Cant create dataset with encoding

I have a CSV which I can only read in python via

df = pd.read_csv(file_path, verbose=True, encoding='ascii', encoding_errors='surrogateescape')

Now I create a small dataset from it named summarized_data

dataset = Dataset.from_pandas(summarized_data)

but get this error:

---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
/home/ahmad/Desktop/aya/urdu_instruct_dataset/src/process_data.ipynb Cell 15 line 4
      1 from datasets import Dataset
      3 # convert to HuggingFace Dataset
----> 4 dataset = Dataset.from_pandas(summarized_data)

File ~/anaconda3/lib/python3.11/site-packages/datasets/arrow_dataset.py:846, in Dataset.from_pandas(cls, df, features, info, split, preserve_index)
    844     info = DatasetInfo()
    845 info.features = features
--> 846 table = InMemoryTable.from_pandas(
    847     df=df,
    848     preserve_index=preserve_index,
    849 )
    850 if features is not None:
    851     # more expensive cast than InMemoryTable.from_pandas(..., schema=features.arrow_schema)
    852     # needed to support the str to Audio conversion for instance
    853     table = table.cast(features.arrow_schema)

File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:747, in InMemoryTable.from_pandas(cls, *args, **kwargs)
    691 @classmethod
    692 def from_pandas(cls, *args, **kwargs):
    693     """
    694     Convert pandas.DataFrame to an Arrow Table.
    695 
ref='~/anaconda3/lib/python3.11/site-packages/datasets/table.py:0'>0</a>;32m   (...)
    745     ```
    746     """
--> 747     return cls(pa.Table.from_pandas(*args, **kwargs))

File ~/anaconda3/lib/python3.11/site-packages/pyarrow/table.pxi:3557, in pyarrow.lib.Table.from_pandas()

File ~/anaconda3/lib/python3.11/site-packages/pyarrow/pandas_compat.py:624, in dataframe_to_arrays(df, schema, preserve_index, nthreads, columns, safe)
    622     for i, maybe_fut in enumerate(arrays):
    623         if isinstance(maybe_fut, futures.Future):
--> 624             arrays[i] = maybe_fut.result()
    626 types = [x.type for x in arrays]
    628 if schema is None:

File ~/anaconda3/lib/python3.11/concurrent/futures/_base.py:449, in Future.result(self, timeout)
    447     raise CancelledError()
    448 elif self._state == FINISHED:
--> 449     return self.__get_result()
    451 self._condition.wait(timeout)
    453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:

File ~/anaconda3/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self)
    399 if self._exception:
    400     try:
--> 401         raise self._exception
    402     finally:
    403         # Break a reference cycle with the exception in self._exception
    404         self = None

File ~/anaconda3/lib/python3.11/concurrent/futures/thread.py:58, in _WorkItem.run(self)
     55     return
     57 try:
---> 58     result = self.fn(*self.args, **self.kwargs)
     59 except BaseException as exc:
     60     self.future.set_exception(exc)

File ~/anaconda3/lib/python3.11/site-packages/pyarrow/pandas_compat.py:592, in dataframe_to_arrays.<locals>.convert_column(col, field)
    589     type_ = field.type
    591 try:
--> 592     result = pa.array(col, type=type_, from_pandas=True, safe=safe)
    593 except (pa.ArrowInvalid,
    594         pa.ArrowNotImplementedError,
    595         pa.ArrowTypeError) as e:
    596     e.args += ("Conversion failed for column {!s} with type {!s}"
    597                .format(col.name, col.dtype),)

File ~/anaconda3/lib/python3.11/site-packages/pyarrow/array.pxi:316, in pyarrow.lib.array()

File ~/anaconda3/lib/python3.11/site-packages/pyarrow/array.pxi:83, in pyarrow.lib._ndarray_to_array()

UnicodeEncodeError: 'utf-8' codec can't encode character '\udcdb' in position 108: surrogates not allowed

How to fix it

how did you go from df to summarized_data?
that bit could help explain.

also do you get the same error if you just convert the df i.e. with Dataset.from_pandas(df), that may point to an issue with your input data file/encoding.

I suspect you need to fix the unencoded chars in your input .csv file. Here is a bit more detail on a possible approach