I have a CSV which I can only read in python via
df = pd.read_csv(file_path, verbose=True, encoding='ascii', encoding_errors='surrogateescape')
Now I create a small dataset from it named summarized_data
dataset = Dataset.from_pandas(summarized_data)
but get this error:
---------------------------------------------------------------------------
UnicodeEncodeError Traceback (most recent call last)
/home/ahmad/Desktop/aya/urdu_instruct_dataset/src/process_data.ipynb Cell 15 line 4
1 from datasets import Dataset
3 # convert to HuggingFace Dataset
----> 4 dataset = Dataset.from_pandas(summarized_data)
File ~/anaconda3/lib/python3.11/site-packages/datasets/arrow_dataset.py:846, in Dataset.from_pandas(cls, df, features, info, split, preserve_index)
844 info = DatasetInfo()
845 info.features = features
--> 846 table = InMemoryTable.from_pandas(
847 df=df,
848 preserve_index=preserve_index,
849 )
850 if features is not None:
851 # more expensive cast than InMemoryTable.from_pandas(..., schema=features.arrow_schema)
852 # needed to support the str to Audio conversion for instance
853 table = table.cast(features.arrow_schema)
File ~/anaconda3/lib/python3.11/site-packages/datasets/table.py:747, in InMemoryTable.from_pandas(cls, *args, **kwargs)
691 @classmethod
692 def from_pandas(cls, *args, **kwargs):
693 """
694 Convert pandas.DataFrame to an Arrow Table.
695
ref='~/anaconda3/lib/python3.11/site-packages/datasets/table.py:0'>0</a>;32m (...)
745 ```
746 """
--> 747 return cls(pa.Table.from_pandas(*args, **kwargs))
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/table.pxi:3557, in pyarrow.lib.Table.from_pandas()
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/pandas_compat.py:624, in dataframe_to_arrays(df, schema, preserve_index, nthreads, columns, safe)
622 for i, maybe_fut in enumerate(arrays):
623 if isinstance(maybe_fut, futures.Future):
--> 624 arrays[i] = maybe_fut.result()
626 types = [x.type for x in arrays]
628 if schema is None:
File ~/anaconda3/lib/python3.11/concurrent/futures/_base.py:449, in Future.result(self, timeout)
447 raise CancelledError()
448 elif self._state == FINISHED:
--> 449 return self.__get_result()
451 self._condition.wait(timeout)
453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
File ~/anaconda3/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self)
399 if self._exception:
400 try:
--> 401 raise self._exception
402 finally:
403 # Break a reference cycle with the exception in self._exception
404 self = None
File ~/anaconda3/lib/python3.11/concurrent/futures/thread.py:58, in _WorkItem.run(self)
55 return
57 try:
---> 58 result = self.fn(*self.args, **self.kwargs)
59 except BaseException as exc:
60 self.future.set_exception(exc)
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/pandas_compat.py:592, in dataframe_to_arrays.<locals>.convert_column(col, field)
589 type_ = field.type
591 try:
--> 592 result = pa.array(col, type=type_, from_pandas=True, safe=safe)
593 except (pa.ArrowInvalid,
594 pa.ArrowNotImplementedError,
595 pa.ArrowTypeError) as e:
596 e.args += ("Conversion failed for column {!s} with type {!s}"
597 .format(col.name, col.dtype),)
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/array.pxi:316, in pyarrow.lib.array()
File ~/anaconda3/lib/python3.11/site-packages/pyarrow/array.pxi:83, in pyarrow.lib._ndarray_to_array()
UnicodeEncodeError: 'utf-8' codec can't encode character '\udcdb' in position 108: surrogates not allowed
How to fix it