Chapter 7 questions

clavicula · April 3, 2024, 3:17am

When I call to_tf_dataset, I get this error.

/opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:197: FutureWarning: In the future `np.object` will be defined as the corresponding NumPy scalar.
  (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[24], line 1
----> 1 tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
      2     columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
      3     collate_fn=data_collator,
      4     shuffle=True,
      5     batch_size=16,
      6 )
      8 tf_eval_dataset = tokenized_datasets["validation"].to_tf_dataset(
      9     columns=["attention_mask", "input_ids", "labels", "token_type_ids"],
     10     collate_fn=data_collator,
     11     shuffle=False,
     12     batch_size=16,
     13 )

File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:381, in TensorflowDatasetMixin.to_tf_dataset(self, columns, batch_size, shuffle, collate_fn, drop_remainder, collate_fn_args, label_cols, dummy_labels, prefetch)
    378 retained_columns = [key for key in self.features.keys() if key in cols_to_retain]
    379 dataset = self.with_format("numpy", columns=retained_columns)
--> 381 columns_to_dtypes, output_signature = self._get_output_signature(
    382     dataset, collate_fn, collate_fn_args, batch_size=batch_size if drop_remainder else None
    383 )
    384 all_columns = list(columns_to_dtypes.keys())
    385 all_dtypes = list(columns_to_dtypes.values())

File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:244, in TensorflowDatasetMixin._get_output_signature(dataset, collate_fn, collate_fn_args, batch_size)
    242     raise ValueError("Unable to get the output signature because the dataset is empty.")
    243 test_batch_size = min(len(dataset), 4)
--> 244 test_batch = dataset[:test_batch_size]
    245 test_batch = [{key: value[i] for key, value in test_batch.items()} for i in range(test_batch_size)]
    246 test_batch = collate_fn(test_batch, **collate_fn_args)

File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1764, in Dataset.__getitem__(self, key)
   1762 def __getitem__(self, key):  # noqa: F811
   1763     """Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools)."""
-> 1764     return self._getitem(
   1765         key,
   1766     )

File /opt/conda/lib/python3.10/site-packages/datasets/arrow_dataset.py:1749, in Dataset._getitem(self, key, decoded, **kwargs)
   1747 formatter = get_formatter(format_type, features=self.features, decoded=decoded, **format_kwargs)
   1748 pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
-> 1749 formatted_output = format_table(
   1750     pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns
   1751 )
   1752 return formatted_output

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:540, in format_table(table, key, formatter, format_columns, output_all_columns)
    538 else:
    539     pa_table_to_format = pa_table.drop(col for col in pa_table.column_names if col not in format_columns)
--> 540     formatted_output = formatter(pa_table_to_format, query_type=query_type)
    541     if output_all_columns:
    542         if isinstance(formatted_output, MutableMapping):

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:285, in Formatter.__call__(self, pa_table, query_type)
    283     return self.format_column(pa_table)
    284 elif query_type == "batch":
--> 285     return self.format_batch(pa_table)

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:346, in NumpyFormatter.format_batch(self, pa_table)
    345 def format_batch(self, pa_table: pa.Table) -> dict:
--> 346     batch = self.numpy_arrow_extractor(**self.np_array_kwargs).extract_batch(pa_table)
    347     if self.decoded:
    348         batch = self.python_features_decoder.decode_batch(batch)

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:160, in NumpyArrowExtractor.extract_batch(self, pa_table)
    159 def extract_batch(self, pa_table: pa.Table) -> dict:
--> 160     return {col: self._arrow_array_to_numpy(pa_table[col]) for col in pa_table.column_names}

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:160, in <dictcomp>(.0)
    159 def extract_batch(self, pa_table: pa.Table) -> dict:
--> 160     return {col: self._arrow_array_to_numpy(pa_table[col]) for col in pa_table.column_names}

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:196, in NumpyArrowExtractor._arrow_array_to_numpy(self, pa_array)
    194         array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
    195 if len(array) > 0:
--> 196     if any(
    197         (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
    198         or (isinstance(x, float) and np.isnan(x))
    199         for x in array
    200     ):
    201         return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object})
    202 return np.array(array, copy=False, **self.np_array_kwargs)

File /opt/conda/lib/python3.10/site-packages/datasets/formatting/formatting.py:197, in <genexpr>(.0)
    194         array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
    195 if len(array) > 0:
    196     if any(
--> 197         (isinstance(x, np.ndarray) and (x.dtype == np.object or x.shape != array[0].shape))
    198         or (isinstance(x, float) and np.isnan(x))
    199         for x in array
    200     ):
    201         return np.array(array, copy=False, **{**self.np_array_kwargs, "dtype": np.object})
    202 return np.array(array, copy=False, **self.np_array_kwargs)

File /opt/conda/lib/python3.10/site-packages/numpy/__init__.py:324, in __getattr__(attr)
    319     warnings.warn(
    320         f"In the future `np.{attr}` will be defined as the "
    321         "corresponding NumPy scalar.", FutureWarning, stacklevel=2)
    323 if attr in __former_attrs__:
--> 324     raise AttributeError(__former_attrs__[attr])
    326 if attr == 'testing':
    327     import numpy.testing as testing

AttributeError: module 'numpy' has no attribute 'object'.
`np.object` was a deprecated alias for the builtin `object`. To avoid this error in existing code, use `object` by itself. Doing this will not modify any behavior and is safe. 
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

Topic		Replies	Views
Chapter 3 questions Course	151	10656	October 6, 2025
Fine Tuning IMDb tutorial - Unable to reproduce and adapt Beginners	19	8607	August 21, 2020
Transformers v3.0.0 is out! 🤗Transformers	0	1953	July 7, 2020
Seq2SeqTrainer: enabled must be a bool (got NoneType) 🤗Transformers	15	3972	December 5, 2022
Tutorial: Fine-tuning with custom datasets – sentiment, NER, and question answering 🤗Transformers	19	12953	February 12, 2024

Chapter 7 questions

Related topics