Having an issue with 'NoneType' after using to_df_dataset() function

Hi, all. I am working on chapter 3 of the Hugging Face NLP course. I have replicated the code in my IDE and tried to run it, but get an error where the training and testing datasets I am trying to produce create a “NoneType” error.

Here is my code:

from datasets import load_dataset
from transformers import AutoTokenizer 
from transformers import DataCollatorWithPadding

checkpoint = "bert-base-uncased"

raw_datasets = load_dataset("glue", "mrpc")

raw_train_dataset = raw_datasets["train"]

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
     return tokenizer(example["sentence1"], example['sentence2'], truncation=True)


tokenized_dataset = raw_datasets.map(tokenize_function, batched=True)

data_colator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_colator,
    batch_size=8,
)

tf_validation_dataset = tokenized_dataset["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_colator,
    batch_size=8,
)

print(tf_train_dataset)

This is the traceback produced:

023-08-28 09:07:39.083722: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]
Map:  27%|██▋       | 1000/3668 [00:00<00:00, 7423.51 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:00<00:00, 11583.81 examples/s]
Map: 100%|██████████| 3668/3668 [00:00<00:00, 11444.43 examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 12600.33 examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 14811.82 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 14503.85 examples/s]
/Users/cdwalke8/anaconda3/envs/test/lib/python3.11/site-packages/datasets/arrow_dataset.py:400: FutureWarning: The output of `to_tf_dataset` will change when a passing single element list for `labels` or `columns` in the next datasets version. To return a tuple structure rather than dict, pass a single string.
Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
  warnings.warn(
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>
Exception ignored in: <function AtomicFunction.__del__ at 0x12d4c2e80>
Traceback (most recent call last):
  File "/Users/cdwalke8/anaconda3/envs/test/lib/python3.11/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py", line 218, in __del__
TypeError: 'NoneType' object is not subscriptable
Exception ignored in: <function AtomicFunction.__del__ at 0x12d4c2e80>
Traceback (most recent call last):
  File "/Users/cdwalke8/anaconda3/envs/test/lib/python3.11/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py", line 218, in __del__
TypeError: 'NoneType' object is not subscriptable
Exception ignored in: <function AtomicFunction.__del__ at 0x12d4c2e80>
Traceback (most recent call last):
  File "/Users/cdwalke8/anaconda3/envs/test/lib/python3.11/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py", line 218, in __del__
TypeError: 'NoneType' object is not subscriptable
Exception ignored in: <function AtomicFunction.__del__ at 0x12d4c2e80>
Traceback (most recent call last):
  File "/Users/cdwalke8/anaconda3/envs/test/lib/python3.11/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py", line 218, in __del__
TypeError: 'NoneType' object is not subscriptable
Exception ignored in: <function AtomicFunction.__del__ at 0x12d4c2e80>
Traceback (most recent call last):
  File "/Users/cdwalke8/anaconda3/envs/test/lib/python3.11/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py", line 218, in __del__
TypeError: 'NoneType' object is not subscriptable
Exception ignored in: <function AtomicFunction.__del__ at 0x12d4c2e80>
Traceback (most recent call last):
  File "/Users/cdwalke8/anaconda3/envs/test/lib/python3.11/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py", line 218, in __del__
TypeError: 'NoneType' object is not subscriptable

Here is my enironment versions:

absl-py                   1.4.0                    pypi_0    pypi
aiohttp                   3.8.5                    pypi_0    pypi
aiosignal                 1.3.1                    pypi_0    pypi
astunparse                1.6.3                    pypi_0    pypi
async-timeout             4.0.3                    pypi_0    pypi
attrs                     23.1.0                   pypi_0    pypi
bzip2                     1.0.8                h0d85af4_4    conda-forge
ca-certificates           2022.9.24            h033912b_0    conda-forge
cachetools                5.3.1                    pypi_0    pypi
certifi                   2023.7.22                pypi_0    pypi
charset-normalizer        3.2.0                    pypi_0    pypi
datasets                  2.14.4                   pypi_0    pypi
dill                      0.3.7                    pypi_0    pypi
filelock                  3.12.2                   pypi_0    pypi
flatbuffers               23.5.26                  pypi_0    pypi
frozenlist                1.4.0                    pypi_0    pypi
fsspec                    2023.6.0                 pypi_0    pypi
gast                      0.4.0                    pypi_0    pypi
google-auth               2.22.0                   pypi_0    pypi
google-auth-oauthlib      1.0.0                    pypi_0    pypi
google-pasta              0.2.0                    pypi_0    pypi
grpcio                    1.57.0                   pypi_0    pypi
h5py                      3.9.0                    pypi_0    pypi
huggingface-hub           0.16.4                   pypi_0    pypi
idna                      3.4                      pypi_0    pypi
keras                     2.13.1                   pypi_0    pypi
libclang                  16.0.6                   pypi_0    pypi
libffi                    3.4.2                h0d85af4_5    conda-forge
libsqlite                 3.40.0               ha978bb4_0    conda-forge
libzlib                   1.2.13               hfd90126_4    conda-forge
markdown                  3.4.4                    pypi_0    pypi
markupsafe                2.1.3                    pypi_0    pypi
multidict                 6.0.4                    pypi_0    pypi
multiprocess              0.70.15                  pypi_0    pypi
ncurses                   6.3                  h96cf925_1    conda-forge
numpy                     1.24.3                   pypi_0    pypi
oauthlib                  3.2.2                    pypi_0    pypi
openssl                   3.0.7                hfd90126_0    conda-forge
opt-einsum                3.3.0                    pypi_0    pypi
packaging                 23.1                     pypi_0    pypi
pandas                    2.0.3                    pypi_0    pypi
pip                       22.3.1             pyhd8ed1ab_0    conda-forge
protobuf                  4.24.1                   pypi_0    pypi
pyarrow                   13.0.0                   pypi_0    pypi
pyasn1                    0.5.0                    pypi_0    pypi
pyasn1-modules            0.3.0                    pypi_0    pypi
python                    3.11.0          h559f36b_0_cpython    conda-forge
python-dateutil           2.8.2                    pypi_0    pypi
pytz                      2023.3                   pypi_0    pypi
pyyaml                    6.0.1                    pypi_0    pypi
readline                  8.1.2                h3899abd_0    conda-forge
regex                     2023.8.8                 pypi_0    pypi
requests                  2.31.0                   pypi_0    pypi
requests-oauthlib         1.3.1                    pypi_0    pypi
rsa                       4.9                      pypi_0    pypi
safetensors               0.3.2                    pypi_0    pypi
setuptools                65.5.1             pyhd8ed1ab_0    conda-forge
six                       1.16.0                   pypi_0    pypi
tensorboard               2.13.0                   pypi_0    pypi
tensorboard-data-server   0.7.1                    pypi_0    pypi
tensorflow                2.13.0                   pypi_0    pypi
tensorflow-estimator      2.13.0                   pypi_0    pypi
tensorflow-io-gcs-filesystem 0.33.0                   pypi_0    pypi
termcolor                 2.3.0                    pypi_0    pypi
tk                        8.6.12               h5dbffcc_0    conda-forge
tokenizers                0.13.3                   pypi_0    pypi
tqdm                      4.66.1                   pypi_0    pypi
transformers              4.31.0                   pypi_0    pypi
typing-extensions         4.5.0                    pypi_0    pypi
tzdata                    2023.3                   pypi_0    pypi
urllib3                   1.26.16                  pypi_0    pypi
werkzeug                  2.3.7                    pypi_0    pypi
wheel                     0.38.4             pyhd8ed1ab_0    conda-forge
wrapt                     1.15.0                   pypi_0    pypi
xxhash                    3.3.0                    pypi_0    pypi
xz                        5.2.6                h775f41a_0    conda-forge
yarl                      1.9.2                    pypi_0    pypi

Can someone let me know what I am doing wrong?

Thanks.

This code runs without issues in Colab, so this could be related to your environment (maybe create a new one)

1 Like

What exactly should I install via pip? For example, I know I need transformers. According to this, I have version 4.31.0. What are the particular libraries the colab is using?

Did you ever get this fixed, I’m having the same issue, but slightly different. The example runs fine when I run it via the python REPL, but when running as a script I get the error.

REPL output

Python 3.11.7 (main, Jan 11 2024, 07:22:38) [GCC 12.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from datasets import load_dataset
#    collate_fn=data_colator,
#    batch_size=8,
#)

print(tf_train_dataset)>>> from transformers import AutoTokenizer
/usr/local/lib/python3.11/site-packages/transformers/utils/hub.py:123: FutureWarning: Using `TRANSFORMERS_CACHE` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
  warnings.warn(
>>> from transformers import DataCollatorWithPadding
>>> import tensorflow; print(tensorflow.__version__)
2.13.0
>>> 
>>> checkpoint = "bert-base-uncased"
>>> 
>>> raw_datasets = load_dataset("glue", "mrpc")
>>> 
>>> raw_train_dataset = raw_datasets["train"]
>>> 
>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
>>> 
>>> def tokenize_function(example):
...      return tokenizer(example["sentence1"], example['sentence2'], truncation=True)
... 
>>> 
>>> tokenized_dataset = raw_datasets.map(tokenize_function, batched=True)
Map: 100%|████████████████████████████████████████████████████████████████| 3668/3668 [00:00<00:00, 29494.49 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 408/408 [00:00<00:00, 23673.36 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████| 1725/1725 [00:00<00:00, 29761.03 examples/s]
>>> 
>>> data_colator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')
>>> 
>>> tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
...     columns=["attention_mask", "input_ids", "token_type_ids"],
...     label_cols=["labels"],
...     shuffle=True,
...     collate_fn=data_colator,
...     batch_size=8,
... )
/usr/local/lib/python3.11/site-packages/datasets/arrow_dataset.py:400: FutureWarning: The output of `to_tf_dataset` will change when a passing single element list for `labels` or `columns` in the next datasets version. To return a tuple structure rather than dict, pass a single string.
Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
  warnings.warn(
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
>>> 
>>> #tf_validation_dataset = tokenized_dataset["validation"].to_tf_dataset(
>>> #    columns=["attention_mask", "input_ids", "token_type_ids"],
>>> #    label_cols=["labels"],
>>> #    shuffle=False,
>>> #    collate_fn=data_colator,
>>> #    batch_size=8,
>>> #)
>>> 
>>> print(tf_train_dataset)
<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>
>>> 

Script output

python test.py 
/usr/local/lib/python3.11/site-packages/transformers/utils/hub.py:123: FutureWarning: Using `TRANSFORMERS_CACHE` is depr
ecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.
  warnings.warn(
2.13.0
Map: 100%|██████████████████████████████████████████████████████████████████| 408/408 [00:00<00:00, 20562.04 examples/s]
/usr/local/lib/python3.11/site-packages/datasets/arrow_dataset.py:400: FutureWarning: The output of `to_tf_dataset` will change when a passing single element list for `labels` or `columns` in the next datasets version. To return a tuple structure rather than dict, pass a single string.
Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 
  warnings.warn(
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
<_PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>
Exception ignored in: <function AtomicFunction.__del__ at 0xffff35c45ee0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py", line 218, in __del__
TypeError: 'NoneType' object is not subscriptable
Exception ignored in: <function AtomicFunction.__del__ at 0xffff35c45ee0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py", line 218, in __del__
TypeError: 'NoneType' object is not subscriptable
Exception ignored in: <function AtomicFunction.__del__ at 0xffff35c45ee0>
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/site-packages/tensorflow/python/eager/polymorphic_function/atomic_function.py", line 218, in __del__
TypeError: 'NoneType' object is not subscriptable

Script content

from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import tensorflow; print(tensorflow.__version__)

checkpoint = "bert-base-uncased"

raw_datasets = load_dataset("glue", "mrpc")

raw_train_dataset = raw_datasets["train"]

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
     return tokenizer(example["sentence1"], example['sentence2'], truncation=True)


tokenized_dataset = raw_datasets.map(tokenize_function, batched=True)

data_colator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='tf')

tf_train_dataset = tokenized_dataset["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_colator,
    batch_size=8,
)

#tf_validation_dataset = tokenized_dataset["validation"].to_tf_dataset(
#    columns=["attention_mask", "input_ids", "token_type_ids"],
#    label_cols=["labels"],
#    shuffle=False,
#    collate_fn=data_colator,
#    batch_size=8,
#)

print(tf_train_dataset)