Hello everyone!
I am currently training a GPT-2 model from scratch using my own cif-tokenizer. The goal is to be able to generate crystallographic information files using an LLM. Since some of the CIFs have more tokens than context length, I am using strided tokenization with returned overflowing tokens and padded to max context length. This method has worked for all my datasets (different materials but same format) - however, using the MP-20 dataset I am getting a ‘TypeError: Couldn’t cast array of type int64 to null’ when performing the dataset.map( function.
Data type of first example processed in train set:
Split: train
Number of examples: 27136
Column names: ['Database', 'Material ID', 'Reduced Formula', 'CIF']
Features: {'Database': Value(dtype='string', id=None), 'Material ID': Value(dtype='string', id=None), 'Reduced Formula': Value(dtype=
'string', id=None), 'CIF': Value(dtype='string', id=None)}
First example:
{'Database': 'MP-20', 'Material ID': 'mp-1221227', 'Reduced Formula': 'Na3MnCoNiO6', 'CIF': "data_Na6Mn2Co2Ni2O12\nloop_\n _atom_type
_symbol\n _atom_type_electronegativity\n _atom_type_radius\n _atom_type_ionic_radius\n Na 0.9300 1.8000 1.1600\n [...] \n"}
Dataloading script:
def tokenize_function(examples, tokenizer, context_length, stride):
'''
Tokenize a dataset using a sliding window approach.
Processes the "CIF" column in a batch of examples.
'''
# Add BOS and EOS tokens to each CIF sequence
bos_token = tokenizer.bos_token
eos_token = tokenizer.eos_token
# Tokenize the CIF column with BOS and EOS tokens
return tokenizer(
[bos_token + example + eos_token for example in examples["CIF"]],
truncation=True,
max_length=context_length,
padding="max_length",
stride=stride,
return_overflowing_tokens=True,
return_special_tokens_mask=True,
return_offsets_mapping=False
)
def load_data(tokenizer, dataset, context_length, stride, dataset_streaming=False):
# Wrap the tokenize function to pass the tokenizer
def tokenize_wrapper(examples):
return tokenize_function(examples, tokenizer, context_length, stride)
# Apply tokenization to the dataset
tokenized_dataset = dataset.map(
tokenize_wrapper, # Pass the wrapped tokenize function
batched=True, # Apply to batches of examples rather than indiv
remove_columns=["CIF"], # Remove the original "CIF" column, bc model only needs tokenized data
# Only use num_proc if not streaming
**({"num_proc": 8} if not dataset_streaming else {})
)
# Create a data collator for training
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False # No masked language modeling
)
return tokenized_dataset, data_collator
Full error:
Conditionality is not activated
Map (num_proc=8): 91%|██████████████████████████████████████████████████████████████████████████████████████▋ | 24744/27136 [00:04<00:00, 52
00.22 examples/s]
multiprocess.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/multiprocess/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 678, in _write_generator_to_qu
eue
for i, result in enumerate(func(**kwargs)):
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3499, in _map_single
writer.write_batch(batch)
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/arrow_writer.py", line 605, in write_batch
arrays.append(pa.array(typed_sequence))
File "pyarrow/array.pxi", line 250, in pyarrow.lib.array
File "pyarrow/array.pxi", line 114, in pyarrow.lib._handle_arrow_array_protocol
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/arrow_writer.py", line 243, in __arrow_array__
out = cast_array_to_feature(
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/table.py", line 1797, in wrapper
return func(array, *args, **kwargs) " 12:44 04-Feb-25
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/table.py", line 2065, in cast_array_to_feature
casted_array_values = _c(array.values, feature.feature)
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/table.py", line 1797, in wrapper
return func(array, *args, **kwargs)
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/table.py", line 2102, in cast_array_to_feature
return array_cast(
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/table.py", line 1797, in wrapper
return func(array, *args, **kwargs)
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/table.py", line 1948, in array_cast
raise TypeError(f"Couldn't cast array of type {_short_str(array.type)} to {_short_str(pa_type)}")
TypeError: Couldn't cast array of type int64 to null
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/cyprien/CrystaLLMv2_CG/_train.py", line 194, in <module>
main()
File "/home/cyprien/CrystaLLMv2_CG/_train.py", line 96, in main
tokenized_dataset, data_collator = load_data(
File "/home/cyprien/CrystaLLMv2_CG/_dataloader.py", line 106, in load_data
tokenized_dataset = dataset.map(
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/dataset_dict.py", line 886, in map
{
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/dataset_dict.py", line 887, in <dictcomp>
k: dataset.map(
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 560, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3165, in map
for rank, done, content in iflatmap_unordered(
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 718, in iflatmap_unordered
[async_result.get(timeout=0.05) for async_result in async_results]
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/datasets/utils/py_utils.py", line 718, in <listcomp>
[async_result.get(timeout=0.05) for async_result in async_results]
File "/home/cyprien/miniconda3/envs/crystallmv2_venv/lib/python3.10/site-packages/multiprocess/pool.py", line 774, in get
raise self._value
TypeError: Couldn't cast array of type int64 to null
If I adjust the context length to be >max_token_length, I do not get an error. I have also tried specifying explicitly the features as per some of the posts on this forum, but I am unsure if I am doing it correctly.
I was wondering if anyone knows how I could tackle this problem? I am still quite new to HuggingFace so struggling to grasp what has gone wrong for this dataset. Thank you in advance for the help.