I’m tokenizing my dataset with the following code:
train_dataset = load_dataset("json", data_files = os.path.join(data_path, 'train_data.json'), split='train')
tokenized_dataset = train_dataset.map(
process_func,
batched=True,
remove_columns=dataset.column_names,
)
The process_func
is:
def process_func(self, sentence: dict) -> dict:
input_ids, attention_mask, labels = [], [], []
original_text = self.tokenizer(sentence['input'],
add_special_tokens=False,
max_length=self.max_source_length,
truncation=True)
translation = self.tokenizer(sentence['output'],
add_special_tokens=False,
max_length=self.max_target_length,
truncation=True)
input_ids = original_text["input_ids"] + translation["input_ids"] + [self.tokenizer.pad_token_id]
attention_mask = original_text["attention_mask"] + translation["attention_mask"] + [1]
labels = [-100] * len(original_text["input_ids"]) + translation["input_ids"] + [self.tokenizer.pad_token_id]
return {
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels
}
However the map function returned this error:
Traceback (most recent call last):
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_comm.py", line 252, in _on_run
self.process_net_command_json(self.py_db, json_contents)
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_process_net_command_json.py", line 193, in process_net_command_json
cmd = on_request(py_db, request)
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_process_net_command_json.py", line 981, in on_stacktrace_request
self.api.request_stack(py_db, request.seq, thread_id, fmt=fmt, start_frame=start_frame, levels=levels)
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_api.py", line 207, in request_stack
if internal_get_thread_stack.can_be_executed_by(get_current_thread_id(threading.current_thread())):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_comm.py", line 655, in can_be_executed_by
self._cmd = py_db.cmd_factory.make_get_thread_stack_message(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_net_command_factory_json.py", line 278, in make_get_thread_stack_message
colno, endcolno = line_col_info.map_columns_to_line(line_text)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_frame_utils.py", line 101, in map_columns_to_line
colno = _utf8_byte_offset_to_character_offset(original_line, self.colno)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_frame_utils.py", line 143, in _utf8_byte_offset_to_character_offset
if byte_offset > offset:
^^^^^^^^^^^^^^^^^^^^
TypeError: '>' not supported between instances of 'int' and 'NoneType'
Traceback (most recent call last):
File "e:\python\Lib\runpy.py", line 198, in _run_module_as_main
return _run_code(code, main_globals, None,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "e:\python\Lib\runpy.py", line 88, in _run_code
exec(code, run_globals)
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher/../..\debugpy\__main__.py", line 39, in <module>
cli.main()
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher/../..\debugpy/..\debugpy\server\cli.py", line 430, in main
run()
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\adapter/../..\debugpy\launcher/../..\debugpy/..\debugpy\server\cli.py", line 284, in run_file
runpy.run_path(target, run_name="__main__")
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_runpy.py", line 321, in run_path
return _run_module_code(code, init_globals, run_name,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_runpy.py", line 135, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "c:\Users\Xiao\.vscode\extensions\ms-python.debugpy-2024.10.0-win32-x64\bundled\libs\debugpy\_vendored\pydevd\_pydevd_bundle\pydevd_runpy.py", line 124, in _run_code
exec(code, run_globals)
File "E:\LLM\fine_tuning_test\finetune.py", line 43, in <module>
tokenized_train_dataset = data_prcocessor.get_tokenized_dataset(train_dataset, data_path, 'train')
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\LLM\fine_tuning_test\utils\data_processor.py", line 62, in get_tokenized_dataset
tokenized_dataset = dataset.map(
^^^^^^^^^^^^
File "E:\python\Lib\site-packages\datasets\arrow_dataset.py", line 602, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\python\Lib\site-packages\datasets\arrow_dataset.py", line 567, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\python\Lib\site-packages\datasets\arrow_dataset.py", line 3161, in map
for rank, done, content in Dataset._map_single(**dataset_kwargs):
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\python\Lib\site-packages\datasets\arrow_dataset.py", line 567, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\python\Lib\site-packages\datasets\arrow_dataset.py", line 3161, in map
for rank, done, content in Dataset._map_single(**dataset_kwargs):
File "E:\python\Lib\site-packages\datasets\arrow_dataset.py", line 567, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\python\Lib\site-packages\datasets\arrow_dataset.py", line 3161, in map
for rank, done, content in Dataset._map_single(**dataset_kwargs):
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\python\Lib\site-packages\datasets\arrow_dataset.py", line 3161, in map
for rank, done, content in Dataset._map_single(**dataset_kwargs):
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\python\Lib\site-packages\datasets\arrow_dataset.py", line 3161, in map
for rank, done, content in Dataset._map_single(**dataset_kwargs):
for rank, done, content in Dataset._map_single(**dataset_kwargs):
File "E:\python\Lib\site-packages\datasets\arrow_dataset.py", line 3575, in _map_single
File "E:\python\Lib\site-packages\datasets\arrow_dataset.py", line 3575, in _map_single
writer.write_batch(batch)
File "E:\python\Lib\site-packages\datasets\arrow_writer.py", line 568, in write_batch
writer.write_batch(batch)
File "E:\python\Lib\site-packages\datasets\arrow_writer.py", line 568, in write_batch
arrays.append(pa.array(typed_sequence))
File "E:\python\Lib\site-packages\datasets\arrow_writer.py", line 568, in write_batch
arrays.append(pa.array(typed_sequence))
^^^^^^^^^^^^^^^^^^^^^^^^
arrays.append(pa.array(typed_sequence))
^^^^^^^^^^^^^^^^^^^^^^^^
File "pyarrow\\array.pxi", line 248, in pyarrow.lib.array
^^^^^^^^^^^^^^^^^^^^^^^^
File "pyarrow\\array.pxi", line 248, in pyarrow.lib.array
File "pyarrow\\array.pxi", line 112, in pyarrow.lib._handle_arrow_array_protocol
File "E:\python\Lib\site-packages\datasets\arrow_writer.py", line 193, in __arrow_array__
out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
File "pyarrow\\array.pxi", line 248, in pyarrow.lib.array
File "pyarrow\\array.pxi", line 112, in pyarrow.lib._handle_arrow_array_protocol
File "E:\python\Lib\site-packages\datasets\arrow_writer.py", line 193, in __arrow_array__
out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "pyarrow\\array.pxi", line 112, in pyarrow.lib._handle_arrow_array_protocol
File "E:\python\Lib\site-packages\datasets\arrow_writer.py", line 193, in __arrow_array__
out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "E:\python\Lib\site-packages\datasets\arrow_writer.py", line 193, in __arrow_array__
out = pa.array(cast_to_python_objects(data, only_1d_for_numpy=True))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "pyarrow\\array.pxi", line 368, in pyarrow.lib.array
File "pyarrow\\array.pxi", line 42, in pyarrow.lib._sequence_to_array
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "pyarrow\\array.pxi", line 368, in pyarrow.lib.array
File "pyarrow\\array.pxi", line 42, in pyarrow.lib._sequence_to_array
File "pyarrow\\array.pxi", line 368, in pyarrow.lib.array
File "pyarrow\\array.pxi", line 42, in pyarrow.lib._sequence_to_array
File "pyarrow\\array.pxi", line 42, in pyarrow.lib._sequence_to_array
File "pyarrow\\error.pxi", line 155, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: cannot mix list and non-list, non-null values
File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
File "pyarrow\\error.pxi", line 92, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: cannot mix list and non-list, non-null values
My dataset is a JSON file like this (about 100,000 records):
[
{
"input": "现代文:这时,蜀军尚有十多万人,自绵竹至汉州,陈兵在一千多里的防线上,首尾相连,但是都无心思跟入侵的唐军战斗。 古文:",
"output": "其时蜀师十余万,自绵汉至于深渡千余里,首尾相继,皆无心斗敌。"
},
{
"input": "现代文:齐国有个叫奉朝请的人,家中豪华奢侈,如不是自己亲手杀的牛,就觉得不好吃。 古文:",
"output": "齐国有一奉朝请,家甚豪侈,非手杀牛,则噉之不美。"
}
]
I saw a similar post (Question answering bot: fine-tuning with custom dataset) and converted my dataset to a line-separated JSON, but the problem still exists.
The strange thing is that I didn’t encounter any problems when I used the same code to process another smaller dataset (about 20,000 records) in the same format. In addition, if I remove batched=True
, the code can run normally without errors. Does the batch processing have some additional requirements for the data format?