I want to train a model using SFTTrainer
, using a dataset that streams the data from a local file (I don’t want to load everything in memory). The dataset seems to be working fine, but having and error with SFTTrainer
. Here’s the code:
features= Features({'doc_id': Value('string'), 'text': Value('string')})
train_dataset = load_dataset('json', data_files="data/interim/collection.jsonl", split='train', streaming=True,features=features)
batch_size = 16
gradient_accumulation_steps = 1
num_train_epochs = 2
training_args = TrainingArguments(
output_dir="output/",
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size//2,
learning_rate=2e-4,
lr_scheduler_type="cosine",
warmup_ratio = 0.1,
gradient_accumulation_steps=gradient_accumulation_steps,
gradient_checkpointing=True,
evaluation_strategy="epoch",
num_train_epochs=num_train_epochs,
# logging strategies
logging_strategy="steps",
logging_steps=1,
save_strategy="epoch", # saving is done at the end of each epoch
remove_unused_columns = True
)
trainer = SFTTrainer(
model,
train_dataset=train_dataset,
# eval_dataset=eval_dataset,
# packing=True, # pack samples together for efficient training
dataset_text_field="text",
data_collator=collator,
max_seq_length=512, # maximum packed length
args=training_args,
# formatting_func=formatting_func, # format samples with a model schema
)
And the error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[22], line 1
----> 1 trainer = SFTTrainer(
2 model,
3 train_dataset=train_dataset,
4 # eval_dataset=eval_dataset,
5 # packing=True, # pack samples together for efficient training
6 dataset_text_field="text",
7 data_collator=collator,
8 max_seq_length=512, # maximum packed length
9 args=training_args,
10 # formatting_func=formatting_func, # format samples with a model schema
11 )
File /notebooks/env/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:283, in SFTTrainer.__init__(self, model, args, data_collator, train_dataset, eval_dataset, tokenizer, model_init, compute_metrics, callbacks, optimizers, preprocess_logits_for_metrics, peft_config, dataset_text_field, packing, formatting_func, max_seq_length, infinite, num_of_sequences, chars_per_token, dataset_num_proc, dataset_batch_size, neftune_noise_alpha, model_init_kwargs, dataset_kwargs, eval_packing)
281 dataset_kwargs = {}
282 if train_dataset is not None:
--> 283 train_dataset = self._prepare_dataset(
284 train_dataset,
285 tokenizer,
286 packing,
287 dataset_text_field,
288 max_seq_length,
289 formatting_func,
290 num_of_sequences,
291 chars_per_token,
292 remove_unused_columns=args.remove_unused_columns if args is not None else True,
293 **dataset_kwargs,
294 )
295 if eval_dataset is not None:
296 _multiple = isinstance(eval_dataset, dict)
File /notebooks/env/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:424, in SFTTrainer._prepare_dataset(self, dataset, tokenizer, packing, dataset_text_field, max_seq_length, formatting_func, num_of_sequences, chars_per_token, remove_unused_columns, append_concat_token, add_special_tokens, skip_prepare_dataset)
421 return dataset
423 if not packing:
--> 424 return self._prepare_non_packed_dataloader(
425 tokenizer,
426 dataset,
427 dataset_text_field,
428 max_seq_length,
429 formatting_func,
430 add_special_tokens,
431 remove_unused_columns,
432 )
434 else:
435 return self._prepare_packed_dataloader(
436 tokenizer,
437 dataset,
(...)
444 add_special_tokens,
445 )
File /notebooks/env/lib/python3.9/site-packages/trl/trainer/sft_trainer.py:492, in SFTTrainer._prepare_non_packed_dataloader(self, tokenizer, dataset, dataset_text_field, max_seq_length, formatting_func, add_special_tokens, remove_unused_columns)
486 if not remove_unused_columns and len(extra_columns) > 0:
487 warnings.warn(
488 "You passed `remove_unused_columns=False` on a non-packed dataset. This might create some issues with the default collator and yield to errors. If you want to "
489 f"inspect dataset other columns (in this case {extra_columns}), you can subclass `DataCollatorForLanguageModeling` in case you used the default collator and create your own data collator in order to inspect the unused dataset columns."
490 )
--> 492 tokenized_dataset = dataset.map(
493 tokenize,
494 batched=True,
495 remove_columns=dataset.column_names if remove_unused_columns else None,
496 num_proc=self.dataset_num_proc,
497 batch_size=self.dataset_batch_size,
498 )
500 return tokenized_dataset
TypeError: map() got an unexpected keyword argument 'num_proc'