from transformers import TextDataset, in general my question is how to pass list of strings (or txt file) to model and finetune it ?
related with this question/issue
train_dataset = TextDataset(
tokenizer=tokenizer,
file_path=math_example_path, # < path to .txt file
block_size=128)
from transformers import (
TrainingArguments,
Trainer,
default_data_collator,
)
import mlflow
mlflow.end_run()
training_args = TrainingArguments(output_dir='test_trainer',
#evaluation_strategy='epoch',
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
gradient_accumulation_steps=20, #
num_train_epochs = 2,
fp16=False,)
trainer = Trainer(
model=llm_model,
args=training_args,
train_dataset=dataset,
#eval_dataset=dataset,
tokenizer=tokenizer,
# Data collator will default to DataCollatorWithPadding, so we change it.
data_collator=default_data_collator,
compute_metrics=None,
preprocess_logits_for_metrics=None,
)
trainer.train()
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[28], line 29
9 training_args = TrainingArguments(output_dir='test_trainer',
10 #evaluation_strategy='epoch',
11 per_device_train_batch_size=1,
(...)
14 num_train_epochs = 2,
15 fp16=False,)
17 trainer = Trainer(
18 model=llm_model,
19 args=training_args,
(...)
26 preprocess_logits_for_metrics=None,
27 )
---> 29 trainer.train()
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\trainer.py:1539, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1537 hf_hub_utils.enable_progress_bars()
1538 else:
-> 1539 return inner_training_loop(
1540 args=args,
1541 resume_from_checkpoint=resume_from_checkpoint,
1542 trial=trial,
1543 ignore_keys_for_eval=ignore_keys_for_eval,
1544 )
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\trainer.py:1836, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1833 rng_to_sync = True
1835 step = -1
-> 1836 for step, inputs in enumerate(epoch_iterator):
1837 total_batched_samples += 1
1839 if self.args.include_num_input_tokens_seen:
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\accelerate\data_loader.py:451, in DataLoaderShard.__iter__(self)
449 # We iterate one batch ahead to check when we are at the end
450 try:
--> 451 current_batch = next(dataloader_iter)
452 except StopIteration:
453 yield
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\torch\utils\data\dataloader.py:634, in _BaseDataLoaderIter.__next__(self)
631 if self._sampler_iter is None:
632 # TODO(https://github.com/pytorch/pytorch/issues/76750)
633 self._reset() # type: ignore[call-arg]
--> 634 data = self._next_data()
635 self._num_yielded += 1
636 if self._dataset_kind == _DatasetKind.Iterable and \
637 self._IterableDataset_len_called is not None and \
638 self._num_yielded > self._IterableDataset_len_called:
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\torch\utils\data\dataloader.py:678, in _SingleProcessDataLoaderIter._next_data(self)
676 def _next_data(self):
677 index = self._next_index() # may raise StopIteration
--> 678 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
679 if self._pin_memory:
680 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\torch\utils\data\_utils\fetch.py:54, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
52 else:
53 data = self.dataset[possibly_batched_index]
---> 54 return self.collate_fn(data)
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\trainer_utils.py:772, in RemoveColumnsCollator.__call__(self, features)
770 def __call__(self, features: List[dict]):
771 features = [self._remove_columns(feature) for feature in features]
--> 772 return self.data_collator(features)
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\data\data_collator.py:92, in default_data_collator(features, return_tensors)
86 # In this function we'll make the assumption that all `features` in the batch
87 # have the same attributes.
88 # So we will look at the first element as a proxy for what attributes exist
89 # on the whole batch.
91 if return_tensors == "pt":
---> 92 return torch_default_data_collator(features)
93 elif return_tensors == "tf":
94 return tf_default_data_collator(features)
File ~\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\data\data_collator.py:158, in torch_default_data_collator(features)
156 batch[k] = torch.tensor(np.stack([f[k] for f in features]))
157 else:
--> 158 batch[k] = torch.tensor([f[k] for f in features])
160 return batch
RuntimeError: Could not infer dtype of DatasetInfo