I am trying to train a model to classify some diseases, following the HuggingFace tutorial to the dot. I used Kaggle to run the code, as I do not have a powerful GPU. There is no way to include all my code, hence I will insert the pertinent lines only:
- I created a pandas
dataframe
and loaded it into adataset
.
from datasets import Dataset
ds = Dataset.from_pandas(df)
- The tokenizer:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
model_nm = 'microsoft/deberta-v3-small'
tokenz = AutoTokenizer.from_pretrained(model_nm)
def tokenize_func(x): return tokenz(x["input"])
- Data collator with padding function:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenize_func)
- Training and test sets. I split the training data as such:
dds = tok_ds.train_test_split(0.25, seed=42)
- Metrics & correlation
import evaluate
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)
- Training
Before I started training the model, I created a map of the expected ids to their labels withid2label
andlabel2id
:
id2label = {i:dx for i, dx in enumerate(list(df['Diagnosis'].unique()))}
label2id = {dx:i for i, dx in enumerate(list(df['Diagnosis'].unique()))}
from transformers import TrainingArguments,Trainer
args = TrainingArguments(output_dir="outputs", learning_rate=lr, per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2, num_train_epochs=epochs, weight_decay=0.01, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, push_to_hub=False, report_to='none',)
model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=len(id2label), id2label=id2label, label2id=label2id)
trainer = Trainer(model=model, args=args, train_dataset=dds['train'], eval_dataset=dds['test'], tokenizer=tokenize_func, data_collator=data_collator, compute_metrics=compute_metrics,)
trainer.train()
This is where I run into a AttributeError
problem:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
Cell In[286], line 1
----> 1 trainer.train()
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1539, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1537 hf_hub_utils.enable_progress_bars()
1538 else:
-> 1539 return inner_training_loop(
1540 args=args,
1541 resume_from_checkpoint=resume_from_checkpoint,
1542 trial=trial,
1543 ignore_keys_for_eval=ignore_keys_for_eval,
1544 )
File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1836, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1833 rng_to_sync = True
1835 step = -1
-> 1836 for step, inputs in enumerate(epoch_iterator):
1837 total_batched_samples += 1
1839 if self.args.include_num_input_tokens_seen:
File /opt/conda/lib/python3.10/site-packages/accelerate/data_loader.py:451, in DataLoaderShard.__iter__(self)
449 # We iterate one batch ahead to check when we are at the end
450 try:
--> 451 current_batch = next(dataloader_iter)
452 except StopIteration:
453 yield
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
627 if self._sampler_iter is None:
628 # TODO(https://github.com/pytorch/pytorch/issues/76750)
629 self._reset() # type: ignore[call-arg]
--> 630 data = self._next_data()
631 self._num_yielded += 1
632 if self._dataset_kind == _DatasetKind.Iterable and \
633 self._IterableDataset_len_called is not None and \
634 self._num_yielded > self._IterableDataset_len_called:
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:674, in _SingleProcessDataLoaderIter._next_data(self)
672 def _next_data(self):
673 index = self._next_index() # may raise StopIteration
--> 674 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
675 if self._pin_memory:
676 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:54, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
52 else:
53 data = self.dataset[possibly_batched_index]
---> 54 return self.collate_fn(data)
File /opt/conda/lib/python3.10/site-packages/transformers/data/data_collator.py:271, in DataCollatorWithPadding.__call__(self, features)
270 def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
--> 271 batch = pad_without_fast_tokenizer_warning(
272 self.tokenizer,
273 features,
274 padding=self.padding,
275 max_length=self.max_length,
276 pad_to_multiple_of=self.pad_to_multiple_of,
277 return_tensors=self.return_tensors,
278 )
279 if "label" in batch:
280 batch["labels"] = batch["label"]
File /opt/conda/lib/python3.10/site-packages/transformers/data/data_collator.py:59, in pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs)
57 # To avoid errors when using Feature extractors
58 if not hasattr(tokenizer, "deprecation_warnings"):
---> 59 return tokenizer.pad(*pad_args, **pad_kwargs)
61 # Save the state of the warning, then disable it
62 warning_state = tokenizer.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False)
AttributeError: 'function' object has no attribute 'pad'
What do I do?