Huggingface tokenizer object has no attribute 'pad'

I am trying to train a model to classify some diseases, following the HuggingFace tutorial to the dot. I used Kaggle to run the code, as I do not have a powerful GPU. There is no way to include all my code, hence I will insert the pertinent lines only:

  1. I created a pandas dataframe and loaded it into a dataset.
    from datasets import Dataset
    ds = Dataset.from_pandas(df)
  1. The tokenizer:
    from transformers import AutoModelForSequenceClassification, AutoTokenizer
    model_nm = 'microsoft/deberta-v3-small'
    tokenz = AutoTokenizer.from_pretrained(model_nm)
    def tokenize_func(x): return tokenz(x["input"])
  1. Data collator with padding function:
    from transformers import DataCollatorWithPadding
    data_collator = DataCollatorWithPadding(tokenizer=tokenize_func)
  1. Training and test sets. I split the training data as such:
    dds = tok_ds.train_test_split(0.25, seed=42)
  1. Metrics & correlation
    import evaluate
    accuracy = evaluate.load("accuracy") 
    def compute_metrics(eval_pred):
        predictions, labels = eval_pred
        predictions = np.argmax(predictions, axis=1)
        return accuracy.compute(predictions=predictions, references=labels)
  1. Training
    Before I started training the model, I created a map of the expected ids to their labels with id2label and label2id:
    id2label = {i:dx for i, dx in enumerate(list(df['Diagnosis'].unique()))}
    label2id = {dx:i for i, dx in enumerate(list(df['Diagnosis'].unique()))}
    
    from transformers import TrainingArguments,Trainer
    
    args = TrainingArguments(output_dir="outputs", learning_rate=lr, per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2, num_train_epochs=epochs, weight_decay=0.01, evaluation_strategy="epoch", save_strategy="epoch", load_best_model_at_end=True, push_to_hub=False, report_to='none',)
    
    model = AutoModelForSequenceClassification.from_pretrained(model_nm, num_labels=len(id2label), id2label=id2label, label2id=label2id)

    trainer = Trainer(model=model, args=args, train_dataset=dds['train'], eval_dataset=dds['test'], tokenizer=tokenize_func, data_collator=data_collator, compute_metrics=compute_metrics,)
    
    trainer.train()

This is where I run into a AttributeError problem:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
Cell In[286], line 1
----> 1 trainer.train()

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1539, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1537         hf_hub_utils.enable_progress_bars()
   1538 else:
-> 1539     return inner_training_loop(
   1540         args=args,
   1541         resume_from_checkpoint=resume_from_checkpoint,
   1542         trial=trial,
   1543         ignore_keys_for_eval=ignore_keys_for_eval,
   1544     )

File /opt/conda/lib/python3.10/site-packages/transformers/trainer.py:1836, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1833     rng_to_sync = True
   1835 step = -1
-> 1836 for step, inputs in enumerate(epoch_iterator):
   1837     total_batched_samples += 1
   1839     if self.args.include_num_input_tokens_seen:

File /opt/conda/lib/python3.10/site-packages/accelerate/data_loader.py:451, in DataLoaderShard.__iter__(self)
    449 # We iterate one batch ahead to check when we are at the end
    450 try:
--> 451     current_batch = next(dataloader_iter)
    452 except StopIteration:
    453     yield

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:630, in _BaseDataLoaderIter.__next__(self)
    627 if self._sampler_iter is None:
    628     # TODO(https://github.com/pytorch/pytorch/issues/76750)
    629     self._reset()  # type: ignore[call-arg]
--> 630 data = self._next_data()
    631 self._num_yielded += 1
    632 if self._dataset_kind == _DatasetKind.Iterable and \
    633         self._IterableDataset_len_called is not None and \
    634         self._num_yielded > self._IterableDataset_len_called:

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/dataloader.py:674, in _SingleProcessDataLoaderIter._next_data(self)
    672 def _next_data(self):
    673     index = self._next_index()  # may raise StopIteration
--> 674     data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    675     if self._pin_memory:
    676         data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

File /opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py:54, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
     52 else:
     53     data = self.dataset[possibly_batched_index]
---> 54 return self.collate_fn(data)

File /opt/conda/lib/python3.10/site-packages/transformers/data/data_collator.py:271, in DataCollatorWithPadding.__call__(self, features)
    270 def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
--> 271     batch = pad_without_fast_tokenizer_warning(
    272         self.tokenizer,
    273         features,
    274         padding=self.padding,
    275         max_length=self.max_length,
    276         pad_to_multiple_of=self.pad_to_multiple_of,
    277         return_tensors=self.return_tensors,
    278     )
    279     if "label" in batch:
    280         batch["labels"] = batch["label"]

File /opt/conda/lib/python3.10/site-packages/transformers/data/data_collator.py:59, in pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs)
     57 # To avoid errors when using Feature extractors
     58 if not hasattr(tokenizer, "deprecation_warnings"):
---> 59     return tokenizer.pad(*pad_args, **pad_kwargs)
     61 # Save the state of the warning, then disable it
     62 warning_state = tokenizer.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False)

AttributeError: 'function' object has no attribute 'pad'

What do I do?

Hi,

You’re providing a function as tokenizer argument (tokenize_func), rather you should provide the tokenizer object to it.