Error in fine-tuning BERT

As a follow-up from my previous question, I am trying to fine-tune a model, but I am getting an error: IndexError: tuple index out of range.

I am trying to classify individual sentences with a binary classification. I am using transformers version 4.2.1 and datasets version 1.2.1

The dataset(s) are .csv files with two columns: “sentence” and “label”. The following is the code that led to the error - if anyone can help identify my error, please let me know :smiley:

import numpy as np
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from datasets import load_dataset, load_metric

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

dataset = load_dataset('csv', data_files={'train': "train_data.csv",
                                          'test':  "test_data.csv"})

metric = load_metric('f1', 'accuracy')

encoded_dataset = x: tokenizer(x['sentence'], padding=True, truncation=True), batched=True,load_from_cache_file=False)

batch_size = 16
args = TrainingArguments(
    evaluation_strategy = "epoch",

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(

All of that runs with no problem. However, I get the following error next:

IndexError                                Traceback (most recent call last)
<ipython-input-2-3435b262f1ae> in <module>
----> 1 trainer.train()

/usr/local/bin/miniconda3/envs/tfhub/lib/python3.8/site-packages/transformers/ in train(self, model_path, trial)
    934             self.control = self.callback_handler.on_epoch_end(self.args, self.state, self.control)
--> 935             self._maybe_log_save_evaluate(tr_loss, model, trial, epoch)
    937             if self.args.tpu_metrics_debug or self.args.debug:

/usr/local/bin/miniconda3/envs/tfhub/lib/python3.8/site-packages/transformers/ in _maybe_log_save_evaluate(self, tr_loss, model, trial, epoch)
   1002         metrics = None
   1003         if self.control.should_evaluate:
-> 1004             metrics = self.evaluate()
   1005             self._report_to_hp_search(trial, epoch, metrics)

/usr/local/bin/miniconda3/envs/tfhub/lib/python3.8/site-packages/transformers/ in evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
   1440         start_time = time.time()
-> 1442         output = self.prediction_loop(
   1443             eval_dataloader,
   1444             description="Evaluation",

/usr/local/bin/miniconda3/envs/tfhub/lib/python3.8/site-packages/transformers/ in prediction_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
   1569                 losses_host = losses if losses_host is None else, losses), dim=0)
   1570             if logits is not None:
-> 1571                 preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100)
   1572             if labels is not None:
   1573                 labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100)

/usr/local/bin/miniconda3/envs/tfhub/lib/python3.8/site-packages/transformers/ in nested_concat(tensors, new_tensors, padding_index)
     83     ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
     84     if isinstance(tensors, (list, tuple)):
---> 85         return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
     86     elif isinstance(tensors, torch.Tensor):
     87         return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)

/usr/local/bin/miniconda3/envs/tfhub/lib/python3.8/site-packages/transformers/ in <genexpr>(.0)
     83     ), f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}."
     84     if isinstance(tensors, (list, tuple)):
---> 85         return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
     86     elif isinstance(tensors, torch.Tensor):
     87         return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)

/usr/local/bin/miniconda3/envs/tfhub/lib/python3.8/site-packages/transformers/ in nested_concat(tensors, new_tensors, padding_index)
     85         return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors))
     86     elif isinstance(tensors, torch.Tensor):
---> 87         return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)
     88     elif isinstance(tensors, np.ndarray):
     89         return numpy_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index)

/usr/local/bin/miniconda3/envs/tfhub/lib/python3.8/site-packages/transformers/ in torch_pad_and_concatenate(tensor1, tensor2, padding_index)
     46 def torch_pad_and_concatenate(tensor1, tensor2, padding_index=-100):
     47     """Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary."""
---> 48     if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]:
     49         return, tensor2), dim=0)

IndexError: tuple index out of range

Thanks in advance!

Hi @AlanFeder, judging by the stack trace my first guess is that the problem comes from a conflict between padding in the operation vs padding on-the-fly in the Trainer.

As described in the Trainer docs, when you pass the tokenizer to the Trainer it will be used as follows:

The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs the maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an interrupted training or reuse the fine-tuned model.

So it seems that in your code, you’re doing padding twice: once in and then again during training.

Can you remove the padding=True argument from your tokenization step and see if that works?

1 Like

Thanks for the response @lewtun! Unfortunately, it did not work, and I received the same error with the same stack trace.

One thing I forgot to mention – the model seems to have run one training epoch successfully, and then crashed when doing the next step…

Not sure if that makes a difference, or adds any useful context.

Thanks again!

1 Like

Yes, you’re right that the problem is happening on the trainer.evaluate() step. It might be coming from the label_names argument in your TrainingArguments. From the docs we have:

The list of keys in your dictionary of inputs that correspond to the labels.

Will eventually default to ["labels"] except if the model used is one of the XxxForQuestionAnswering in which case it will default to ["start_positions", "end_positions"] .

So it seems you need to provide a list like ['label'] instead of the string. If that doesn’t work, you could try renaming the “label” column in your CSV files to “labels” and then dropping the label_names argument from TrainingArguments.

You can then check if it works by just running


which is faster than waiting for one epoch of training :slight_smile:

As a tip, I would also specify all the implicit arguments of your TrainingArguments and Trainer explicitly, e.g. use ouput_dir="test_20210201_1200" in TrainingArguments and similarly for model and args in Trainer.

PS. one thing that looks a bit odd is the way you load the metric:

metric = load_metric('f1', 'accuracy')

I don’t think you can load multiple metrics this way since the second argument refers to the “configuration” of the metric (e.g. GLUE has a config for each task). Nevertheless, this is probably not the source of the problem.


Yes! :tada: This worked! Specifically, I changed the column name (just wrapping label in “[” & “]” did not work).

I also made the other changed you suggested.

Last question – do you happen to know if there is a way to load multiple accuracy metrics (e.g. both f1 and accuracy)?

Thank you so much for all your assistance – my first fine tuning was successful!

1 Like

Great :partying_face:!

I think the simplest way to track both accuracy and F1 score would be to first load them independently:

accuracy_score = load_metric('accuracy')
f1_score = load_metric('f1')

Then you can include them in the compute_metrics function by simply returning a dict of entries:

def compute_metrics(pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    # returns a dict like {'f1':0.54221}
    f1 = f1_score.compute(predictions=predictions, references=labels)
    # returns a dict like {'accuracy': 0.3241}
    acc = accuracy_score.compute(predictions=predictions, references=labels)
    # merge the two dictionaries
    return {**f1, **acc}

Thanks so much for all your help!

1 Like

I have a similar problem!
I am working on a summarization model with Bart.
so far I can train the model and it is fine, but when I want to do trainer.evaluate() it returns this warning:
VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify ‘dtype=object’ when creating the ndarray.
result = getattr(asarray(obj), method)(*args, **kwds)
and this error:
ValueError : could not broadcast input array from shape (50,32,50265) into shape (50,)
50 is the number of rows in my evaluation dataset
32 is the max_evaluation_length
50265 is BartForConditionalGeneration(
(model): BartModel(
(shared): Embedding(50265, 768, padding_idx=1)

here is the code:
from transformers import AutoTokenizer
model_checkpoint = “facebook/bart-base”

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_input_length = 64
max_target_length = 32

def preprocess_function(example):

model_inputs = tokenizer(example['text'],max_length = max_input_length, padding=True , truncation= True) 

with tokenizer.as_target_tokenizer():

    labels = tokenizer(example['summary'], max_length =max_target_length ,padding=True , truncation = True ) 

model_inputs['labels'] = labels['input_ids']

return model_inputs

tokenized_datasets =, batched= True, remove_columns = [‘text’,‘summary’])

from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(“facebook/bart-base”)

from transformers import TrainingArguments

training_args = TrainingArguments(“test_trainer”)

from transformers import Trainer

import numpy as np

from datasets import load_metric

#metric = load_metric(“accuracy”)

metric = load_metric(“glue”, “mrpc”)

def compute_metrics(eval_pred):

logits, labels = eval_pred

predictions = np.argmax(logits, axis=-1)

return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(model=model,





        tokenizer = tokenizer)

then :slightly_smiling_face:
I train the model and it works fine but trainer.evaluate() returns that error

but it works at the start like this:
***** Running Evaluation *****
Num examples = 50
Batch size = 8

6%|▋ | 12/189 [13:07<3:13:36, 65.63s/it]
33%|███▎ | 62/189 [08:23<17:12, 8.13s/it]

then error!

I believe you. can do load_metric as the OP. see reference documentation, you are not limited to 1 metric at a time

1 Like