Fine Tuning IMDb tutorial - Unable to reproduce and adapt

Hi! I am trying to reproduce this Notebook: https://colab.research.google.com/drive/1-JIJlao4dI-Ilww_NnTc0rxtp-ymgDgM?usp=sharing. However, there is no way to make it work in Colab since I am getting an OOM error all the time (with both GPU and None environments). I tried to download and run it on my laptop but still no way to make it work.

Despite this, I wanted to adapt that tutorial in order to use another pre-trained model and another local CSV dataset. This is my code:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model_name = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

train_dataset, validation_dataset = load_dataset(
    "csv",
    delimiter="\t",
    data_files={"train": "spanish_train.csv", "validation": "spanish_val.csv"},
    split=["train", "validation"],
)
train_dataset.remove_column_("id_str")
train_dataset.rename_column_("TWEET", "tweet")
train_dataset.rename_column_("LABEL", "label")
validation_dataset.remove_column_("id_str")
validation_dataset.rename_column_("TWEET", "tweet")
validation_dataset.rename_column_("LABEL", "label")

def tokenize(batch):
    return tokenizer(batch["tweet"], padding=True, truncation=True, max_length=10000)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=10)
validation_dataset = validation_dataset.map(tokenize, batched=True, batch_size=10)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
validation_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluate_during_training=True,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)
trainer.train()

In this last line I am getting this error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-93-3435b262f1ae> in <module>()
----> 1 trainer.train()

10 frames
/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in train(self, model_path)
    490                 self._past = None
    491 
--> 492             for step, inputs in enumerate(epoch_iterator):
    493 
    494                 # Skip past any already trained steps if resuming training

/usr/local/lib/python3.6/dist-packages/tqdm/notebook.py in __iter__(self, *args, **kwargs)
    213     def __iter__(self, *args, **kwargs):
    214         try:
--> 215             for obj in super(tqdm_notebook, self).__iter__(*args, **kwargs):
    216                 # return super(tqdm...) will not catch exception
    217                 yield obj

/usr/local/lib/python3.6/dist-packages/tqdm/std.py in __iter__(self)
   1102                 fp_write=getattr(self.fp, 'write', sys.stderr.write))
   1103 
-> 1104         for obj in iterable:
   1105             yield obj
   1106             # Update and possibly print the progressbar.

/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    361 
    362     def __next__(self):
--> 363         data = self._next_data()
    364         self._num_yielded += 1
    365         if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
    401     def _next_data(self):
    402         index = self._next_index()  # may raise StopIteration
--> 403         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    404         if self._pin_memory:
    405             data = _utils.pin_memory.pin_memory(data)

/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     42     def fetch(self, possibly_batched_index):
     43         if self.auto_collation:
---> 44             data = [self.dataset[idx] for idx in possibly_batched_index]
     45         else:
     46             data = self.dataset[possibly_batched_index]

/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
     42     def fetch(self, possibly_batched_index):
     43         if self.auto_collation:
---> 44             data = [self.dataset[idx] for idx in possibly_batched_index]
     45         else:
     46             data = self.dataset[possibly_batched_index]

/usr/local/lib/python3.6/dist-packages/nlp/arrow_dataset.py in __getitem__(self, key)
    717             format_columns=self._format_columns,
    718             output_all_columns=self._output_all_columns,
--> 719             format_kwargs=self._format_kwargs,
    720         )
    721 

/usr/local/lib/python3.6/dist-packages/nlp/arrow_dataset.py in _getitem(self, key, format_type, format_columns, output_all_columns, format_kwargs)
    705                 format_columns=format_columns,
    706                 output_all_columns=output_all_columns,
--> 707                 format_kwargs=format_kwargs,
    708             )
    709         return outputs

/usr/local/lib/python3.6/dist-packages/nlp/arrow_dataset.py in _convert_outputs(self, outputs, format_type, format_columns, output_all_columns, format_kwargs)
    617                     continue
    618                 if format_columns is None or k in format_columns:
--> 619                     v = map_nested(command, v, **map_nested_kwargs)
    620                 output_dict[k] = v
    621         return output_dict

/usr/local/lib/python3.6/dist-packages/nlp/utils/py_utils.py in map_nested(function, data_struct, dict_only, map_list, map_tuple, map_numpy)
    189                 return np.array(mapped)
    190     # Singleton
--> 191     return function(data_struct)
    192 
    193 

TypeError: new(): invalid data type 'str'

Does anyone know what am I doing wrong?

max_length=10000 this seems wrong. IIRC the tokenizers will pad up to max length. So your batches will be size bsx10000. That will cause OOM.

The OOM error happens in the example Notebook I referenced, which I found in https://huggingface.co/transformers/training.html#additional-resources. In my case, the error I am getting is another one: TypeError: new(): invalid data type 'str', which I don’t know what does it mean.

Edit: I removed the max_length argument and the error is still happening.

By taking a look at the datasets features’ structure, I see a notable difference between the one that appears in the example notebook (which is lodaded from the Hugging Face repository):

{'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], names_file=None, id=None),
 'text': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)}

And the one I load from my CSV file:

{'attention_mask': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'label': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'tweet': Value(dtype='string', id=None)}

Basically, the main difference is that the label feature has a ClassLabel type, while mine is just Value. Plus, I am having an extra feature, token_type_ids, which does not appear in the example’s dataset. However, I think that the real problem that triggers the error is the fact that my label is not of the proper type, so how can I specify that the label feature is my classes’ label? It seems that I need to pass a features argument to nlp.load_dataset(), but I could not find any example of how to do it.

I think the issue is that labels is string, it should be int, which is why nlp is throwing error.
Map the string labels to int should solve the issue.

1 Like

And for OOM ?
You can try reducing the batch_size.

I modified the tutorial’s per_device_train_batch_size and per_device_eval_batch_size to a smaller size and the example is now reproducible. Thanks!

On the other hand, I don’t know how can I map the string labels to int. I tried the following:

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(train_dataset["LABEL"])
train_dataset.map(lambda examples: {"LABEL": label_encoder.transform(examples["LABEL"])})

And it is giving me the following error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-15-16ebda6fb306> in <module>()
      3 label_encoder = LabelEncoder()
      4 label_encoder.fit(train_dataset["LABEL"])
----> 5 train_dataset.map(lambda examples: {"LABEL": label_encoder.transform(examples["LABEL"])})

4 frames
/usr/local/lib/python3.6/dist-packages/nlp/arrow_dataset.py in map(self, function, with_indices, batched, batch_size, remove_columns, keep_in_memory, load_from_cache_file, cache_file_name, writer_batch_size, features, disable_nullable, verbose)
    856         test_inputs = self[:2] if batched else self[0]
    857         test_indices = [0, 1] if batched else 0
--> 858         update_data = does_function_return_dict(test_inputs, test_indices)
    859 
    860         class NumExamplesMismatch(Exception):

/usr/local/lib/python3.6/dist-packages/nlp/arrow_dataset.py in does_function_return_dict(inputs, indices)
    829         def does_function_return_dict(inputs, indices):
    830             """ Does the function returns a dict. """
--> 831             processed_inputs = function(inputs, indices) if with_indices else function(inputs)
    832             does_return_dict = isinstance(processed_inputs, Mapping)
    833 

<ipython-input-15-16ebda6fb306> in <lambda>(examples)
      3 label_encoder = LabelEncoder()
      4 label_encoder.fit(train_dataset["LABEL"])
----> 5 train_dataset.map(lambda examples: {"LABEL": label_encoder.transform(examples["LABEL"])})

/usr/local/lib/python3.6/dist-packages/sklearn/preprocessing/_label.py in transform(self, y)
    266         """
    267         check_is_fitted(self)
--> 268         y = column_or_1d(y, warn=True)
    269         # transform of empty array is empty array
    270         if _num_samples(y) == 0:

/usr/local/lib/python3.6/dist-packages/sklearn/utils/validation.py in column_or_1d(y, warn)
    795         return np.ravel(y)
    796 
--> 797     raise ValueError("bad input shape {0}".format(shape))
    798 
    799 

ValueError: bad input shape ()

I am suspecting that if I could configure the features of the dataset when loading it, the library would handle this automatically. However, I am unable to do so. This is what I tried to do:

train_dataset, validation_dataset = nlp.load_dataset(
    "csv",
    delimiter="\t",
    data_files={"train": "spanish_train.csv", "validation": "spanish_val.csv"},
    split=["train", "validation"],
    features=nlp.Features({
        "id_str": nlp.Value("float64"),
        "TWEET": nlp.Value("string"),
        "LABEL": nlp.ClassLabel(names=["AGAINST", "FAVOR", "NEUTRAL"])
    }),
)

But when I check the train_dataset.features value I am still getting the same information as in my previous post. If I try to use nlp.Features.from_dict like this:

train_dataset, validation_dataset = nlp.load_dataset(
    "csv",
    delimiter="\t",
    data_files={"train": "spanish_train.csv", "validation": "spanish_val.csv"},
    split=["train", "validation"],
    features=nlp.Features.from_dict({
        "id_str": {
                "dtype": "string",
                "_type": "Value"
        },
        "LABEL": {
                "names": ["AGAINST", "FAVOR", "NEUTRAL"],
                "_type": "ClassLabel"
        },
        "TWEET": {
                "dtype": "string",
                "_type": "Value"
        }
    }),
)

It doesn’t take it into account just like in the first example.

Can someone provide a working example of how to properly specify the features of this dataset, please?

I saw that the dataset has a method, cast_(), which accepts a nlp.Features object. However, when I try to do this:

features = nlp.Features({
    "id_str": nlp.Value("string"),
    "TWEET": nlp.Value("string"),
    "LABEL": nlp.ClassLabel(names=["AGAINST", "FAVOR", "NEUTRAL"]),
})

train_dataset = nlp.load_dataset(
    "csv",
    delimiter="\t",
    data_files="spanish_train.csv",
    split="train",
)

train_dataset.cast_(features)

It gives me the following error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-38-d718935de6f0> in <module>()
     12 )
     13 
---> 14 train_dataset.cast_(features)

1 frames
/usr/local/lib/python3.6/dist-packages/nlp/arrow_dataset.py in cast_(self, features)
    393         self._info.features = features
    394         schema = pa.schema(features.type)
--> 395         self._data = self._data.cast(schema)
    396 
    397     def remove_column_(self, column_name: str):

/usr/local/lib/python3.6/dist-packages/pyarrow/table.pxi in pyarrow.lib.Table.cast()

ValueError: Target schema's field names are not matching the table's field names: ['id_str', 'TWEET', 'LABEL'], ['LABEL', 'TWEET', 'id_str']

But when I switch the order in the nlp.Features dict it gives this error:

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-40-9b8397e5244e> in <module>()
     12 )
     13 
---> 14 train_dataset.cast_(features)

/usr/local/lib/python3.6/dist-packages/nlp/arrow_dataset.py in cast_(self, features)
    387         if list(features) != self._data.column_names:
    388             raise ValueError(
--> 389                 f"The columns in features ({list(features)}) must be identical and in the same order "
    390                 f"as the columns in the dataset: {self._data.column_names}"
    391             )

ValueError: The columns in features (['LABEL', 'TWEET', 'id_str']) must be identical and in the same order as the columns in the dataset: ['id_str', 'TWEET', 'LABEL']

So, for some reason, my auto-configured features are in inverse order, but the dataset has the correct one. I can check this because if I print the train_dataset object, it gives me this:

Dataset(features: {'id_str': Value(dtype='float64', id=None), 'TWEET': Value(dtype='string', id=None), 'LABEL': Value(dtype='string', id=None)}, num_rows: 6046)

And when I print train_dataset.features, I get this:

{'LABEL': Value(dtype='string', id=None),
 'TWEET': Value(dtype='string', id=None),
 'id_str': Value(dtype='float64', id=None)}

I don’t know how to proceed after this. Can anyone explain how can I configure my datasets’ features properly, please?

Since I haven’t been able to find a way to specify the features, I manipulated the CSV from pandas and then loaded it with the nlp.Dataset.from_pandas function. So now, my code looks like this:

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
import nlp
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import pandas as pd
from sklearn.preprocessing import LabelEncoder

model_name = "dccuchile/bert-base-spanish-wwm-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

label_encoder = LabelEncoder()
train_df = pd.read_csv("spanish_train.csv", sep="\t")
val_df = pd.read_csv("spanish_val.csv", sep="\t")
train_df["LABEL"] = label_encoder.fit_transform(train_df["LABEL"])
val_df["LABEL"] = label_encoder.fit_transform(val_df["LABEL"])

train_dataset = nlp.Dataset.from_pandas(train_df)
validation_dataset = nlp.Dataset.from_pandas(val_df)

train_dataset.remove_column_("id_str")
train_dataset.rename_column_("TWEET", "tweet")
train_dataset.rename_column_("LABEL", "label")
validation_dataset.remove_column_("id_str")
validation_dataset.rename_column_("TWEET", "tweet")
validation_dataset.rename_column_("LABEL", "label")

def tokenize(batch):
    return tokenizer(batch["tweet"], padding=True, truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True, batch_size=10)
validation_dataset = validation_dataset.map(tokenize, batched=True, batch_size=10)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
validation_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluate_during_training=True,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset
)
trainer.train()

But now the error I get is the following:

/usr/local/lib/python3.6/dist-packages/nlp/utils/py_utils.py:191: UserWarning: The given NumPy array is not writeable, and PyTorch does not support non-writeable tensors. This means you can write to the underlying (supposedly non-writeable) NumPy array using the tensor. You may want to copy the array to protect its data or make it writeable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at  /pytorch/torch/csrc/utils/tensor_numpy.cpp:141.)
  return function(data_struct)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-52-3435b262f1ae> in <module>()
----> 1 trainer.train()

6 frames
/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in train(self, model_path)
    490                 self._past = None
    491 
--> 492             for step, inputs in enumerate(epoch_iterator):
    493 
    494                 # Skip past any already trained steps if resuming training

/usr/local/lib/python3.6/dist-packages/tqdm/notebook.py in __iter__(self, *args, **kwargs)
    213     def __iter__(self, *args, **kwargs):
    214         try:
--> 215             for obj in super(tqdm_notebook, self).__iter__(*args, **kwargs):
    216                 # return super(tqdm...) will not catch exception
    217                 yield obj

/usr/local/lib/python3.6/dist-packages/tqdm/std.py in __iter__(self)
   1102                 fp_write=getattr(self.fp, 'write', sys.stderr.write))
   1103 
-> 1104         for obj in iterable:
   1105             yield obj
   1106             # Update and possibly print the progressbar.

/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    361 
    362     def __next__(self):
--> 363         data = self._next_data()
    364         self._num_yielded += 1
    365         if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
    401     def _next_data(self):
    402         index = self._next_index()  # may raise StopIteration
--> 403         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    404         if self._pin_memory:
    405             data = _utils.pin_memory.pin_memory(data)

/usr/local/lib/python3.6/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     45         else:
     46             data = self.dataset[possibly_batched_index]
---> 47         return self.collate_fn(data)

/usr/local/lib/python3.6/dist-packages/transformers/data/data_collator.py in default_data_collator(features)
     59         if k not in ("label", "label_ids") and v is not None and not isinstance(v, str):
     60             if isinstance(v, torch.Tensor):
---> 61                 batch[k] = torch.stack([f[k] for f in features])
     62             else:
     63                 batch[k] = torch.tensor([f[k] for f in features], dtype=torch.long)

RuntimeError: stack expects each tensor to be equal size, but got [99] at entry 0 and [111] at entry 1

And I don’t know where is the root cause of the problem, since I am padding and truncating in the tokenization (which I guess is what the tensors error is referring to).

I tried to change the per_device_train_batch_size and per_device_eval_batch_size parameters in the TrainingArguments object to 1 and the trainer.train() method started to train. However, after a couple of iterations, the method threw this error:

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-92-3435b262f1ae> in <module>()
----> 1 trainer.train()

7 frames
/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in train(self, model_path)
    497                     continue
    498 
--> 499                 tr_loss += self._training_step(model, inputs, optimizer)
    500 
    501                 if (step + 1) % self.args.gradient_accumulation_steps == 0 or (

/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in _training_step(self, model, inputs, optimizer)
    620             inputs["mems"] = self._past
    621 
--> 622         outputs = model(**inputs)
    623         loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
    624 

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/transformers/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states)
   1282             else:
   1283                 loss_fct = CrossEntropyLoss()
-> 1284                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
   1285             outputs = (loss,) + outputs
   1286 

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
    946     def forward(self, input: Tensor, target: Tensor) -> Tensor:
    947         return F.cross_entropy(input, target, weight=self.weight,
--> 948                                ignore_index=self.ignore_index, reduction=self.reduction)
    949 
    950 

/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
   2420     if size_average is not None or reduce is not None:
   2421         reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2422     return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
   2423 
   2424 

/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
   2216                          .format(input.size(0), target.size(0)))
   2217     if dim == 2:
-> 2218         ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
   2219     elif dim == 4:
   2220         ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)

IndexError: Target 2 is out of bounds.

WHat does this error mean? And how can I solve it?

I just had this problem and it was fixed after setting padding = "max_length" in the tokenizer function. (this is in response to RuntimeError: stack expects each tensor to be equal size, but got [99] at entry 0 and [111] at entry 1 )

Pinging @joeddav here who created this notebook.

In case someone wants to test the code with the files, you can download them here.

After checking your other post, I set the tokenize function just like you did and I got this error when calling training.train():

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-31-3435b262f1ae> in <module>()
----> 1 trainer.train()

7 frames
/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in train(self, model_path)
    497                     continue
    498 
--> 499                 tr_loss += self._training_step(model, inputs, optimizer)
    500 
    501                 if (step + 1) % self.args.gradient_accumulation_steps == 0 or (

/usr/local/lib/python3.6/dist-packages/transformers/trainer.py in _training_step(self, model, inputs, optimizer)
    620             inputs["mems"] = self._past
    621 
--> 622         outputs = model(**inputs)
    623         loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
    624 

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/transformers/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states)
   1282             else:
   1283                 loss_fct = CrossEntropyLoss()
-> 1284                 loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
   1285             outputs = (loss,) + outputs
   1286 

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
    720             result = self._slow_forward(*input, **kwargs)
    721         else:
--> 722             result = self.forward(*input, **kwargs)
    723         for hook in itertools.chain(
    724                 _global_forward_hooks.values(),

/usr/local/lib/python3.6/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
    946     def forward(self, input: Tensor, target: Tensor) -> Tensor:
    947         return F.cross_entropy(input, target, weight=self.weight,
--> 948                                ignore_index=self.ignore_index, reduction=self.reduction)
    949 
    950 

/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction)
   2420     if size_average is not None or reduce is not None:
   2421         reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2422     return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
   2423 
   2424 

/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
   2216                          .format(input.size(0), target.size(0)))
   2217     if dim == 2:
-> 2218         ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
   2219     elif dim == 4:
   2220         ret = torch._C._nn.nll_loss2d(input, target, weight, _Reduction.get_enum(reduction), ignore_index)

IndexError: Target 2 is out of bounds.

So I guess I’m still missing something :sweat:

EDIT: Since I am training a classifier of 3 classes, I guess that the LabelEncoder encoded them into 0, 1 and 2, and maybe I need to specify that there are 3 classes, but I don’t know how to do so.

Hi @putopavel, to map your str labels to int with nlp you can simply do the following
if you have 3 labels “A”, “B” and “C”, create dict with string label to int mapping i.e
label_to_int = {"A": 0, "B": 1, "C": 2} then

train_dataset.map(lambda example: {"label": label_to_int[example['LABEL']})

and to specify the number of classes in model, set the num_labels property on config object

model.config.num_labels = 3

Thanks for your suggestions, @valhalla. With the map operation the dataset loading looks much more concise now. However, I am still getting that IndexError: Target 2 is out of bounds. error when I call trainer.train(). This is the complete code I have right now:

# [...]
dataset = nlp.load_dataset(
    "csv",
    delimiter="\t",
    data_files={"train": "spanish_train.csv", "validation": "spanish_val.csv"},
)

label_to_int = {
    "AGAINST": 0,
    "FAVOR": 1,
    "NEUTRAL": 2
}
dataset = dataset.map(lambda example: {"LABEL": label_to_int[example["LABEL"]]})
# [...]
trainer.train()

Maybe the model.config.num_labels = 3 statement is not being interpreted by the model?

EDIT: By checking whats in model.config, I got the following:

BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "type_vocab_size": 2,
  "vocab_size": 31002
}

I am seeing that the labels in id2label and label2id are generic. Might this be a problem?

@putopavel Sorry, my mistake. This is how its done

config = AutoConfig.from_pretrained(model_name, num_lables=3)
model = AutoModelForSequenceClassification.from_pretrained(model_name, config=config)

Also would be nice if you only post the relevant code snippet and post colab link for full code. Kinda hard to read when there’s so much code

Hi @valhalla, I tried that new approach and I’m still having that same error.

You are right, I was thinking that having all the code written here is very tedious. Here I uploaded my notebook and adapted it so you can directly download the dataset.

Typo here: num_lables -> num_labels.

@putopavel LMK if that’s doesn’t solve it, and if not please repost the error you’re having.

2 Likes

Yes! Now it seems to be training. Let’s see what happens once it gets trained. I’ll set like 10 epochs or so since I think it takes way too many time and then I’ll try to use this model to predict the test dataset.

Thank you all for your help and support! It is very much appreciated.

1 Like