Failing to format sentiment140 for Trainer

Trying out one of the examples of using nlp with the Trainer class. But can’t seem to get the data formatted correctly or there is a bug. Any ideas?

from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model_dir = r"distilbert-base-uncased"
dataset_name = r"sentiment140"


def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=140)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

print("Loading data")
train_dataset, test_dataset = load_dataset(dataset_name, split=['train', 'test'])
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=1000)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=1000)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'sentiment'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'sentiment'])


model = DistilBertForSequenceClassification.from_pretrained(model_dir)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_dir)

print("Loading data")
train_dataset, test_dataset = load_dataset(dataset_name, split=['train', 'test'])
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=1000)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=1000)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'sentiment'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'sentiment'])

print("Loading Trainer")
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluate_during_training=True,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

>>> RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 96 and 61 in dimension 1 at C:\w\1\s\tmp_conda_3.7_100118\conda\conda-bld\pytorch_1579082551706\work\aten\src\TH/generic/THTensor.cpp:612

versions:
transformers 3.0.2 pypi_0 pypi
nlp 0.3.0 pypi_0 pypi

Hi @swayson the reason is that you have set padding to True and your bs for data processing is 1000,
when padding is True then it pads to the longest sequence in the batch. So the max seq length for all batches is not same and your bs for training is 32 so might be taking examples which have different seq lengths.

You can set padding to to max_lengh to pad to a max length specified in max_length or to the max acceptable input length for the model if no length is provided

1 Like

It works! :slight_smile:
Your explanation is clear and I now understand the root issue much better. Much appreciated!
Thank you @valhalla