Trying out one of the examples of using nlp
with the Trainer
class. But can’t seem to get the data formatted correctly or there is a bug. Any ideas?
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast, Trainer, TrainingArguments
from nlp import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
model_dir = r"distilbert-base-uncased"
dataset_name = r"sentiment140"
def tokenize(batch):
return tokenizer(batch['text'], padding=True, truncation=True, max_length=140)
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
print("Loading data")
train_dataset, test_dataset = load_dataset(dataset_name, split=['train', 'test'])
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=1000)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=1000)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'sentiment'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'sentiment'])
model = DistilBertForSequenceClassification.from_pretrained(model_dir)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_dir)
print("Loading data")
train_dataset, test_dataset = load_dataset(dataset_name, split=['train', 'test'])
train_dataset = train_dataset.map(tokenize, batched=True, batch_size=1000)
test_dataset = test_dataset.map(tokenize, batched=True, batch_size=1000)
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'sentiment'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'sentiment'])
print("Loading Trainer")
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=1,
per_device_train_batch_size=64,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
evaluate_during_training=True,
logging_dir='./logs',
)
trainer = Trainer(
model=model,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=test_dataset
)
trainer.train()
>>> RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 96 and 61 in dimension 1 at C:\w\1\s\tmp_conda_3.7_100118\conda\conda-bld\pytorch_1579082551706\work\aten\src\TH/generic/THTensor.cpp:612
versions:
transformers 3.0.2 pypi_0 pypi
nlp 0.3.0 pypi_0 pypi