Hello everyone,
I’m trying to reproduce the IMDB sentiment analysis model of the tutorial. I already had the data on my personal machine under a slight different form, but after transforming it into a list of strings, each one containing a review, it should be the same. But I get an error. I don’t know if I did something wrong or if the library changed since the tutorial was created.
My code looks like this (the script isn’t finished as it doesn’t contain the evaluation part but the training is already failing):
import os
import pandas as pd
import pickle
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch import cuda
from torch.utils.data import Dataset
import time
from transformers.integrations import TensorBoardCallback
from train_and_eval_lstm import print_evaluation_scores
device = 'cuda' if cuda.is_available() else 'cpu'
def main():
clean_text_train_fn = os.path.join(os.getcwd(), "Transformed_data/clean_text_train.csv")
df_clean_text_train = pd.read_csv(clean_text_train_fn)
clean_text_train = df_clean_text_train["clean_text"].tolist()
clean_text_valid_fn = os.path.join(os.getcwd(), "Transformed_data/clean_text_valid.csv")
df_clean_text_valid = pd.read_csv(clean_text_valid_fn)
clean_text_valid = df_clean_text_valid["clean_text"].tolist()
clean_text_test_fn = os.path.join(os.getcwd(), "Transformed_data/clean_text_test.csv")
df_clean_text_test = pd.read_csv(clean_text_test_fn)
clean_text_test = df_clean_text_test["clean_text"].tolist()
## Load binary labels
y_binary_train_fn = os.path.join(os.getcwd(), 'Transformed_data/Labels/y_binary_train.pkl')
with open(y_binary_train_fn, mode='rb') as f:
y_binary_train = pickle.load(f)
y_binary_valid_fn = os.path.join(os.getcwd(), 'Transformed_data/Labels/y_binary_valid.pkl')
with open(y_binary_valid_fn, mode='rb') as f:
y_binary_valid = pickle.load(f)
y_binary_test_fn = os.path.join(os.getcwd(), 'Transformed_data/Labels/y_binary_test.pkl')
with open(y_binary_test_fn, mode='rb') as f:
y_binary_test = pickle.load(f)
## Using pretrained Tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)
start = time.time()
train_encodings = tokenizer(clean_text_train, truncation=True, padding=True)
stop = time.time()
print(f"Time to tokenize training set: {stop - start}")
val_encodings = tokenizer(clean_text_valid, truncation=True, padding=True)
test_encodings = tokenizer(clean_text_test, truncation=True, padding=True)
class IMDbDataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(self.labels[idx])
return item
def __len__(self):
return len(self.labels)
n_toy = 500
toy_dataset = IMDbDataset(train_encodings[:n_toy], y_binary_train[:n_toy])
train_dataset = IMDbDataset(train_encodings, y_binary_train)
val_dataset = IMDbDataset(val_encodings, y_binary_valid)
test_dataset = IMDbDataset(test_encodings, y_binary_test)
training_args = TrainingArguments(
output_dir='./results', # output directory
num_train_epochs=1, # total number of training epochs
per_device_train_batch_size=16, # batch size per device during training
per_device_eval_batch_size=64, # batch size for evaluation
warmup_steps=500, # number of warmup steps for learning rate scheduler
weight_decay=0.01, # strength of weight decay
logging_dir='./logs', # directory for storing logs
logging_steps=10,
)
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
trainer = Trainer(
model=model, # the instantiated 🤗 Transformers model to be trained
args=training_args, # training arguments, defined above
train_dataset=toy_dataset, # training dataset
eval_dataset=val_dataset, # evaluation dataset
callbacks=[TensorBoardCallback]
)
start = time.time()
trainer.train()
stop = time.time()
print(f"Time to train the model: {stop-start}")
model_dir = os.path.join(os.getcwd(), "Saved_models")
model.save_pretrained(model_dir)
if __name__ == "__main__":
main()
And if I execute it, I receive the following error:
File "/home/me/Documents/CS_Programming_Machine_Learning/Projects/IMDB_sentiment_analysis_2/Comparison_models/train_and_eval_DistilBERT.py", line 61, in __getitem__
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
AttributeError: 'list' object has no attribute 'items'
If I use debugging, I see that indeed, self.encodings is a python list.
I guess that I can fix it on my own, but I was wondering if I did something wrong or if the docs are outdated (I use version 4.5.1 of HF Transformers).