Python 3.7.6
Transformers 4.4.2
Pytorch 1.8.0
Hi HF Community!
I would like to finetune BERT for sequence classification on some training data I have and also evaluate the resulting model. I am using the Trainer
class to do the training and am a little confused on what the evaluation is doing. Below is my code:
import torch
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
import pandas as pd
class MyDataset(Dataset):
def __init__(self, csv_file: str):
self.df = pd.read_csv(csv_file, encoding='ISO-8859-1')
self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", padding_side='right', local_files_only=True)
self.label_list = self.df['label'].value_counts().keys().to_list()
def __len__(self) -> int:
return len(self.df)
def __getitem__(self, idx: int) -> str:
if torch.is_tensor(idx):
idx = idx.tolist()
text = self.df.iloc[idx, 1]
tmp_label = self.df.iloc[idx, 3]
if tmp_label != 'label_a':
label = 1
else:
label = 0
return (text, label)
def data_collator(self, dataset_samples_list):
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", padding_side='right', local_files_only=True)
examples = [example[0] for example in dataset_samples_list]
encoded_results = tokenizer(examples, padding=True, truncation=True, return_tensors='pt',
return_attention_mask=True)
batch = {}
batch['input_ids'] = torch.stack([result for result in encoded_results['input_ids']])
batch['attention_mask'] = torch.stack([result for result in encoded_results['attention_mask']])
batch['labels'] = torch.stack([torch.tensor(example[1]) for example in dataset_samples_list])
return batch
train_data_obj = MyDataset('/path/to/train/data.csv')
eval_data_obj = MyDataset('/path/to/eval/data.csv')
model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
training_args = TrainingArguments(
output_dir='/path/to/output/dir',
do_train=True,
do_eval=True,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
evaluation_strategy='epoch',
num_train_epochs=2,
save_steps=10,
gradient_accumulation_steps=4,
dataloader_drop_last=True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_data_obj,
eval_dataset=eval_data_obj,
data_collator=data_collator
)
trainer.train()
trainer.save_model("/path/to/model/save/dir")
trainer.evaluate()
As I understand, once trainer.train()
is called, after each epoch the model will be evaluated on the dataset from eval_data_obj
and those results will be displayed. After the training is done and the model is saved using trainer.save_model("/path/to/model/save/dir")
, trainer.evaluate()
will evaluate the saved model on the eval_data_obj
and return a dict
containing the evaluation loss. Are there other metrics like accuracy that are included in this dict
by default? Thank you in advance for your help!