I am trying to fine tune cardiffnlp/twitter-roberta-base-sentiment-latest
on my own dataset which has 8 labels for sentiment analysis.
The data is in a CSV format, I tokenized and encoded using the mentioned Tokenizer in the hugging face doc, then with pytorch Dataset class, I converted those encodings into PyTorch tensors. I don’t understand why I am getting a value error:
ValueError: Expected input batch_size (16) to match target batch_size (128).
I appreciate your help. Thank you so much.
p.s. I have previously fine tuned (full FF, PEFT and RLHF) flan-t5 but I don’t have experience with custom models which are already fine tuned.
model = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(model)
config = AutoConfig.from_pretrained(model)
config.num_labels = 8
import torch
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
"""
Custom Dataset class for handling tokenized text data and corresponding labels.
Inherits from torch.utils.data.Dataset.
"""
def __init__(self, encodings, labels):
"""
Initializes the DataLoader class with encodings and labels.
Args:
encodings (dict): A dictionary containing tokenized input text data
(e.g., 'input_ids', 'token_type_ids', 'attention_mask').
labels (list): A list of integer labels for the input text data.
"""
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
"""
Returns a dictionary containing tokenized data and the corresponding label for a given index.
Args:
idx (int): The index of the data item to retrieve.
Returns:
item (dict): A dictionary containing the tokenized data and the corresponding label.
"""
Retrieve tokenized data for the given index
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
Add the label for the given index to the item dictionary
item['labels'] = torch.nn.functional.one_hot(torch.tensor(self.labels[idx]), num_classes=8)
return item
def __len__(self):
"""
Returns the number of data items in the dataset.
Returns:
(int): The number of data items in the dataset.
"""
return len(self.labels)
Assuming train_encodings and val_encodings are dictionaries of tensors
train_dataset = TextDataset(train_encodings, y_train.to_list(),)
val_dataset = TextDataset(val_encodings, y_val.to_list())
Finally using a the Trainer classes to train as usual
from transformers import AutoModelForSequenceClassification, TrainingArguments
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=1,
per_device_train_batch_size=16,
per_device_eval_batch_size=8,
warmup_steps=100,
weight_decay=1e-4,
logging_dir='./logs',
eval_steps=50,
load_best_model_at_end=True,
evaluation_strategy="steps",
save_strategy="steps",
)
from transformers import Trainer
model = AutoModelForSequenceClassification.from_config(config)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics= compute_metrics
)