I’ve tried so far a couple of sequence classifiers like Bert and Roberta and I’m unable to make them overfit or even make them learn the following toy dataset problem. Any suggestions on what/why they don’t train on this simple toy task? It’s a simple binary classification problems shouldn’t this be easy to learn for such models?
import torch
import random
import numpy as np
import pandas as pd
from datasets import Dataset
from torch.utils.data import DataLoader, TensorDataset
from transformers.trainer import Trainer, TrainingArguments
from transformers.data.data_collator import DataCollatorWithPadding
from transformers.models.auto import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification
def generate_addition_data(num_samples):
data = []
for _ in range(num_samples):
a = random.randint(0, 999)
b = random.randint(0, 999)
question = f"{a} + {b}"
correct_answer = a + b
is_correct = random.choice([True, False])
if is_correct:
answer = correct_answer
else:
# Generate an incorrect answer by adding or subtracting a random value
answer = correct_answer + random.choice([-10, -5, -1, 1, 5, 10])
data.append({
'question': question,
'answer': str(answer),
'label': 1 if is_correct else 0
})
return data
data = generate_addition_data(10_000) # Generate 10,000 samples
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)
checkpoint = "FacebookAI/roberta-base"
def preprocess_function(examples):
inputs = [f"Calculate: {q} = {a}" for q, a in zip(examples["question"], examples['answer'])]
labels = [a for a in examples['label']]
model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True, return_tensors='pt')
model_inputs['labels'] = labels
return model_inputs
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenized_datasets = dataset.map(preprocess_function, batched=True)
# pos_weights = len(df) / (2 * df.label.value_counts()[1])
# neg_weights = len(df) / (2 * df.label.value_counts()[0])
train_test_split = tokenized_datasets.train_test_split(test_size=2000, seed=2024)
pos_weights = len(train_test_split['train'].to_pandas()) / (2 * train_test_split['train'].to_pandas().label.value_counts()[1])
neg_weights = len(train_test_split['train'].to_pandas()) / (2 * train_test_split['train'].to_pandas().label.value_counts()[0])
print(f"Pos weights: {pos_weights}, Neg weights: {neg_weights}")
config = AutoConfig.from_pretrained(checkpoint)
config.num_labels = 2
model = AutoModelForSequenceClassification.from_config(config)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)
training_args = TrainingArguments(
output_dir="./tseqclassifier_checkpoints/",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=140,
per_device_eval_batch_size=140,
num_train_epochs=200,
weight_decay=0.01,
warmup_steps=500,
# use_cpu=True
load_best_model_at_end=True,
)
class WeightedCELossTrainer(Trainer):
def compute_loss(self, model, inputs, return_outputs=False):
labels = inputs.pop("labels")
# Get model's predictions
outputs = model(**inputs)
logits = outputs.get("logits")
# Compute custom loss
loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([pos_weights, neg_weights], device=model.device, dtype=logits.dtype))
# loss_fct = torch.nn.CrossEntropyLoss(pos_weights=torch.tensor([0.9785, 10.24], device=model.device, dtype=logits.dtype))
loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
return (loss, outputs) if return_outputs else loss
trainer = WeightedCELossTrainer(
model=model,
args=training_args,
train_dataset=train_test_split['train'],
eval_dataset=train_test_split['test'],
tokenizer = None,
compute_metrics =compute_metrics,
)
trainer.train()
def predict(question):
inputs = tokenizer(f"Calculate: {question}", return_tensors="pt").input_ids.cuda()
outputs = model(inputs)
print(f"outputs = {outputs.logits.argmax(dim=1).item()}")
answer = tokenizer.decode(outputs.logits.argmax(dim=1).cpu().numpy(), skip_special_tokens=True)
return answer
question1 = "15 + 27 = 43"
question2 = "15 + 27 = 42"
print(f"Question: {question1}")
print(f"Predicted Answer: {predict(question1)}")
print(f"Question: {question2}")
print(f"Predicted Answer: {predict(question2)}")