Dear all!
(This is my first post on the forum. I’m sorry if anything is off or the code is weird looking… I tried to fix it as best I can… Im still learning!)
I’m fairly new to NLP and I’ve run into an issue I cant seem to solve. I’m attempting to fine-tune RoBERTa on a dataset that classifies text into 199 different categories (representing various wellbeing triggers). Basically, we have a set of textual data (around 15000 lines of text) thats classified as various triggers of wellbeing (sample data below).
The problem is: after training, when I use my fine-tuned model for inference (even on data it has already seen), it always predicts the very first class (“acculturation stress”). I can’t get it to select any other class… it’s effectively stuck on one label. Im really not sure what Im doing wrong.
Weirdly enough, the training process itself doesn’t throw errors, and my training metrics look amazing. And during the test prediction part, it classifies everything correctly. In fact, I get the following results:
eval_loss | eval_accuracy | eval_weighted_f1 | eval_macro_f1 | eval_runtime | epoch |
---|---|---|---|---|---|
0.002152 | 0.99965 | 0.999646 | 0.999646 | 909.2079 | 6 |
Everything seems near-perfect from the training side, so I’m not sure what’s going wrong. Any insights or tips would be greatly appreciated. Not even Qwen, ChatGPT, or Claude managed to crack it!
EDIT: I did notice that the “adapter_model.safetensors” file in the “full_model” directory (the location of the final model) is empty, but the one before merger is like 7mbs. However, jyst copying it over manually doesnt solve the problem. So perhaps there is an issue with the merging?
Dataset Example
Here’s the basic structure of the data:
Domain | Sub Category (label) | Example (text) |
---|---|---|
life demands | acculturation stress | I really hate it in the Netherlands, even though I chose to move here. |
life demands | acculturation stress | I want to integrate and feel at home but the people here make it so difficult. |
wellbeing | cognitive flexibility | I enjoy collaborating because it forces me to flex my thinking. |
wellbeing | affect balance: positive vs negative affect | I try to focus on positive moments rather than dwelling on the negatives. |
life resources | appreciation & recognition | My boss always tells me how much he appreciates the work I do after we complete a big project. |
life resources | career development opportunities | Being able to shadow colleagues helped me see how my skills transfer to new roles. |
Fine-Tuning Code
# ----------------------------------------------
# 1. Import Necessary Libraries
# ----------------------------------------------
import torch
import os
import json
import logging
import pandas as pd
from datasets import Dataset
from transformers import (
RobertaTokenizer,
RobertaForSequenceClassification,
TrainingArguments,
Trainer,
TrainerState
)
from peft import LoraConfig, get_peft_model, TaskType, PeftModel # !!! CHANGED !!!
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import bitsandbytes as bnb
from sklearn.utils import resample # Ensure this import exists
# ----------------------------------------------
# 🛠 2. Configuration
# ----------------------------------------------
class Config:
model_name = "roberta-base"
data_path = "train.xlsx"
batch_size = 32 # Reduced for 16GB VRAM
epochs = 1 #6
gradient_accumulation_steps = 1 # Effective batch size = batch_size * grad_accum_steps
max_seq_length = 512 # Memory optimization
learning_rate = 3e-5
weight_decay = 0.01
output_dir = "./roberta_output"
log_file = "training.log"
results_csv = "training_results.csv"
predictions_csv = "test_predictions.csv"
metric_for_best_model = "weighted_f1" # !!! CHANGED !!! (Unify best model metric)
greater_is_better = True
evaluation_strategy = "epoch" # !!! CHANGED !!! (Align with actual usage)
#eval_steps = 300 # Evaluate every 300 steps
save_strategy = "epoch" # !!! CHANGED !!! (Align with actual usage)
#save_steps = 300 # !!! CHANGED !!! (Add for step-based saving)
save_total_limit = 2
max_grad_norm = 1.0
logging_steps = 300
min_samples = 1
# Check model's maximum sequence length
from transformers import RobertaConfig
config_check = RobertaConfig.from_pretrained(Config.model_name)
print(f"Maximum allowed tokens: {config_check.max_position_embeddings}") # Should show 512
# Validate configuration parameters
required_params = [
'model_name', 'data_path', 'batch_size', 'epochs',
'output_dir', 'learning_rate', 'min_samples', 'log_file',
'results_csv', 'predictions_csv'
]
for param in required_params:
if not hasattr(Config, param):
raise AttributeError(f"Missing config parameter: {param}")
# ----------------------------------------------
# Logging Setup
# ----------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler(Config.log_file, encoding="utf-8"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
# ----------------------------------------------
# 4. Check GPU Availability
# ----------------------------------------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {DEVICE}")
logger.info(f"Torch version: {torch.__version__}")
logger.info(f"CUDA Available: {torch.cuda.is_available()}")
logger.info(f"BitsandBytes Available: {hasattr(bnb, 'nn')}")
# ----------------------------------------------
# 5. Load & Preprocess Data
# ----------------------------------------------
def load_and_preprocess_data(file_path):
"""Loads, preprocesses, and balances the dataset."""
logger.info(f"Loading dataset from {file_path}...")
df = pd.read_excel(file_path, engine="openpyxl") if file_path.endswith(".xlsx") else pd.read_csv(file_path)
df.dropna(subset=["Sub Category", "Example"], inplace=True)
# Add data validation
if df.empty:
raise ValueError("Empty dataset after loading")
df["Sub Category"] = df["Sub Category"].astype(str).str.replace(" ", "_").str.strip()
df["Example"] = df["Example"].str.lower().str.strip()
label_counts = df["Sub Category"].value_counts()
valid_labels = label_counts[label_counts >= Config.min_samples].index
df = df[df["Sub Category"].isin(valid_labels)]
if df.empty:
raise ValueError(f"No categories meet min_samples={Config.min_samples} requirement")
def balance_dataset(df_):
label_counts_ = df_["Sub Category"].value_counts()
max_samples = label_counts_.max()
df_balanced = df_.groupby("Sub Category", group_keys=False).apply(
lambda x: resample(
x,
replace=True,
n_samples=max_samples,
random_state=42
)
).reset_index(drop=True)
return df_balanced
df = balance_dataset(df)
logger.info(f"Final dataset size after balancing: {len(df)}")
return df
# ----------------------------------------------
# 6. Tokenization
# ----------------------------------------------
def tokenize_function(examples):
"""Tokenizes text using RoBERTa tokenizer."""
tokenizer = RobertaTokenizer.from_pretrained(Config.model_name)
tokenized_inputs = tokenizer(
examples["Example"],
padding="max_length",
truncation=True,
max_length=512,
return_tensors="pt"
)
#tokenized_inputs["labels"] = torch.tensor(examples["labels"], dtype=torch.float) # Force labels to float
#return tokenized_inputs
# Use long (integer) labels instead of float
tokenized_inputs["labels"] = torch.tensor(examples["labels"], dtype=torch.long)
return tokenized_inputs
# ----------------------------------------------
# 7. Dataset Preparation
# ----------------------------------------------
def prepare_datasets(df):
"""Creates stratified datasets with proper label mapping."""
label_mapping = {label: idx for idx, label in enumerate(df["Sub Category"].unique())}
Config.num_labels = len(label_mapping)
logger.info(f"Number of categories: {Config.num_labels}")
# !!! CHANGED !!! - Create output dir if not existing
if not os.path.exists(Config.output_dir):
os.makedirs(Config.output_dir)
with open(f"{Config.output_dir}/label_mapping.json", "w") as f:
json.dump(label_mapping, f)
df["label"] = df["Sub Category"].map(label_mapping).astype(int) # ✅ Convert to float explicitly
# Stratified splits
train_df, eval_test_df = train_test_split(
df,
test_size=0.3,
stratify=df["label"],
random_state=42
)
eval_df, test_df = train_test_split(
eval_test_df,
test_size=0.5,
stratify=eval_test_df["label"],
random_state=42
)
datasets = []
for split_df in [train_df, eval_df, test_df]:
dataset = Dataset.from_pandas(split_df).map(
lambda x: {"labels": x["label"]},
remove_columns=["label"]
)
datasets.append(dataset)
return tuple(datasets) + (label_mapping,)
# ----------------------------------------------
# 8. Compute Evaluation Metrics
# ----------------------------------------------
def compute_metrics(eval_pred):
"""Calculates multiple evaluation metrics."""
logits, labels = eval_pred
preds = logits.argmax(axis=-1)
acc = accuracy_score(labels, preds)
w_f1 = f1_score(labels, preds, average="weighted")
m_f1 = f1_score(labels, preds, average="macro")
return {
"accuracy": acc,
"weighted_f1": w_f1,
"macro_f1": m_f1
}
# ------------------------------------------------------------------------------
# 🚀 9. Fine-Tune RoBERTa with LoRA + Auto-Resume
# ------------------------------------------------------------------------------
def train_model(train_dataset, eval_dataset, test_dataset, label_mapping):
"""Trains RoBERTa model with LoRA and ensures all required files are saved."""
tokenizer = RobertaTokenizer.from_pretrained(Config.model_name)
# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)
num_labels = len(label_mapping)
# !!! CHANGED !!!: We'll detect a checkpoint directory ourselves
last_checkpoint = None
if os.path.isdir(Config.output_dir) and any(fname.startswith("checkpoint-") for fname in os.listdir(Config.output_dir)):
# Attempt to find the most recent checkpoint folder
checkpoints = [d for d in os.listdir(Config.output_dir) if d.startswith("checkpoint-")]
if checkpoints:
# Sort by step
checkpoints.sort(key=lambda x: int(x.split("-")[-1]))
last_checkpoint = os.path.join(Config.output_dir, checkpoints[-1])
logger.info(f" Found a possible checkpoint to resume from: {last_checkpoint}")
# Initialize model
if last_checkpoint:
logger.info(f"Resuming from {last_checkpoint}")
model = RobertaForSequenceClassification.from_pretrained(last_checkpoint, num_labels=num_labels)
else:
logger.info("No valid checkpoint found. Starting fresh training.")
model = RobertaForSequenceClassification.from_pretrained(Config.model_name, num_labels=num_labels)
model = model.to(DEVICE)
# Apply LoRA Adapters
lora_config = LoraConfig(
task_type=TaskType.SEQ_CLS,
r=32,
lora_alpha=128,
lora_dropout=0.1,
bias="none"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# !!! CHANGED !!!: Gradient Accumulation & Seed
training_args = TrainingArguments(
output_dir=Config.output_dir,
evaluation_strategy=Config.evaluation_strategy,
save_strategy=Config.save_strategy,
#save_steps=Config.save_steps,
#eval_steps=Config.eval_steps,
save_total_limit=Config.save_total_limit,
per_device_train_batch_size=Config.batch_size,
per_device_eval_batch_size=Config.batch_size,
num_train_epochs=Config.epochs,
learning_rate=Config.learning_rate,
weight_decay=Config.weight_decay,
logging_dir="./logs",
logging_steps=Config.logging_steps,
report_to="none",
load_best_model_at_end=True,
metric_for_best_model=Config.metric_for_best_model,
greater_is_better=Config.greater_is_better,
gradient_accumulation_steps=Config.gradient_accumulation_steps, # !!! CHANGED !!!
seed=42 # !!! CHANGED !!!
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
tokenizer=tokenizer,
)
logger.info("Starting training...")
# !!! CHANGED !!!: Actually pass `resume_from_checkpoint` to do auto-resume
trainer.train(resume_from_checkpoint=last_checkpoint)
# Save Final LoRA Adapter & Tokenizer
logger.info("Saving final model, LoRA adapters, and tokenizer...")
model.save_pretrained(Config.output_dir)
tokenizer.save_pretrained(Config.output_dir)
# Save Trainer State
trainer.state.save_to_json(f"{Config.output_dir}/trainer_state.json")
# Save Label Mapping for Inference
label_mapping_path = f"{Config.output_dir}/label_mapping.json"
with open(label_mapping_path, "w") as f:
json.dump(label_mapping, f)
logger.info(f"Label mapping saved to {label_mapping_path}")
# Verify Label Mapping Integrity
with open(label_mapping_path, "r") as f:
loaded_mapping = json.load(f)
if loaded_mapping == label_mapping:
logger.info(" Label mapping verification successful.")
else:
logger.error(" Label mapping mismatch! Check saved file.")
# Evaluate & Save Results
logger.info(" Evaluating model...")
eval_results = trainer.evaluate()
eval_df = pd.DataFrame([eval_results])
eval_df.to_csv(Config.results_csv, index=False)
logger.info(f" Evaluation results saved to {Config.results_csv}")
# Save Predictions on Test Set
logger.info(" Running predictions on test dataset...")
test_predictions = trainer.predict(test_dataset)
test_preds = test_predictions.predictions.argmax(axis=1)
test_results_df = pd.DataFrame({
"Text": test_dataset["Example"],
"Predicted Label": [list(label_mapping.keys())[p] for p in test_preds],
"Actual Label": [list(label_mapping.keys())[int(l)] for l in test_dataset["labels"]], # ✅ Convert to int
"Correct": test_preds == test_dataset["labels"]
})
test_results_df.to_csv(Config.predictions_csv, index=False)
logger.info(f" Test predictions saved to {Config.predictions_csv}")
test_metrics = compute_metrics((test_predictions.predictions, test_predictions.label_ids))
logger.info(f"Test metrics: {test_metrics}")
correct_preds = test_results_df["Correct"].sum()
total_preds = len(test_results_df)
test_accuracy = correct_preds / total_preds
logger.info(f"Test Accuracy: {test_accuracy}")
# !!! CHANGED !!!: Use official PEFT merge
logger.info(" Merging LoRA adapters into base model for AWS deployment...")
full_model_path = f"{Config.output_dir}/full_model"
if not os.path.exists(full_model_path):
os.makedirs(full_model_path)
# Load the LoRA-adapted model
adapter_model = PeftModel.from_pretrained(
model,
Config.output_dir
)
# Merge LoRA weights into base and unload
adapter_model = adapter_model.merge_and_unload() # merges LoRA into base weights
# Now adapter_model is effectively the base model with LoRA merges
adapter_model.save_pretrained("./roberta_output/full_model")
# Save Full Model Configuration & Tokenizer for AWS
adapter_model.config.to_json_file(f"{full_model_path}/config.json")
tokenizer.save_pretrained(full_model_path)
logger.info(" Full model saved for AWS deployment!")
print(os.listdir(Config.output_dir))
return model, trainer
# ----------------------------------------------
# 10. Main Execution Pipeline
# ----------------------------------------------
if __name__ == "__main__":
try:
df = load_and_preprocess_data(Config.data_path)
train_dataset, eval_dataset, test_dataset, label_mapping = prepare_datasets(df)
model, trainer = train_model(train_dataset, eval_dataset, test_dataset, label_mapping)
logger.info("Training completed successfully!")
except Exception as e:
logger.error(f"Training failed: {str(e)}", exc_info=True)
raise
The files it produces are:
roberta_output/
└─ full_model/
├─ adapter_config.json
├─ adapter_model.bin
├─ adapter_model.safetensors
├─ config.json
├─ merges.txt
├─ README.md
├─ special_tokens_map.json
├─ tokenizer_config.json
└─ vocab.json
Prediction Script
import os
import json
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification
MODEL_DIR = "./roberta_output/full_model"
LABEL_MAPPING_PATH = "./roberta_output/label_mapping.json"
# Load label mapping
with open(LABEL_MAPPING_PATH, "r") as f:
label_mapping = json.load(f)
# Create correct mappings
id2label = {str(v): k for k, v in label_mapping.items()}
label2id = {k: v for k, v in label_mapping.items()}
# Load merged model with explicit config
tokenizer = RobertaTokenizer.from_pretrained(MODEL_DIR)
model = RobertaForSequenceClassification.from_pretrained(
MODEL_DIR,
num_labels=len(label_mapping),
id2label=id2label,
label2id=label2id,
problem_type="single_label_classification" # Important line
).eval().to("cuda" if torch.cuda.is_available() else "cpu")
# Test samples
samples = [
"I feel so exhausted. Everything is overwhelming me these days.",
"I love spending time with my family and traveling on weekends!",
"Whenever I get recognized at work, my motivation goes up."
]
for text in samples:
inputs = tokenizer(
text.lower().strip(),
max_length=512,
padding="max_length",
truncation=True,
return_tensors="pt"
).to(model.device)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.softmax(outputs.logits, dim=-1)[0]
pred_id = probs.argmax().item()
print(f"\nText: {text}")
print(f"Predicted: {id2label[str(pred_id)]}")
print("Top 3 probabilities:")
for prob, idx in zip(*probs.topk(3)):
print(f"- {id2label[str(idx.item())]}: {prob.item():.2%}")
#Thank you so much for taking the time to read through this long post and for helping me brainstorm ways to fix the problem