Here’s the code for the simple example.
import evaluate
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from transformers import (
AutoTokenizer,
AutoConfig,
AutoModelForSequenceClassification,
TrainingArguments,
Trainer,
DataCollatorWithPadding
)
data = load_dataset("/data/for/binary/text/classification/")
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
def preprocess_data(examples):
txt_data = []
for example in examples['values']:
num2str = ' '.join(str(token) for token in example)
txt_data.append(num2str)
examples['text'] = txt_data
return tokenizer(txt_data, truncation=True)
tokenized_data = data.map(preprocess_data, batched=True)
print(tokenized_data)
DatasetDict({
train: Dataset({
features: ['values', 'answer', 'cat', 'text', 'input_ids', 'attention_mask'],
num_rows: 100000
})
validation: Dataset({
features: ['values', 'answer', 'cat', 'text', 'input_ids', 'attention_mask'],
num_rows: 14988
})
test: Dataset({
features: ['values', 'answer', 'cat', 'text', 'input_ids', 'attention_mask'],
num_rows: 15000
})
tokenized_data = tokenized_data.rename_column("answer", "labels")
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')
config = AutoConfig.from_pretrained('roberta-base')
config.num_labbels = 2
model = AutoModelForSequenceClassification.from_config(config)
metrics = evaluate.combine(['accuracy', 'f1', 'precision', 'recall'] )
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return metrics.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
output_dir="roberta_checkpoints",
learning_rate=2e-5,
per_device_train_batch_size=640,
per_device_eval_batch_size=640,
num_train_epochs=2,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_data["train"],
eval_dataset=tokenized_data["validation"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
trainer.train()
And here’s the output during training.
The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: cat, values, text.
/python3.12/site-packages/transformers/optimization.py:306: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use thePyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
warnings.warn(
***** Running training *****
Num examples = 100000
Num Epochs = 2
Instantaneous batch size per device = 640
Total train batch size (w. parallel, distributed & accumulation) = 640
Gradient Accumulation steps = 1
Total optimization steps = 314
Epoch Training Loss Validation Loss Accuracy F1 Precision Recall
1 No log 0.693468 0.497131 0.000000 0.000000 0.000000
2 No log 0.693163 0.497131 0.000000 0.000000 0.000000
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: cat, values, text.
***** Running Evaluation *****
Num examples = 14988
Batch size = 640
/python3.12/site-packages/sklearn/metrics/_classification.py:1469: **UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.**
_warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to roberta_checkpoints/checkpoint-157
Configuration saved in roberta_checkpoints/checkpoint-157/config.json
Model weights saved in roberta_checkpoints/checkpoint-157/pytorch_model.bin
tokenizer config file saved in roberta_checkpoints/checkpoint-157/tokenizer_config.json
Special tokens file saved in roberta_checkpoints/checkpoint-157/special_tokens_map.json
Finally the trainer outputs the following
TrainOutput(global_step=314, training_loss=0.7030252833275279, metrics={'train_runtime': 188.2461, 'train_samples_per_second': 1062.439, 'train_steps_per_second': 1.668, 'total_flos': 2261110632000000.0, 'train_loss': 0.7030252833275279, 'epoch': 2.0})
Any help understanding what’s causing this issue would be appreciated.