I am Training summarization model in Google Colab with transformer version 4.46.1.
I keep on getting the following warning
“Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.”
I changed my code to use the processing_class, but still getting the warning.
Relevant Code
!pip install datasets evaluate wandb accelerate py7zr rouge_score sentencepiece sacrebleu bert-score fsspec transformers -U -qq
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, set_seed
from transformers import AutoProcessor, DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoConfig, GenerationConfig
samsum_dataset = load_dataset("samsum", trust_remote_code=True)
# Split the test set into test and validation sets
train_split = samsum_dataset["train"]
valid_split = samsum_dataset["validation"]
test_split = samsum_dataset["test"]
train_val_subset= DatasetDict(
{"train": train_split, "valid": val_split})
checkpoint = "google/pegasus-cnn_dailymail"
processor = AutoProcessor.from_pretrained(checkpoint)
def tokenize_fn(batch):
input_encodings = processor(
batch["dialogue"], truncation=True, max_length=512)
target_encodings = processor(
text_target=batch["summary"], truncation=True, max_length=128
)
return {
"input_ids": input_encodings["input_ids"],
"attention_mask": input_encodings["attention_mask"],
"labels": target_encodings["input_ids"],
}
tokenized_dataset= train_val_subset.map(tokenize_fn, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(
train_val_subset["train"].column_names
)
config = AutoConfig.from_pretrained(checkpoint)
generation_config = GenerationConfig.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(
checkpoint,
config=config,
)
rouge_score = evaluate.load("rouge")
def compute_metrics(eval_pred):
# Separate predictions and labels
predictions, labels = eval_pred
# Decode generated summaries into text, removing any special tokens
decoded_predictions = processor.batch_decode(
predictions, skip_special_tokens=True)
# Replace -100 in the labels with padding token ID, as -100 is commonly used to indicate ignored tokens
labels = np.where(labels != -100, labels, processor.pad_token_id)
# Decode reference summaries into text, removing any special tokens
decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
# ROUGE expects a newline after each sentence; tokenize and join sentences with newline characters
decoded_predictions = [
"\n".join(sent_tokenize(pred.strip())) for pred in decoded_predictions
]
decoded_labels = [
"\n".join(sent_tokenize(label.strip())) for label in decoded_labels
]
# Compute ROUGE scores between the predictions and references, using stemming
score = rouge_score.compute(
predictions=decoded_predictions,
references=decoded_labels,
use_stemmer=True,
use_aggregator=True,
)
# Scale the scores by 100 and round to four decimal places before returning
return {key: round(value * 100, 4) for key, value in score.items()}
# Define the directory where model checkpoints will be saved
model_folder = Path("/content/samsum_pegasus")
# Create the directory if it doesn't exist
model_folder.mkdir(exist_ok=True, parents=True)
# Configure training parameters
training_args = Seq2SeqTrainingArguments(
# Training-specific configurations
num_train_epochs=1, # Total number of training epochs
weight_decay=0.01, # Apply L2 regularization to prevent overfitting
learning_rate=5e-5, # Step size for the optimizer during training
optim="adamw_torch", # Optimizer,
warmup_steps=10,
predict_with_generate=True,
generation_config=generation_config,
per_device_train_batch_size=5,
per_device_eval_batch_size=5, # Number of samples per eval batch for each device
# memory related settings
gradient_accumulation_steps=1, # memory
gradient_checkpointing=True, # memory
fp16 = False, # Speed
bf16=True,
tf32=False, # speed
# evaluation settings
output_dir=str(model_folder), # Directory to save model checkpoints
eval_strategy="steps", # Evaluate model at specified step intervals
eval_steps=10, # Perform evaluation every 10 training steps
# Checkpoint settings
save_strategy="steps", # Save model checkpoint at specified step intervals
save_steps=10, # Save a model checkpoint every 10 training steps
load_best_model_at_end=True, # Reload the best model at the end of training
save_total_limit=2, # Retain only the best and the most recent model checkpoints
metric_for_best_model='eval_loss',
greater_is_better=False,
# Experiment logging configurations
logging_strategy="steps",
logging_steps=10,
report_to="wandb", # Log metrics and results to Weights & Biases platform
# Experiment name for Weights & Biases
run_name="samsum_pegasus",
data_collator = DataCollatorForSeq2Seq(
tokenizer=processor,
model=model,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["valid"],
compute_metrics=compute_metrics,
data_collator = data_collator,
processing_class = processor
)
trainer.train() # start training
Are there any changes I can make to avoid the warning?