Trainer warning with the new version

I am Training summarization model in Google Colab with transformer version 4.46.1.
I keep on getting the following warning

“Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.”

I changed my code to use the processing_class, but still getting the warning.

Relevant Code

  !pip install datasets evaluate wandb accelerate py7zr rouge_score sentencepiece sacrebleu bert-score fsspec transformers -U -qq

from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, set_seed
from transformers import AutoProcessor, DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM
from transformers import AutoConfig, GenerationConfig

samsum_dataset = load_dataset("samsum", trust_remote_code=True)
# Split the test set into test and validation sets
train_split = samsum_dataset["train"]
valid_split = samsum_dataset["validation"]
test_split = samsum_dataset["test"]
train_val_subset= DatasetDict(
    {"train": train_split, "valid": val_split})

checkpoint =  "google/pegasus-cnn_dailymail"
processor = AutoProcessor.from_pretrained(checkpoint)

def tokenize_fn(batch):
    input_encodings = processor(
        batch["dialogue"], truncation=True, max_length=512)

    target_encodings = processor(
        text_target=batch["summary"], truncation=True, max_length=128
    )

    return {
        "input_ids": input_encodings["input_ids"],
        "attention_mask": input_encodings["attention_mask"],
        "labels": target_encodings["input_ids"],
    }

tokenized_dataset= train_val_subset.map(tokenize_fn, batched=True)

tokenized_dataset = tokenized_dataset.remove_columns(
    train_val_subset["train"].column_names
)
config = AutoConfig.from_pretrained(checkpoint)
generation_config = GenerationConfig.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint,
    config=config,
)

rouge_score = evaluate.load("rouge")
def compute_metrics(eval_pred):
    # Separate predictions and labels
    predictions, labels = eval_pred

    # Decode generated summaries into text, removing any special tokens
    decoded_predictions = processor.batch_decode(
        predictions, skip_special_tokens=True)

    # Replace -100 in the labels with padding token ID, as -100 is commonly used to indicate ignored tokens
    labels = np.where(labels != -100, labels, processor.pad_token_id)

    # Decode reference summaries into text, removing any special tokens
    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)

    # ROUGE expects a newline after each sentence; tokenize and join sentences with newline characters
    decoded_predictions = [
        "\n".join(sent_tokenize(pred.strip())) for pred in decoded_predictions
    ]
    decoded_labels = [
        "\n".join(sent_tokenize(label.strip())) for label in decoded_labels
    ]

    # Compute ROUGE scores between the predictions and references, using stemming

    score = rouge_score.compute(
        predictions=decoded_predictions,
        references=decoded_labels,
        use_stemmer=True,
        use_aggregator=True,
    )

    # Scale the scores by 100 and round to four decimal places before returning
    return {key: round(value * 100, 4) for key, value in score.items()}

# Define the directory where model checkpoints will be saved
model_folder = Path("/content/samsum_pegasus")
# Create the directory if it doesn't exist
model_folder.mkdir(exist_ok=True, parents=True)

# Configure training parameters
training_args = Seq2SeqTrainingArguments(
    # Training-specific configurations
    num_train_epochs=1,  # Total number of training epochs
    weight_decay=0.01,  # Apply L2 regularization to prevent overfitting
    learning_rate=5e-5,  # Step size for the optimizer during training
    optim="adamw_torch",  # Optimizer,
    warmup_steps=10,
    predict_with_generate=True,
    generation_config=generation_config,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,  # Number of samples per eval batch for each device

    # memory related settings
    gradient_accumulation_steps=1,  # memory
    gradient_checkpointing=True,  # memory
    fp16 = False, # Speed
    bf16=True,
    tf32=False, # speed

    # evaluation settings
    output_dir=str(model_folder),  # Directory to save model checkpoints
    eval_strategy="steps",  # Evaluate model at specified step intervals
    eval_steps=10,  # Perform evaluation every 10 training steps
    
    # Checkpoint settings
    save_strategy="steps",  # Save model checkpoint at specified step intervals
    save_steps=10,  # Save a model checkpoint every 10 training steps
    load_best_model_at_end=True,  # Reload the best model at the end of training
    save_total_limit=2,  # Retain only the best and the most recent model checkpoints
    metric_for_best_model='eval_loss',
    greater_is_better=False,
    
    # Experiment logging configurations 
    logging_strategy="steps",
    logging_steps=10,
    report_to="wandb",  # Log metrics and results to Weights & Biases platform
    # Experiment name for Weights & Biases
    run_name="samsum_pegasus",

data_collator = DataCollatorForSeq2Seq(
   tokenizer=processor,
    model=model,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["valid"],
    compute_metrics=compute_metrics,

    data_collator = data_collator,
    processing_class = processor
   )

trainer.train()  # start training

Are there any changes I can make to avoid the warning?

2 Likes

Maybe this?