Translation with marianmt. Early stopping stucked

Hi. I am fine-tunning marianmt for a translation task. and as soon as i added early stopping. it crashes without even giving an error message:

from transformers import DataCollatorForSeq2Seq, MarianMTModel, MarianTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, Seq2SeqTrainingArguments, Seq2SeqTrainer
from contracts_translation.src.finetunning.src.logger import CSVLoggerCallback
import evaluate

import warnings
warnings.filterwarnings(“ignore”)

class MarianFineTuner:
def init(self, model_name: str, device: str, config: dict):
self.model_name = model_name
self.device = device
self.config = config
self.tokenizer = MarianTokenizer.from_pretrained(model_name)
self.model = MarianMTModel.from_pretrained(model_name).to(device)
self.data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer, model=self.model)

def tokenize_dataset(self, dataset, source_col: str, target_col: str):
    def tokenize_function(examples):
        model_inputs = self.tokenizer(examples[source_col], truncation=True)
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(examples[target_col], truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    return dataset.map(tokenize_function, batched=True, remove_columns=[source_col, target_col])

def compute_metrics(self, eval_preds):
    preds, labels = eval_preds
    decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Strip
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    bleu = evaluate.load("bleu")
    meteor = evaluate.load("meteor")

    bleu_result = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": round(bleu_result["bleu"], 4),
        "meteor": round(meteor_result["meteor"], 4)
    }

def train(self, train_dataset, val_dataset, experiment_name):

    training_args = Seq2SeqTrainingArguments(
        output_dir=self.config["temp_output_dir"],
        per_device_train_batch_size=self.config["batch_size"],
        per_device_eval_batch_size=self.config["batch_size"],
        num_train_epochs=self.config["num_train_epochs"],
        eval_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        do_train=True,
        do_eval=True,
        report_to=[],
        load_best_model_at_end=True,
        metric_for_best_model="meteor",
        greater_is_better=True,
        predict_with_generate=True,
        torch_empty_cache_steps=2,
        eval_accumulation_steps=10,
    )

    log_path = f"results/epoch_logs/epoch_log_{experiment_name}.csv"

    trainer = Seq2SeqTrainer(
        model=self.model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=self.tokenizer,
        data_collator=self.data_collator,
        callbacks=[
            CSVLoggerCallback(log_path, experiment_name),
            EarlyStoppingCallback(early_stopping_patience=2)
        ],
        compute_metrics=self.compute_metrics
    )

    trainer.train()

    trainer.save_model(f"models/{experiment_name}")

this is how i am instansiating my model:

def main():
with open(f"config/{args.experiment_name}.yaml") as f:
config = yaml.safe_load(f)

cols = ["MARKETING_REFERENCE_fr", "MARKETING_REFERENCE_root", config["source_column"], config["target_column"]]

train_ds, val_ds = load_datasets(
    config["data"]["train_path"], config["data"]["val_path"], cols
)

torch.cuda.empty_cache()

output_dir = Path("models/temp_output")
output_dir.mkdir(parents=True, exist_ok=True)

config["temp_output_dir"] = str(output_dir)

trainer = MarianFineTuner(model_name=config["model_name"], device=config["device"], config=config)
train_ds = trainer.tokenize_dataset(train_ds, config["source_column"], config["target_column"])
val_ds = trainer.tokenize_dataset(val_ds, config["source_column"], config["target_column"])

Path("results").mkdir(parents=True, exist_ok=True)

trainer.train(train_ds, val_ds, experiment_name=args.experiment_name)

shutil.rmtree(output_dir)

do you know why this would happend?

1 Like

If no error message appears, it means that the processing has completed normally (although the result may not be normal), so there may be something wrong with the evaluation. The following example shows an error message, so it is slightly different, but it is similar to this…

However, even if a callback is set so that the early stopping condition is always true, it seems that it will be executed for one epoch…

https://stackoverflow.com/questions/76330546/how-to-determine-the-value-of-early-stopping-patience-in-huggingfaces-seq2seqtr

import os
import yaml
import torch
import shutil
from pathlib import Path
from datasets import load_dataset
from transformers import (
MarianMTModel, MarianTokenizer,
DataCollatorForSeq2Seq, Seq2SeqTrainer,
Seq2SeqTrainingArguments, EarlyStoppingCallback
)
import evaluate

class MarianFineTuner:
def init(self, model_name: str, device: str, config: dict):
self.model_name = model_name
self.device = device
self.config = config

    self.tokenizer = MarianTokenizer.from_pretrained(model_name)
    self.model = MarianMTModel.from_pretrained(model_name).to(device)
    self.data_collator = DataCollatorForSeq2Seq(
        tokenizer=self.tokenizer,
        model=self.model
    )

def tokenize_dataset(self, dataset, source_col, target_col):
    def tokenize_fn(example):
        inputs = self.tokenizer(example[source_col], truncation=True)
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(example[target_col], truncation=True)
        inputs["labels"] = labels["input_ids"]
        return inputs

    return dataset.map(tokenize_fn, batched=True, remove_columns=[source_col, target_col])

def compute_metrics(self, eval_preds):
    preds, labels = eval_preds
    decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    bleu = evaluate.load("bleu")
    meteor = evaluate.load("meteor")

    bleu_result = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": round(bleu_result["bleu"], 4),
        "meteor": round(meteor_result["meteor"], 4)
    }

def train(self, train_dataset, val_dataset, experiment_name):
    output_dir = Path(f"models/{experiment_name}")
    output_dir.mkdir(parents=True, exist_ok=True)

    training_args = Seq2SeqTrainingArguments(
        output_dir=str(output_dir),
        per_device_train_batch_size=self.config["train_batch_size"],
        per_device_eval_batch_size=self.config["eval_batch_size"],
        learning_rate=self.config["lr"],
        num_train_epochs=self.config["epochs"],
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="bleu",
        greater_is_better=True,
        logging_strategy="epoch",
        report_to="none",
        seed=self.config.get("seed", 42)
    )

    trainer = Seq2SeqTrainer(
        model=self.model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=self.tokenizer,
        data_collator=self.data_collator,
        compute_metrics=self.compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    # Clean up safely after training finishes
    if output_dir.exists():
        shutil.rmtree(output_dir)

def main():
# Load config
with open(“config/train_config.yaml”, “r”) as f:
config = yaml.safe_load(f)

source_col = config["source_column"]
target_col = config["target_column"]
model_name = config["model_name"]
experiment_name = config["experiment_name"]

# Load and prepare data
dataset = load_dataset(config["data"]["hf_loader"], data_files={
    "train": config["data"]["train_path"],
    "validation": config["data"]["val_path"]
})

trainer = MarianFineTuner(
    model_name=model_name,
    device="cuda" if torch.cuda.is_available() else "cpu",
    config=config
)

train_ds = trainer.tokenize_dataset(dataset["train"], source_col, target_col)
val_ds = trainer.tokenize_dataset(dataset["validation"], source_col, target_col)

torch.cuda.empty_cache()

trainer.train(train_ds, val_ds, experiment_name)

if name == “main”:
main()

Required config/train_config.yaml Example:

model_name: Helsinki-NLP/opus-mt-en-fr
experiment_name: translation_run_01

train_batch_size: 4
eval_batch_size: 4
lr: 5e-5
epochs: 10

source_column: en_text
target_column: fr_text

data:
hf_loader: json
train_path: ./data/train.json
val_path: ./data/val.json

Solution provided by Triskel Data Deterministic AI.

1 Like

this is my config example: description: “baseline marianmt model, with input data splitted by dot. Only
sentences with more than 5 words are kept after splitting”

model_name: “Helsinki-NLP/opus-mt-en-fr”
source_column: “sentences_en”
target_column: “sentences_fr”
batch_size: 2
num_train_epochs: 3
max_length:
fp16: true
device: “cuda:0”
data:
train_path: “data/dot_len/train_dot_len.xlsx”
val_path: “data/dot_len/validation_dot_len.xlsx”
test_path: “data/dot_len/test_dot_len.xlsx”

1 Like

Perhaps the Hugging Face datasets library was not used to create the dataset. I think pandas can read xlsx files themselves…