Translation with marianmt. Early stopping stucked

laurafuentesq · June 16, 2025, 3:12pm

Hi. I am fine-tunning marianmt for a translation task. and as soon as i added early stopping. it crashes without even giving an error message:

from transformers import DataCollatorForSeq2Seq, MarianMTModel, MarianTokenizer, Trainer, TrainingArguments, EarlyStoppingCallback, Seq2SeqTrainingArguments, Seq2SeqTrainer
from contracts_translation.src.finetunning.src.logger import CSVLoggerCallback
import evaluate

import warnings
warnings.filterwarnings(“ignore”)

class MarianFineTuner:
def init(self, model_name: str, device: str, config: dict):
self.model_name = model_name
self.device = device
self.config = config
self.tokenizer = MarianTokenizer.from_pretrained(model_name)
self.model = MarianMTModel.from_pretrained(model_name).to(device)
self.data_collator = DataCollatorForSeq2Seq(tokenizer=self.tokenizer, model=self.model)

def tokenize_dataset(self, dataset, source_col: str, target_col: str):
    def tokenize_function(examples):
        model_inputs = self.tokenizer(examples[source_col], truncation=True)
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(examples[target_col], truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs

    return dataset.map(tokenize_function, batched=True, remove_columns=[source_col, target_col])

def compute_metrics(self, eval_preds):
    preds, labels = eval_preds
    decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Strip
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    bleu = evaluate.load("bleu")
    meteor = evaluate.load("meteor")

    bleu_result = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": round(bleu_result["bleu"], 4),
        "meteor": round(meteor_result["meteor"], 4)
    }

def train(self, train_dataset, val_dataset, experiment_name):

    training_args = Seq2SeqTrainingArguments(
        output_dir=self.config["temp_output_dir"],
        per_device_train_batch_size=self.config["batch_size"],
        per_device_eval_batch_size=self.config["batch_size"],
        num_train_epochs=self.config["num_train_epochs"],
        eval_strategy="epoch",
        logging_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        do_train=True,
        do_eval=True,
        report_to=[],
        load_best_model_at_end=True,
        metric_for_best_model="meteor",
        greater_is_better=True,
        predict_with_generate=True,
        torch_empty_cache_steps=2,
        eval_accumulation_steps=10,
    )

    log_path = f"results/epoch_logs/epoch_log_{experiment_name}.csv"

    trainer = Seq2SeqTrainer(
        model=self.model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=self.tokenizer,
        data_collator=self.data_collator,
        callbacks=[
            CSVLoggerCallback(log_path, experiment_name),
            EarlyStoppingCallback(early_stopping_patience=2)
        ],
        compute_metrics=self.compute_metrics
    )

    trainer.train()

    trainer.save_model(f"models/{experiment_name}")

this is how i am instansiating my model:

def main():
with open(f"config/{args.experiment_name}.yaml") as f:
config = yaml.safe_load(f)

cols = ["MARKETING_REFERENCE_fr", "MARKETING_REFERENCE_root", config["source_column"], config["target_column"]]

train_ds, val_ds = load_datasets(
    config["data"]["train_path"], config["data"]["val_path"], cols
)

torch.cuda.empty_cache()

output_dir = Path("models/temp_output")
output_dir.mkdir(parents=True, exist_ok=True)

config["temp_output_dir"] = str(output_dir)

trainer = MarianFineTuner(model_name=config["model_name"], device=config["device"], config=config)
train_ds = trainer.tokenize_dataset(train_ds, config["source_column"], config["target_column"])
val_ds = trainer.tokenize_dataset(val_ds, config["source_column"], config["target_column"])

Path("results").mkdir(parents=True, exist_ok=True)

trainer.train(train_ds, val_ds, experiment_name=args.experiment_name)

shutil.rmtree(output_dir)

do you know why this would happend?

John6666 · June 16, 2025, 3:48pm

If no error message appears, it means that the processing has completed normally (although the result may not be normal), so there may be something wrong with the evaluation. The following example shows an error message, so it is slightly different, but it is similar to this…

However, even if a callback is set so that the early stopping condition is always true, it seems that it will be executed for one epoch…

https://stackoverflow.com/questions/76330546/how-to-determine-the-value-of-early-stopping-patience-in-huggingfaces-seq2seqtr

Pimpcat-AU · June 17, 2025, 6:26am

import os
import yaml
import torch
import shutil
from pathlib import Path
from datasets import load_dataset
from transformers import (
MarianMTModel, MarianTokenizer,
DataCollatorForSeq2Seq, Seq2SeqTrainer,
Seq2SeqTrainingArguments, EarlyStoppingCallback
)
import evaluate

class MarianFineTuner:
def init(self, model_name: str, device: str, config: dict):
self.model_name = model_name
self.device = device
self.config = config

    self.tokenizer = MarianTokenizer.from_pretrained(model_name)
    self.model = MarianMTModel.from_pretrained(model_name).to(device)
    self.data_collator = DataCollatorForSeq2Seq(
        tokenizer=self.tokenizer,
        model=self.model
    )

def tokenize_dataset(self, dataset, source_col, target_col):
    def tokenize_fn(example):
        inputs = self.tokenizer(example[source_col], truncation=True)
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(example[target_col], truncation=True)
        inputs["labels"] = labels["input_ids"]
        return inputs

    return dataset.map(tokenize_fn, batched=True, remove_columns=[source_col, target_col])

def compute_metrics(self, eval_preds):
    preds, labels = eval_preds
    decoded_preds = self.tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = self.tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [label.strip() for label in decoded_labels]

    bleu = evaluate.load("bleu")
    meteor = evaluate.load("meteor")

    bleu_result = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    meteor_result = meteor.compute(predictions=decoded_preds, references=decoded_labels)

    return {
        "bleu": round(bleu_result["bleu"], 4),
        "meteor": round(meteor_result["meteor"], 4)
    }

def train(self, train_dataset, val_dataset, experiment_name):
    output_dir = Path(f"models/{experiment_name}")
    output_dir.mkdir(parents=True, exist_ok=True)

    training_args = Seq2SeqTrainingArguments(
        output_dir=str(output_dir),
        per_device_train_batch_size=self.config["train_batch_size"],
        per_device_eval_batch_size=self.config["eval_batch_size"],
        learning_rate=self.config["lr"],
        num_train_epochs=self.config["epochs"],
        evaluation_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="bleu",
        greater_is_better=True,
        logging_strategy="epoch",
        report_to="none",
        seed=self.config.get("seed", 42)
    )

    trainer = Seq2SeqTrainer(
        model=self.model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=self.tokenizer,
        data_collator=self.data_collator,
        compute_metrics=self.compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    trainer.train()

    # Clean up safely after training finishes
    if output_dir.exists():
        shutil.rmtree(output_dir)

def main():
# Load config
with open(“config/train_config.yaml”, “r”) as f:
config = yaml.safe_load(f)

source_col = config["source_column"]
target_col = config["target_column"]
model_name = config["model_name"]
experiment_name = config["experiment_name"]

# Load and prepare data
dataset = load_dataset(config["data"]["hf_loader"], data_files={
    "train": config["data"]["train_path"],
    "validation": config["data"]["val_path"]
})

trainer = MarianFineTuner(
    model_name=model_name,
    device="cuda" if torch.cuda.is_available() else "cpu",
    config=config
)

train_ds = trainer.tokenize_dataset(dataset["train"], source_col, target_col)
val_ds = trainer.tokenize_dataset(dataset["validation"], source_col, target_col)

torch.cuda.empty_cache()

trainer.train(train_ds, val_ds, experiment_name)

if name == “main”:
main()

Required config/train_config.yaml Example:

model_name: Helsinki-NLP/opus-mt-en-fr
experiment_name: translation_run_01

train_batch_size: 4
eval_batch_size: 4
lr: 5e-5
epochs: 10

source_column: en_text
target_column: fr_text

data:
hf_loader: json
train_path: ./data/train.json
val_path: ./data/val.json

Solution provided by Triskel Data Deterministic AI.

laurafuentesq · June 17, 2025, 12:29pm

this is my config example: description: “baseline marianmt model, with input data splitted by dot. Only
sentences with more than 5 words are kept after splitting”

model_name: “Helsinki-NLP/opus-mt-en-fr”
source_column: “sentences_en”
target_column: “sentences_fr”
batch_size: 2
num_train_epochs: 3
max_length:
fp16: true
device: “cuda:0”
data:
train_path: “data/dot_len/train_dot_len.xlsx”
val_path: “data/dot_len/validation_dot_len.xlsx”
test_path: “data/dot_len/test_dot_len.xlsx”

John6666 · June 17, 2025, 2:50pm

Perhaps the Hugging Face datasets library was not used to create the dataset. I think pandas can read xlsx files themselves…

Topic		Replies	Views
Issue with using a save_pretrained model (MarianMT) 🤗Transformers	1	447	April 5, 2023
Issues with save_pretrained (MarianMT) Beginners	1	655	April 11, 2023
Adding New Tokens to MarianMT Model 🤗Tokenizers	8	758	February 4, 2024
How to get translation with attention using MarianMT Beginners	3	1153	March 30, 2023
Caching issues with MarianMT Beginners	0	22	November 1, 2024

Translation with marianmt. Early stopping stucked

Related topics