HF Trainer: HF trainer cause a problem while fine-tuning T5 (T5 doesn't generate eos token at proper point)

Hi, I’m trying to fine-tune T5 to new task. So I trained T5 without HF trainer (just use HF model & tokenizer & AdamW) successfully. And prediction results are as follows:

prediction: "2 explanation: ""A double standard like never seen before in the history of our Country"""
target_label: "1 explanation: ""A double standard like never seen before in the history of our Country"""

classification was wrong but there’s no problem with the form of generated text.

But if I trained T5 with HF trainer, something goes wrong. You can see it at this following results:

prediction: "2 explanation: \"A double standard like never seen before in the history of our Country\" 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 explanation 2 2 2 explanation explanation explanation explanation explanation explanation explanation explanation explanation 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2"
target_label: "1 explanation: \"A double standard like never seen before in the history of our Country\""

T5 could not generate eos token at the proper point.

Except for the different batch size(1 vs 4), all the settings of both cases are exactly same. (I checked it twice)

I have no idea with this situation… What causes this problem? or Is there anything I can try or search?

Thanks in advance :blush:

+ Actually the reason why I have to use HF Trainer is deepspeed integration. I have to fine-tune 11B. If there are specific example code using deepspeed without HF Trainer, leave them by comments please.

+ Belows are the code that I used

  1. Without HF Trainer

.sh script

python train.py --max_source_len 512 \
  --max_target_len 512 \
  --batch_size 1 \
  --lr 0.00001 \
  --num_workers 4

train.py

# Importing libraries
import os
import numpy as np
import pandas as pd
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import json
import argparse
from tqdm import tqdm

# Importing the T5 modules from huggingface/transformers
from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer
)

# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

def train(tokenizer, model, device, loader, optimizer):
    
    """
    Function to be called for training with the parameters passed from main function

    """

    model.train()
    losses = []
    
    for batch in tqdm(loader, desc="Training Process"):

        input_ids = batch["source_ids"].to(device)
        attention_mask = batch["source_mask"].to(device)
        labels = batch["target_ids"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs[0]
        losses.append(loss)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    return losses

def validate(tokenizer, model, device, loader):
    
    """
    Function to evaluate model for predictions

    """

    model.eval()
    predictions = []
    targets = []

    pair_ids = []
    docs = []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluation Process"):
            input_ids = batch["source_ids"].to(device)
            attention_mask = batch["source_mask"].to(device)
            target = batch["target"]
            pair_id = batch["pair_id"]
            doc = [src.split(" claim1: ")[1].split(" claim2: ")[0] for src in batch["source"]]

            generated_ids = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask, 
                max_length=512,
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
            )
            prediction = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]

            predictions.extend(prediction)
            targets.extend(target)
            pair_ids.extend(pair_id)
            docs.extend(doc)

    meta_info = {"pair_ids": pair_ids, "docs": docs}

    return predictions, targets, meta_info

def T5Trainer(
    model_params, data_dir="./ClaimDiff/", ckpt_dir="./checkpoints/", output_dir='./outputs/'
    ):

    """
    T5 trainer
    model_params is a dictionary containing model paramters for T5 training:

    MODEL: "t5-base", model_type: t5-base/t5-large
    BATCH_SIZE: 8, batch size
    EPOCHS: 6, number of training epochs
    LEARNING_RATE: 1e-4, learning rate
    MAX_SOURCE_TEXT_LENGTH: 512, max length of source text
    MAX_TARGET_TEXT_LENGTH: 50, max length of target text
    SEED: 42, set seed for reproducibility
    """

    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(model_params["SEED"])  # pytorch random seed
    np.random.seed(model_params["SEED"])  # numpy random seed
    torch.backends.cudnn.deterministic = True

    # logging
    print(f"""[Model]: Loading {model_params["MODEL"]}...\n""")

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])

    # Defining the model. We are using t5-base model and added a Language model layer on top for generation of Summary.
    # Further this model is sent to device (GPU/TPU) for using the hardware.
    model = T5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)

    # logging
    print(f"[Data]: Reading data...\n")

    # Importing the raw dataset
    train_df, val_df = get_data(data_dir, rationale=True)
    
    # Creation of Dataset and Dataloader
    # Creating the Training and Validation dataset for further creation of Dataloader
    train_dataset = ClaimDiff(
        data=train_df,
        tokenizer=tokenizer,
        source_max_token_len=model_params["MAX_SOURCE_TEXT_LENGTH"],
        target_max_token_len=model_params["MAX_TARGET_TEXT_LENGTH"],
    )
    
    val_dataset = ClaimDiff(
        data=val_df,
        tokenizer=tokenizer,
        source_max_token_len=model_params["MAX_SOURCE_TEXT_LENGTH"],
        target_max_token_len=model_params["MAX_TARGET_TEXT_LENGTH"],
    )

    # Defining the parameters for creation of dataloaders
    train_params = {
        "batch_size": model_params["BATCH_SIZE"],
        "num_workers": model_params["NUM_WORKERS"],
    }

    val_params = {
        "batch_size": model_params["BATCH_SIZE"],
        "num_workers": model_params["NUM_WORKERS"],
    }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(
        train_dataset,
        **train_params
    )
    
    val_loader = DataLoader(
        val_dataset,
        **val_params
    )
    
    # Defining the optimizer that will be used to tune the weights of the network in the training session.
    optimizer = AdamW(
        params=model.parameters(), lr=model_params["LEARNING_RATE"]
    )

    # Training loop
    print(f"[Initiating Fine Tuning]...\n")

    for epoch in range(model_params["EPOCHS"]):
        print(f"Start training epoch {epoch}")
        losses = train(tokenizer, model, device, training_loader, optimizer)
        print(f"Epoch {epoch} average losses: {sum(losses)/len(losses)}")

    # Saving the model after training
    print(f"[Saving Model]...\n")
    path = os.path.join(ckpt_dir, "model_files")
    model.save_pretrained(path)
    tokenizer.save_pretrained(path)

    
    print(f"[Loading Model]...\n")
    trained_model = T5ForConditionalGeneration.from_pretrained(path)
    
    # evaluating test dataset
    print(f"[Initiating Validation]...\n")
    predictions, actuals, meta_info = validate(tokenizer, trained_model, device, val_loader)
    final_df = pd.DataFrame({
        "pair_id": meta_info["pair_ids"],
        "generated_text": predictions,
        "actual_text": actuals,
        "doc": meta_info["docs"],
    })
    final_df.to_csv(os.path.join(output_dir, "predictions.csv"))

    print(f"[Validation Completed.]\n")
    
    print(
        f"""[Model] Model saved @ {os.path.join(ckpt_dir, 'model_files')}\n"""
    )
    
    print(
        f"""[Validation] Generation on Validation data saved @ {os.path.join(output_dir,'predictions.csv')}\n"""
    )


def parse_args():
    parser = argparse.ArgumentParser()

    # Training & Inference
    parser.add_argument('--seed', default=42, type=int, help='seed')
    parser.add_argument('--data_dir', default='data/', type=str, help='data directory')
    parser.add_argument('--model_name', default='t5-base', type=str, help='model file name')
    parser.add_argument("--parallelize", default=False, action="store_true", help="model parallelism")
    parser.add_argument('--num_workers', default=4, type=int, help='number of workers')
    parser.add_argument('--n_epochs', default=3, type=int, help='number of epochs')
    parser.add_argument('--batch_size', default=1, type=int, help='batch size')
    parser.add_argument('--lr', default=0.00001, type=float, help='learning rate')
    parser.add_argument('--max_source_len', default=512, type=int, help='max target sequence length')
    parser.add_argument('--max_target_len', default=512, type=int, help='min target sequence length')

    args = parser.parse_args()

    return args


if __name__=="__main__":
    
    args = parse_args()
    
    model_params = {
        "MODEL": args.model_name,  # model_type: t5-base/t5-large
        "BATCH_SIZE": args.batch_size,  # batch size
        "NUM_WORKERS": args.num_workers, # number of CPU workers
        "EPOCHS": args.n_epochs,  # number of epochs
        "LEARNING_RATE": args.lr,  # learning rate
        "MAX_SOURCE_TEXT_LENGTH": args.max_source_len,  # max length of source text
        "MAX_TARGET_TEXT_LENGTH": args.max_target_len,  # max length of target text
        "SEED": args.seed,  # set seed for reproducibility
        "PARALLEL": args.parallelize
    }

    print("Training settings")
    for k, v in model_params.items():
        print(f'{k}: {v}')
    
    T5Trainer(
        model_params=model_params
    )

  1. Using HF Trainer

.sh script

export BS=1

python train.py \
  --model_name_or_path t5-base \
  --do_train \
  --do_eval \
  --do_predict \
  --max_source_length 512 \
  --max_target_length 512 \
  --per_device_train_batch_size $BS \
  --per_device_eval_batch_size $BS \
  --learning_rate 1e-4 \
  --num_train_epochs 2 \
  --output_dir output/ \
  --save_steps 10000 \
  --overwrite_output_dir \
  --logging_first_step \
  --logging_steps 100

train.py

import logging
import os
import json
import random
import sys
from dataclasses import dataclass, field
from typing import Optional
from torch.utils.data import Dataset

import datasets

import pandas as pd
import numpy as np

import transformers
from transformers import (
    T5ForConditionalGeneration,
    T5Tokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version
from transformers.utils.versions import require_version


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
check_min_version("4.17.0.dev0")

require_version("datasets>=1.8.0", "To fix: pip install -r requirements.txt")

logger = logging.getLogger(__name__)


@dataclass
class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    max_source_length: int = field(
        default=512,
        metadata={
            "help": "The maximum total input source sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    max_target_length: int = field(
        default=512,
        metadata={
            "help": "The maximum total input target sequence length after tokenization. Sequences longer "
            "than this will be truncated, sequences shorter will be padded."
        },
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": "Whether to pad all samples to `max_seq_length`. "
            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
        },
    )

@dataclass
class ModelArguments:
    """
    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
    """

    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    tokenizer_name: Optional[str] = field(
        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
    )
    cache_dir: Optional[str] = field(
        default=None,
        metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"},
    )

def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )

    log_level = training_args.get_process_log_level()
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.enable_default_handler()
    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(training_args.output_dir)) > 0:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome."
            )
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # i.e. Dataset
    data_dir = "ClaimDiff/"
    train_df, val_df = get_data(data_dir, rationale=True)

    # Load pretrained model and tokenizer
    tokenizer = T5Tokenizer.from_pretrained(model_args.model_name_or_path)

    model = T5ForConditionalGeneration.from_pretrained(model_args.model_name_or_path, use_cache=False)

    train_dataset = ClaimDiff(
        data=train_df,
        tokenizer=tokenizer,
        source_max_token_len=data_args.max_source_length,
        target_max_token_len=data_args.max_target_length,
    )
    
    eval_dataset = ClaimDiff(
        data=val_df.iloc[:4, :],
        tokenizer=tokenizer,
        source_max_token_len=data_args.max_source_length,
        target_max_token_len=data_args.max_target_length,
    )

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset if training_args.do_train else None,
        tokenizer=tokenizer,
    )

    # Training
    if training_args.do_train:
        checkpoint = None
        if training_args.resume_from_checkpoint is not None:
            checkpoint = training_args.resume_from_checkpoint
        elif last_checkpoint is not None:
            checkpoint = last_checkpoint

        trainer.train(resume_from_checkpoint=checkpoint)

        trainer.save_model()  # Saves the tokenizer too for easy upload
        trainer.save_state()

    if training_args.do_predict:
        logger.info("*** Predict ***")

        # Removing the `label` columns because it contains -1 and Trainer won't like that.
        outputs = trainer.predict(eval_dataset)
        
        output_seq_ids = outputs.predictions[0]
        output_seq_ids = np.argmax(output_seq_ids, axis=2)

        output_label_seq_ids = outputs.label_ids

        predictions = []
        for i, (output_ids, output_label_ids) in enumerate(zip(output_seq_ids, output_label_seq_ids)):
            # Prediction
            output_ids = np.asarray([output_id for output_id in output_ids if output_id != -100])
            prediction = tokenizer.decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

            # Target label
            output_label_ids = np.asarray([output_label_id for output_label_id in output_label_ids if output_label_id != -100])
            label = tokenizer.decode(output_label_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)

            predictions.append({
                "index": i,
                "prediction": prediction,
                "label": label
            })

        output_predict_file = os.path.join(training_args.output_dir, "predict_results_claimdiff.json")
        if trainer.is_world_process_zero():
            with open(output_predict_file, 'w') as output:
                logger.info("***** Predict Results ClaimDiff *****")
                json.dump(predictions, output)


def _mp_fn(index):
    # For xla_spawn (TPUs)
    main()


if __name__ == "__main__":
    main()