Training a CausalLM from scratch for a machine translation task

Hello everyone!
So, for an experiment of mine I wanted to train from scratch a CausalLM like meta-llama/Llama-3.2-1B-Instruct for a machine translation task like from en to it.

I imagined the task to be something like this:

<TARGET_LANGUAGE_CODE> <START_SYMBOL_source> source sentence <END_SYMBOL_SOURCE> <START_SYMBOL_TARGET> target sentence <END_SYMBOL_TARGET>

Unfortunately, after training the model and testing it on new sentences, what I get as output is the exact sentence I gave as input.

After having preprocessed the data, I have loaded the tokenizer and added special tokens.

model_path = "meta-llama/Llama-3.2-1B-Instruct"


TARGET_LANGUAGE_CODE = "it"
START_SYMBOL_SOURCE = "<START_SYMBOL_source>"
END_SYMBOL_SOURCE = "<END_SYMBOL_SOURCE>"
START_SYMBOL_TARGET = "<START_SYMBOL_TARGET>"
END_SYMBOL_TARGET = "<END_SYMBOL_TARGET>"

tokenizer = AutoTokenizer.from_pretrained(model_path)

# Define special tokens
special_tokens = {
    "additional_special_tokens": [
        START_SYMBOL_SOURCE, END_SYMBOL_SOURCE, START_SYMBOL_TARGET, END_SYMBOL_TARGET
    ]
}

# Add a padding token if not already present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Add the special tokens to the tokenizer
tokenizer.add_special_tokens(special_tokens)

def process(example):
    source = example['Source_clean']
    target = example['Target_clean']
    input_text = f"{TARGET_LANGUAGE_CODE} {START_SYMBOL_SOURCE} {source} {END_SYMBOL_SOURCE}"
    target_text = f"{START_SYMBOL_TARGET} {target} {END_SYMBOL_TARGET}"
    model_input = tokenizer(
        input_text,
        text_target=target_text,
        truncation=True,
        max_length=96,
        padding="max_length"  # Ensure consistent length
    )
    return model_input

tokenized_dataset = dataset.map(process, batched=True, remove_columns=["Source_clean", "Target_clean"])
tokenized_dataset

Then I setup the model without pre-trained weights:

config = AutoConfig.from_pretrained(model_path)
config.num_hidden_layers = 6  # Use only the first 6 layers

# Resize the model's token embeddings to accommodate the new tokens
model = AutoModelForCausalLM.from_config(config)
model.resize_token_embeddings(len(tokenizer))
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)
# Set training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/llama-translation2",
    eval_strategy="steps",
    eval_steps=100,
    #save_steps=20,
    logging_steps=100,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=64,
    gradient_checkpointing = True,
    num_train_epochs=3,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=3,
    fp16=True,  # Mixed precision for faster training
    push_to_hub=False,
    max_grad_norm=0.1,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
)
class ClearMemoryCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Clearing GPU memory after epoch {state.epoch}...")
        torch.cuda.empty_cache()
        gc.collect()
optimizer = Adam8bit(model.parameters(), lr=5e-5)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer = tokenizer,
    optimizers=(optimizer, None),
    callbacks=[ClearMemoryCallback()]
)
trainer.train()

And I test the model by using:

# Path to the saved model folder
MODEL_PATH = "/content/drive/MyDrive/llama-translation2/final_model"

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)

# Define the translation function
def translate_sentence(sentence, model, tokenizer, max_length=512):
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculations for testing
        # Format the input for translation
        input_text = f"{TARGET_LANGUAGE_CODE} {START_SYMBOL_SOURCE} {sentence} {END_SYMBOL_SOURCE}"
        inputs = tokenizer(input_text, return_tensors="pt")
        
        # Generate translation
        outputs = model.generate(
            **inputs,
            max_length=max_length,
            num_beams=5,  # Use beam search for better translations
            early_stopping=True
        )
        
        # Decode the output
        translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translated_sentence

# Test the model with a sample sentence
sample_sentence = "Hello, how are you?"
translated_sentence = translate_sentence(sample_sentence, model, tokenizer)
print(f"Translated Sentence: {translated_sentence}")

But unfortunately the translated sentence is still “Hello, how are you?”

Is there something i might have not considered?
Is it just a matter of not having trained data on enough data? (As already said this is an experiment)

Thanks for the attention!

1 Like

DataCollatorForSeq2Seq is for seq2seq models like T5. If you want to use LLAMA (decoder only model), I would recommend using Supervised Fine-tuning Trainer DataCollatorForCompletionOnlyLM.

1 Like

This doesn’t sound right. It will replace every instance of the string it with this special token, and things won’t be what they should be. Maybe use something like <lang-code-it> here?

1 Like

It feels a bit unconventional as normally when training causual LM the targets are the same as inputs but offset by 1. But here for translation you set the targets to be text of a different language.

Did you manage to track down the cause of your problem? I suspect either [PAD] tokens in the target are not being set to -100 properly or your targets are simply dropped and replaced with the inputs hence your model never learned the other language.

1 Like