Train a CausalLM for machine translation

For an experiment of mine, I am trying to train from scratch a causalLM, in particular Qwen/Qwen2.5-0.5B-Instruct, for a machine translation task.

Since this an experiment and I am aware that achieving good performances would require both a great amount of time and resources, I decided to use as training dataset around 25% of my data (around 30000 observations).

My idea was to train the model to solve the fllowing task specifically:
<TARGET_LANGUAGE_CODE> <START_SYMBOL_source> source sentence <END_SYMBOL_SOURCE> <START_SYMBOL_TARGET> target sentence <END_SYMBOL_TARGET>.

Unfortunately, after training the model, giving it new sentences as input results in getting back the same source sentence plus random words if max_length hasnā€™t been reached.

What I would like to know if this this just caused by the model not being trained on enough data or if there is an error in setting my code right.

After performing data preprocessing, I transform data using:

model = "Qwen/Qwen2.5-0.5B-Instruct"```
def preprocess(example):
    source = example['Source_clean']
    target = example['Target_clean']
    # Combine source and target into a single sequence
    sequence = (
        f"<START_SYMBOL_source> {source} <END_SYMBOL_SOURCE> "
        f"<START_SYMBOL_TARGET> {target} <END_SYMBOL_TARGET>"
    )
    return {"sequence": sequence}

# Apply preprocessing
dataset = dataset.map(preprocess)

Add special tokens to tokenizer

tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({
    'additional_special_tokens': [
        '<START_SYMBOL_source>',
        '<END_SYMBOL_SOURCE>',
        '<START_SYMBOL_TARGET>',
        '<END_SYMBOL_TARGET>'
    ]
})

Then I apply tokenization

def tokenize_function(example):
    # Tokenize the combined input
    sequence = example['sequence']
    # Tokenize the sequence using the updated tokenizer
    tokenized = tokenizer(sequence, padding='max_length', truncation=True, max_length=96)  # You can adjust max_length as needed
    return tokenized

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)

tokenized_dataset = tokenized_dataset.map(
    lambda example: example,
    remove_columns=['Source_clean', 'Target_clean', 'sequence']
)

So that input_ids and attention_mask are the only two feaures in the data left.

Model Setup

#model setup
config = AutoConfig.from_pretrained(model)
#config.num_hidden_layers = 12

# Ensure the model configuration is set up for causal language modeling
config.is_decoder = True  # Set the model as a decoder for causal language modeling
config.add_cross_attention = False  # Optional, only for encoder-decoder models
model = AutoModelForCausalLM.from_config(config)

Callbacks

class ClearMemoryCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, **kwargs):
        print(f"Clearing GPU memory after epoch {state.epoch}...")
        torch.cuda.empty_cache()
        gc.collect()

# Inside callbacks
class StateCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, metrics=None, **kwargs):
        logging.info(f"Epoch {state.epoch} ended.")
        if metrics:
            logging.info(f"Metrics: {metrics}")
# Set data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal model doesn't use masked LM
    pad_to_multiple_of=8
)

# Set training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/qwen-translation2/checkpoint",
    eval_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=100,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=64,
    gradient_checkpointing = True,
    num_train_epochs=3,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    save_total_limit=3,
    fp16=True,  # Mixed precision for faster training
    push_to_hub=False,
    max_grad_norm=0.1,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
)



model.gradient_checkpointing_enable()
# Training setup
from transformers import Trainer

optimizer = Adam8bit(model.parameters(), lr=5e-5)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    tokenizer = tokenizer,
    optimizers=(optimizer, None),
    callbacks=[ClearMemoryCallback()]
)
# Move model and data to the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.config.use_cache = False

logging.info("Starting training...")
trainer.train()
logging.info("Finished training.")

After training, if I donā€™t set save_safetensors = 'False the traning stops just before the end of the last epoch and I get the following warning:

There were missing keys in the checkpoint model loaded: [ā€˜lm_head.weightā€™]

Model Testing

folder = "/content/drive/MyDrive/qwen-translation2/checkpoint"


# Load the model and tokenizer from the folder
trained_model = AutoModelForCausalLM.from_pretrained(folder)
trained_tokenizer = AutoTokenizer.from_pretrained(folder)

# Set the model to evaluation mode
model.eval()
prompt = "Once upon a time in a faraway land"

# Ensure that attention mask and pad_token_id are set correctly
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)

# Add attention mask to the generation function
outputs = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"],  # Pass attention mask
    max_length=96,  # Maximum length of generated text
    early_stopping=True,
    num_return_sequences=1,  # Generate a single output
    no_repeat_ngram_size=2,  # Prevent repeating n-grams
    temperature=0.7,  # Sampling temperature (lower is more deterministic)
    top_p=0.9,  # Nucleus sampling (controls diversity)
    top_k=50,  # Limits the sampling to the top 50 tokens
    do_sample=True,  # Enable sampling
    pad_token_id=model.config.pad_token_id  # Explicitly set pad_token_id
)

generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_text

Could you help me in solving this issue?
Thanks for the attention

1 Like

Maybe unresolved issue? Or maybe it hasnā€™t been made into an issue on the transformers github.