Hello everyone!
So, for an experiment of mine I wanted to train from scratch a CausalLM like meta-llama/Llama-3.2-1B-Instruct
for a machine translation task like from en
to it
.
I imagined the task to be something like this:
<TARGET_LANGUAGE_CODE> <START_SYMBOL_source> source sentence <END_SYMBOL_SOURCE> <START_SYMBOL_TARGET> target sentence <END_SYMBOL_TARGET>
Unfortunately, after training the model and testing it on new sentences, what I get as output is the exact sentence I gave as input.
After having preprocessed the data, I have loaded the tokenizer and added special tokens.
model_path = "meta-llama/Llama-3.2-1B-Instruct"
TARGET_LANGUAGE_CODE = "it"
START_SYMBOL_SOURCE = "<START_SYMBOL_source>"
END_SYMBOL_SOURCE = "<END_SYMBOL_SOURCE>"
START_SYMBOL_TARGET = "<START_SYMBOL_TARGET>"
END_SYMBOL_TARGET = "<END_SYMBOL_TARGET>"
tokenizer = AutoTokenizer.from_pretrained(model_path)
# Define special tokens
special_tokens = {
"additional_special_tokens": [
START_SYMBOL_SOURCE, END_SYMBOL_SOURCE, START_SYMBOL_TARGET, END_SYMBOL_TARGET
]
}
# Add a padding token if not already present
if tokenizer.pad_token is None:
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# Add the special tokens to the tokenizer
tokenizer.add_special_tokens(special_tokens)
def process(example):
source = example['Source_clean']
target = example['Target_clean']
input_text = f"{TARGET_LANGUAGE_CODE} {START_SYMBOL_SOURCE} {source} {END_SYMBOL_SOURCE}"
target_text = f"{START_SYMBOL_TARGET} {target} {END_SYMBOL_TARGET}"
model_input = tokenizer(
input_text,
text_target=target_text,
truncation=True,
max_length=96,
padding="max_length" # Ensure consistent length
)
return model_input
tokenized_dataset = dataset.map(process, batched=True, remove_columns=["Source_clean", "Target_clean"])
tokenized_dataset
Then I setup the model without pre-trained weights:
config = AutoConfig.from_pretrained(model_path)
config.num_hidden_layers = 6 # Use only the first 6 layers
# Resize the model's token embeddings to accommodate the new tokens
model = AutoModelForCausalLM.from_config(config)
model.resize_token_embeddings(len(tokenizer))
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)
# Set training arguments
training_args = TrainingArguments(
output_dir="/content/drive/MyDrive/llama-translation2",
eval_strategy="steps",
eval_steps=100,
#save_steps=20,
logging_steps=100,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=64,
gradient_checkpointing = True,
num_train_epochs=3,
learning_rate=5e-5,
warmup_steps=500,
weight_decay=0.01,
save_total_limit=3,
fp16=True, # Mixed precision for faster training
push_to_hub=False,
max_grad_norm=0.1,
load_best_model_at_end=True,
metric_for_best_model='eval_loss',
)
class ClearMemoryCallback(TrainerCallback):
def on_epoch_end(self, args, state, control, **kwargs):
print(f"Clearing GPU memory after epoch {state.epoch}...")
torch.cuda.empty_cache()
gc.collect()
optimizer = Adam8bit(model.parameters(), lr=5e-5)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_dataset['train'],
eval_dataset=tokenized_dataset['test'],
tokenizer = tokenizer,
optimizers=(optimizer, None),
callbacks=[ClearMemoryCallback()]
)
trainer.train()
And I test the model by using:
# Path to the saved model folder
MODEL_PATH = "/content/drive/MyDrive/llama-translation2/final_model"
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(MODEL_PATH)
# Define the translation function
def translate_sentence(sentence, model, tokenizer, max_length=512):
model.eval() # Set the model to evaluation mode
with torch.no_grad(): # Disable gradient calculations for testing
# Format the input for translation
input_text = f"{TARGET_LANGUAGE_CODE} {START_SYMBOL_SOURCE} {sentence} {END_SYMBOL_SOURCE}"
inputs = tokenizer(input_text, return_tensors="pt")
# Generate translation
outputs = model.generate(
**inputs,
max_length=max_length,
num_beams=5, # Use beam search for better translations
early_stopping=True
)
# Decode the output
translated_sentence = tokenizer.decode(outputs[0], skip_special_tokens=True)
return translated_sentence
# Test the model with a sample sentence
sample_sentence = "Hello, how are you?"
translated_sentence = translate_sentence(sample_sentence, model, tokenizer)
print(f"Translated Sentence: {translated_sentence}")
But unfortunately the translated sentence is still “Hello, how are you?”
Is there something i might have not considered?
Is it just a matter of not having trained data on enough data? (As already said this is an experiment)
Thanks for the attention!