For an experiment of mine, I am trying to train from scratch a causalLM, in particular Qwen/Qwen2.5-0.5B-Instruct
, for a machine translation task.
Since this an experiment and I am aware that achieving good performances would require both a great amount of time and resources, I decided to use as training dataset around 25% of my data (around 30000 observations).
My idea was to train the model to solve the fllowing task specifically:
<TARGET_LANGUAGE_CODE> <START_SYMBOL_source> source sentence <END_SYMBOL_SOURCE> <START_SYMBOL_TARGET> target sentence <END_SYMBOL_TARGET>.
Unfortunately, after training the model, giving it new sentences as input results in getting back the same source sentence plus random words if max_length hasnāt been reached.
What I would like to know if this this just caused by the model not being trained on enough data or if there is an error in setting my code right.
After performing data preprocessing, I transform data using:
model = "Qwen/Qwen2.5-0.5B-Instruct"```
def preprocess(example):
source = example['Source_clean']
target = example['Target_clean']
# Combine source and target into a single sequence
sequence = (
f"<START_SYMBOL_source> {source} <END_SYMBOL_SOURCE> "
f"<START_SYMBOL_TARGET> {target} <END_SYMBOL_TARGET>"
)
return {"sequence": sequence}
# Apply preprocessing
dataset = dataset.map(preprocess)
Add special tokens to tokenizer
tokenizer = AutoTokenizer.from_pretrained(model)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_special_tokens({
'additional_special_tokens': [
'<START_SYMBOL_source>',
'<END_SYMBOL_SOURCE>',
'<START_SYMBOL_TARGET>',
'<END_SYMBOL_TARGET>'
]
})
Then I apply tokenization
def tokenize_function(example):
# Tokenize the combined input
sequence = example['sequence']
# Tokenize the sequence using the updated tokenizer
tokenized = tokenizer(sequence, padding='max_length', truncation=True, max_length=96) # You can adjust max_length as needed
return tokenized
# Apply tokenization
tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset = tokenized_dataset.map(
lambda example: example,
remove_columns=['Source_clean', 'Target_clean', 'sequence']
)
So that input_ids
and attention_mask
are the only two feaures in the data left.
Model Setup
#model setup
config = AutoConfig.from_pretrained(model)
#config.num_hidden_layers = 12
# Ensure the model configuration is set up for causal language modeling
config.is_decoder = True # Set the model as a decoder for causal language modeling
config.add_cross_attention = False # Optional, only for encoder-decoder models
model = AutoModelForCausalLM.from_config(config)
Callbacks
class ClearMemoryCallback(TrainerCallback):
def on_epoch_end(self, args, state, control, **kwargs):
print(f"Clearing GPU memory after epoch {state.epoch}...")
torch.cuda.empty_cache()
gc.collect()
# Inside callbacks
class StateCallback(TrainerCallback):
def on_epoch_end(self, args, state, control, metrics=None, **kwargs):
logging.info(f"Epoch {state.epoch} ended.")
if metrics:
logging.info(f"Metrics: {metrics}")
# Set data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False, # Causal model doesn't use masked LM
pad_to_multiple_of=8
)
# Set training arguments
training_args = TrainingArguments(
output_dir="/content/drive/MyDrive/qwen-translation2/checkpoint",
eval_strategy="steps",
eval_steps=500,
save_steps=500,
logging_steps=100,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=64,
gradient_checkpointing = True,
num_train_epochs=3,
learning_rate=5e-5,
warmup_steps=500,
weight_decay=0.01,
save_total_limit=3,
fp16=True, # Mixed precision for faster training
push_to_hub=False,
max_grad_norm=0.1,
load_best_model_at_end=True,
metric_for_best_model='eval_loss',
)
model.gradient_checkpointing_enable()
# Training setup
from transformers import Trainer
optimizer = Adam8bit(model.parameters(), lr=5e-5)
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=tokenized_dataset['train'],
eval_dataset=tokenized_dataset['test'],
tokenizer = tokenizer,
optimizers=(optimizer, None),
callbacks=[ClearMemoryCallback()]
)
# Move model and data to the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
model.config.use_cache = False
logging.info("Starting training...")
trainer.train()
logging.info("Finished training.")
After training, if I donāt set save_safetensors = 'False
the traning stops just before the end of the last epoch and I get the following warning:
There were missing keys in the checkpoint model loaded: [ālm_head.weightā]
Model Testing
folder = "/content/drive/MyDrive/qwen-translation2/checkpoint"
# Load the model and tokenizer from the folder
trained_model = AutoModelForCausalLM.from_pretrained(folder)
trained_tokenizer = AutoTokenizer.from_pretrained(folder)
# Set the model to evaluation mode
model.eval()
prompt = "Once upon a time in a faraway land"
# Ensure that attention mask and pad_token_id are set correctly
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
# Add attention mask to the generation function
outputs = model.generate(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"], # Pass attention mask
max_length=96, # Maximum length of generated text
early_stopping=True,
num_return_sequences=1, # Generate a single output
no_repeat_ngram_size=2, # Prevent repeating n-grams
temperature=0.7, # Sampling temperature (lower is more deterministic)
top_p=0.9, # Nucleus sampling (controls diversity)
top_k=50, # Limits the sampling to the top 50 tokens
do_sample=True, # Enable sampling
pad_token_id=model.config.pad_token_id # Explicitly set pad_token_id
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
generated_text
Could you help me in solving this issue?
Thanks for the attention