Hi everyone,
I’m fine-tuning a LLaMA-based model (universitytehran/PersianMind-v1.0) using QLoRA and BitsAndBytes in 4-bit precision. I am working with Kaggle GPU T4, and it takes about 75 hours to be fine-tuned using ParsMap dataset with 40,000 records for training related to converting informal to formal text.
Here is my code:
base_model_id = "universitytehran/PersianMind-v1.0"
compute_dtype = torch.bfloat16 if torch.cuda.get_device_capability(0)[0] >= 8 else torch.float16
print("Compute dtype:", compute_dtype)
def safe_str(x):
return "" if x is None or (isinstance(x, float) and np.isnan(x)) else str(x)
df = df_parsmap.copy()
df = df.dropna(subset=["inFormalForm","formalForm"]) # keep only rows with both sides
def make_text(row):
informal = safe_str(row["inFormalForm"])
formal = safe_str(row["formalForm"])
return f"<s><|startoftext|>[Informal]{informal}[Formal]{formal}<|endoftext|>"
df["text"] = df.apply(make_text, axis=1)
perm = np.random.permutation(len(df))
cut = int(0.9*len(df))
train_df = df.iloc[perm[:cut]].reset_index(drop=True)
val_df = df.iloc[perm[cut:]].reset_index(drop=True)
ds = DatasetDict({
"train": Dataset.from_pandas(train_df[["text"]]),
"validation": Dataset.from_pandas(val_df[["text"]]),
})
len(ds["train"]), len(ds["validation"])
tokenizer = AutoTokenizer.from_pretrained(base_model_id, use_fast=True, trust_remote_code=True)
specials = {
"bos_token": "<s>",
"eos_token": "</s>",
"pad_token": "<pad>",
}
for k,v in specials.items():
if getattr(tokenizer, k, None) != v:
tokenizer.add_special_tokens({k: v})
added = tokenizer.add_tokens(["<|startoftext|>", "<|endoftext|>", "[Informal]", "[Formal]", "<sep>"], special_tokens=True)
print("Added new tokens:", added)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=compute_dtype,
)
model = AutoModelForCausalLM.from_pretrained(
base_model_id,
trust_remote_code=True,
quantization_config=bnb_config,
device_map="auto",
)
model.resize_token_embeddings(len(tokenizer))
model = prepare_model_for_kbit_training(model)
model.config.use_cache = False
lora_config = LoraConfig(
r=16, lora_alpha=32, lora_dropout=0.1, bias="none", task_type="CAUSAL_LM",
target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
)
model = get_peft_model(model, lora_config)
model.gradient_checkpointing_enable()
# quick param report
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable: {trainable:,} / Total: {total:,} ({100*trainable/total:.2f}%)")
max_length = 128
def tokenize_batch(batch):
return tokenizer(
batch["text"],
truncation=True,
max_length=max_length,
padding="max_length",
)
tokenized = ds.map(tokenize_batch, batched=True, remove_columns=ds["train"].column_names)
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
effective_bs = 16
per_device_train_bs = 2
per_device_eval_bs = 2
grad_accum = max(1, effective_bs // per_device_train_bs)
epochs = 3
args = TrainingArguments(
output_dir="./persianmind-formalizer-lora",
num_train_epochs=epochs,
per_device_train_batch_size=per_device_train_bs,
per_device_eval_batch_size=per_device_eval_bs,
gradient_accumulation_steps=grad_accum,
learning_rate=1e-5,
warmup_ratio=0.03,
lr_scheduler_type="cosine",
weight_decay=0.0,
logging_steps=50,
eva_strategy="steps",
eval_steps=2000,
save_strategy="epoch",
save_total_limit=2,
load_best_model_at_end=True,
bf16=(compute_dtype==torch.bfloat16),
fp16=(compute_dtype==torch.float16),
optim="paged_adamw_8bit",
gradient_checkpointing=True,
gradient_checkpointing_kwargs={"use_reentrant": False},
dataloader_num_workers=4,
dataloader_pin_memory=True,
dataloader_persistent_workers=True,
group_by_length=True,
tf32=True,
report_to="none",
)
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized["train"],
eval_dataset=tokenized["validation"],
data_collator=collator,
tokenizer=tokenizer,
)
trainer.train()
Any insights or references to similar cases would be greatly appreciated!
Thanks in advance.