I have been finetuning IndicBART model for hindi text summarisation. There is not even a slight change over 10 epochs. I have no idea whats wrong. I have no idea what wrong can someone guide me.
Here’s my training code:
from tqdm.auto import tqdm
import torch
import numpy as np
progress_bar = tqdm(range(num_training_steps))
for epoch in range(num_train_epochs):
# Training
model.train()
for step, batch in enumerate(train_dataloader):
outputs = model(**batch)
loss = outputs.loss
record.append(loss)
accelerator.backward(loss)
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
# Evaluation
model.eval()
for step, batch in enumerate(eval_dataloader):
with torch.no_grad():
generated_tokens = accelerator.unwrap_model(model).generate(
batch["input_ids"],
attention_mask=batch["attention_mask"],
min_length=15, #increase min length
max_length=512,
top_k=50,
num_beams=5, # Use beam search
repetition_penalty=2.0, # Increase repetition penalty
length_penalty=1.0,
early_stopping=True,
pad_token_id=pad_id,
bos_token_id=bos_id,
eos_token_id=eos_id,
decoder_start_token_id=tokenizer._convert_token_to_id_with_added_voc("<2hi>")
)
generated_tokens = accelerator.pad_across_processes(
generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
)
labels = batch["labels"]
# If we did not pad to max length, we need to pad the labels too
labels = accelerator.pad_across_processes(
batch["labels"], dim=1, pad_index=tokenizer.pad_token_id
)
generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
labels = accelerator.gather(labels).cpu().numpy()
# Replace -100 in the labels as we can't decode them
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
if isinstance(generated_tokens, tuple):
generated_tokens = generated_tokens[0]
decoded_preds = tokenizer.batch_decode(
generated_tokens, skip_special_tokens=True
)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True, clean_up_tokenization_spaces=False)
decoded_preds, decoded_labels = postprocess_text(
decoded_preds, decoded_labels
)
rouge_score.add_batch(predictions=decoded_preds, references=decoded_labels)
# Compute metrics
result = rouge_score.compute()
# Extract the median ROUGE scores
result = {key: value * 100 for key, value in result.items()}
result = {k: round(v, 4) for k, v in result.items()}
print(f"Epoch {epoch}:", result, f"Loss: {loss}")
# Save and upload
"""accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
if accelerator.is_main_process:
tokenizer.save_pretrained(output_dir)
repo.push_to_hub(
commit_message=f"Training in progress epoch {epoch}", blocking=False
)"""
Postprocess function to compute ROUGE scores:
import indicnlp.tokenize.sentence_tokenize as tok
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [label.strip() for label in labels]
preds = ["\n".join(tok.sentence_split(pred.strip(), lang='hi', delim_pat='auto')) for pred in decoded_preds]
labels = ["\n".join(tok.sentence_split(label.strip(), lang='hi', delim_pat='auto')) for label in decoded_labels]
return decoded_preds, decoded_labels