Hi im fine tuning model âpszemraj/led-large-book-summaryâ for text summarization.
But Trainer gives no response when training model with compute_metrics for ROUGE score. It complete the first training steps and get stuck on evaluation step. Looking for your help since i have got stuck on this.
If compute_metrics not set up the models trains without issues.
Initially it used to give CUDA out of memory error. But got fixed with following solutions tried.
`OutOfMemoryError: CUDA out of memory. Tried to allocate 8.00 GiB. GPU 0 has a total capacity of 16.00 GiB of which 0 bytes is free. Of the allocated memory 25.41 GiB is allocated by PyTorch, and 812.29 MiB is reserved by PyTorch but unallocated.
solutions tried :
- set predict_with_generate=True
- reduced per_device_eval_batch_size size to 1.
- even reduced eval data set size to 2 samples.
- eval_accumulation_steps set to 1.
- added function preprocess_logits_for_metrics.
Observations:
i added a print statement in the compute_metrics function but it is not reaching on training.
Here are some related previous issues i tried but didnât worked:
- CUDA out of memory when using Trainer with compute_metrics
- CUDA out of memory only during validation not training
- Cuda out of memory during evaluation but training is fine
- python - OutOfMemoryError: CUDA out of memory while using compute_metrics function in Hugging Face Trainer - Stack Overflow
- Trainer doesn't call compute_metrics during evaluation
- Trainer crashes during predict and with compute_metrics
- Adding compute_metrics produces Cuda OutOfMemoryError
Versions
- Cuda = 12.4
- transformers = 4.44.2
- OS = Windows 11
- GPU = NVIDIA RTX 3080 16GB
Features of the dataset
features: ['document_content', 'golden_summary', 'input_ids', 'attention_mask', 'global_attention_mask', 'labels']
NOTE : here additional to the âattention_maskâ , a âglobal_attention_maskâ is there since it is needed for LED models.
here is the fine tuning code
context_size = 16384
max_summary_length = 512
model_id = "pszemraj/led-large-book-summary"
output_dir= "led-large-fine-tuned"
tokenizer = AutoTokenizer.from_pretrained(model_id,
trust_remote_code=True,
padding_side="right",
add_eos_token=True,
add_bos_token=True,
use_fast=False
)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = context_size
tokenizer.model_max_length
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=False,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=compute_dtype,
)
device_map = {"": 0}
original_model =AutoModelForSeq2SeqLM.from_pretrained(
model_id,
device_map=device_map,
quantization_config=bnb_config,
trust_remote_code=True,
use_auth_token=True,
use_cache=False
)
original_model.config.max_position_embeddings = context_size
original_model = prepare_model_for_kbit_training(original_model)
peft_config = LoraConfig(
lora_alpha=128,
lora_dropout=0.05,
r=256,
bias="none",
target_modules=[
'q_proj',
'k_proj',
'v_proj',
'dense'
],
task_type=TaskType.SEQ_2_SEQ_LM,
)
original_model.gradient_checkpointing_enable()
peft_model = get_peft_model(original_model, peft_config)
peft_model.config.num_beams = 2
peft_model.config.length_penalty = 2.0
peft_model.config.early_stopping = True
peft_model.config.no_repeat_ngram_size = 3
tokenized_train_dataset = tokenized_dataset['train']
tokenized_val_dataset = tokenized_dataset['validation']
data_collator = DataCollatorForSeq2Seq(tokenizer, model=original_model)
label_pad_token_id = -100
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=peft_model,
label_pad_token_id=label_pad_token_id,
pad_to_multiple_of=8
)
import evaluate
import nltk
import numpy as np
from nltk.tokenize import sent_tokenize
# nltk.download("punkt")
# Metric
metric = evaluate.load("rouge")
# helper function to postprocess text
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [label.strip() for label in labels]
# rougeLSum expects newline after each sentence
preds = ["\n".join(sent_tokenize(pred)) for pred in preds]
labels = ["\n".join(sent_tokenize(label)) for label in labels]
return preds, labels
def compute_metrics(eval_preds):
preds, labels = eval_preds
if isinstance(preds, tuple):
preds = preds[0]
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# Replace -100 in the labels as we can't decode them.
labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
result = {k: round(v * 100, 4) for k, v in result.items()}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
return result
def preprocess_logits_for_metrics(logits, labels):
"""
Original Trainer may have a memory leak.
This is a workaround to avoid storing too many tensors that are not needed.
"""
pred_ids = torch.argmax(logits[0], dim=-1)
return pred_ids, labels
batch_size = 1
logging_steps = 5
no_of_epochs = 1
peft_training_args = Seq2SeqTrainingArguments(
evaluation_strategy="steps",
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
fp16=True,
learning_rate=2e-4,
output_dir = output_dir,
logging_steps=logging_steps,
eval_steps=25,
save_steps=25,
warmup_steps=2,
gradient_accumulation_steps=4,
eval_accumulation_steps = 1,
num_train_epochs=no_of_epochs,
predict_with_generate=True,
gradient_checkpointing=True,
load_best_model_at_end=True,
metric_for_best_model="eval_loss",
logging_dir=f"{output_dir}/logs",
report_to="tensorboard",
group_by_length=True,
push_to_hub=False,
hub_strategy="every_save",
hub_model_id=output_dir,
)
peft_model.config.use_cache = False
peft_trainer = Seq2SeqTrainer(
# peft_trainer = Trainer(
model=peft_model,
tokenizer=tokenizer,
args=peft_training_args,
data_collator=data_collator,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_val_dataset,
compute_metrics=compute_metrics,
preprocess_logits_for_metrics=preprocess_logits_for_metrics,
)
peft_trainer.train()