Training loss is zero from the first step and model generation is empty after training?

mou3az · February 8, 2024, 4:35am

Sorry for the long message with lots of code stuff. Thanks for your help! In the code, iam trying to fine-tunne the model “flan-t5-base” for generating question out of context and answer it is kind of peft tunning using prompt tunning and lora > #loading the model and the tokenizer:

 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_name_or_path = "google/flan-t5-base"
G_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name_or_path,
    torch_dtype=torch.float16,
    device_map='auto',
)
G_tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)

#freezing both embedding and weights:

import torch.nn as nn
# Freeze original model weights
for param in G_model.parameters():
  param.requires_grad = False

# freeze the embedding layer
for name, param in G_model.named_parameters():
    if "embeddings" in name:
        param.requires_grad = False

G_model.gradient_checkpointing_enable()
G_model.enable_input_require_grads()

class CastOutputToFloat(nn.Sequential):
  def forward(self, x): return super().forward(x).to(torch.float16)
G_model.lm_head = CastOutputToFloat(G_model.lm_head)`

#prompt tunning & lora configurations:

from peft import PromptTuningConfig, TaskType, PromptTuningInit, get_peft_model
prompt_config = PromptTuningConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    num_virtual_tokens=60,
    prompt_tuning_init=PromptTuningInit.TEXT,
    prompt_tuning_init_text="Generate a question for this answer:",
    tokenizer_name_or_path=model_name_or_path
)

P_model = get_peft_model(G_model, prompt_config)
from peft import LoraConfig
Lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM
)

L_model = get_peft_model(P_model, Lora_config)

#preparing and preprocessing the data

from transformers import DataCollatorForSeq2Seq
from datasets import Dataset, load_dataset

# Load SQuAD dataset
train_squad = load_dataset("squad")["train"].shuffle(seed=42).select(range(5000)).remove_columns(["id", "title"])
test_squad = load_dataset("squad")["validation"].shuffle(seed=42).select(range(5000, 6000)).remove_columns(["id", "title"])

def preprocess_function(examples):
    if "answer" in examples:
        input_text = examples["context"] + " [SEP] " + examples["answers"]["text"][0]
    else:
        input_text = examples["context"]
    model_inputs = G_tokenizer(input_text, pad_to_max_length='longest', truncation=True, return_tensors='pt')
    if "question" in examples:
        model_inputs["labels"] = G_tokenizer(examples["question"], pad_to_max_length='longest', truncation=True, return_tensors='pt')["input_ids"]
    return model_inputs
# Tokenize datasets
train_data = train_squad.map(preprocess_function, batched=True, remove_columns=["context", "answers", "question"])
test_data = test_squad.map(preprocess_function, batched=True, remove_columns=["context", "answers", "question"])
data_collator = DataCollatorForSeq2Seq(G_tokenizer, model=L_model)

#start the training

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
      per_device_train_batch_size=32,
      gradient_accumulation_steps=32,
      warmup_steps=10,
      num_train_epochs=100,
      learning_rate=5e-3,
      weight_decay=0.01,
      logging_steps=1,
      output_dir='outputs',
      logging_dir='logs',
    )

trainer = Trainer(
    model=L_model,
    train_dataset=train_data,
    args=training_args,
    data_collator=data_collator,
)

# Additional configuration
L_model.config.use_cache = False

# Start training
trainer.train()

#generate questions

import torch
# Move input data to the same device as the model (GPU)
device = L_model.device
print(device)
input_ids_tensor = torch.tensor(test_data["input_ids"]).to(device)
attention_mask_tensor = torch.tensor(test_data["attention_mask"]).to(device)

# Generate sequences
generated_sequences = L_model.generate(
    input_ids=input_ids_tensor,
    attention_mask=attention_mask_tensor,
    max_length=64,
    early_stopping=True,
    num_beams=5,
    num_return_sequences=1,
)
# Decode the generated sequences
decoded_sequences = []
for generated_sequence in generated_sequences:
    generated_question = G_tokenizer.decode(generated_sequence[0], skip_special_tokens=True)

    decoded_sequences.append((generated_question))

# Print or inspect the decoded sequences
for decoded_sequence in decoded_sequences:
    print(decoded_sequence)

Again thanks for your time and reading

Topic		Replies	Views
Loading Lora models after trainning Beginners	1	2026	June 24, 2024
Error with get_peft_model() and PromptTuningConfig 🤗Transformers	1	1508	November 6, 2023
Combine between lora and prompt tunning 🤗Transformers	1	613	February 3, 2024
How does the finetune on transformer (t5) work Beginners	3	1319	April 11, 2022
Unable to train model (Loss is 0.000000) DeepSpeed	2	1027	October 17, 2023

Training loss is zero from the first step and model generation is empty after training?

Related Topics