Hello,
I am trying to fine-tune a t5-base
model for creating appropriate question against a compliance item. Compliance iteams are paragraph of texts and my question are in the past format of them. I have trained the model, saved it and loaded it back for future usecases.
The problem is when I am trying to use the model for creating new questions on unknown statements the response is coming as incomplete.
Code:
import pandas as pd
import torch
from datasets import Dataset
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, T5Tokenizer
df = pd.read_csv(r'/content/questionsgenerator.csv', encoding='unicode_escape')
df.head()
# Load pre-trained model and tokenizer
model_name = "t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
# Define the training arguments
training_args = Seq2SeqTrainingArguments(
output_dir="./output_dir",
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
predict_with_generate=True,
logging_steps=100,
save_steps=5000,
eval_steps=5000,
num_train_epochs=3,
learning_rate=1e-4,
warmup_steps=1000,
save_total_limit=3,
)
# Define the training dataset
train_dataset = Dataset.from_pandas(df.rename(columns={"Compliance Item": "input_text", "Question": "target_text"}))
# Define the function to preprocess the dataset
def preprocess_function(examples):
inputs = [f"compliance item: {ci}" for ci in examples["input_text"]]
targets = [f"{question} </s>" for question in examples["target_text"]]
model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
with tokenizer.as_target_tokenizer():
labels = tokenizer(targets, max_length=32, padding="max_length", truncation=True)
model_inputs["labels"] = labels["input_ids"]
return model_inputs
# Preprocess the dataset
train_dataset = train_dataset.map(preprocess_function, batched=True)
# Define the trainer
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
)
# Fine-tune the model on the dataset
trainer.train()
model.save_pretrained("./fine_tuned_model_question_generation")
tokenizer = T5Tokenizer.from_pretrained("t5-large")
model = transformers.AutoModelForSeq2SeqLM.from_pretrained("./fine_tuned_model_question_generation")
context = 'When the Installment Due Date falls on a non-business day, the Mortgagee must consider a Borrower’s Notice of Intent to Prepay or the receipt of the prepayment amount for a Mortgage closed before January 21, 2015 timely if received on the next business day.'
encoding = tokenizer.encode_plus(context, return_tensors="pt")
input_ids = encoding["input_ids"]
attention_mask = encoding["attention_mask"]
output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=1000)
decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
decoded_output
Here the response is When the Installment Due Date fell on a non-business day, was the Borrower’s Notice of Intent to Prepay or the receipt of the prepayment amount for
which is incomplete.
So my question is what do i need to do increase the output?
- Should I increase the epochs?
- Or is there a better model for this task?
Please help in this.