Hi everyone, after fine-tuning a T5 model I wrote a script for inference:
def generate_summary(text, max_length, temperature):
inputs= tokenizer(text= text, return_tensors= "pt", truncation= True, padding= True, max_length= 1024).to(device)
outputs= model.generate(
**inputs,
max_length= max_length,
temperature= temperature,
num_beams= 4,
no_repeat_ngram_size= 2,
early_stopping= True,
do_sample= True
)
summary= tokenizer.decode(outputs[0], skip_special_tokens= True)
return summary
Here in the generate_summary method, I parse three arguments: text for input context, temperature for deterministic control, and max_length for output length control. However, I noticed that while the temperature working well, the max_length was not, it just cut off (truncated) the text when the maximum length was reached.
Am I doing something wrong here? The goal is to tell the model to generate the output within the pre-set length.
FYI, in the fine-tuning process, I created a Custom Dataset method:
# LOAD AND PROCESS THE DATASET
class CustomDataset(Dataset):
def __init__(self, data, context_max_length, summary_max_length):
#self.data = data
self.context_max_length= context_max_length
self.summary_max_length= summary_max_length
self.tokenizer= AutoTokenizer.from_pretrained(pretrained_model_name_or_path= parameters.model_name, cache_dir= parameters.model_cache_dir)
self.data= [
item for item in data if len(self.tokenizer.encode(item["context"]))<= self.context_max_length and len(self.tokenizer.encode(item["summary"]))<= self.summary_max_length
]
def __len__(self):
return len(self.data) # Assuming both questions and passages have the same length
def __getitem__(self, idx):
context_encodings= self.tokenizer(
self.data[idx]["context"],
max_length= self.context_max_length,
padding= "max_length",
truncation= True,
return_tensors= "pt"
)
summary_encodings= self.tokenizer(
self.data[idx]["summary"],
max_length= self.summary_max_length,
padding= "max_length",
truncation= True,
return_tensors= "pt"
)
return {
'input_ids': context_encodings["input_ids"].squeeze(),
'labels': summary_encodings["input_ids"].squeeze(),
"attention_mask": context_encodings["attention_mask"].squeeze()
}
print("Load the dataset")
train_dataset= load_from_disk(f"{parameters.datasets_cache_dir}/big_sum_dataset/train")
validation_dataset= load_from_disk(f"{parameters.datasets_cache_dir}/big_sum_dataset/validation")
tokenized_train_dataset= CustomDataset(data= train_dataset, context_max_length= 1024, summary_max_length= 512)
tokenized_validation_dataset= CustomDataset(data= validation_dataset, context_max_length= 1024, summary_max_length= 512)
Here I do some preprocessing on the dataset:
- I filtered out all the instances that do not satisfy the pre-set length
- I padded the length of the input context to 1024 and the output summary to 512.
I don’t know if this Custom Dataset makes the model can not generate vary length output.
Thank you.