Hello, everyone.
I was trying using the T5 model to fine-tune the stsb dataset without prefixes. However, , to decode some prediction , the output was a sentence instead of a number. I don’t understand why this happened. Any help is appreciated, thanks a lot!
My code is:
from transformers import T5Config, T5ForConditionalGeneration, get_scheduler,T5Tokenizer
from transformers.optimization import Adafactor, AdafactorSchedule
from datasets import load_dataset, load_dataset_builder, load_metric
from tqdm import tqdm
import torch
from torch.utils.data import DataLoader
import time
from dataclasses import dataclass
import numpy as np
def same_seed(seed):
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
def evaluate_model(model, dataloader, metric, device, tokenizer):
model.eval().to(device)
for i, batch in enumerate(tqdm(dataloader)):
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = model.generate(input_ids=batch['input_ids'],
attention_mask=batch['attention_mask'],
max_length=16,
early_stopping=True)
predictions = [tokenizer.decode(ids, skip_special_tokens=True) for ids in outputs]
batch['labels'][batch['labels'][:,:] == -100] = 0
real_labels = [tokenizer.decode(ids, skip_special_tokens=True) for ids in batch['labels']]
predictions = [float(i) for i in predictions]
real_labels = [float(i) for i in real_labels]
metric.add_batch(predictions=predictions, references=real_labels)
loss = metric.compute()
return loss
def add_eos_to_examples(example):
example['input_text'] = '%s</s>%s' % (example['sentence1'], example['sentence1'])
processed_label = round(example['label'], 1)
if int(processed_label * 10) % 2 != 0:
min_num = processed_label - 0.1
max_num = processed_label + 0.1
if (example['label'] - min_num) <= (max_num - example['label']):
processed_label = min_num
else:
processed_label = max_num
example['target_text'] = '%s' % round(processed_label, 1) #avoid some case of decimal
return example
@dataclass
class T2TDataCollator:
def __call__(self, batch):
"""
Take a list of samples from a Dataset and collate them into a batch.
Returns:
A dictionary of tensors
"""
input_ids = torch.stack([example['input_ids'] for example in batch])
lm_labels = torch.stack([example['target_ids'] for example in batch])
lm_labels[lm_labels[:, :] == 0] = -100
attention_mask = torch.stack([example['attention_mask'] for example in batch])
return {
'input_ids': input_ids,
'attention_mask': attention_mask,
'labels': lm_labels
}
def main():
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
'seed': 5201314,
'model_name' : 't5-base',
'dataset': 'stsb',
'num_epochs': 20,
'batch_size': 2,
'learning_rate': 3e-4,
'optimizer': 'Adamw',
'datetime': time.strftime('%Y%m%d-%H%M%S'),
'save_path': 'results'
}
same_seed(config['seed'])
tokenizer = T5Tokenizer.from_pretrained(config['model_name'], model_max_length=512)
def convert_to_features(example_batch):
max_input_len = 512
max_target_len = 16
input_encodings = tokenizer.batch_encode_plus(example_batch['input_text'], padding='max_length', max_length=max_input_len, truncation=True)
target_encodings = tokenizer.batch_encode_plus(example_batch['target_text'], padding='max_length', max_length=max_target_len, truncation=True)
encodings = {
'input_ids': input_encodings['input_ids'],
'attention_mask': input_encodings['attention_mask'],
'target_ids': target_encodings['input_ids']
}
return encodings
raw_datasets = load_dataset('glue', config['dataset'])
tokenized_datasets = raw_datasets.map(add_eos_to_examples)
tokenized_datasets = tokenized_datasets.map(convert_to_features, batched=True)
columns = ['input_ids', 'target_ids', 'attention_mask']
tokenized_datasets.set_format(type='torch', columns=columns)
data_collator = T2TDataCollator()
train_dataloader = DataLoader(
tokenized_datasets['train'], shuffle=True, batch_size=config['batch_size'], collate_fn=data_collator
)
eval_dataloader = DataLoader(
tokenized_datasets['validation'], batch_size=config['batch_size'], collate_fn=data_collator
)
model = T5ForConditionalGeneration.from_pretrained(config['model_name'], return_dict=True)
optimizer = torch.optim.AdamW(params=model.parameters(), lr=config['learning_rate'])
num_epochs = config['num_epochs']
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
'linear',
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps,
)
metric = load_metric('glue', config['dataset'])
for epoch in range(num_epochs):
model.train().to(device)
for i, batch in enumerate(tqdm(train_dataloader)):
optimizer.zero_grad()
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
train_loss = evaluate_model(model, train_dataloader, metric, device, tokenizer)
val_loss = evaluate_model(model, eval_dataloader, metric, device, tokenizer)
print(f'Epoch [{epoch+1}/{epoch+1}]: Train set: {train_loss}, Validation set: {val_loss}')
if __name__ == "__main__":
main()