Here is my code
from transformers import Seq2SeqTrainer,Seq2SeqTrainingArguments, EarlyStoppingCallback, BertTokenizer,MT5ForConditionalGeneration
from transformers.data.data_collator import DataCollatorForSeq2Seq,default_data_collator
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import math,os
import numpy as np
from torch.utils.data import Dataset
from tqdm import tqdm
import torch
from datasets import load_dataset, load_metric
os.environ['MASTER_PORT'] = '777'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
#model trained form https://huggingface.co/uer/t5-base-chinese-cluecorpussmall
pretrain_task22_fewshot_zh = './results/baidu/pretrain-table-task22-lowdata/checkpoint-8'
model = MT5ForConditionalGeneration.from_pretrained(pretrain_task22_fewshot_zh)
tokenizer = BertTokenizer.from_pretrained(pretrain_task22_fewshot_zh)
device = 'cuda:0'
train_args = Seq2SeqTrainingArguments(output_dir='./results/baidu/finetune/task22-lowdata',evaluation_strategy = 'epoch',
per_device_train_batch_size=32,weight_decay=0, learning_rate= 0.00005,
num_train_epochs=100,lr_scheduler_type='constant_with_warmup',warmup_ratio=0.1,logging_strategy='steps',
save_strategy='epoch',fp16_backend = 'amp',fp16 = False,gradient_accumulation_steps = 2,
load_best_model_at_end = True,logging_steps = 1)#,deepspeed='./zero2_auto_config.json', save_total_limit = 3)
def load_data(path):
data = []
with open(path,encoding='utf-8') as w:
while True:
line = w.readline()
if not line:
break
data.append(line)
return data
class T5dataset(Dataset):
def __init__(self, data_set,tokenizer,maxlen,label_maxlen):
self.tokenizer = tokenizer
self.maxlen = maxlen
self.label_maxlen = label_maxlen
self.data_set = data_set
def __len__(self):
return len(self.data_set)
def __getitem__(self, index):
model_input = {}
data = self.data_set[index]
table, text = data.split('\t')
model_input = self.tokenizer(table,padding = 'max_length',truncation = True,max_length = self.maxlen)
label = self.tokenizer(text,truncation = True,max_length = self.label_maxlen)
model_input['labels'] = label['input_ids']
return {"input_ids": model_input['input_ids'], "attention_mask": model_input['attention_mask'], "labels": label['input_ids']}
baidu_lowdata = './data/baidu_compete/finetune/lowdata/'
train_data = load_data(baidu_lowdata + 'train.txt')
val_data = load_data(baidu_lowdata + 'val.txt')
train_data = T5dataset(train_data,tokenizer,64,256)
val_data = T5dataset(val_data,tokenizer,64,256)
early_stop = EarlyStoppingCallback(early_stopping_patience = 2,early_stopping_threshold = 0)
data_collator = DataCollatorForSeq2Seq(
tokenizer,
model=model,
label_pad_token_id=-100,
padding='max_length',
max_length= 64
)
metric = load_metric("sacrebleu")
def postprocess_text(preds, labels):
preds = [pred.strip() for pred in preds]
labels = [[label.strip()] for label in labels]
return preds, labels
def compute_metrics(eval_preds):
# print(preds)
preds, labels = eval_preds
#print('preds:',preds[0])
# print('len:',preds[0].shape)
if isinstance(preds, tuple):
preds = preds[0]
print('preds:',preds)
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
# if data_args.ignore_pad_token_for_loss:
# # Replace -100 in the labels as we can't decode them.
# labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
# Some simple post-processing
decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
result = metric.compute(predictions=decoded_preds, references=decoded_labels)
result = {"bleu": result["score"]}
prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
result["gen_len"] = np.mean(prediction_lens)
result = {k: round(v, 4) for k, v in result.items()}
return result
trainer = Seq2SeqTrainer(model=model,
args=train_args,
train_dataset=train_data,
eval_dataset=val_data,
tokenizer=tokenizer,
data_collator=data_collator,
callbacks = [early_stop],
compute_metrics=compute_metrics
)
trainer.train()
Here is part of what I got:
***** Running Evaluation *****
Num examples = 11
Batch size = 8
preds: [[[ -6.9859548 -6.9850636 -6.9853897 ... -6.985799 -6.9857574 | 0/2 [00:00<?, ?it/s]
-6.985038 ]
[ -6.9859576 -6.985067 -6.9853916 ... -6.9858017 -6.9857593
-6.985041 ]
[ -7.4163866 -7.41599 -7.41603 ... -7.4164863 -7.416518
-7.415782 ]
...
[ -8.480153 -8.479599 -8.479667 ... -8.480127 -8.480097
-8.47964 ]
[ -8.4777355 -8.477188 -8.477254 ... -8.47771 -8.4776745
-8.477233 ]
[ -8.475657 -8.47512 -8.475176 ... -8.475634 -8.475585
-8.475155 ]]
...
Traceback (most recent call last):
File "tmp.py", line 118, in <module>
trainer.train()
File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/trainer.py", line 1342, in train
self._maybe_log_save_evaluate(tr_loss, model, trial, epoch, ignore_keys_for_eval)
File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/trainer.py", line 1437, in _maybe_log_save_evaluate
metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/trainer_seq2seq.py", line 75, in evaluate
return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix)
File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/trainer.py", line 2042, in evaluate
metric_key_prefix=metric_key_prefix,
File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/trainer.py", line 2273, in evaluation_loop
metrics = self.compute_metrics(EvalPrediction(predictions=all_preds, label_ids=all_labels))
File "tmp.py", line 91, in compute_metrics
decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/tokenization_utils_base.py", line 3133, in batch_decode
for seq in sequences
File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/tokenization_utils_base.py", line 3133, in <listcomp>
for seq in sequences
File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/tokenization_utils_base.py", line 3169, in decode
**kwargs,
File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/tokenization_utils.py", line 743, in _decode
filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
File "/search/odin/imer/anaconda3/envs/torch1.7/lib/python3.7/site-packages/transformers/tokenization_utils.py", line 718, in convert_ids_to_tokens
index = int(index)
TypeError: int() argument must be a string, a bytes-like object or a number, not 'list'
The code computing BLEU was copied from transformers/run_translation.py at master · huggingface/transformers · GitHub
I also ran that code and print preds in compute_metrics which were all integers. I think my main problem is why the preds printed in my code are not integers which can not be decode.
This has been disturbing me for two days. Could someone please point out where is wrong? Thank you!