hello, i use transformers.trainer to fine-tune the codellama-7b-python. i want using bleu to eval model during training, but when i rewrite the compute_metric function, i decode the lables and input_ids,and i find that the decode result is not the primary text i encode. here is my encode function
def tokenize2( prompt, tokenizer, cutoff_len, padding=False):
result = tokenizer(
prompt,
truncation=True,
max_length=cutoff_len,
padding=âmax_lengthâ if padding else False,
return_tensors=None
)
return {
âinput_idsâ: result[âinput_idsâ],
âlabelsâ: copy.deepcopy(result[âinput_idsâ])
}
def tokenize_prompt(data_point:Dict, tokenizer=tokenizer, raw_source_len=256, cutoff_len=512):
# tokenized_result = self.src_tokenize(src, tokenizer, cutoff_len)
src=data_point['text']
tgt=data_point['target']
tokenized_result = tokenize2(src, tokenizer, raw_source_len, padding=False)
source_len = len(tokenized_result['input_ids'])
assert source_len<=raw_source_len
assert len(tgt)>0
src = tokenizer.decode(tokenized_result['input_ids'], skip_special_tokens=True, clean_up_tokenization_spaces=True)
prompt_with_response = src + tgt + " " + tokenizer.eos_token
tokenized_with_response = tokenize2(prompt_with_response, tokenizer, cutoff_len, padding=False)
tokenized_with_response["labels"] = [-100] * source_len + tokenized_with_response["labels"][source_len:]
# print(tokenizer.decode(tokenized_with_response["labels"], skip_special_tokens=True,
# clean_up_tokenization_spaces=True))
assert len(tokenized_with_response["input_ids"]) == len(tokenized_with_response["labels"])
return {'input_ids':tokenized_with_response["input_ids"],'labels': tokenized_with_response["labels"]}