Hi!
I want to create a chatbot with a certain character using transformer. I prepared a simple dataset for teaching tone and started training a model with the code below. However, loss became 0 and eval_loss became NaN. What approach should I take?
I created a class for Dataloader to set the labels other than the lines of the character I want to create to -100, and loaded it into the Trainer. Is there a problem there?
Use the tokenize_dataset function to generate the ids of all texts with tokenized_full, and generate the text excluding the text of the desired character with tokenized_no_output.
We only have about 400 test data because we want to expand the dataset after confirming that the learning pipeline is working correctly. Could this be the cause?
I’m a beginner in machine learning, so please excuse me if I’m asking an off-topic question.
If you need to see a sample dataset or output, I’d be happy to show you. thank you.
code
model_name = "rinna/japanese-gpt-neox-3.6b-instruction-ppo"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
config = AutoConfig.from_pretrained(model_name,use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
model_name,
config=config,
device_map="auto",
load_in_8bit=True
)
VAL_SET_SIZE = int(len(new_dataset) * 0.05)
new_dataset = Dataset.from_dict({k: [dic[k] for dic in new_dataset] for k in new_dataset[0]})
train_val = new_dataset.train_test_split(
test_size=VAL_SET_SIZE, shuffle=True, seed=1990
)
train_data = train_val["train"]
val_data = train_val["test"]
# スペシャルトークンの確認
tokenized_train = tokenize_dataset(train_data, tokenizer)
tokenized_val = tokenize_dataset(val_data, tokenizer)
collator = InstructCollator(tokenizer)
loader = DataLoader(tokenized_train, collate_fn=collator, batch_size=8, shuffle=True)
batch = next(iter(loader))
batch
eval_steps = 11
save_steps = 33
logging_steps = 3
MICRO_BATCH_SIZE = 2
BATCH_SIZE = 32
trainer = transformers.Trainer(
#model=model.to(torch.bfloat16),
model = model,
data_collator=collator,
train_dataset=tokenized_train,
eval_dataset=tokenized_val,
args=transformers.TrainingArguments(
num_train_epochs=1,
learning_rate=3e-5,
evaluation_strategy="steps",
save_strategy="steps",
eval_steps=eval_steps,
save_steps=save_steps,
per_device_train_batch_size=MICRO_BATCH_SIZE,
per_device_eval_batch_size=MICRO_BATCH_SIZE,
gradient_accumulation_steps=BATCH_SIZE // MICRO_BATCH_SIZE,
#bf16=True,
dataloader_num_workers=12,
logging_steps=logging_steps,
output_dir="./output",
report_to="wandb",
save_total_limit=1,
load_best_model_at_end=True,
greater_is_better=False,
metric_for_best_model="eval_loss",
#fp16=False,
auto_find_batch_size=True
)
)
model.config.use_cache = False
trainer.train()
result
{'loss': 78882.7083, 'learning_rate': 2.7428571428571428e-05, 'epoch': 0.42}
{'loss': 0.0, 'learning_rate': 2.485714285714286e-05, 'epoch': 0.85}
{'loss': 0.0, 'learning_rate': 2.2285714285714287e-05, 'epoch': 1.27}
{'eval_loss': nan, 'eval_runtime': 2.3267, 'eval_samples_per_second': 4.728, 'eval_steps_per_second': 2.579, 'epoch': 1.56}
{'loss': 0.0, 'learning_rate': 1.9714285714285714e-05, 'epoch': 1.7}
{'loss': 0.0, 'learning_rate': 1.7142857142857142e-05, 'epoch': 2.12}
{'loss': 0.0, 'learning_rate': 1.4571428571428571e-05, 'epoch': 2.55}
{'loss': 0.0, 'learning_rate': 1.2e-05, 'epoch': 2.97}
{'eval_loss': nan, 'eval_runtime': 2.3607, 'eval_samples_per_second': 4.66, 'eval_steps_per_second': 2.542, 'epoch': 3.12}
{'loss': 0.0, 'learning_rate': 9.428571428571428e-06, 'epoch': 3.4}
{'loss': 0.0, 'learning_rate': 6.857142857142857e-06, 'epoch': 3.82}
{'loss': 0.0, 'learning_rate': 4.2857142857142855e-06, 'epoch': 4.25}
{'loss': 0.0, 'learning_rate': 1.7142857142857143e-06, 'epoch': 4.67}
{'eval_loss': nan, 'eval_runtime': 2.2353, 'eval_samples_per_second': 4.921, 'eval_steps_per_second': 2.684, 'epoch': 4.67}
configure dataset
class InstructCollator():
def __init__(self, tokenizer, ignore_index=-100):
self.tokenizer = tokenizer
self.ignore_index = -100
def __call__(self, examples):
input_batch = []
label_batch = []
for example in examples:
input_batch.append(example['input_ids'])
label_batch.append(example['labels'])
input_ids = pad_sequence(
input_batch, batch_first=True, padding_value=self.tokenizer.pad_token_id
)
# labelsのpaddingトークンは先程と同様にignore_indexである-100で埋める
labels = pad_sequence(
label_batch, batch_first=True, padding_value=self.ignore_index
)
# attention_maskはbool値でもいいらしい
attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
return {
'input_ids': input_ids,
'labels': labels,
'attention_mask': attention_mask
}
def tokenize_dataset(data_point, tokenizer, ignore_index=-100):
features = []
for data in data_point:
instruction_text = ""
if data['instruction'] != "":
instruction_text = data['instruction'] + "\n"
prompt_full = f"[INST]\n{instruction_text}[/INST]\n{data['input']}{data['output']}{tokenizer.eos_token}"
prompt_no_output = f"[INST]\n{instruction_text}[/INST]\n{data['input']}"
if len(tokenizer.encode(prompt_full)) >= 2048:
continue
tokenized_full = tokenizer(
prompt_full,
padding='longest',
truncation=True,
max_length=2048,
return_tensors='pt'
)
tokenized_no_output = tokenizer(
prompt_no_output,
padding='longest',
truncation=True,
max_length=2048,
return_length=True,
return_tensors='pt'
)
input_ids = tokenized_full['input_ids'][0]
labels = copy.deepcopy(input_ids)
source_len = tokenized_no_output['length'][0]
labels[:source_len] = ignore_index
features.append({
'input_ids': input_ids,
'labels': labels
})
return features