The result of fine tuning will be loss =0,eval_loss=Nan. How can I start learning the right way?

Hi!
I want to create a chatbot with a certain character using transformer. I prepared a simple dataset for teaching tone and started training a model with the code below. However, loss became 0 and eval_loss became NaN. What approach should I take?

I created a class for Dataloader to set the labels other than the lines of the character I want to create to -100, and loaded it into the Trainer. Is there a problem there?

Use the tokenize_dataset function to generate the ids of all texts with tokenized_full, and generate the text excluding the text of the desired character with tokenized_no_output.

We only have about 400 test data because we want to expand the dataset after confirming that the learning pipeline is working correctly. Could this be the cause?
I’m a beginner in machine learning, so please excuse me if I’m asking an off-topic question.

If you need to see a sample dataset or output, I’d be happy to show you. thank you.

code

model_name = "rinna/japanese-gpt-neox-3.6b-instruction-ppo"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
config = AutoConfig.from_pretrained(model_name,use_fast=False)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    config=config,
    device_map="auto",
    load_in_8bit=True
)
VAL_SET_SIZE = int(len(new_dataset) * 0.05)

new_dataset = Dataset.from_dict({k: [dic[k] for dic in new_dataset] for k in new_dataset[0]})

train_val = new_dataset.train_test_split(
    test_size=VAL_SET_SIZE, shuffle=True, seed=1990
)
train_data = train_val["train"]
val_data = train_val["test"]

# スペシャルトークンの確認
tokenized_train = tokenize_dataset(train_data, tokenizer)
tokenized_val = tokenize_dataset(val_data, tokenizer)
collator = InstructCollator(tokenizer)
loader = DataLoader(tokenized_train, collate_fn=collator, batch_size=8, shuffle=True)
batch = next(iter(loader))
batch
eval_steps = 11
save_steps = 33
logging_steps = 3
MICRO_BATCH_SIZE = 2
BATCH_SIZE = 32

trainer = transformers.Trainer(
    #model=model.to(torch.bfloat16),
    model = model,
    data_collator=collator,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    args=transformers.TrainingArguments(
        num_train_epochs=1,
        learning_rate=3e-5,
        evaluation_strategy="steps",
        save_strategy="steps",
        eval_steps=eval_steps,
        save_steps=save_steps,
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        per_device_eval_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=BATCH_SIZE // MICRO_BATCH_SIZE,
        #bf16=True,
        dataloader_num_workers=12,
        logging_steps=logging_steps,
        output_dir="./output",
        report_to="wandb",
        save_total_limit=1,
        load_best_model_at_end=True,
        greater_is_better=False,
        metric_for_best_model="eval_loss",
        #fp16=False,
        auto_find_batch_size=True
    )
)
model.config.use_cache = False
trainer.train()

result

{'loss': 78882.7083, 'learning_rate': 2.7428571428571428e-05, 'epoch': 0.42}
{'loss': 0.0, 'learning_rate': 2.485714285714286e-05, 'epoch': 0.85}
{'loss': 0.0, 'learning_rate': 2.2285714285714287e-05, 'epoch': 1.27}
{'eval_loss': nan, 'eval_runtime': 2.3267, 'eval_samples_per_second': 4.728, 'eval_steps_per_second': 2.579, 'epoch': 1.56}
{'loss': 0.0, 'learning_rate': 1.9714285714285714e-05, 'epoch': 1.7}
{'loss': 0.0, 'learning_rate': 1.7142857142857142e-05, 'epoch': 2.12}
{'loss': 0.0, 'learning_rate': 1.4571428571428571e-05, 'epoch': 2.55}
{'loss': 0.0, 'learning_rate': 1.2e-05, 'epoch': 2.97}
{'eval_loss': nan, 'eval_runtime': 2.3607, 'eval_samples_per_second': 4.66, 'eval_steps_per_second': 2.542, 'epoch': 3.12}
{'loss': 0.0, 'learning_rate': 9.428571428571428e-06, 'epoch': 3.4}
{'loss': 0.0, 'learning_rate': 6.857142857142857e-06, 'epoch': 3.82}
{'loss': 0.0, 'learning_rate': 4.2857142857142855e-06, 'epoch': 4.25}
{'loss': 0.0, 'learning_rate': 1.7142857142857143e-06, 'epoch': 4.67}
{'eval_loss': nan, 'eval_runtime': 2.2353, 'eval_samples_per_second': 4.921, 'eval_steps_per_second': 2.684, 'epoch': 4.67}

configure dataset

class InstructCollator():
    def __init__(self, tokenizer, ignore_index=-100):
        self.tokenizer = tokenizer
        self.ignore_index = -100

    def __call__(self, examples):
        input_batch = []
        label_batch = []
        for example in examples:
            input_batch.append(example['input_ids'])
            label_batch.append(example['labels'])

        input_ids = pad_sequence(
            input_batch, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )

        # labelsのpaddingトークンは先程と同様にignore_indexである-100で埋める
        labels = pad_sequence(
            label_batch, batch_first=True, padding_value=self.ignore_index
        )

        # attention_maskはbool値でもいいらしい
        attention_mask = input_ids.ne(self.tokenizer.pad_token_id)
        
        return {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask
        }

def tokenize_dataset(data_point, tokenizer, ignore_index=-100):
    features = []

    for data in data_point:
      instruction_text = ""
      if data['instruction'] != "":
          instruction_text = data['instruction'] + "\n"
      prompt_full = f"[INST]\n{instruction_text}[/INST]\n{data['input']}{data['output']}{tokenizer.eos_token}"
      prompt_no_output = f"[INST]\n{instruction_text}[/INST]\n{data['input']}"
 
      if len(tokenizer.encode(prompt_full)) >= 2048:
          continue
      tokenized_full = tokenizer(
          prompt_full,
          padding='longest',
          truncation=True,
          max_length=2048,
          return_tensors='pt'
      )
      
      tokenized_no_output = tokenizer(
          prompt_no_output,
          padding='longest',
          truncation=True,
          max_length=2048,
          return_length=True,
          return_tensors='pt'
      )
      
      input_ids = tokenized_full['input_ids'][0]
      labels = copy.deepcopy(input_ids)
      source_len = tokenized_no_output['length'][0]

      labels[:source_len] = ignore_index
      
      features.append({
          'input_ids': input_ids,
          'labels': labels
      })
    return features