Error when increasing max_length for tokenizer - OverflowError: out of range integral type conversion attempted

GonRos22 · April 18, 2024, 7:52am

i’m trying to fine-tune a mistral 7B model locally for a regression task, the code works and the loss is decreasing but the outputs when i run trainer.predict(test_data) are cut in the middle, so i assumed its about the max_length parameter in the tokenizer.

after i increase the length from 1024 to 2048, this error occurs.

here is my code:

import torch
import transformers
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
from sklearn.model_selection import KFold
import os
import numpy as np

from dataset_utils import load_questionnaire


CUTOFF_LEN = 2048
LORA_R = 8
LORA_ALPHA = 2 * LORA_R
LORA_DROPOUT = 0.1



def tokenize(prompt):
    return tokenizer(
        prompt + tokenizer.eos_token,
        truncation=True,
        max_length=CUTOFF_LEN ,
        padding="max_length"
    )


def generate_prompt(user_query,is_instruct):
    sys_msg = ("answer the following question:.\n"
               "Provide your answer in the following JSON format: {\"score\": \"predicted_score\"}\n")
    if is_instruct:
        p = "<s> [INST]" + sys_msg + "\n" + user_query["text"] + "[/INST]" + "{\"score\":" + str(user_query["score"]) + "}</s>"
    else:
        p = "<s>" + sys_msg + "\n" + user_query["text"] + "{\"score\":" + str(user_query["score"]) + "}</s>"
    return p


if __name__ == '__main__':
    #model_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
    model_name = "mistralai/Mistral-7B-v0.1"
    questionnaire = load_questionnaire('data/my_data.csv')

    k = 5
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    total_test_results = []
    for i, (train_index, test_index) in enumerate(kf.split(questionnaire)):
        print(f"Fold {i}")
        train_data = questionnaire.iloc[train_index]
        test_data = questionnaire.iloc[test_index]

        print('debug')
        train_data = train_data[:15]
        test_data = test_data[:15]

        train_data = Dataset.from_pandas(train_data)
        test_data = Dataset.from_pandas(test_data)

        bnb_config = BitsAndBytesConfig(
    load_in_4bit= True,
    bnb_4bit_quant_type= "nf4",
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_use_double_quant= False,
)
        tokenizer = AutoTokenizer.from_pretrained(model_name,
                                                  padding_side="left",
                                                  add_eos_token=True,
                                                  add_bos_token=True)
        tokenizer.pad_token = tokenizer.eos_token
        model = AutoModelForCausalLM.from_pretrained(model_name,
                                                     quantization_config=bnb_config,
                                                     torch_dtype=torch.float16,
                                                     device_map="auto")


        # Prepare model for k-bit training
        model = prepare_model_for_kbit_training(model)



       
        config = LoraConfig(
            r=16,
            lora_alpha=16,
            lora_dropout=0.05,
            bias="none",
            task_type="CAUSAL_LM",
            target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"]
        )
        model = get_peft_model(model, config)

        train_data = train_data.shuffle().map(lambda x: tokenize(generate_prompt(x,is_instruct='instruct' in model_name.lower())), remove_columns=["text" , "score"])
        test_data = test_data.map(lambda x: tokenize(generate_prompt(x,is_instruct='instruct' in model_name.lower())), remove_columns=["text", "score"])
        trainer = Trainer(
            model=model,
            train_dataset=train_data,
            args=TrainingArguments(
                per_device_train_batch_size=1,
                gradient_accumulation_steps=4,
                num_train_epochs=6,
                learning_rate=1e-4,
                logging_steps=2,
                optim="adamw_torch",
                save_strategy="epoch",
                output_dir = f"mistral 7B lora-instruct-Qs (fold-{i})"
            ),
            data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
        )
        model.config.use_cache = False

        trainer.train()
        predictions = trainer.predict(test_data)
        outputs = tokenizer.batch_decode(predictions[1])

        test_results = trainer.evaluate(test_data)
        print(f'test results for fold {i}:{test_results}')
        total_test_results.append(test_results)
    np.save('mixtral-moe-lora-instruct-Qs.npy', np.array(total_test_results))

the full stack trace:

Traceback (most recent call last):
  File "/specific/a/home/cc/students/csguests/gros/.pycharm_helpers/pydev/_pydevd_bundle/pydevd_exec2.py", line 3, in Exec
    exec(exp, global_vars, local_vars)
  File "<input>", line 1, in <module>
AttributeError: 'LlamaTokenizerFast' object has no attribute 'special_tokens'

No chat template is defined for this tokenizer - using the default template for the LlamaTokenizerFast class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

Traceback (most recent call last):
  File "/specific/a/home/cc/students/csguests/gros/.pycharm_helpers/pydev/pydevd.py", line 1534, in _exec
    pydev_imports.execfile(file, globals, locals)  # execute the script
  File "/specific/a/home/cc/students/csguests/gros/.pycharm_helpers/pydev/_pydev_imps/_pydev_execfile.py", line 18, in execfile
    exec(compile(contents+"\n", file, 'exec'), glob, loc)
  File "/home/ai_center/ai_users/gros/firstPred/finetune_mixtral_7Bx8.py", line 143, in <module>
    outputs = tokenizer.batch_decode(predictions[1])
  File "/home/ai_center/ai_users/gros/miniconda3/envs/psyq/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 3785, in batch_decode
    return [
  File "/home/ai_center/ai_users/gros/miniconda3/envs/psyq/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 3786, in <listcomp>
    self.decode(
  File "/home/ai_center/ai_users/gros/miniconda3/envs/psyq/lib/python3.9/site-packages/transformers/tokenization_utils_base.py", line 3825, in decode
    return self._decode(
  File "/home/ai_center/ai_users/gros/miniconda3/envs/psyq/lib/python3.9/site-packages/transformers/tokenization_utils_fast.py", line 625, in _decode
    text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
OverflowError: out of range integral type conversion attempted

Process finished with exit code 1

Topic		Replies	Views
Bug in Summarization tutorial Site Feedback	2	1961	March 21, 2024
Fine-tune transformers for language model Beginners	2	662	August 14, 2022
How do I increase max_new_tokens Beginners	3	29262	August 19, 2023
Model_max_length error in some models 🤗Transformers	0	199	April 1, 2024
Predictions with pipeline fails to truncate test set 🤗Transformers	0	180	January 23, 2024

Error when increasing max_length for tokenizer - OverflowError: out of range integral type conversion attempted

Related topics