TypeError: argument 'ids': 'list' object cannot be interpreted as an integer when lora training

I’m doing a lora traning, but when it goes to 33%, it always throws out an error:

File "/root/autodl-tmp/train_lora_optuna.py", line 140, in <module> trainer.train() 
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/trainer.py", line 2171, in train return inner_training_loop( 
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/trainer.py", line 2625, in _inner_training_loop self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time) 
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/trainer.py", line 3071, in _maybe_log_save_evaluate metrics = self._evaluate(trial, ignore_keys_for_eval) 
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/trainer.py", line 3025, in _evaluate metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) 
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/trainer.py", line 4073, in evaluate output = eval_loop( 
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/trainer.py", line 4362, in evaluation_loop metrics = self.compute_metrics( 
File "/root/autodl-tmp/train_lora_optuna.py", line 71, in eval_metric predictions_str = tokenizer.batch_decode(predictions, skip_special_tokens=True) 
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3811, in batch_decode return [ 
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3812, in <listcomp> self.decode( 
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3851, in decode return self._decode( 
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py", line 668, in _decode text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) 
TypeError: argument 'ids': 'list' object cannot be interpreted as an integer 
33%|███▎ | 1740/5220 [03:18<06:36, 8.77it/s]

I think maybe the eval part goes wrong, but I can’t figure out the reason.

from transformers import *
from peft import *
import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset
from tensorboard import * 
import optuna
from datasets import load_dataset
import evaluate
import os

# 设置环境变量以优化 GPU 内存分配
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


device = "cuda"
tokenizer_name_or_path = "LLM4Binary/llm4decompile-1.3b-v1.5"
model_name_or_path = "LLM4Binary/llm4decompile-1.3b-v1.5"
dataset_name = "asm2c"
text_column = "asm text"
label_column = "text_label"
max_length = 64
lr = 3e-2
num_epochs = 50
batch_size = 8

#device_ids = [0, 1]  # 使用第0和第1个GPU

dataset = load_dataset("json", data_files="./traindata.jsonl")
#datasets = dataset.train_test_split(test_size=0.1)
dataset = dataset["train"].train_test_split(0.1)


tokenizer = AutoTokenizer.from_pretrained("LLM4Binary/llm4decompile-1.3b-v1.5")

def preprocess_function(examples):
    inputs = examples["input"]
    outputs = examples["output"]

    # 合并input和output列
    merged_texts = [f"{input} {output_text}" for input, output_text in zip(inputs, outputs)]
    
    model_inputs = tokenizer(merged_texts, truncation=True, padding="max_length", max_length=256)
    model_inputs["labels"] = model_inputs["input_ids"].copy()  # 设置labels
    return model_inputs

processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]

# 加载评估指标
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")

def eval_metric(eval_predict):
    predictions, labels = eval_predict
    
    # 确保 predictions 和 labels 都是二维的
    if len(predictions.shape) == 1:
        predictions = predictions.unsqueeze(0)  # 转换为二维
    if len(labels.shape) == 1:
        labels = labels.unsqueeze(0)  # 转换为二维
    
    # 进行类型转换之前可以先查看预测的形状
    print(f"predictions shape: {predictions.shape}")
    
    # 将 token IDs 解码为文本
    predictions_str = tokenizer.batch_decode(predictions.tolist(), skip_special_tokens=True)
    labels_str = tokenizer.batch_decode(labels.tolist(), skip_special_tokens=True)
    
    # 计算 BLEU 分数
    bleu_results = bleu_metric.compute(predictions=predictions_str, references=[[ref] for ref in labels_str])
    
    # 计算 ROUGE 分数
    rouge_results = rouge_metric.compute(predictions=predictions_str, references=labels_str)
    
    # 合并结果
    metrics = {
        "bleu": bleu_results["bleu"],
        "rouge1": rouge_results["rouge1"],
        "rouge2": rouge_results["rouge2"],
        "rougeL": rouge_results["rougeL"],
    }
    return metrics



peft_config = LoraConfig(
    task_type="SEQ_2_SEQ_LM",
    r=8,
    lora_alpha=8,
    #target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    init_lora_weights="gaussian",
    bias='lora_only'
)
checkpoint_name = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace(
    "/", "_"
)

# creating model
model = AutoModelForCausalLM.from_pretrained("LLM4Binary/llm4decompile-1.3b-v1.5")

#model = torch.nn.DataParallel(model, device_ids=device_ids)


#model = prepare_model_for_kbit_training(model)

peft_model = get_peft_model(model, peft_config)


training_args = Seq2SeqTrainingArguments(
    output_dir="./results",             # 保存模型的目录
    eval_strategy="epoch",         # 每个 epoch 进行评估
    save_strategy="epoch",           # 每个 epoch 结束时保存模型              
    learning_rate=2e-5,
    per_device_train_batch_size=1,      # 训练时的batch_size
    per_device_eval_batch_size=8,      # 验证时的batch_size
    logging_steps=30,                    # log 打印的频率
    num_train_epochs=3,
    weight_decay=0.008,
    load_best_model_at_end=True,
    gradient_accumulation_steps=1,
    metric_for_best_model="bleu",
    fp16=True,
    predict_with_generate=True
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    #data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
    compute_metrics=eval_metric,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]  # 设定 patience 来早停
)

trainer.train()

# 自动搜索

def default_hp_space_optuna(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
        "seed": trial.suggest_int("seed", 1, 40),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32, 64]),
        "optim": trial.suggest_categorical("optim", ["adamw_hf"]),
    }

trainer.hyperparameter_search(
    hp_space=default_hp_space_optuna, 
    compute_objective=lambda x: x.get("bleu", 0), 
    direction="maximize", 
    n_trials=10
    )

lora_adapter = "./lora_adapter"
peft_model.save_pretrained(lora_adapter, save_adapter=True, save_config=True)

model_to_merge = PeftModel.from_pretrained(AutoModelForCausalLM.from_pretrained(model_name_or_path).to("cuda"), lora_adapter)

merged_model = model_to_merge.merge_and_unload()
merged_model.save_pretrained("./merged_model1")
tokenizer.save_pretrained("./merged_model1")
1 Like

I think it’s similar to the symptoms of the first URL.

I read that post before, but it seems not the same (summarization model).
I tried predict_with_generate=True, but it didn’t work.

training_args = Seq2SeqTrainingArguments(
output_dir=“./results”, # 保存模型的目录
eval_strategy=“epoch”, # 每个 epoch 进行评估
save_strategy=“epoch”, # 每个 epoch 结束时保存模型
learning_rate=2e-5,
per_device_train_batch_size=1, # 训练时的batch_size
per_device_eval_batch_size=8, # 验证时的batch_size
logging_steps=30, # log 打印的频率
num_train_epochs=3,
weight_decay=0.008,
load_best_model_at_end=True,
gradient_accumulation_steps=1,
metric_for_best_model=“bleu”,
fp16=True,
predict_with_generate=True
)

1 Like

For now, it seems that eval is probably in a state where it will fail 100% of the time, because it crashes at 33%. 100/3 = ?

One way is to set eval=“none”, but you want to fix Evaluate anyway…

    eval_strategy="epoch",         # 每个 epoch 进行评估
    num_train_epochs=3,

As a temporary suggestion, let’s first reduce the number of training sessions to the absolute minimum (or =“steps”) and modify the code until the Evaluate function starts working.

    eval_strategy="steps",         # maybe crash at first

I tried debugging it a little. I found the cause, or rather the trigger, but I’m not sure what the right thing to do is…
The cause is that the Evaluate function is trying to decode something that the tokenizer can’t decode.

    # 将 token IDs 解码为文本
    print("predictions:", predictions)
    print("labels:", labels)
    #predictions_str = tokenizer.batch_decode(predictions.tolist(), skip_special_tokens=True) # cannot be decoded... like <class 'numpy.ndarray'> "predictions: [[[1.87890625, 8.796875, 11.3359375, 8.6328125, 7.796875, 10.8125, 11.3046875, 13.625, 9.8046875, 11.984375, ..."
    labels_str = tokenizer.batch_decode(labels.tolist(), skip_special_tokens=True)

Thank you ! I may try not use this eval.

1 Like

Or, you could improve the evaluation. If the evaluation doesn’t work properly, there is a risk that incorrect evaluations will be adopted and the model will be powered down…
(Because “load_best_model_at_end=True” is specified)

I think the role of the evaluation function will be fulfilled at a minimum if the good and bad are not reversed.

hello, i meet the same question," text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
TypeError: argument ‘ids’: ‘list’ object cannot be interpreted as an integer"

i hope it will be helpful. my question is i forget add “for output in batch_outputs::rofl:

//my code is as follows:

batch_outputs = model.generate(
input_ids=prompt_ids.input_ids,

)
for output in batch_outputs:
print("output is ",tokenizer.decode(output, skip_special_tokens=True))

1 Like