I’m doing a lora traning, but when it goes to 33%, it always throws out an error:
File "/root/autodl-tmp/train_lora_optuna.py", line 140, in <module> trainer.train()
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/trainer.py", line 2171, in train return inner_training_loop(
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/trainer.py", line 2625, in _inner_training_loop self._maybe_log_save_evaluate(tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time)
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/trainer.py", line 3071, in _maybe_log_save_evaluate metrics = self._evaluate(trial, ignore_keys_for_eval)
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/trainer.py", line 3025, in _evaluate metrics = self.evaluate(ignore_keys=ignore_keys_for_eval)
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/trainer.py", line 4073, in evaluate output = eval_loop(
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/trainer.py", line 4362, in evaluation_loop metrics = self.compute_metrics(
File "/root/autodl-tmp/train_lora_optuna.py", line 71, in eval_metric predictions_str = tokenizer.batch_decode(predictions, skip_special_tokens=True)
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3811, in batch_decode return [
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3812, in <listcomp> self.decode(
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/tokenization_utils_base.py", line 3851, in decode return self._decode(
File "/root/miniconda3/envs/llm4decompile/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py", line 668, in _decode text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
TypeError: argument 'ids': 'list' object cannot be interpreted as an integer
33%|███▎ | 1740/5220 [03:18<06:36, 8.77it/s]
I think maybe the eval part goes wrong, but I can’t figure out the reason.
from transformers import *
from peft import *
import torch
from datasets import load_dataset
import os
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset
from tensorboard import *
import optuna
from datasets import load_dataset
import evaluate
import os
# 设置环境变量以优化 GPU 内存分配
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
device = "cuda"
tokenizer_name_or_path = "LLM4Binary/llm4decompile-1.3b-v1.5"
model_name_or_path = "LLM4Binary/llm4decompile-1.3b-v1.5"
dataset_name = "asm2c"
text_column = "asm text"
label_column = "text_label"
max_length = 64
lr = 3e-2
num_epochs = 50
batch_size = 8
#device_ids = [0, 1] # 使用第0和第1个GPU
dataset = load_dataset("json", data_files="./traindata.jsonl")
#datasets = dataset.train_test_split(test_size=0.1)
dataset = dataset["train"].train_test_split(0.1)
tokenizer = AutoTokenizer.from_pretrained("LLM4Binary/llm4decompile-1.3b-v1.5")
def preprocess_function(examples):
inputs = examples["input"]
outputs = examples["output"]
# 合并input和output列
merged_texts = [f"{input} {output_text}" for input, output_text in zip(inputs, outputs)]
model_inputs = tokenizer(merged_texts, truncation=True, padding="max_length", max_length=256)
model_inputs["labels"] = model_inputs["input_ids"].copy() # 设置labels
return model_inputs
processed_datasets = dataset.map(
preprocess_function,
batched=True,
num_proc=1,
remove_columns=dataset["train"].column_names,
load_from_cache_file=False,
desc="Running tokenizer on dataset",
)
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["test"]
# 加载评估指标
bleu_metric = evaluate.load("bleu")
rouge_metric = evaluate.load("rouge")
def eval_metric(eval_predict):
predictions, labels = eval_predict
# 确保 predictions 和 labels 都是二维的
if len(predictions.shape) == 1:
predictions = predictions.unsqueeze(0) # 转换为二维
if len(labels.shape) == 1:
labels = labels.unsqueeze(0) # 转换为二维
# 进行类型转换之前可以先查看预测的形状
print(f"predictions shape: {predictions.shape}")
# 将 token IDs 解码为文本
predictions_str = tokenizer.batch_decode(predictions.tolist(), skip_special_tokens=True)
labels_str = tokenizer.batch_decode(labels.tolist(), skip_special_tokens=True)
# 计算 BLEU 分数
bleu_results = bleu_metric.compute(predictions=predictions_str, references=[[ref] for ref in labels_str])
# 计算 ROUGE 分数
rouge_results = rouge_metric.compute(predictions=predictions_str, references=labels_str)
# 合并结果
metrics = {
"bleu": bleu_results["bleu"],
"rouge1": rouge_results["rouge1"],
"rouge2": rouge_results["rouge2"],
"rougeL": rouge_results["rougeL"],
}
return metrics
peft_config = LoraConfig(
task_type="SEQ_2_SEQ_LM",
r=8,
lora_alpha=8,
#target_modules=["q_proj", "v_proj"],
lora_dropout=0.1,
init_lora_weights="gaussian",
bias='lora_only'
)
checkpoint_name = f"{dataset_name}_{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}_v1.pt".replace(
"/", "_"
)
# creating model
model = AutoModelForCausalLM.from_pretrained("LLM4Binary/llm4decompile-1.3b-v1.5")
#model = torch.nn.DataParallel(model, device_ids=device_ids)
#model = prepare_model_for_kbit_training(model)
peft_model = get_peft_model(model, peft_config)
training_args = Seq2SeqTrainingArguments(
output_dir="./results", # 保存模型的目录
eval_strategy="epoch", # 每个 epoch 进行评估
save_strategy="epoch", # 每个 epoch 结束时保存模型
learning_rate=2e-5,
per_device_train_batch_size=1, # 训练时的batch_size
per_device_eval_batch_size=8, # 验证时的batch_size
logging_steps=30, # log 打印的频率
num_train_epochs=3,
weight_decay=0.008,
load_best_model_at_end=True,
gradient_accumulation_steps=1,
metric_for_best_model="bleu",
fp16=True,
predict_with_generate=True
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
#data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True)
compute_metrics=eval_metric,
callbacks=[EarlyStoppingCallback(early_stopping_patience=2)] # 设定 patience 来早停
)
trainer.train()
# 自动搜索
def default_hp_space_optuna(trial):
return {
"learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
"num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5),
"seed": trial.suggest_int("seed", 1, 40),
"per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32, 64]),
"optim": trial.suggest_categorical("optim", ["adamw_hf"]),
}
trainer.hyperparameter_search(
hp_space=default_hp_space_optuna,
compute_objective=lambda x: x.get("bleu", 0),
direction="maximize",
n_trials=10
)
lora_adapter = "./lora_adapter"
peft_model.save_pretrained(lora_adapter, save_adapter=True, save_config=True)
model_to_merge = PeftModel.from_pretrained(AutoModelForCausalLM.from_pretrained(model_name_or_path).to("cuda"), lora_adapter)
merged_model = model_to_merge.merge_and_unload()
merged_model.save_pretrained("./merged_model1")
tokenizer.save_pretrained("./merged_model1")