The error message is as follows:
raise RuntimeError(
RuntimeError: Error(s) in loading state_dict for PeftModelForCausalLM:
size mismatch for base_model.model.model.embed_tokens.base_layer.weight: copying a param with shape torch.Size([
128257, 3072]) from checkpoint, the shape in current model is torch.Size([128256, 3072]).
size mismatch for base_model.model.model.embed_tokens.lora_embedding_A.default: copying a param with shape torch
.Size([64, 128257]) from checkpoint, the shape in current model is torch.Size([64, 128256]).
size mismatch for base_model.model.lm_head.base_layer.weight: copying a param with shape torch.Size([128257, 307
2]) from checkpoint, the shape in current model is torch.Size([128256, 3072]).
size mismatch for base_model.model.lm_head.lora_B.default.weight: copying a param with shape torch.Size([128257,
64]) from checkpoint, the shape in current model is torch.Size([128256, 64]).
Here is the code I learned:
import os
import json
import random
import torch
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from peft import get_peft_model, LoraConfig
from torch.optim import AdamW
from accelerate import Accelerator
명령줄 인자 설정
parser = argparse.ArgumentParser()
parser.add_argument(“–batch_size”, type=int, default=2)
parser.add_argument(“–micro_batch_size”, type=int, default=5) # 마이크로 배치 사이즈 추가
parser.add_argument(“–lr”, type=float, default=5e-5)
args = parser.parse_args()
모델 ID 설정
model_id = ‘llama-3.2-Korean-Bllossom-3B’
config.json 파일 로드
config_file = f"{model_id}/config.json"
if not os.path.exists(config_file):
raise FileNotFoundError(f"Config file not found: {config_file}")
with open(config_file, ‘r’) as f:
config = json.load(f)
max_position_embeddings 값을 가져옴, 기본값은 512로 설정
max_position_embeddings = config.get(‘max_position_embeddings’, 512)
토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained(model_id)
패딩 토큰 추가
tokenizer.add_special_tokens({‘pad_token’: ‘[PAD]’})
pad_token_id = tokenizer.pad_token_id
LoRA 설정
lora_config = LoraConfig(
r=64,
lora_alpha=16,
lora_dropout=0.1,
task_type=“CAUSAL_LM”,
target_modules=[“q_proj”, “v_proj”, “embed_tokens”, “lm_head”]
)
모델을 CPU로 로드
model = AutoModelForCausalLM.from_pretrained(model_id)
model.resize_token_embeddings(len(tokenizer))
model = get_peft_model(model, lora_config)
pad_token_id 명시적으로 설정
model.generation_config.pad_token_id = pad_token_id
Accelerator 초기화
accelerator = Accelerator()
모델을 Accelerator로 준비
model, tokenizer = accelerator.prepare(model, tokenizer)
processed_dataset.json 파일 로드
dataset_file = ‘processed_dataset.json’
if not os.path.exists(dataset_file):
raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
with open(dataset_file, ‘r’, encoding=‘utf-8’) as f:
full_dataset = json.load(f)
데이터가 리스트 형태일 경우
texts = [item[‘text’] for item in full_dataset]
random.shuffle(texts)
데이터 나누기
split_index = int(len(texts) * 0.8)
train_texts = texts[:split_index]
val_texts = texts[split_index:]
데이터셋 생성
train_dataset = Dataset.from_dict({“text”: train_texts})
val_dataset = Dataset.from_dict({“text”: val_texts})
필요 없는 문자열 제거 함수
def remove_unwanted_strings(examples):
examples[‘text’] = [text.replace(‘<>’, ‘’).replace(‘<>’, ‘’).strip() for text in examples[‘text’]]
return examples
문자열 제거 적용
train_dataset = train_dataset.map(remove_unwanted_strings, batched=True)
val_dataset = val_dataset.map(remove_unwanted_strings, batched=True)
데이터 전처리 함수
def preprocess_function(examples):
tokenized = tokenizer(
examples[“text”],
max_length=512,
truncation=True,
padding=“max_length”, # max_length로 패딩
return_tensors=‘pt’
)
labels = tokenized["input_ids"].clone()
labels[labels == pad_token_id] = -100 # 패딩 토큰을 -100으로 설정하여 손실에서 제외
tokenized["labels"] = labels
return tokenized
데이터셋 전처리
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
TrainingArguments 설정
training_args = TrainingArguments(
output_dir=‘./results’,
per_device_train_batch_size=args.batch_size,
num_train_epochs=5,
learning_rate=args.lr,
logging_dir=‘./logs’,
logging_steps=10,
eval_strategy=“epoch”, # 평가 전략을 'epoch’으로 설정
save_strategy=“epoch”, # 저장 전략도 'epoch’으로 설정
report_to=“wandb”,
logging_first_step=True,
bf16=True, # FP16 활성화
gradient_accumulation_steps=3,
save_total_limit=3,
load_best_model_at_end=True,
metric_for_best_model=“eval_loss”,
)
Optimizer 설정
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
Trainer 설정
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_val_dataset,
optimizers=(optimizer, None), # Custom optimizer 추가
)
Gradient Clipping 설정
trainer.args.max_grad_norm = 1.0 # Gradient Clipping 추가
학습 시작
trainer.train()
학습 후 모델과 토크나이저 저장
model.save_pretrained(‘./results’)
tokenizer.save_pretrained(‘./results’)
추론 함수 정의
def infer(text):
with torch.no_grad():
inputs = tokenizer(text, return_tensors=‘pt’, max_length=128, truncation=True)
inputs = {k: v.to(accelerator.device) for k, v in inputs.items()}
outputs = model.generate(**inputs)
return tokenizer.decode(outputs[0], skip_special_tokens=True)
학습 중간에 메모리 정리
torch.cuda.empty_cache()
The above code has undergone many modifications. It was written with gpt3, ms copilot, and advice from many people.
Fine tuning was successful. The merge code used merge_peft_adapters.py from github.
The command I entered was:
python merge_peft_adapters.py --base_model_name_or_path llama-3.2-Korean-Bllossom-3B --peft_model_path results/checkpoint-65 --device cpu
I used cpu because it was a small model, but the merge failed.