I want to create my own llama model.
Many people recommended xwin-mlewd-13b, but I felt many limitations while using this model.
The biggest problem was the language problem.
So I am trying to fine-tune based on the novel data I have.
However, I am not sure if what I am doing is right, so I am asking a question.
I asked gpt3, and he said what I am doing is right, but gpt’s answers are often wrong, so I am asking you guys.
What I want to do is simple.
I want to create new content based on the novel dataset or have a conversation with the characters in the novel.
xwin-mlewd-13b is suitable for what I want to do, but I felt the limitations of the language.
I made the dataset simply.
All I did was correct the grammar of the novel.
And I converted it to a json file.
And the file was created as follows.
“text”: “You will see at a glance how money comes in and goes out.”
I made the code based on the text above. import os
import json
import random
import torch
import argparse
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import Dataset
from peft import get_peft_model, LoraConfig
from torch.optim import AdamW
from accelerate import Accelerator
Set command line arguments
parser = argparse.ArgumentParser()
parser.add_argument(“–batch_size”, type=int, default=2)
parser.add_argument(“–lr”, type=float, default=3e-4)
args = parser.parse_args()
Set model ID
model_id = ‘llama-3.2-Korean-Bllossom-3B’ # Enter desired model ID
Load config.json file
config_file = f"{model_id}/config.json"
if not os.path.exists(config_file):
raise FileNotFoundError(f"Config file not found: {config_file}")
with open(config_file, ‘r’) as f:
config = json.load(f)
Get max_position_embeddings value, default is 512
max_position_embeddings = config.get(‘max_position_embeddings’, 30000)
Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
Set padding token
if tokenizer.pad_token is None:
If pad_token is not present, use eos_token
tokenizer.pad_token = tokenizer.eos_token
Set pad_token_id
pad_token_id = tokenizer.pad_token_id
if pad_token_id is None or pad_token_id == -1:
raise ValueError(“pad_token_id is not set correctly; please check the tokenizer.”)
LoRA setup
lora_config = LoraConfig(
r=3, # Reduce r value to reduce memory usage
lora_alpha=16,
lora_dropout=0.1,
task_type=“CAUSAL_LM”,
target_modules=[“q_proj”, “v_proj”]
)
Load model to CPU
model = AutoModelForCausalLM.from_pretrained(model_id)
model = get_peft_model(model, lora_config)
Set pad_token_id explicitly
model.generation_config.pad_token_id = pad_token_id
Accelerator initialization
accelerator = Accelerator()
Prepare model to Accelerator
model, tokenizer = accelerator.prepare(model, tokenizer)
Load processed_dataset.json file
dataset_file = ‘processed_dataset.json’
if not os.path.exists(dataset_file):
raise FileNotFoundError(f"Dataset file not found: {dataset_file}")
with open(dataset_file, ‘r’, encoding=‘utf-8’) as f:
full_dataset = json.load(f)
If the data is in list form
texts = [item[‘text’] for item in full_dataset]
random.shuffle(texts)
Split data
split_index = int(len(texts) * 0.8) # Use 80% as training data
train_texts = texts[:split_index]
val_texts = texts[split_index:]
Create dataset
train_dataset = Dataset.from_dict({“text”: train_texts})
val_dataset = Dataset.from_dict({“text”: val_texts})
Function to remove unnecessary strings
def remove_unwanted_strings(examples):
examples[‘text’] = [text.replace(‘<>’, ‘’).replace(‘<>’, ‘’).strip() for text in examples[‘text’]]
return examples
Apply string removal
train_dataset = train_dataset.map(remove_unwanted_strings, batched=True)
val_dataset = val_dataset.map(remove_unwanted_strings, batched=True)
Data preprocessing function
def preprocess_function(examples):
Perform tokenization only (no padding/truncation)
tokenized_examples = tokenizer(examples[‘text’], return_tensors=‘pt’)
In batch Calculating the maximum length
max_len = max([t.shape[1] for t in tokenized_examples[‘input_ids’]])
max_len = min(max_len, max_position_embeddings) # Limit the maximum embedding length
Apply padding and truncation
model_inputs = tokenizer(
examples[‘text’],
max_length=max_len,
truncation=True,
padding=‘max_length’,
return_tensors=‘pt’
)
model_inputs[‘labels’] = model_inputs[‘input_ids’] # Set labels to input_ids
return model_inputs
Preprocessing the dataset
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)
Setting TrainingArguments
training_args = TrainingArguments(
output_dir=‘./results’,
per_device_train_batch_size=args.batch_size,
num_train_epochs=3,
learning_rate=args.lr,
logging_dir=‘./logs’,
logging_steps=10,
eval_strategy=“steps”,
evaluation_strategy=“steps”, # Setting validation interval
eval_steps=500, # Validate every 500 steps
save_strategy=“epoch”,
report_to=“tensorboard”,
logging_first_step=True,
fp16=True, # Use mixed precision
gradient_accumulation_steps=2, # Adjusted gradient accumulation
save_total_limit=3, # Limit the number of models saved
early_stopping_patience=3, # If there is no performance improvement during 3 validations end
)
Optimizer setup
optimizer = AdamW(model.parameters(), lr=training_args.learning_rate)
Trainer setup
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_val_dataset, # Add validation dataset
)
Start training
trainer.train()
Save model and tokenizer after training
model.save_pretrained(‘./results’)
t