I put together this code to causal tune a llama-2 model:
import os
import time
import torch # Import PyTorch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, LlamaTokenizer,LlamaForCausalLM
from datasets import Dataset, load_dataset
from huggingface_hub import HfApi, HfFolder
torch.set_default_dtype(torch.float16)
token = "1223"
HfFolder.save_token(token)
document_paths = ["./texts/snippet5krandomclean.txt"]
hugmodel = "meta-llama/Llama-2-7b-hf"
outputdir = "./my_finetuned_model-snippet-llama-2-5krandomclean"
quantization_config = BitsAndBytesConfig(
load_in_8bit=True, load_in_4bit=False
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
n_gpu = torch.cuda.device_count()
print(f"{device}, gpus: {n_gpu}")
tokenizer = LlamaTokenizer.from_pretrained(hugmodel, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
outputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=1024)
# Convert lists to tensors and clone
outputs["labels"] = [torch.tensor(o).clone() for o in outputs["input_ids"]]
return outputs
chunk_size = 1024
with open('./texts/snippet5krandomclean.txt', 'r', encoding='utf-8') as f:
content = f.read()
lines = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]
raw_datasets = Dataset.from_dict({"text": lines})
print(raw_datasets)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, batch_size=1024)
torch.cuda.empty_cache()
model = LlamaForCausalLM.from_pretrained(hugmodel,low_cpu_mem_usage=True, load_in_8bit=True, quantization_config=quantization_config)
if n_gpu > 1:
model = torch.nn.DataParallel(model)
start_time = time.time()
# Initialize trainer
training_args = TrainingArguments(
per_device_train_batch_size=1,
num_train_epochs=1,
logging_dir="./logs",
logging_steps=3,
do_train=True,
output_dir="./results",
save_total_limit=1,
remove_unused_columns=False,
gradient_accumulation_steps=6,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_datasets,
)
trainer.train()
if isinstance(model, torch.nn.DataParallel):
model.module.save_pretrained(outputdir)
else:
model.save_pretrained(outputdir)
tokenizer.save_pretrained(outputdir)
And then I try to prompt it with this code:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaForCausalLM, LlamaTokenizer
import transformers
import torch
mpath = "/mnt/d/Newfolder/my_finetuned_model-snippet-llama-2-5krandomclean"
model = LlamaForCausalLM.from_pretrained(mpath)
tokenizer = LlamaTokenizer.from_pretrained(mpath)
pipeline = transformers.pipeline(
"text-generation",
model=model,
remove_invalid_values = True,
tokenizer=tokenizer,
device_map="auto",
)
sequences = pipeline(
'What is a llama?',
do_sample=True,
num_return_sequences=1,
eos_token_id=tokenizer.eos_token_id,
max_length=200,
)
for seq in sequences:
print(f"Result: {seq['generated_text']}")
The result is gibberish like this:
Result: What is a llama? watപ)), ol bugião�appy соответ California numericaliej suppose familie></They regex appointedvalumenLar gatherDEFonces vert健功 stead placed consumemaster Sint Geschäft колиTA pelos Pennsylvaniaædiaiiiспо Че Russ constraintgetElementById erst&enserrench Kal MissouriStandnýchты史ym образова veröff coneexportsdownload}lines马ち dragcacheECK Mountain figaci modoicбургenter Editionคations scri lblArgument Alertmo watautore claimsiat BooksErrorデOINwhererror…
The document I am using for training is just a large text file ~15MB of plain text stories. Is there something I am missing in my process?