Prompt printing gibberish

I put together this code to causal tune a llama-2 model:

import os
import time
import torch  # Import PyTorch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig, LlamaTokenizer,LlamaForCausalLM
from datasets import Dataset, load_dataset
from huggingface_hub import HfApi, HfFolder
torch.set_default_dtype(torch.float16)

token = "1223"  
HfFolder.save_token(token)


document_paths = ["./texts/snippet5krandomclean.txt"]
hugmodel = "meta-llama/Llama-2-7b-hf"
outputdir = "./my_finetuned_model-snippet-llama-2-5krandomclean"

quantization_config = BitsAndBytesConfig(
    load_in_8bit=True, load_in_4bit=False
)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
n_gpu = torch.cuda.device_count()
print(f"{device}, gpus: {n_gpu}")


tokenizer = LlamaTokenizer.from_pretrained(hugmodel, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token

def tokenize_function(examples):
    outputs = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=1024)
    # Convert lists to tensors and clone
    outputs["labels"] = [torch.tensor(o).clone() for o in outputs["input_ids"]]
    return outputs

chunk_size = 1024 
with open('./texts/snippet5krandomclean.txt', 'r', encoding='utf-8') as f:
    content = f.read()
    lines = [content[i:i+chunk_size] for i in range(0, len(content), chunk_size)]

raw_datasets = Dataset.from_dict({"text": lines})
print(raw_datasets)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, batch_size=1024)

torch.cuda.empty_cache()
model = LlamaForCausalLM.from_pretrained(hugmodel,low_cpu_mem_usage=True, load_in_8bit=True, quantization_config=quantization_config)
if n_gpu > 1:
    model = torch.nn.DataParallel(model)



start_time = time.time()
# Initialize trainer
training_args = TrainingArguments(
    per_device_train_batch_size=1,
    num_train_epochs=1,
    logging_dir="./logs",
    logging_steps=3,
    do_train=True,
    output_dir="./results",
    save_total_limit=1,
    remove_unused_columns=False,
    gradient_accumulation_steps=6,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

trainer.train()


if isinstance(model, torch.nn.DataParallel):
    model.module.save_pretrained(outputdir)
else:
    model.save_pretrained(outputdir)

tokenizer.save_pretrained(outputdir)

And then I try to prompt it with this code:

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, LlamaForCausalLM, LlamaTokenizer
import transformers
import torch

mpath = "/mnt/d/Newfolder/my_finetuned_model-snippet-llama-2-5krandomclean"

model = LlamaForCausalLM.from_pretrained(mpath)

tokenizer = LlamaTokenizer.from_pretrained(mpath)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    remove_invalid_values = True,
    tokenizer=tokenizer,
    device_map="auto",
)

sequences = pipeline(
    'What is a llama?',
    do_sample=True,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    max_length=200,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

The result is gibberish like this:
Result: What is a llama? watപ)), ol bugião�appy соответ California numericaliej suppose familie></They regex appointedvalumenLar gatherDEFonces vert健功 stead placed consumemaster Sint Geschäft колиTA pelos Pennsylvaniaædiaiiiспо Че Russ constraintgetElementById erst&enserrench Kal MissouriStandnýchты史ym образова veröff coneexportsdownload}lines马ち dragcacheECK Mountain figaci modoicбургenter Editionคations scri lblArgument Alertmo watautore claimsiat BooksErrorデOINwhererror…

The document I am using for training is just a large text file ~15MB of plain text stories. Is there something I am missing in my process?

1 Like

I wonder if there is an issue with the weights or precision.

I am using the load_in_8bit=True.

As I am starting with a HF model I don’t need to run that weight conversion script right?