Hello. Help me solve the problem. I want to convert Llama3.1-8B using the script convert_llama_weights_to_hf.py, but I get the following errors:
First error: RuntimeError: Internal: could not parse ModelProto from C:\Users\evhac.llama\checkpoints\Llama3.1-8B\tokenizer.model
Second error: ValueError: Failed to instantiate tokenizer. Please, make sure you have sentencepiece and protobuf installed.
I installed sentencepiece and protobuf, but the errors persist.
1 Like
With the exception of a very small number of exceptional models, it seems that there is no need to convert using that script. The error is probably occurring because you are trying to convert something that does not need to be converted. If you need Llama3 in HF format, you can also download it from the following.
I solved this error, but a new error appeared
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
from datasets import load_dataset, concatenate_datasets
try:
# Download tokenizer and model
model_path = "C:\\Users\\evhac\\.llama\\checkpoints\\Llama3.1-8B-hf"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
# Setting eos_token as pad_token
tokenizer.pad_token = tokenizer.eos_token
# Transfer the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Loading datasets
persona_chat_dataset = load_dataset("AlekseyKorshuk/persona-chat", split="train")
dailydialog_dataset = load_dataset("roskoN/dailydialog", split="train")
# Transforming datasets
def preprocess_persona(example):
dialogue = example.get("dialogue", [])
return {"dialogue": dialogue}
def preprocess_dailydialog(example):
dialogue = example.get("dialogue", [])
return {"dialogue": dialogue}
persona_chat_dataset = persona_chat_dataset.map(preprocess_persona, remove_columns=persona_chat_dataset.column_names)
dailydialog_dataset = dailydialog_dataset.map(preprocess_dailydialog, remove_columns=dailydialog_dataset.column_names)
combined_dataset = concatenate_datasets([persona_chat_dataset, dailydialog_dataset])
# Transforming Dialogues
def preprocess_dialogue(example):
conversation = ""
for turn in example["dialogue"]:
if 'role' in turn and 'text' in turn:
conversation += f"{turn['role']}: {turn['text']} \n"
return {"text": conversation}
processed_dataset = combined_dataset.map(preprocess_dialogue)
# Tokenization and tagging
def preprocess_for_model(example):
# Tokenize text and trim to fit model requirements
tokenized = tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)
# Add labels, which should be the same as the input_ids for training
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
# Applying tokenization to the processed dataset
processed_dataset = processed_dataset.map(preprocess_for_model, batched=True)
# Training parameters
training_args = TrainingArguments(
output_dir="./llama-chatbot",
num_train_epochs=1,
per_device_train_batch_size=2,
save_steps=500,
save_total_limit=2,
fp16=False,
remove_unused_columns=False
)
# Creating and running Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=processed_dataset,
)
trainer.train()
except Exception as e:
print("An error has occurred:", e)
1 Like