Hello. I want to create my own AI based on Meta AI, but when training a chatbot using datasets, I encountered the following problem: Could not convert to integer: 3221225477. Path ‘exitCode’. Tell me how to solve it?
I am using:
- Windows 11 Home version 23H2
- Microsoft Visual Studio 2022
- Python 3.11.0
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch
from datasets import load_dataset, concatenate_datasets
try:
# Download tokenizer and model
model_path = "C:\\Users\\evhac\\.llama\\checkpoints\\Llama3.1-8B-hf"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
# Setting eos_token as pad_token
tokenizer.pad_token = tokenizer.eos_token
# Transfer the model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Loading datasets
persona_chat_dataset = load_dataset("AlekseyKorshuk/persona-chat", split="train")
dailydialog_dataset = load_dataset("roskoN/dailydialog", split="train")
# Transforming datasets
def preprocess_persona(example):
dialogue = example.get("dialogue", [])
return {"dialogue": dialogue}
def preprocess_dailydialog(example):
dialogue = example.get("dialogue", [])
return {"dialogue": dialogue}
persona_chat_dataset = persona_chat_dataset.map(preprocess_persona, remove_columns=persona_chat_dataset.column_names)
dailydialog_dataset = dailydialog_dataset.map(preprocess_dailydialog, remove_columns=dailydialog_dataset.column_names)
combined_dataset = concatenate_datasets([persona_chat_dataset, dailydialog_dataset])
# Transforming Dialogues
def preprocess_dialogue(example):
conversation = ""
for turn in example["dialogue"]:
if 'role' in turn and 'text' in turn:
conversation += f"{turn['role']}: {turn['text']} \n"
return {"text": conversation}
processed_dataset = combined_dataset.map(preprocess_dialogue)
# Tokenization and tagging
def preprocess_for_model(example):
# Tokenize text and trim to fit model requirements
tokenized = tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)
# Add labels, which should be the same as the input_ids for training
tokenized["labels"] = tokenized["input_ids"].copy()
return tokenized
# Applying tokenization to the processed dataset
processed_dataset = processed_dataset.map(preprocess_for_model, batched=True)
# Training parameters
training_args = TrainingArguments(
output_dir="./llama-chatbot",
num_train_epochs=1,
per_device_train_batch_size=2,
save_steps=500,
save_total_limit=2,
fp16=False,
remove_unused_columns=False
)
# Creating and running Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=processed_dataset,
)
trainer.train()
except Exception as e:
print("An error has occurred:", e)