import torch
from datasets import Dataset
from transformers import AdamW, AutoTokenizer, AutoModelForCausalLM, DataCollatorWithPadding,TrainingArguments, Trainer
data = {
“messages”: [“Hello, how are you?”, “What’s your name?”],
“responses”: [“I’m fine, thank you!”, “I am a chatbot.”]
}
formatted_data = [{“messages”: msg, “responses”: resp} for msg, resp in zip(data[“messages”], data[“responses”])]
raw_dataset = Dataset.from_list(formatted_data)
checkpoint = “microsoft/DialoGPT-small”
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(example):
# Concatenate messages and responses
inputs = tokenizer(example[“messages”], return_tensors=“pt”, truncation=True,padding = True)
# Create labels from responses (shifted input IDs)
labels = tokenizer(example[“responses”], return_tensors=“pt”, truncation=True,padding = True)[“input_ids”]
return {
“input_ids”: inputs[“input_ids”][0], # Get the first batch item
“attention_mask”: inputs[“attention_mask”][0], # Get the first batch item
“labels”: labels[0] # Get the first batch item as labels
}
tokenized_datasets = raw_dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print(raw_dataset)
#TRAINING TIME-----------------------------------
training_args = TrainingArguments(“test-trainer”,report_to=“none”)
model = AutoModelForCausalLM.from_pretrained(checkpoint)
from transformers import Trainer
trainer = Trainer(
model,
training_args,
train_dataset=tokenized_datasets,
eval_dataset=tokenized_datasets,
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.train()
model.save_pretrained(“directory_on_my_computer”)