Fine-tuned transformers model generats nonsensical results

am experimenting with fine-tuning an LLM model to a custom dataset, consisting of a text file of about 60000 words with scraped user comments from a news website. I am using transformers with pytorch.

I have a tokenizer script and a fine-tuner script. I am experimenting with 2 models : Bulgarian versions of a gpt2 and a bert. I chose those, because they are relatevily light and can run locally on my humble rtx 2060 NVIDIA laptop.

However, after training, when I interact with my bot, it behaves weirdly. The gpt-2 replies with series of words from a similar category, e.g. adjectives only, or place-related words only. While the bert just returns . Also, both repeat my question before attempting to reply.

I know this is a tough one to troubleshoot, but I am looking for any general troubleshooting tips. How should I inspect the dataset/tokenizer/model for inconsistencies? And is there any easier methods for fine-tuning a pre-trained model on custom non-conversational data (that also supports Bulgarian language)?

Tokenizer script:

from transformers import AutoTokenizer

from datasets import Dataset

# Path to your .txt file

file_path = r"txtfilepath"

# Read the .txt file

with open(file_path, "r", encoding="utf-8") as f:

comments = f.readlines()

# Strip any extra whitespace or newline characters

comments = [comment.strip() for comment in comments]

# Load tokenizer for the GPT-2 model from Hugging Face

tokenizer = AutoTokenizer.from_pretrained("rmihaylov/gpt2-small-bg")

# Tokenize comments

tokenized_comments = tokenizer(comments, padding=True, truncation=True, return_tensors="pt")

# Add labels to the tokenized dataset

tokenized_comments["labels"] = tokenized_comments["input_ids"].clone()

# Convert tokenized comments into a Dataset object

dataset = Dataset.from_dict({

'input_ids': tokenized_comments['input_ids'],

'attention_mask': tokenized_comments['attention_mask'],

'labels': tokenized_comments['labels']

})

# Save the tokenized dataset

dataset.save_to_disk(r"tokenizedsavepath")

print("Tokenized dataset saved successfully.")

Fine-tuning script:

import os

import numpy as np

from transformers import AutoModelForCausalLM, Trainer, TrainingArguments, AutoTokenizer

from datasets import load_from_disk, DatasetDict

# Set environment variables for debugging

os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

os.environ['TORCH_USE_CUDA_DSA'] = '1'

# Load the new tokenized dataset

dataset_path = r"tokenizeddatasetpath"

dataset = load_from_disk(dataset_path)

# Split the dataset into train and test sets

train_test_split = dataset.train_test_split(test_size=0.1) # 10% for test, 90% for train

dataset = DatasetDict({

'train': train_test_split['train'],

'test': train_test_split['test']

})

# Load pre-trained GPT-2 model

model_name = "rmihaylov/gpt2-small-bg"

model = AutoModelForCausalLM.from_pretrained(model_name)

# Initialize tokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add special tokens if they are not already added

special_tokens_dict = {'additional_special_tokens': ['<pad>', '<eos>', '<bos>', '<unk>']}

num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

print(f"Number of special tokens added: {num_added_toks}")

# Resize the model embeddings to match the tokenizer's vocabulary size

model.resize_token_embeddings(len(tokenizer))

print(f"Model embeddings resized to: {model.config.vocab_size}")

# Verify tokenizer and special tokens

print(f"Tokenizer vocabulary size: {len(tokenizer)}")

print(f"Special tokens: {tokenizer.all_special_tokens}")

print(f"Special token IDs: {tokenizer.all_special_ids}")

# Define a function to compute metrics

def compute_metrics(eval_pred):

logits, labels = eval_pred.predictions, eval_pred.label_ids

predictions = np.argmax(logits, axis=-1)

return {"accuracy": (predictions == labels).mean()}

# Define training arguments with evaluation strategy

training_args = TrainingArguments(

output_dir='./results_gpt2_small_bg', # Required argument

per_device_train_batch_size=4,

num_train_epochs=3,

logging_dir='./logs_gpt2_small_bg',

logging_steps=500,

evaluation_strategy="epoch", # Evaluate at the end of each epoch

remove_unused_columns=False, # Ensure 'labels' column is used

)

# Trainer for fine-tuning

trainer = Trainer(

model=model,

args=training_args,

train_dataset=dataset["train"],

eval_dataset=dataset["test"], # Use the test dataset for evaluation

compute_metrics=compute_metrics,

)

# Start fine-tuning

trainer.train()

# Save the fine-tuned model and tokenizer

output_model_dir = "outputfolder"

model.save_pretrained(output_model_dir)

tokenizer.save_pretrained(output_model_dir)

print("Fine-tuning completed and model saved successfully.")

Interaction script

import os
from transformers import AutoModelForCausalLM, AutoTokenizer

# Set environment variables for debugging
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

# Load the fine-tuned model and tokenizer
model_dir = "./fine_tuned_model_bert_pamela"
model = AutoModelForCausalLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# Ensure the model is in evaluation mode
model.eval()

# Interaction script
print("Welcome to the Fine-Tuned Chatbot!")
print("You can start chatting. Type 'exit' to end the conversation.")

while True:
    user_input = input("You: ")
    if user_input.lower() == 'exit':
        break

    # Tokenize the input
    inputs = tokenizer(user_input, return_tensors='pt')

    try:
        # Generate a response
        output_ids = model.generate(
            inputs['input_ids'],
            max_length=50,
            num_return_sequences=1,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

        # Decode the response
        response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        print(f"Chatbot: {response}")
    except RuntimeError as e:
        print(f"Error: {e}")

print("Conversation ended.")