Fine-tuning distilbert/distilgpt2 for text to sql yields weird characters

Hi there, I’m trying to fine-tune the model for text-to-sql application over some dataset. But i generates the following ouputs:

The base model would not generate such weird output (displays as ? symbols).

Here are my code:

import pandas as pd
from datasets import load_dataset
from IPython.display import HTML, display

dataset_name = "b-mc2/sql-create-context"
dataset = load_dataset(dataset_name, split="train")


def display_table(dataset_or_sample):
    # A helper fuction to display a Transformer dataset or single sample contains multi-line string nicely
    pd.set_option("display.max_colwidth", None)
    pd.set_option("display.width", None)
    pd.set_option("display.max_rows", None)

    if isinstance(dataset_or_sample, dict):
        df = pd.DataFrame(dataset_or_sample, index=[0])
    else:
        df = pd.DataFrame(dataset_or_sample)

    html = df.to_html().replace("\\n", "<br>")
    styled_html = f"""<style> .dataframe th, .dataframe tbody td {{ text-align: left; padding-right: 30px; }} </style> {html}"""
    display(HTML(styled_html))


display_table(dataset.select(range(3)))

split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]

print(f"Training dataset contains {len(train_dataset)} text-to-SQL pairs")
print(f"Test dataset contains {len(test_dataset)} text-to-SQL pairs")

PROMPT_TEMPLATE = """You are a powerful text-to-SQL model. Given the SQL tables and natural language question, your job is to write SQL query that answers the question.

### Table:
{context}

### Question:
{question}

### Response:
{output}"""


def apply_prompt_template(row):
    prompt = PROMPT_TEMPLATE.format(
        question=row["question"],
        context=row["context"],
        output=row["answer"],
    )
    return {"prompt": prompt}


train_dataset = train_dataset.map(apply_prompt_template)
display_table(train_dataset.select(range(1)))

from transformers import AutoTokenizer,GPT2Tokenizer

import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig,GPT2Model,GPT2LMHeadModel


quantization_config = BitsAndBytesConfig(
    # Load the model with 4-bit quantization
    load_in_4bit=True,
    # Use double quantization
    bnb_4bit_use_double_quant=True,
    # Use 4-bit Normal Float for storing the base model weights in GPU memory
    bnb_4bit_quant_type="nf4",
    # De-quantize the weights to 16-bit (Brain) float before the forward/backward pass
    bnb_4bit_compute_dtype=torch.bfloat16,
    # This allow CPU offload.
    # llm_int8_enable_fp32_cpu_offload=True,
)


base_model_id = "distilbert/distilgpt2"

model = GPT2LMHeadModel.from_pretrained(base_model_id,
                                             quantization_config=quantization_config,
                                             # low_cpu_mem_usage=True,
                                             # device_map="auto",
                                             # torch_dtype=torch.float16
                                       )

# You can use a different max length if your custom dataset has shorter/longer input sequences.
MAX_LENGTH = 256

tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    model_max_length=MAX_LENGTH,
    padding_side="left",
    add_eos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token


def tokenize_and_pad_to_fixed_length(sample):
    result = tokenizer(
        sample["prompt"],
        truncation=True,
        max_length=MAX_LENGTH,
        padding="max_length",
    )
    result["labels"] = result["input_ids"].copy()
    return result

tokenized_train_dataset = train_dataset.map(tokenize_and_pad_to_fixed_length)

assert all(len(x["input_ids"]) == MAX_LENGTH for x in tokenized_train_dataset)

display_table(tokenized_train_dataset.select(range(1)))


import transformers

tokenizer_no_padding = AutoTokenizer.from_pretrained(base_model_id)
tokenizer_no_padding.pad_token = tokenizer_no_padding.eos_token
pipeline = transformers.pipeline(model=model, tokenizer=tokenizer_no_padding, task="text-generation")

sample = test_dataset[1]
prompt = PROMPT_TEMPLATE.format(
    context=sample["context"], question=sample["question"], output=""
)  # Leave the answer part blank

with torch.no_grad():
    response = pipeline(prompt, max_new_tokens=256, repetition_penalty=1.15, return_full_text=False)

display_table({"prompt": prompt, "generated_query": response[0]["generated_text"]})

import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Enabling gradient checkpointing, to make the training further efficient
model.gradient_checkpointing_enable()
# Set up the model for quantization-aware training e.g. casting layers, parameter freezing, etc.
model_kbit = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    task_type="CAUSAL_LM",
    # This is the rank of the decomposed matrices A and B to be learned during fine-tuning. A smaller number will save more GPU memory but might result in worse performance.
    r=32,
    # This is the coefficient for the learned ΔW factor, so the larger number will typically result in a larger behavior change after fine-tuning.
    lora_alpha=64,
    # Drop out ratio for the layers in LoRA adaptors A and B.
    lora_dropout=0.1,
    # We fine-tune all linear layers in the model. It might sound a bit large, but the trainable adapter size is still only **1.16%** of the whole model.
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    # Bias parameters to train. 'none' is recommended to keep the original model performing equally when turning off the adapter.
    bias="none",
)

peft_model = get_peft_model(model_kbit, peft_config)
peft_model.print_trainable_parameters()

from datetime import datetime

import transformers
from transformers import TrainingArguments

import mlflow

# DeepSpeed requires a distributed environment even when only one process is used.
# This emulates a launcher in the notebook
import os

os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["NCCL_DEBUG"] = "INFO"

training_args = TrainingArguments(
    # Set this to mlflow for logging your training
    report_to="mlflow",
    # Name the MLflow run
    run_name=f"distilbert-SQL-QLoRA-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}",
    # Replace with your output destination
    output_dir="YOUR_OUTPUT_DIR",
    # For the following arguments, refer to https://huggingface.co/docs/transformers/main_classes/trainer
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    bf16=True,
    learning_rate=2e-5,
    lr_scheduler_type="constant",
    max_steps=500,
    save_steps=100,
    logging_steps=100,
    warmup_steps=5,
    # https://discuss.huggingface.co/t/training-llama-with-lora-on-multiple-gpus-may-exist-bug/47005/3
    ddp_find_unused_parameters=False,
    # deepspeed="ds_zero3_config.json",
)

trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=tokenized_train_dataset,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer_no_padding, mlm=False),
    args=training_args,
)

# use_cache=True is incompatible with gradient checkpointing.
peft_model.config.use_cache = False

trainer.train()

import transformers

pipeline = transformers.pipeline(model=trainer.model, tokenizer=tokenizer_no_padding, task="text-generation")

sample = test_dataset[1]
prompt = PROMPT_TEMPLATE.format(
    context=sample["context"], question=sample["question"], output=""
)  # Leave the answer part blank

with torch.no_grad():
    response = pipeline(prompt, max_new_tokens=256, repetition_penalty=1.15, return_full_text=False)

display_table({"prompt": prompt, "generated_query": response[0]["generated_text"]})

Can anyone help me understand what was the problem that caused those weird chars in the output?