Hi there, I’m trying to fine-tune the model for text-to-sql application over some dataset. But i generates the following ouputs:
The base model would not generate such weird output (displays as ? symbols).
Here are my code:
import pandas as pd
from datasets import load_dataset
from IPython.display import HTML, display
dataset_name = "b-mc2/sql-create-context"
dataset = load_dataset(dataset_name, split="train")
def display_table(dataset_or_sample):
# A helper fuction to display a Transformer dataset or single sample contains multi-line string nicely
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", None)
pd.set_option("display.max_rows", None)
if isinstance(dataset_or_sample, dict):
df = pd.DataFrame(dataset_or_sample, index=[0])
else:
df = pd.DataFrame(dataset_or_sample)
html = df.to_html().replace("\\n", "<br>")
styled_html = f"""<style> .dataframe th, .dataframe tbody td {{ text-align: left; padding-right: 30px; }} </style> {html}"""
display(HTML(styled_html))
display_table(dataset.select(range(3)))
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]
print(f"Training dataset contains {len(train_dataset)} text-to-SQL pairs")
print(f"Test dataset contains {len(test_dataset)} text-to-SQL pairs")
PROMPT_TEMPLATE = """You are a powerful text-to-SQL model. Given the SQL tables and natural language question, your job is to write SQL query that answers the question.
### Table:
{context}
### Question:
{question}
### Response:
{output}"""
def apply_prompt_template(row):
prompt = PROMPT_TEMPLATE.format(
question=row["question"],
context=row["context"],
output=row["answer"],
)
return {"prompt": prompt}
train_dataset = train_dataset.map(apply_prompt_template)
display_table(train_dataset.select(range(1)))
from transformers import AutoTokenizer,GPT2Tokenizer
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig,GPT2Model,GPT2LMHeadModel
quantization_config = BitsAndBytesConfig(
# Load the model with 4-bit quantization
load_in_4bit=True,
# Use double quantization
bnb_4bit_use_double_quant=True,
# Use 4-bit Normal Float for storing the base model weights in GPU memory
bnb_4bit_quant_type="nf4",
# De-quantize the weights to 16-bit (Brain) float before the forward/backward pass
bnb_4bit_compute_dtype=torch.bfloat16,
# This allow CPU offload.
# llm_int8_enable_fp32_cpu_offload=True,
)
base_model_id = "distilbert/distilgpt2"
model = GPT2LMHeadModel.from_pretrained(base_model_id,
quantization_config=quantization_config,
# low_cpu_mem_usage=True,
# device_map="auto",
# torch_dtype=torch.float16
)
# You can use a different max length if your custom dataset has shorter/longer input sequences.
MAX_LENGTH = 256
tokenizer = AutoTokenizer.from_pretrained(
base_model_id,
model_max_length=MAX_LENGTH,
padding_side="left",
add_eos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token
def tokenize_and_pad_to_fixed_length(sample):
result = tokenizer(
sample["prompt"],
truncation=True,
max_length=MAX_LENGTH,
padding="max_length",
)
result["labels"] = result["input_ids"].copy()
return result
tokenized_train_dataset = train_dataset.map(tokenize_and_pad_to_fixed_length)
assert all(len(x["input_ids"]) == MAX_LENGTH for x in tokenized_train_dataset)
display_table(tokenized_train_dataset.select(range(1)))
import transformers
tokenizer_no_padding = AutoTokenizer.from_pretrained(base_model_id)
tokenizer_no_padding.pad_token = tokenizer_no_padding.eos_token
pipeline = transformers.pipeline(model=model, tokenizer=tokenizer_no_padding, task="text-generation")
sample = test_dataset[1]
prompt = PROMPT_TEMPLATE.format(
context=sample["context"], question=sample["question"], output=""
) # Leave the answer part blank
with torch.no_grad():
response = pipeline(prompt, max_new_tokens=256, repetition_penalty=1.15, return_full_text=False)
display_table({"prompt": prompt, "generated_query": response[0]["generated_text"]})
import torch
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
# Enabling gradient checkpointing, to make the training further efficient
model.gradient_checkpointing_enable()
# Set up the model for quantization-aware training e.g. casting layers, parameter freezing, etc.
model_kbit = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
task_type="CAUSAL_LM",
# This is the rank of the decomposed matrices A and B to be learned during fine-tuning. A smaller number will save more GPU memory but might result in worse performance.
r=32,
# This is the coefficient for the learned ΔW factor, so the larger number will typically result in a larger behavior change after fine-tuning.
lora_alpha=64,
# Drop out ratio for the layers in LoRA adaptors A and B.
lora_dropout=0.1,
# We fine-tune all linear layers in the model. It might sound a bit large, but the trainable adapter size is still only **1.16%** of the whole model.
target_modules=[
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
"lm_head",
],
# Bias parameters to train. 'none' is recommended to keep the original model performing equally when turning off the adapter.
bias="none",
)
peft_model = get_peft_model(model_kbit, peft_config)
peft_model.print_trainable_parameters()
from datetime import datetime
import transformers
from transformers import TrainingArguments
import mlflow
# DeepSpeed requires a distributed environment even when only one process is used.
# This emulates a launcher in the notebook
import os
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "9994" # modify if RuntimeError: Address already in use
os.environ["RANK"] = "0"
os.environ["LOCAL_RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["NCCL_DEBUG"] = "INFO"
training_args = TrainingArguments(
# Set this to mlflow for logging your training
report_to="mlflow",
# Name the MLflow run
run_name=f"distilbert-SQL-QLoRA-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}",
# Replace with your output destination
output_dir="YOUR_OUTPUT_DIR",
# For the following arguments, refer to https://huggingface.co/docs/transformers/main_classes/trainer
per_device_train_batch_size=1,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
optim="paged_adamw_8bit",
bf16=True,
learning_rate=2e-5,
lr_scheduler_type="constant",
max_steps=500,
save_steps=100,
logging_steps=100,
warmup_steps=5,
# https://discuss.huggingface.co/t/training-llama-with-lora-on-multiple-gpus-may-exist-bug/47005/3
ddp_find_unused_parameters=False,
# deepspeed="ds_zero3_config.json",
)
trainer = transformers.Trainer(
model=peft_model,
train_dataset=tokenized_train_dataset,
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer_no_padding, mlm=False),
args=training_args,
)
# use_cache=True is incompatible with gradient checkpointing.
peft_model.config.use_cache = False
trainer.train()
import transformers
pipeline = transformers.pipeline(model=trainer.model, tokenizer=tokenizer_no_padding, task="text-generation")
sample = test_dataset[1]
prompt = PROMPT_TEMPLATE.format(
context=sample["context"], question=sample["question"], output=""
) # Leave the answer part blank
with torch.no_grad():
response = pipeline(prompt, max_new_tokens=256, repetition_penalty=1.15, return_full_text=False)
display_table({"prompt": prompt, "generated_query": response[0]["generated_text"]})
Can anyone help me understand what was the problem that caused those weird chars in the output?