@Chahnwoo Is there any way if i don’t add any special tokens and use only existing tokens to train.
<s> ### User : {example['input']} ### Response : {example['output']} </s>
Is this prompt format ok for training with my data.
My input example is like
find men's wallets below 1000
The output is
{"prompt": "find men's wallets below 1000","action_intent": "search","filters": {"category_name": "Wallets","brand_name": "","price_min": "0","price_max": "1000","provider_name": "","query_entity_type": "product","colour": "","size" : ""}}
Now what problems iam facing is :
Generating extra content repeted content after response till max_new_tokens is reached
Iam using the below to train the model mistral 7b :
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import wandb
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from accelerate import Accelerator
from datasets import load_dataset
from huggingface_hub import notebook_login
import transformers
from datetime import datetime
# Load data
data = pd.read_excel("./Sample_data.xlsx")
# Format data
formatted_data = [
{"input": "Please create a json block for this input query text. Input : " + row["Input"], "output": row["Output"]}
for _, row in data.iterrows()
]
# Split data
train_data, test_data = train_test_split(formatted_data, test_size=0.2, random_state=42)
# Save train and test data
train_file_path = "notes.jsonl"
test_file_path = "notes_validation.jsonl"
with open(train_file_path, 'w') as f:
for entry in train_data:
f.write(json.dumps(entry) + '\n')
with open(test_file_path, 'w') as f:
for entry in test_data:
f.write(json.dumps(entry) + '\n')
# Load datasets
train_dataset = load_dataset('json', data_files='notes.jsonl', split='train')
eval_dataset = load_dataset('json', data_files='notes_validation.jsonl', split='train')
# Login to WandB and Hugging Face Hub
wandb.login()
wandb_project = "json-finetune-02.07.2024"
if wandb_project:
os.environ["WANDB_PROJECT"] = wandb_project
notebook_login("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")
# Initialize tokenizer
base_model_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(
base_model_id,
padding_side="right",
add_eos_token=True,
add_bos_token=True,
use_auth_token=True,
)
tokenizer.pad_token = tokenizer.eos_token
# Define formatting function
def formatting_func(example):
return f"<s> ### User : {example['input']} ### Response : {example['output']} </s>"
# Generate and tokenize prompts
max_length = 512
def generate_and_tokenize_prompt(prompt):
result = tokenizer(
formatting_func(prompt),
truncation=True,
max_length=max_length,
padding="max_length",
)
result["labels"] = result["input_ids"].copy()
return result
tokenized_train_dataset = train_dataset.map(generate_and_tokenize_prompt)
tokenized_val_dataset = eval_dataset.map(generate_and_tokenize_prompt)
# Load model with quantization configuration
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.bfloat16
)
model = AutoModelForCausalLM.from_pretrained(base_model_id, quantization_config=bnb_config, device_map="auto", use_auth_token=True)
# Prepare model for k-bit training
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
r=32,
lora_alpha=64,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "lm_head"
],
bias="none",
lora_dropout=0.05,
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
if torch.cuda.device_count() > 1:
model.is_parallelizable = True
model.model_parallel = True
accelerator = Accelerator()
model = accelerator.prepare_model(model)
# Define trainer
project = "json-finetune-Jul02"
base_model_name = "mistral"
run_name = base_model_name + "-" + project
output_dir = "./" + run_name
trainer = transformers.Trainer(
model=model,
train_dataset=tokenized_train_dataset,
eval_dataset=tokenized_val_dataset,
args=transformers.TrainingArguments(
output_dir=output_dir,
warmup_steps=2,
per_device_train_batch_size=2,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
max_steps=500,
learning_rate=2.5e-5,
bf16=True,
optim="paged_adamw_8bit",
logging_steps=25,
logging_dir="./logs",
save_strategy="steps",
save_steps=50,
evaluation_strategy="steps",
eval_steps=25,
do_eval=True,
report_to="wandb",
run_name=f"{run_name}-{datetime.now().strftime('%Y-%m-%d-%H-%M')}"
),
data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train()
model.save_pretrained("./Model/")
# Evaluation
eval_prompt = """<s> ### User : Please create a json block for this input query text. Please do not add any explanation and show only json.
find green sarees below 3000 with zari border
### Response :
"""
eval_tokenizer = AutoTokenizer.from_pretrained(
base_model_id,
add_bos_token=True,
)
model_input = eval_tokenizer(eval_prompt, return_tensors="pt").to("cuda")
model.eval()
with torch.no_grad():
generated_tokens = model.generate(**model_input, max_new_tokens=256, repetition_penalty=1.15)
response = eval_tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
print(response)
In the above i didn’t use any additional tokens will it work fine and also it’s not determining the price min and price max properly.
Please help me to resolve this !!!