Hi, I’m pretty new to Hugging Face and I do not really have experience in Fine-Tuning LLMs; specifically I’m having a hard time preparing the dataset to use it with the Trainer API.
I want to fine-tune google/gemma-3-1b-it
to perform Extractive Question Answering, a problem where given a specific piece of text the model should identify where is the answer in the provided text. I found Adapting Pre-trained Generative Models for Extractive Question Answering paper where the same problem is redesigned as: given those sentences and a question, tell me what sentence answers the question.
I am using a processed version of MultiSpanQA (same of the paper) where I extracted question, sentences and answers, since the original dataset is designed for BIO Tagging (…).
Here is my code (only part of it)
# unprocessed dataset
ds = load_dataset('json', data_files={
'train': 'dataset/train.json',
'validation': 'dataset/valid.json'
})
# ds['train']
# Dataset({
# features: ['question', 'sentences', 'num_labels', 'answers'],
# num_rows: 5230
# })
# model settings
base_model = 'google/gemma-3-1b-it'
attention_implementation = 'sdpa' # or 'flash_attention_2'
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16
)
tokenizer = AutoTokenizer.from_pretrained(base_model, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(
base_model,
device_map="auto",
quantization_config=quantization_config,
attn_implementation=attention_implementation
)
# prepare dataset for training
system_prompt = """Your objective is to find the answers for a question in list of sentences.
You should identify what sentences contain the answer for the question and output a list of numbers.
Each number represents the index of the sentence.
The output format is the following:
int, ..., int
"""
user_prompt = """**Sentences**:
{context}
**Question**: {question}
"""
def tokenize_dataset(item):
messages = [
{'role': 'system', 'content': system_prompt},
{'role': 'user', 'content': user_prompt.format(context=item['sentences'], question=item['question'])}
]
tokenized_chat = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
padding=True,
max_length=2048,
truncation=True,
)
tokenized_labels = tokenizer(
item['answers'],
add_special_tokens=False,
truncation=True,
max_length=56
)
return {
'input_ids': tokenized_chat,
'attention_mask': [1] * len(tokenized_chat),
'labels': tokenized_labels
}
train_ds = ds['train'].shuffle(seed=15).select(range(300)).map(tokenize_dataset)
eval_ds = ds['validation'].shuffle(seed=15).select(range(100)).map(tokenize_dataset)
train_ds = train_ds.remove_columns(['question', 'sentences', 'num_labels', 'answers'])
eval_ds = eval_ds.remove_columns(['question', 'sentences', 'num_labels', 'answers'])
ft_model_id = 'gemma-3-1b-ft-extractqa'
# training settings
epochs = 1
batch_size = 8
gradient_accumulation_steps = 8 # -> effective batch size is 8 * 8 = 64
optimizer = 'paged_adamw_8bit'
# PEFT settings
lora_rank = 8
lora_alpha = 16
lora_config = LoraConfig(
r=lora_rank,
lora_alpha=lora_alpha,
target_modules=['q_proj', 'v_proj'],
lora_dropout=0.1,
bias="none"
)
# backend settings
torch_empty_cache_steps = 4
torch_compile_backend = 'inductor' # apparently useless with T4 on colab
model = get_peft_model(model, lora_config)
args = TrainingArguments(
num_train_epochs=epochs,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
optim=optimizer,
gradient_checkpointing=True,
fp16=True,
torch_empty_cache_steps=torch_empty_cache_steps,
torch_compile_backend=torch_compile_backend,
label_names=['labels'],
report_to=None
)
trainer = Trainer(
model,
args,
train_dataset=train_ds,
eval_dataset=eval_ds
)
trainer.train()
Now, when running that code I get the following error:
ValueError Traceback (most recent call last)
<ipython-input-87-3435b262f1ae> in <cell line: 0>()
----> 1 trainer.train()
8 frames
/usr/local/lib/python3.11/dist-packages/transformers/data/data_collator.py in torch_default_data_collator(features)
157 batch[k] = torch.from_numpy(np.stack([f[k] for f in features]))
158 else:
--> 159 batch[k] = torch.tensor([f[k] for f in features])
160
161 return batch
ValueError: expected sequence of length 164 at dim 1 (got 273)
I really think this is caused by how I’m processing the dataset, however I am still not sure since “expected sequence of length …” is vague to me, I only get that this has to be with the processing of the dataset.
I hope someone can point me to the solution.