Dataset Tokenization to Fine-Tune Gemma 3 1B

Hi, I’m pretty new to Hugging Face and I do not really have experience in Fine-Tuning LLMs; specifically I’m having a hard time preparing the dataset to use it with the Trainer API.

I want to fine-tune google/gemma-3-1b-it to perform Extractive Question Answering, a problem where given a specific piece of text the model should identify where is the answer in the provided text. I found Adapting Pre-trained Generative Models for Extractive Question Answering paper where the same problem is redesigned as: given those sentences and a question, tell me what sentence answers the question.

I am using a processed version of MultiSpanQA (same of the paper) where I extracted question, sentences and answers, since the original dataset is designed for BIO Tagging (…).

Here is my code (only part of it)

# unprocessed dataset
ds = load_dataset('json', data_files={
    'train': 'dataset/train.json',
    'validation': 'dataset/valid.json'
})
# ds['train']
# Dataset({
#     features: ['question', 'sentences', 'num_labels', 'answers'],
#     num_rows: 5230
# })

# model settings
base_model = 'google/gemma-3-1b-it'
attention_implementation = 'sdpa' # or 'flash_attention_2'
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(base_model, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    device_map="auto",
    quantization_config=quantization_config,
    attn_implementation=attention_implementation
)

# prepare dataset for training
system_prompt = """Your objective is to find the answers for a question in list of sentences.
You should identify what sentences contain the answer for the question and output a list of numbers.

Each number represents the index of the sentence.
The output format is the following:
int, ..., int
"""

user_prompt = """**Sentences**:
{context}

**Question**: {question}
"""

def tokenize_dataset(item):
    messages = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_prompt.format(context=item['sentences'], question=item['question'])}
    ]

    tokenized_chat = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        padding=True,  
        max_length=2048,
        truncation=True,
    )

    tokenized_labels = tokenizer(
        item['answers'], 
        add_special_tokens=False,
        truncation=True,
        max_length=56
    )

    return {
        'input_ids': tokenized_chat,
        'attention_mask': [1] * len(tokenized_chat),
        'labels': tokenized_labels
    }

train_ds = ds['train'].shuffle(seed=15).select(range(300)).map(tokenize_dataset)
eval_ds = ds['validation'].shuffle(seed=15).select(range(100)).map(tokenize_dataset)

train_ds = train_ds.remove_columns(['question', 'sentences', 'num_labels', 'answers'])
eval_ds = eval_ds.remove_columns(['question', 'sentences', 'num_labels', 'answers'])


ft_model_id = 'gemma-3-1b-ft-extractqa'

# training settings
epochs = 1
batch_size = 8
gradient_accumulation_steps = 8 # -> effective batch size is 8 * 8 = 64
optimizer = 'paged_adamw_8bit'

# PEFT settings
lora_rank = 8
lora_alpha = 16
lora_config = LoraConfig(
    r=lora_rank,
    lora_alpha=lora_alpha,
    target_modules=['q_proj', 'v_proj'],
    lora_dropout=0.1,
    bias="none"
)

# backend settings
torch_empty_cache_steps = 4
torch_compile_backend = 'inductor' # apparently useless with T4 on colab

model = get_peft_model(model, lora_config)

args = TrainingArguments(
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optimizer,
    gradient_checkpointing=True,
    fp16=True,
    torch_empty_cache_steps=torch_empty_cache_steps,
    torch_compile_backend=torch_compile_backend,
    label_names=['labels'],
    report_to=None
)

trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=eval_ds
)

trainer.train()

Now, when running that code I get the following error:

ValueError                                Traceback (most recent call last)
<ipython-input-87-3435b262f1ae> in <cell line: 0>()
----> 1 trainer.train()

8 frames
/usr/local/lib/python3.11/dist-packages/transformers/data/data_collator.py in torch_default_data_collator(features)
    157                 batch[k] = torch.from_numpy(np.stack([f[k] for f in features]))
    158             else:
--> 159                 batch[k] = torch.tensor([f[k] for f in features])
    160 
    161     return batch

ValueError: expected sequence of length 164 at dim 1 (got 273)

I really think this is caused by how I’m processing the dataset, however I am still not sure since “expected sequence of length …” is vague to me, I only get that this has to be with the processing of the dataset.
I hope someone can point me to the solution.

1 Like

This error occurs when DataCollator is unable to collate the data as expected. There are multiple possible solutions, but I think the smartest thing to do would be to either use an existing DataCollator or write a new one.

Alternatively, you could leave tokenization entirely to the Trainer, but that would mean a significant change to the current code, so I’m not sure what to do in that case…

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
import torch

Carga el dataset JSON

dataset = load_dataset(‘json’, data_files={
‘train’: ‘dataset/train.json’,
‘validation’: ‘dataset/valid.json’
})

Tokenizador base de Gemma

tokenizer = AutoTokenizer.from_pretrained(“google/gemma-3b-it”)

Función de tokenización

def tokenize_example(example):
messages = [
{“role”: “system”, “content”: “You are a helpful assistant.”},
{“role”: “user”, “content”: f"Question: {example[‘question’]}\nSentences: {example[‘sentences’]}"}
]
# Generar texto estilo chat
chat_text = tokenizer.apply_chat_template(messages, tokenize=False)

# Tokenizar entrada
model_inputs = tokenizer(chat_text, padding="max_length", truncation=True, max_length=2048)

# Tokenizar respuesta esperada (respuesta a la pregunta)
labels = tokenizer(example["answers"], padding="max_length", truncation=True, max_length=2048)

# Agregar etiquetas al input
model_inputs["labels"] = labels["input_ids"]
return model_inputs

Aplicar tokenización al dataset completo

tokenized_dataset = dataset.map(tokenize_example, remove_columns=dataset[“train”].column_names)

Cargar modelo base

model = AutoModelForCausalLM.from_pretrained(“google/gemma-3b-it”)

Usar DataCollator que maneje padding automático

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

Argumentos de entrenamiento

training_args = TrainingArguments(
output_dir=“./results”,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
num_train_epochs=1,
logging_steps=10,
save_steps=100,
evaluation_strategy=“steps”,
eval_steps=50,
save_total_limit=2,
report_to=“none”
)

Preparar el Trainer

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset[“train”],
eval_dataset=tokenized_dataset[“validation”],
tokenizer=tokenizer,
data_collator=data_collator
)

Iniciar entrenamiento

trainer.train()