the dataset contains only nontokenized data under a column named “text”. when initializing the SFTtrainer, I set “tokenizer” parameter equal to my tokenizer and data_text_field to text.
a snippet of my code:
class CustomDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
input_text = self.data.dataset.iloc[idx]
batch = {
"text" : input_text["text"]
}
return batch
current_device = Accelerator().local_process_index
# Define training arguments
training_args = SFTConfig(
output_dir= datasetPath,
# overwrite_output_dir=True,
num_train_epochs=5,
per_device_train_batch_size=8,
per_device_eval_batch_size=8,
save_total_limit=5,
evaluation_strategy="steps",
save_strategy = "epoch",
# save_steps=5000,
eval_steps=50,
logging_dir= datasetPath,
logging_strategy="steps",
logging_steps=10,
do_eval=True,
do_train=True,
learning_rate=5e-4,
adam_epsilon=1e-08,
warmup_steps=100,
eval_accumulation_steps=1,
gradient_checkpointing=False,
auto_find_batch_size=False,
gradient_accumulation_steps = 1,
dataloader_drop_last=True,
save_safetensors=False,
dataset_text_field="text",
)
# Define optimizer
from torch.optim import AdamW as PyTorchAdamW
params = model.parameters()
# Define the optimizer with specified parameters
optimizer = PyTorchAdamW(
params,
lr=5e-4,
# betas=(0.9, 0.999), ## the default value
eps=1e-08,
# weight_decay=0.1, ## maybe later
# correct_bias=True,
)
t_total = (len(train_dataloader) // training_args.gradient_accumulation_steps) * training_args.num_train_epochs
# Create the scheduler
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=training_args.warmup_steps,
num_training_steps=t_total,
)
# Create Trainer instance
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
optimizers=(optimizer, scheduler),
max_seq_length = 768,
tokenizer=tokenizer,
)
trainer.train()
the error:
/usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py in pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
3297 # The model’s main input name, usually input_ids
, has be passed for padding
3298 if self.model_input_names[0] not in encoded_inputs:
→ 3299 raise ValueError(
3300 “You should supply an encoding or a list of encodings to this method "
3301 f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}”
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided