Hi,
I was following the Question-answering tutorial from the HF Transformers docs, and though I have the exact same code as in the tutorial, kept receiving a
pyarrow.lib.ArrowInvalid: Column 2 named start_positions expected length 1000 but got length 1
error when running the program.
This is the full traceback:
Map: 0%| | 0/4000 [00:00<?, ? examples/s]
Traceback (most recent call last):
File "/home/polina/repo/policyDQA/hflearning/questans.py", line 80, in <module>
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
File "/home/polina/.local/lib/python3.10/site-packages/datasets/dataset_dict.py", line 853, in map
{
File "/home/polina/.local/lib/python3.10/site-packages/datasets/dataset_dict.py", line 854, in <dictcomp>
k: dataset.map(
File "/home/polina/.local/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 592, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "/home/polina/.local/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 557, in wrapper
out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
File "/home/polina/.local/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3097, in map
for rank, done, content in Dataset._map_single(**dataset_kwargs):
File "/home/polina/.local/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3493, in _map_single
writer.write_batch(batch)
File "/home/polina/.local/lib/python3.10/site-packages/datasets/arrow_writer.py", line 558, in write_batch
pa_table = pa.Table.from_arrays(arrays, schema=schema)
File "pyarrow/table.pxi", line 3798, in pyarrow.lib.Table.from_arrays
File "pyarrow/table.pxi", line 2962, in pyarrow.lib.Table.validate
File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Column 2 named start_positions expected length 1000 but got length 1
The problem seems to be coming from when the dataset ‘tokenized_squad’ is created by passing chunks of dataset ‘squad’ in batches of 1000 examples at a time to preprocess_function.
But when I changed batched
from True
to False
in the line
tokenized_squad = squad.map(preprocess_function, batched=False, remove_columns=squad["train"].column_names)
,
I got a new error:
TypeError: when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`.
This is my full program:
from datasets import load_dataset
#create dataset named squad from first 5000 entries in squad dataset
squad = load_dataset("squad", split="train[:5000]")
#split squad dataset into 20% for testing and 80% for training
squad = squad.train_test_split(test_size=0.2)
#create a tokenizer using "distilbert-base-uncased" model
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def preprocess_function(examples):
#iterate through input dataset 'examples' in list "question", strip spaces around text, and append to list 'questions'
questions = [q.strip() for q in examples["question"]]
#create tokenizer prompted by 'questions' and examples["context"]
inputs = tokenizer(
questions, #text to be encoded
examples["context"], #text to be encoded
max_length=384, #max length of output
truncation="only_second", #truncate 2nd sequence in a pair
return_offsets_mapping=True, #return char start and end for each token
padding="max_length", #pad to maximum length 384
)
#remove element "offset_mapping"(storing each subtoken's start and end position) from list inputs
offset_mapping = inputs.pop("offset_mapping")
#set list answers equal to list element "answers"
answers = examples["answers"]
start_positions = []
end_positions = []
#traverse list 'offset_mapping' with offset
for i, offset in enumerate(offset_mapping):
answer = answers[i]
start_char = answer["answer_start"][0]
end_char = answer["answer_start"][0] + len(answer["text"][0])
sequence_ids = inputs.sequence_ids(i)
# find the start and end of the context
idx = 0
while sequence_ids[idx] != 1:
idx += 1
context_start = idx
while sequence_ids[idx] == 1:
idx += 1
context_end = idx - 1
# if answer not fully inside context, label context (0, 0)
if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
start_positions.append(0)
end_positions.append(0)
else:
# otherwise label using the start and end token positions
idx = context_start
while idx <= context_end and offset[idx][0] <= start_char:
idx += 1
start_positions.append(idx - 1)
idx = context_end
while idx >= context_start and offset[idx][1] >= end_char:
idx -= 1
end_positions.append(idx + 1)
inputs["start_positions"] = start_positions
inputs["end_positions"] = end_positions
#assert set(len(column_values) for column_values in returned_batch.values()) == 1, "Mismatch in the number of elements"
return inputs
#create dataset 'tokenized_squad' by passing chunks of dataset 'squad' in batches of 1000 examples at a time to preprocess_function
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
#create a dataCollector object to form a batch using a list of dataset elements
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()
#load 'distilbert' model for question-answering
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
training_args = TrainingArguments(
output_dir="my_awesome_qa_model",
evaluation_strategy="epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
push_to_hub=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_squad["train"],
eval_dataset=tokenized_squad["test"],
tokenizer=tokenizer,
data_collator=data_collator,
)
trainer.train()
trainer.push_to_hub()
I have read satpalsr’s solution to a similar problem, as well as the Ch5 Datasets doc about the map method. But the shape error I’m having isn’t 1000 vs 1500, but 1000 instead of 1.
What am I doing wrong?