Getting pyarrow.lib.ArrowInvalid: Column 2 named start_positions expected length 1000 but got length 1

Hi,
I was following the Question-answering tutorial from the HF Transformers docs, and though I have the exact same code as in the tutorial, kept receiving a
pyarrow.lib.ArrowInvalid: Column 2 named start_positions expected length 1000 but got length 1
error when running the program.

This is the full traceback:

Map:   0%|                                                                                                                                            | 0/4000 [00:00<?, ? examples/s]
Traceback (most recent call last):
  File "/home/polina/repo/policyDQA/hflearning/questans.py", line 80, in <module>
    tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
  File "/home/polina/.local/lib/python3.10/site-packages/datasets/dataset_dict.py", line 853, in map
    {
  File "/home/polina/.local/lib/python3.10/site-packages/datasets/dataset_dict.py", line 854, in <dictcomp>
    k: dataset.map(
  File "/home/polina/.local/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 592, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
  File "/home/polina/.local/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 557, in wrapper
    out: Union["Dataset", "DatasetDict"] = func(self, *args, **kwargs)
  File "/home/polina/.local/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3097, in map
    for rank, done, content in Dataset._map_single(**dataset_kwargs):
  File "/home/polina/.local/lib/python3.10/site-packages/datasets/arrow_dataset.py", line 3493, in _map_single
    writer.write_batch(batch)
  File "/home/polina/.local/lib/python3.10/site-packages/datasets/arrow_writer.py", line 558, in write_batch
    pa_table = pa.Table.from_arrays(arrays, schema=schema)
  File "pyarrow/table.pxi", line 3798, in pyarrow.lib.Table.from_arrays
  File "pyarrow/table.pxi", line 2962, in pyarrow.lib.Table.validate
  File "pyarrow/error.pxi", line 100, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: Column 2 named start_positions expected length 1000 but got length 1

The problem seems to be coming from when the dataset ‘tokenized_squad’ is created by passing chunks of dataset ‘squad’ in batches of 1000 examples at a time to preprocess_function.

But when I changed batched from True to False in the line
tokenized_squad = squad.map(preprocess_function, batched=False, remove_columns=squad["train"].column_names),
I got a new error:

TypeError: when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`.

This is my full program:

from datasets import load_dataset

#create dataset named squad from first 5000 entries in squad dataset
squad = load_dataset("squad", split="train[:5000]")

#split squad dataset into 20% for testing and 80% for training
squad = squad.train_test_split(test_size=0.2)

#create a tokenizer using "distilbert-base-uncased" model
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def preprocess_function(examples):
    #iterate through input dataset 'examples' in list "question", strip spaces around text, and append to list 'questions'
    questions = [q.strip() for q in examples["question"]]

    #create tokenizer prompted by 'questions' and examples["context"]
    inputs = tokenizer(
        questions, #text to be encoded
        examples["context"], #text to be encoded
        max_length=384, #max length of output
        truncation="only_second", #truncate 2nd sequence in a pair
        return_offsets_mapping=True, #return char start and end for each token
        padding="max_length", #pad to maximum length 384
    )

    #remove element "offset_mapping"(storing each subtoken's start and end position) from list inputs
    offset_mapping = inputs.pop("offset_mapping")
    #set list answers equal to list element "answers"
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    #traverse list 'offset_mapping' with offset
    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # if answer not fully inside context, label context (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # otherwise label using the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions

        #assert set(len(column_values) for column_values in returned_batch.values()) == 1, "Mismatch in the number of elements"
        return inputs

#create dataset 'tokenized_squad' by passing chunks of dataset 'squad' in batches of 1000 examples at a time to preprocess_function
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

#create a dataCollector object to form a batch using a list of dataset elements
from transformers import DefaultDataCollator
data_collator = DefaultDataCollator()
    
#load 'distilbert' model for question-answering
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")

training_args = TrainingArguments(
    output_dir="my_awesome_qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

trainer.push_to_hub()

I have read satpalsr’s solution to a similar problem, as well as the Ch5 Datasets doc about the map method. But the shape error I’m having isn’t 1000 vs 1500, but 1000 instead of 1.

What am I doing wrong?

This error means that the values in the dictionary (inputs) returned by the map transform do not match in length (this assertion has to hold: assert len(set(len(x) for x in inputs.values())) == 1)