Map function skipping rows (only 8k out of 1.6M rows)

Hi,
I have been trying to write a preprocess function which supports batching but my final attempt yields in not all rows being processed (I counted in the function, it gets only 8K rows out of 1.6M)

If anyone could please guide me with respect to the best way to write such a function Iā€™d be grateful.

sentiment_dataset = load_dataset("sentiment140")

def processdata(examples):
    global x
    num_example = len(examples)
    x+= num_example

    for i in range(num_example):
        examples['text'][i] = f"Classify the sentiment of the following text: {examples['text'][i]}\n"
    token_dict = {
        "input_ids": [[] for _ in range(num_example)],
        "attention_mask": [[] for _ in range(num_example)],
        "labels": [[] for _ in range(num_example)],
    }
    encoding = tokenizer(
        examples['text'],
        add_special_tokens=True,
        truncation=False,
    )
    labels = [
        [-100] * len(encoding["input_ids"][i]) + tokenizer(examples["sentiment"][i]).input_ids
          for i in range(num_example)
    ]
    for i in range(num_example):
        token_dict["input_ids"][i].extend(
            encoding["input_ids"][i]
        )
        token_dict["attention_mask"][i].extend(
            encoding["attention_mask"][i]
        )
        token_dict["labels"][i].extend(labels[i])

        pad_length = max_length - len(token_dict["input_ids"][i])

        if pad_length < 0:
            # Truncates too long samples
            for key in ["input_ids", "attention_mask", "labels"]:
                token_dict[key][i] = token_dict[key][i][:pad_length]
        else:
            # Pads too short samples
            pad_token_id = tokenizer.pad_token_id
            token_dict["input_ids"][i].extend(
                [pad_token_id for _ in range(pad_length)]
            )
            token_dict["attention_mask"][i].extend(
                [0 for _ in range(pad_length)]
            )
            token_dict["labels"][i].extend(
                [-100 for _ in range(pad_length)]
            )

processed = sentiment_dataset.map(processdata,remove_columns=sentiment_dataset['train'].column_names, batched=True, load_from_cache_file=False)

ā€˜processedā€™ above has only 8000 training rows and 5 test rows whereas the sentiment_dataset has 1.6M training and 498 test rows. Im baffled.

EDIT: This was solved. Apparently map evaluates batches lazily so unless you do len(examples[column_name]) it just returns 5.
Fixed code below for anyone who encounters something like this later:

def processdata(examples):
    global x
    num_example = len(examples['text'])
    x+= num_example

    for i in range(num_example):
        examples['text'][i] = f"Classify the sentiment of the following text: {examples['text'][i]}\n"
    token_dict = {
        "input_ids": [[] for _ in range(num_example)],
        "attention_mask": [[] for _ in range(num_example)],
        "labels": [[] for _ in range(num_example)],
    }
    encoding = tokenizer(
        examples['text'],
        add_special_tokens=True,
        truncation=False,
    )
    labels = [
        [-100] * len(encoding["input_ids"][i]) + tokenizer(examples["sentiment"][i]).input_ids
          for i in range(num_example)
    ]
    for i in range(num_example):
        token_dict["input_ids"][i].extend(
            encoding["input_ids"][i]
        )
        token_dict["attention_mask"][i].extend(
            encoding["attention_mask"][i]
        )
        token_dict["labels"][i].extend(labels[i])

        pad_length = max_length - len(token_dict["input_ids"][i])

        if pad_length < 0:
            # Truncates too long samples
            for key in ["input_ids", "attention_mask", "labels"]:
                token_dict[key][i] = token_dict[key][i][:pad_length]
        else:
            # Pads too short samples
            pad_token_id = tokenizer.pad_token_id
            token_dict["input_ids"][i].extend(
                [pad_token_id for _ in range(pad_length)]
            )
            token_dict["attention_mask"][i].extend(
                [0 for _ in range(pad_length)]
            )
            token_dict["labels"][i].extend(
                [-100 for _ in range(pad_length)]
            )