Hi,
I have been trying to write a preprocess function which supports batching but my final attempt yields in not all rows being processed (I counted in the function, it gets only 8K rows out of 1.6M)
If anyone could please guide me with respect to the best way to write such a function I’d be grateful.
sentiment_dataset = load_dataset("sentiment140")
def processdata(examples):
global x
num_example = len(examples)
x+= num_example
for i in range(num_example):
examples['text'][i] = f"Classify the sentiment of the following text: {examples['text'][i]}\n"
token_dict = {
"input_ids": [[] for _ in range(num_example)],
"attention_mask": [[] for _ in range(num_example)],
"labels": [[] for _ in range(num_example)],
}
encoding = tokenizer(
examples['text'],
add_special_tokens=True,
truncation=False,
)
labels = [
[-100] * len(encoding["input_ids"][i]) + tokenizer(examples["sentiment"][i]).input_ids
for i in range(num_example)
]
for i in range(num_example):
token_dict["input_ids"][i].extend(
encoding["input_ids"][i]
)
token_dict["attention_mask"][i].extend(
encoding["attention_mask"][i]
)
token_dict["labels"][i].extend(labels[i])
pad_length = max_length - len(token_dict["input_ids"][i])
if pad_length < 0:
# Truncates too long samples
for key in ["input_ids", "attention_mask", "labels"]:
token_dict[key][i] = token_dict[key][i][:pad_length]
else:
# Pads too short samples
pad_token_id = tokenizer.pad_token_id
token_dict["input_ids"][i].extend(
[pad_token_id for _ in range(pad_length)]
)
token_dict["attention_mask"][i].extend(
[0 for _ in range(pad_length)]
)
token_dict["labels"][i].extend(
[-100 for _ in range(pad_length)]
)
processed = sentiment_dataset.map(processdata,remove_columns=sentiment_dataset['train'].column_names, batched=True, load_from_cache_file=False)
‘processed’ above has only 8000 training rows and 5 test rows whereas the sentiment_dataset has 1.6M training and 498 test rows. Im baffled.