class SQUAD(Dataset):
def __init__(self):
# Load our training dataset and tokenizer
self.dataset = load_dataset("squad", split="train")
self.encoded_context = self.dataset.map(
convert_to_features_context, batched=True
)
#self.dataset = self.dataset.flatten()
self.encoded_question = self.dataset.map(
convert_to_features_question, batched=True
)
# Format our dataset to outputs torch.Tensor to train a pytorch model
columns = ["input_ids", "start_positions", "end_positions"]
self.encoded_context.set_format(type="torch", columns=columns)
self.encoded_question.set_format(type="torch", columns=["input_ids"])
self.length = len(self.encoded_context["input_ids"])
self.encoded_context.flatten()
self.encoded_question.flatten()
def __len__(self):
return self.length
def __getitem__(self, idx):
t1 = time.time()
input_ids_context = self.encoded_context["input_ids"]
print("context", time.time() - t1) # 0.38
t2 = time.time()
input_ids_question = self.encoded_question["input_ids"][idx]
print("question", time.time() - t2) #0.40
t3 = time.time()
input_ids_start_positions = self.encoded_context["start_positions"][idx]
print("start", time.time() - t3) #0.40
t4 = time.time()
input_ids_end_positions = self.encoded_context["end_positions"][idx]
print("end", time.time() - t4) #0.68
return (
input_ids_context,
input_ids_question,
input_ids_start_positions,
input_ids_end_positions,
)
train_dataset = SQUAD()
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=20)
for batch in dataloader:
I made my own custom dataset class and brought Squad datasets from huggingface.
The problem was it takes too much time to do ‘getitem’ function.
To fetch each data takes about 0.5 sec on average
If I set batch size =20, it takes 0.5 * 4 * 20 = 40 sec. (since getitem returns 4 data)
It looks totally wrong.
I think using huggingface dataset and custom datasets together might cause this problem.
Either way, querying one of the columns from the large dataset, Squad, might take too much time.
How can I fix it??