I am currently trying using extra steps for the tokenisation:
token_ids = []
mask_ids = []
seg_ids = []
y = []
for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
MAX_LEN = 255
premise_id = self.tokenizer.encode(premise, add_special_tokens = False, max_length=MAX_LEN, truncation=True)
demarkator_premise_id = self.tokenizer.encode('{', add_special_tokens = False, max_length=MAX_LEN, truncation=True)
hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False, max_length=MAX_LEN, truncation=True)
demarkator_hypothesis_id = self.tokenizer.encode('}', add_special_tokens = False, max_length=MAX_LEN, truncation=True)
pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + demarkator_premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + demarkator_hypothesis_id + [self.tokenizer.sep_token_id]
premise_len = len(premise_id)
hypothesis_len = len(hypothesis_id)
segment_ids = torch.tensor([0] * (premise_len + 3) + [1] * (hypothesis_len + 2)) # sentence 0 and sentence 1
attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 5)) # mask padded values
token_ids.append(torch.tensor(pair_token_ids))
seg_ids.append(segment_ids)
mask_ids.append(attention_mask_ids)
y.append(self.label_dict[label])
token_ids = pad_sequence(token_ids, batch_first=True)
mask_ids = pad_sequence(mask_ids, batch_first=True)
seg_ids = pad_sequence(seg_ids, batch_first=True)
y = torch.tensor(y)
dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
data_loader = DataLoader(
dataset ,
shuffle=shuffle,
batch_size=batch_size
)
return data_loader
How can I use a datacollator to perform my own steps (combining special characters to premise-hypothesis pairs) after tokenization?