I keep receiving the following warning when training the deberta-v3 model.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the
callmethod is faster than using a method to encode the text followed by a call to the
pad method to get a padded encoding.
I want to pad inputs dynamically to the batch’s longest seq. So, I encode the text first, then use a Collator for padding. How can I combine these 2 steps into the __call__
with dynamic padding?
My code is as follows:
def prepare_input(cfg, text):
inputs = cfg.tokenizer(
text,
return_tensors=None,
return_token_type_ids=False,
add_special_tokens=True,
)
for k, v in inputs.items():
inputs[k] = torch.tensor(v, dtype=torch.long)
return inputs
class TrainDataset(Dataset):
def __init__(self, cfg, df):
self.cfg = cfg
self.texts = df["full_text"].values
self.labels = df[cfg.target_cols].values
def __len__(self):
return len(self.texts)
def __getitem__(self, i):
item = prepare_input(self.cfg, self.texts[i])
item["labels"] = torch.tensor(self.labels[i], dtype=torch.float)
return item
collate_fn = DataCollatorWithPadding(tokenizer=CFG.tokenizer, padding="longest")
trn_ds = TrainDataset(CFG, trn_folds)
trn_loader = DataLoader(
trn_ds,
batch_size=CFG.batch_size,
shuffle=True,
num_workers=CFG.num_workers,
pin_memory=True,
drop_last=True,
collate_fn=collate_fn,
)