Hey Guys,
I was trying to process a sequence pair of sentences together using the tokenizer for the longformer model and the problem is that the token_type_ids
is a list always with zero elements.
This is a snippet what the data looks like
This is the code to create my CustomDataset:
class PlagiarismDetectorDataset(Dataset):
def __init__(self, data: pd.DataFrame, tokenizer, max_token_len: int = 4096):
self.data = data
self.tokenizer = tokenizer
self.max_token_len = max_token_len
def __len__(self):
return len(self.data[self.data['Datatype'] == 'train'])
def __getitem__(self, item: int):
data_row = self.data.iloc[item]
review = data_row.Text
original = self.data[(self.data['Task'] == data_row.Task) & (self.data.Datatype == 'orig')]['Text'].iloc[0]
label = data_row.Class
encoding = self.tokenizer(
text=review,
text_pair=original,
add_special_tokens=True,
max_length=self.max_token_len,
return_token_type_ids=True,
truncation=True,
return_attention_mask=True,
return_tensors="pt")
return dict(
review=review,
original=original,
label=label,
input_ids=encoding["input_ids"].flatten(),
token_type_ids=encoding["token_type_ids"].flatten(),
attention_mask=encoding["attention_mask"].flatten(),
labels=torch.DoubleTensor(label)
)
Finally, this the way to get a sample of the processed data:
from transformers import AutoTokenizer, LongformerForSequenceClassification
tokenizer = AutoTokenizer.from_pretrained('allenai/longformer-base-4096')
train_dataset = PlagiarismDetectorDataset(data=aux, tokenizer=tokenizer)
sample_data = train_dataset[0]
Based on the video of preprocessing I was expecting to have a token_type_ids
with 0 and 1 elements. However, the result I get is a list of zero elements. What am I doing wrong?