I’m fine-tuning a GPT-2 model, due to the dataset containing negative examples I had to modify the data collector to handel this third element negative_mask
.
The dataset class extends the base Datset from pyTorch:
class CustomTextDataset(Dataset):
def __init__(self, tokenizer, examples, block_size=128):
self.examples = examples
self.tokenizer = tokenizer
self.block_size = block_size
texts = [example['text'] for example in examples]
negative_masks = [example['negative'] for example in examples]
self.encodings = tokenizer(texts, padding=True, truncation=True, max_length=self.block_size, return_tensors='pt')
self.negative_masks = torch.tensor(negative_masks, dtype=torch.float)
print(f"Encodings: {self.encodings}")
print(f"Negative masks: {self.negative_masks}")
def __getitem__(self, idx):
item = {key: val[idx] for key, val in self.encodings.items()}
item['negative_mask'] = self.negative_masks[idx]
if 'negative_mask' not in item: #debug, passed
print(f"Error: negative_mask is missing in dataset item {idx}")
print(f"Dataset item {idx}: {item}")
return item
def __len__(self):
return len(self.encodings['input_ids'])
At this stage, negative_mask
is still present based off of the debug output:
Negative masks: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0.,
0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
However by the time the CustomeDataCollector
class is called:
class CustomDataCollator:
def __init__(self, tokenizer, mlm=False, mlm_probability=0.15):
self.tokenizer = tokenizer
self.mlm = mlm
self.mlm_probability = mlm_probability
self.data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=mlm, mlm_probability=mlm_probability)
def __call__(self, features):
print("Features received by CustomDataCollator:", features)
for i, feature in enumerate(features): #debug,
if 'negative_mask' not in feature:
print(f"Feature {i} is missing negative_mask: {feature}")
raise ValueError(f"negative_mask is missing in feature {i}: {feature}") # raised here -----
input_ids = torch.stack([f['input_ids'] for f in features])
attention_mask = torch.stack([f['attention_mask'] for f in features])
negative_mask = torch.tensor([f['negative_mask'] for f in features], dtype=torch.float)
batch = {
'input_ids': input_ids,
'attention_mask': attention_mask,
'negative_mask': negative_mask
}
if 'labels' in features[0]:
batch['labels'] = torch.stack([f['labels'] for f in features])
print(f"Batch prepared in CustomDataCollator: {batch}")
return batch
The exception is raised every time, indicating that somewhere, negative_mask
is getting lost. Considering that features are batched in the dataSet class, and its debug line supports that negative_mask
is present, I really cannot think of a way of it somehow being dropped when dataCollator
is called. The exception raised is:
ValueError: negative_mask is missing in feature 0: {'input_ids': tensor([ 70, 4024, 10163, 11, 23624, 6188, 25, 1210, 826, 9087,
20064, 11, 5529, 1944, 20334, 685, 45, 7156, 37045, 60,
50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0])}
Any help is greatly appreciated