I run the code following and it outputs the error.
def prepare_calibration_input(model, dataloader, device):
use_cache = model.config.use_cache
model.config.use_cache = False
layers = model.model.layers
# dev = model.hf_device_map["model.embed_tokens"]
if "model.embed_tokens" in model.hf_device_map:
device = model.hf_device_map["model.embed_tokens"]
dtype = next(iter(model.parameters())).dtype
inps = torch.zeros((128, model.seqlen, model.config.hidden_size), dtype=dtype, device=device)
inps.requires_grad = False
cache = {'i': 0, 'attention_mask': None, "position_ids": None}
class Catcher(nn.Module):
def __init__(self, module):
super().__init__()
self.module = module
#def forward(self, inp, **kwargs):
#inps[cache['i']] = inp
#cache['i'] += 1
#cache['attention_mask'] = kwargs['attention_mask']
#cache['position_ids'] = kwargs['position_ids']
#raise ValueError
def forward(self, inp, **kwargs):
inps[cache['i']] = inp
cache['i'] += 1
cache['attention_mask'] = kwargs.get('attention_mask', None)
cache['position_ids'] = kwargs.get('position_ids', None)
raise ValueError # Intentionally raise an exception to stop the forward pass
layers[0] = Catcher(layers[0])
#for batch in dataloader:
# try:
# model(batch[0].to(device))
# except ValueError:
# pass
# Check if attention_mask is provided in the dataloader
for batch in dataloader:
if 'attention_mask' not in batch or batch['attention_mask'] is None:
raise ValueError("attention_mask is missing in the dataloader")
try:
model(batch[0].to(device))
except ValueError:
continue # Continue as the error is expected
layers[0] = layers[0].module
outs = torch.zeros_like(inps)
attention_mask = cache['attention_mask']
position_ids = cache['position_ids']
model.config.use_cache = use_cache
return inps, outs, attention_mask, position_ids
So why is there no attention_mask in the dataloader? Is the version issue of datasets?