I am trying to train T5 model binary classification. When I am trying to
define data class, I am getting
batch_size, seq_length = input_shape
1017 # required mask seq length can be calculated via length of past
1018 mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
If I am using unsqueeze(0) for input_id and attention_mask I am getting following error
ValueError: too many values to unpack (expected 2)
if I am using squeeze(0), flatten() or none of them I am getting
not enough values to unpack (expected 2, got 1)
The data class is below
…
def __init__(self,
df: pd.DataFrame,
tokenizer:T5Tokenizer,
source_max_token_length: int=1000,
target_max_token_length: int=400):
self.tokenizer=tokenizer
self.df=df
self.source_max_token_length=source_max_token_length
self.target_max_token_length=target_max_token_length
def __len__(self):
return len(self.df)
def __getitem__(self,index: int):
data_row=self.df.iloc[index]
source_encoding=tokenizer.batch_encode_plus(
str(data_row['question']),
# str(data_row['context']),
max_length=self.source_max_token_length,
padding='max_length',
truncation='only_second',
return_attention_mask=True,
add_special_tokens=True,
return_tensors='pt'
)
target_encoding=tokenizer.batch_encode_plus(
str(data_row['answers']),
max_length=self.target_max_token_length,
padding='max_length',
truncation=True,
return_attention_mask=True,
add_special_tokens=True,
return_tensors='pt'
)
labels=target_encoding['input_ids']
labels[labels==0]=-100
return dict(
question=data_row['question'],
# context=data_row['context'],
answer_text=data_row['answers'],
# input_ids=np.array([source_encoding['input_ids']]),#.flatten(),np.array([d['input_ids']])
# attention_mask=np.array([source_encoding['attention_mask']]),#.flatten(),#.flatten(),
# labels=labels
input_ids=source_encoding['input_ids'].unsqueeze(0),#.flatten(),#.flatten(),np.array([d['input_ids']])
attention_mask=source_encoding['attention_mask'].unsqueeze(0),#.flatten(),#.flatten(),#.flatten(),
labels=labels.flatten()
)
I tried ever debugging like:
str(data_row['question']),
# str(data_row['context']),
max_length=1000,
padding='max_length',
truncation='only_second',
return_attention_mask=True,
add_special_tokens=True,return_tensors='pt'
)
The size of example is example[‘input_ids’].shape
example['input_ids'].shape
torch.Size([61, 1000])
I think I do not need unsqueeze(0) or flatten(), but none of them works
sagemaker: 2.221.0
transformers: 4.41.0
pytorch_lightning: 1.1.3