TypeError: forward() got an unexpected keyword argument 'token_type_ids'

I get the following error when finetuning Bert for classification. My forwad() function does not take any token inputs ?


PRE_TRAINED_MODEL_NAME = '/Bert/sm'
tokenizer = FlaubertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
PRE_TRAINED_MODEL = FlaubertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)


class FlauBertForSequenceClassification(FlaubertModel):
	"""
	FlauBert Model for Classification Tasks.

	"""
	def __init__(self, config, num_labels, freeze_encoder=False):

		"""
		@param    FlauBert: a FlauBertModel object
		@param    classifier: a torch.nn.Module classifier
		@param    freeze_encoder (bool): Set `False` to fine-tune the FlauBERT model
		
		"""

		# instantiate the parent class FlaubertModel
		super().__init__(config)
		
		# Specify hidden size of FB hidden size of our classifier, and number of labels

		# instantiate num. of classes
		self.num_labels = num_labels
		
		# instantiate and load a pretrained FlaubertModel 
		self.encoder = FlaubertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
		

		
		# freeze the encoder parameters if required (Q1)
		if freeze_encoder: 
		  for param in self.encoder.parameters():
			  param.requires_grad = False

		# the classifier: a feed-forward layer attached to the encoder's head
		self.classifier = torch.nn.Sequential(
											  torch.nn.Linear(in_features=config.emb_dim, out_features=512),
											  torch.nn.Tanh(),  # or nn.ReLU()
											  torch.nn.Dropout(p=0.1), 
											  torch.nn.Linear(in_features=512, out_features=self.num_labels, bias=True),
											  )
		# instantiate a dropout function for the classifier's input
		self.dropout = torch.nn.Dropout(p=0.1)


	def forward(
		self,
		input_ids=None,
		attention_mask=None,
		head_mask=None,
		inputs_embeds=None,
		labels=None,
		output_attentions=None,
		output_hidden_states=None,
	):
		# encode a batch of sequences
		encoder_output = self.encoder(
			input_ids=input_ids,
			attention_mask=attention_mask,
			head_mask=head_mask,
			inputs_embeds=inputs_embeds,
			output_attentions=output_attentions,
			output_hidden_states=output_hidden_states,
		)
		# extract the hidden representations from the encoder output
		hidden_state = encoder_output[0]  # (bs, seq_len, dim)
		pooled_output = hidden_state[:, 0]  # (bs, dim)
		# apply dropout
		pooled_output = self.dropout(pooled_output)  # (bs, dim)
		# feed into the classifier
		logits = self.classifier(pooled_output)  # (bs, dim)

		outputs = (logits,) + encoder_output[1:]
		
		if labels is not None:
			
			#multilabel
			#loss_fct = BCEWithLogitsLoss()
			#loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1, self.num_labels))
			
			#binaryclassification
			#loss_fct = torch.nn.CrossEntropyLoss()  #crossEntropyLoss
			#loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
			# aggregate outputs
			#outputs = (loss,) + outputs
			
			#multiclassification
			loss_fct = torch.nn.CrossEntropyLoss()  #crossEntropyLoss
			loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
			# aggregate outputs
			outputs = (loss,) + outputs

		return outputs  # (loss), logits, (hidden_states), (attentions)


# instantiate model
model = FlauBertForSequenceClassification(
    config=PRE_TRAINED_MODEL.config, num_labels=3, freeze_encoder = False
    )
# Create torch dataset
	class Dataset(torch.utils.data.Dataset):
		def __init__(self, encodings, labels=None):
			self.encodings = encodings
			self.labels = labels

		def __getitem__(self, idx):
			item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
			if self.labels:
				item["labels"] = torch.tensor(self.labels[idx])
			return item

		def __len__(self):
			return len(self.encodings["input_ids"])

	var, l = input_file(path1)
	
	X = list(var)
	y = list(l)
	
	X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
	X_train_tokenized = tokenizer(X_train, padding="max_length", truncation=True, max_length=512)
	print(X_train_tokenized)
	X_val_tokenized = tokenizer(X_val, padding="max_length", truncation=True, max_length=512)
 
	train_dataset = Dataset(X_train_tokenized, y_train)
	val_dataset = Dataset(X_val_tokenized, y_val)

	training_args = TrainingArguments(
	output_dir='/ghf/sm',          
	logging_dir='/ogs/sm',
	do_train=True,
	do_eval=False,
	evaluation_strategy="steps",
	logging_first_step=True,
	logging_steps=10,
	num_train_epochs=2.0,              
	per_device_train_batch_size=8,
	per_device_eval_batch_size=8,	
	learning_rate=5e-5,
	weight_decay=0.01
	)

	trainer = Trainer(
		model=model,                         
		args=training_args,                  
		train_dataset=train_dataset,
		eval_dataset=val_dataset,	
		compute_metrics=compute_metrics
		)
	print("Train")	
	trainer.train()

STACKTRACE

***** Running training *****
  Num examples = 536
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 34

  0%|          | 0/34 [00:00<?, ?it/s]Traceback (most recent call last):
  File "//finetuning.py", line 371, in <module>
    trainer.train()
  File "/ython3.9/site-packages/transformers/trainer.py", line 1269, in train
    tr_loss += self.training_step(model, inputs)
  File "/lw/.conda/envs/bert/lib/python3.9/site-packages/transformers/trainer.py", line 1754, in training_step
    loss = self.compute_loss(model, inputs)
  File "/luw/.conda/envs/bert/lib/python3.9/site-packages/transformers/trainer.py", line 1786, in compute_loss
    outputs = model(**inputs)
  File "uw/.conda/envs/bert/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "uw/.conda/envs/bert/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 168, in forward
    outputs = self.parallel_apply(replicas, inputs, kwargs)
  File "/uw/.conda/envs/bert/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py", line 178, in parallel_apply
    return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
  File "/uw/.conda/envs/bert/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 86, in parallel_apply
    output.reraise()
  File "uw/.conda/envs/bert/lib/python3.9/site-packages/torch/_utils.py", line 425, in reraise
    raise self.exc_type(msg)
TypeError: Caught TypeError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/uw/.conda/envs/bert/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
    output = module(*input, **kwargs)
  File "/uw/.conda/envs/bert/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
TypeError: forward() got an unexpected keyword argument 'token_type_ids'


Did you figure out a solution for this? Was it resolved?

1 Like

Yes I find it.

Can you please provide the solution