Item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} AttributeError: 'list' object has no attribute 'items'

I am trying to classify a file using the model obtained from finetuning bertsequenceclassification.

I get this error when loading the new test file (it does not have label), it is unknown file.

# Create torch dataset
class Dataset(torch.utils.data.Dataset):
	def __init__(self, encodings, labels=None):
		self.encodings = encodings
		self.labels = labels

	def __getitem__(self, idx):
		item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
		if self.labels:
			item["labels"] = torch.tensor(self.labels[idx])
			#print(item)
		return item

	def __len__(self):
		print(len(self.encodings["input_ids"]))
		return len(self.encodings["input_ids"])


# prepare dat for classification

tokenizer = XXXTokenizer.from_pretrained(model_name)
	print("Transform xml file to pandas series core...")
	text, file_name = transform_xml_to_pd(file)  # transform xml file to pd
	
	# Xtest_emb, s = get_xxx_layer(Xtest['sent'], path_to_model_lge)  # index 2 correspond to sentences
	#print(text)
	
	print("Preprocess text with spacy model...")
	clean_text = make_new_traindata(text['sent'])
	#print(clean_text[1])  # clean text ; 0 = raw text ; and etc...
	
	X = list(clean_text)
	X_text_tokenized = []
	
	for x in X:
		#print(type(x))
		x_encoded = tokenizer(str(x), padding="max_length", truncation=True, max_length=512)
		#print(type(x_encoded))
		#print(x_encoded)
		X_text_tokenized.append(x_encoded)
		
	#print(type(X_text_tokenized))
	
	
	X_data = Dataset(X_text_tokenized)
	
	print(type(X_data))
	print(X_data['input_ids'])

Error :slight_smile:


  File "/scriptTraitements/classifying.py", line 153, in __getitem__
    item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
AttributeError: 'list' object has no attribute 'items'

Hi,

this line is the problem:

X_data = Dataset(X_text_tokenized)

because X_text_tokenized is a list of dictionaries and not a dictionary of lists. You can fix this with the following code:

def list_of_dicts_to_dict_of_lists(d):
    dic = d[0]
    keys = dic.keys()
    values = [dic.values() for dic in d]
    return {k: list(v) for k, v in zip(keys, zip(*values))}

X_data = Dataset(list_of_dicts_to_dict_of_lists(X_text_tokenized))
1 Like