I am trying to classify a file using the model obtained from finetuning bertsequenceclassification.
I get this error when loading the new test file (it does not have label), it is unknown file.
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels=None):
self.encodings = encodings
self.labels = labels
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
if self.labels:
item["labels"] = torch.tensor(self.labels[idx])
#print(item)
return item
def __len__(self):
print(len(self.encodings["input_ids"]))
return len(self.encodings["input_ids"])
# prepare dat for classification
tokenizer = XXXTokenizer.from_pretrained(model_name)
print("Transform xml file to pandas series core...")
text, file_name = transform_xml_to_pd(file) # transform xml file to pd
# Xtest_emb, s = get_xxx_layer(Xtest['sent'], path_to_model_lge) # index 2 correspond to sentences
#print(text)
print("Preprocess text with spacy model...")
clean_text = make_new_traindata(text['sent'])
#print(clean_text[1]) # clean text ; 0 = raw text ; and etc...
X = list(clean_text)
X_text_tokenized = []
for x in X:
#print(type(x))
x_encoded = tokenizer(str(x), padding="max_length", truncation=True, max_length=512)
#print(type(x_encoded))
#print(x_encoded)
X_text_tokenized.append(x_encoded)
#print(type(X_text_tokenized))
X_data = Dataset(X_text_tokenized)
print(type(X_data))
print(X_data['input_ids'])
Error
File "/scriptTraitements/classifying.py", line 153, in __getitem__
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
AttributeError: 'list' object has no attribute 'items'