I made a CSV by referring to the SQUAD data structure. Then trying use script from here to training Question answering using customized CSV.
When converting from CSV to Dataset, I understood that it was necessary to convert the feature in the answers column to Sequence (feature…, so I converted it as follows.
#Custom features
ans_feature = Features({'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)})
custom_features = Features(
{
"id": Value(dtype='string', id=None),
"title": Value(dtype='string', id=None),
"context": Value(dtype='string', id=None),
"question": Value(dtype='string', id=None),
"answers": Sequence(ans_feature, length=-1, id=None),
}
)
#Create train datasets
df = pd.read_csv("train.csv", encoding="utf_8")
train_l = []
for index, row in df.iterrows():
train_l.append({'id': row[0], 'title': row[1], 'context': row[2], 'question': row[3], 'answers': row[4]})
train_dataset = Dataset.from_pandas(pd.DataFrame(data=train_l), features=custom_features)
#Create validation datasets
df = pd.read_csv("validation.csv", encoding="utf_8")
train_v = []
for index, row in df.iterrows():
train_v.append({'id': row[0], 'title': row[1], 'context': row[2], 'question': row[3], 'answers': row[4]})
validation_dataset = Dataset.from_pandas(pd.DataFrame(data=train_v), features=custom_features)
#Create datasets
raw_datasets = DatasetDict({
"train": train_dataset,
"validation": validation_dataset,
})
However, the conversion process did not go well and the following error occurred.
File "/Users/tphan/.pyenv/versions/anaconda3-2023.03/envs/transformers/lib/python3.10/site-packages/datasets/table.py", line 2140, in cast_array_to_feature
raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{feature}")
TypeError: Couldn't cast array of type
string
to
Sequence(feature={'text': Value(dtype='string', id=None), 'answer_start': Value(dtype='int32', id=None)}, length=-1, id=None)
Can someone please tell me the correct way and the code?