I’m trying to follow this tutorial to fine-tune bert for a NER task using my own dataset Hugging Face Transformers with Keras: Fine-tune a non-English BERT for Named Entity Recognition. Below is my shortened code, and the error due to the last line of the code. I’m new to all these, and thank you in advance for helping out!
# load dataset,
df_converters = {'tokens': ast.literal_eval, 'labels': ast.literal_eval}
train_df = pd.read_csv("train_df_pretokenization.csv", converters=df_converters)
train_df = train_df.head(10)
# model and pretrained tokenizer
model_ckpt = "indobenchmark/indobert-base-p2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
# tokenization, and align labels
def tokenize_and_align_labels(batch):
tag2int = {'B-POI':0, 'B-STR':1, 'E-POI':2, 'E-STR':3, 'I-POI':4,
'I-STR':5, 'S-POI':6, 'S-STR':7, 'O':8}
#tokenized_inputs = tokenizer(batch['tokens'], is_split_into_words=True, truncation = True, padding = True)
tokenized_inputs = tokenizer(batch['tokens'], is_split_into_words=True, truncation = True)
labels=[]
for idx, label in enumerate(batch['labels']):
word_ids = tokenized_inputs.word_ids(batch_index = idx)
previous_word_idx = None
label_ids = []
for word_idx in word_ids:
if word_idx is None:
label_ids.append(-100)
elif word_idx != previous_word_idx:
label_ids.append(tag2int[label[word_idx]])
else:
label_ids.append(tag2int[label[word_idx]])
previous_word_idx = word_idx
labels.append(label_ids)
tokenized_inputs['tags'] = labels
return tokenized_inputs
def encode_dataset(ds):
return ds.map(tokenize_and_align_labels, batched= True, batch_size=10, remove_columns=['labels','tokens', 'index'])
train_ds = Dataset.from_pandas(train_df)
train_ds_encoded = encode_dataset(train_ds)
# prepare model input
data_collator = DataCollatorForTokenClassification(tokenizer, return_tensors="tf")
tf_train_dataset = train_ds_encoded.to_tf_dataset(
columns= ['input_ids', 'token_type_ids', 'attention_mask', 'tags'],
shuffle=False,
batch_size=5,
collate_fn=data_collator
)
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.
I thought data collator is supposed to take care of the padding work given the requested batch size, and I don’t understand why feeding in sequences of different lengths will cause this error. Indeed, the tutorial runs fine without specifying padding
or truncation
. My code will run if I add padding = True
to the tokenizer in the function (the line I commented out in the function). But I don’t think it is the right place to add paddings.