I was reading and working through “Token Classification with W-NUT Emerging Entities” tutorial on Fine-tuning with custom datasets — transformers 4.5.0.dev0 documentation using a different data.
To replicate the data structure of the tutorial, I used the code below to insert a blank space between sentences/tags
# insert blank row in python dataframe when value in column changes
mask = DF['sentence_id'].ne(DF['sentence_id'].shift(-1))
DF1 = pd.DataFrame('',index=mask.index[mask] + .5, columns=DF.columns)
DF2 = pd.concat([DF, DF1]).sort_index().reset_index(drop=True).iloc[:-1]
DF2.head(18)
I then wrote the data to a file
DF2.to_csv(r'/content/drive/MyDrive/Colab Notebooks/model/dataset.txt',
header=None, index=None, sep='\t', mode='a', encoding="utf-8")
I read the data back using the code provided in the tutorial
def read_wnut(file_path):
file_path = Path(file_path, encoding='utf8')
raw_text = file_path.read_text().strip()
raw_docs = re.split(r'\n\t?\n', raw_text)
token_docs = []
tag_docs = []
for doc in raw_docs:
tokens = []
tags = []
for line in doc.split('\n'):
token, tag = line.split('\t')
tokens.append(token)
tags.append(tag)
token_docs.append(tokens)
tag_docs.append(tags)
return token_docs, tag_docs
texts_df, tags_df = read_wnut(colab_file_path)
Next, I split up the data into training, validation, and test sets.
I then created encoding for the tags and the tokens as seen below
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}
# import the transformers module
from transformers import BertTokenizerFast
# import the small bert tokenizer
model_name = "google/bert_uncased_L-4_H-512_A-8"
tokenizer = BertTokenizerFast.from_pretrained(model_name)
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
test_encodings = tokenizer(test_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
It’s when I run the offset function that I get an error message
import numpy as np
def encode_tags(tags, encodings):
labels = [[tag2id[tag] for tag in doc] for doc in tags]
encoded_labels = []
for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
# create an empty array of -100
doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
arr_offset = np.array(doc_offset)
# set labels whose first offset position is 0 and the second is not 0
doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
encoded_labels.append(doc_enc_labels.tolist())
return encoded_labels
# return the encoded labels
train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)
test_labels = encode_tags(test_tags, test_encodings)
The error message is below:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-18-7290e6aeda9c> in <module>()
17
18 # return the encoded labels
---> 19 train_labels = encode_tags(train_tags, train_encodings)
20 val_labels = encode_tags(val_tags, val_encodings)
21 test_labels = encode_tags(test_tags, test_encodings)
<ipython-input-18-7290e6aeda9c> in encode_tags(tags, encodings)
11
12 # set labels whose first offset position is 0 and the second is not 0
---> 13 doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
14 encoded_labels.append(doc_enc_labels.tolist())
15
ValueError: NumPy boolean array indexing assignment cannot assign 236 input values to the 120 output values where the mask is true
I am not sure where my error is, as I have tried to replicate what is in the tutorial.