I am using the following codes to extract word embeddings from famous NLP models such BERT:
from transformers import pipeline, AutoTokenizer, AutoModel
import numpy as np
import re
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
model_pipeline = pipeline('feature-extraction', model=model_name, tokenizer=tokenizer)
def find_wordNo_sentence(word, sentence):
print(sentence)
splitted_sen = sentence.split(" ")
print(splitted_sen)
index = splitted_sen.index(word)
for i,w in enumerate(splitted_sen):
if(word == w):
return i
print("not found") #0 base
def return_bert_embedding(word, sentence):
word = re.sub(r'[^\w]', " ", word)
word = " ".join(word.split())
sentence = re.sub(r'[^\w]', ' ', sentence)
sentence = " ".join(sentence.split())
id_word = find_wordNo_sentence(word, sentence)
try:
data = model_pipeline(sentence)
n_words = len(sentence.split(" "))
#print(sentence_emb.shape)
n_embs = len(data[0])
print(n_embs, n_words)
print(len(data[0]))
if (n_words != n_embs):
"There is extra tokenized word"
results = data[0][id_word]
return np.array(results)
except:
return "word not found"
print(return_bert_embedding('your', "what is your name?"))
However, I know that these NLP models have different layers. How can I specify the layer number for extracting the features?