Following my previous post, I am trying to extract NLP features from famous models such as BERT or t5.
As you may know, these models consist of many layers. Here is my code:
from transformers import pipeline, AutoTokenizer, AutoModel
import numpy as np
import re
# In[7]:
model_name = "xlnet-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))
model_pipeline = pipeline('feature-extraction', model=model_name, tokenizer=tokenizer)
# In[8]:
def find_wordNo_sentence(word, sentence):
print(sentence)
splitted_sen = sentence.split(" ")
print(splitted_sen)
index = splitted_sen.index(word)
for i,w in enumerate(splitted_sen):
if(word == w):
return i
print("not found") #0 base
# In[13]:
def return_xlnet_embedding(word, sentence):
word = re.sub(r'[^\w]', " ", word)
word = " ".join(word.split())
sentence = re.sub(r'[^\w]', ' ', sentence)
sentence = " ".join(sentence.split())
id_word = find_wordNo_sentence(word, sentence)
try:
data = model_pipeline(sentence)
n_words = len(sentence.split(" "))
n_embs = len(data[0])
print(n_embs, n_words)
print(len(data[0]))
inputs = tokenizer(sentence)
print(inputs.word_ids())
list_of_word_ids = [i for i,j in enumerate(inputs.word_ids()) if j==id_word]
print("hi")
print(list_of_word_ids)
print(inputs.tokens())
print(len(data[0]))
results = np.zeros(len(data[0][0]))
for i in range(len(list_of_word_ids)):
print(len(data[0][i]))
results += np.array(data[0][i])
results /= len(list_of_word_ids)
return np.array(results)
except:
print("word is wrong")
How can I specify which layer to extract features?