I require embedding of a sentence, and before using Bert model, I want to fine tune it for my specific domain and then get the embeddings (pooler_output from AutoModel)
This was my plan
- Finetune Masked Language model for the specific domain
- Load this fine tuned model into AutoModel, and then get the embeddings
However, the embeddings which I got from the second steo doesn’t seem to be correct
(I did a simple check - for a particular sentence, computed the cosine similarity of standalone step2 and the similarity of step2 preceeded by step1, this similarity was low, which I think it shouldn’t be, as I just did a fine tuning with a small dataset (500 data points)
Code -
#step1
from transformers import AutoModelForMaskedLM
bert_maskedML=AutoModelForMaskedLM.from_pretrained('bert-base-uncased')
from transformers import AutoTokenizer
bert_tokenizer=AutoTokenizer.from_pretrained("bert-base-uncased")
import datasets
from datasets import load_dataset
df=pd.read_csv('/kaggle/input/inputs-n500/Regression_inputs_n500.csv')
dataset=load_dataset("csv", data_files='.fintech_inputs_n500.csv', split=datasets.Split.TRAIN)
tok_oup=dataset.map(lambda x:bert_tokenizer(x['text'], padding='max_length'), batched=True)
tok_oup=tok_oup.remove_columns('text')
tok_oup=tok_oup.remove_columns('Unnamed: 0')
tok_oup.set_format("torch", columns=[ "input_ids", 'token_type_ids' ,'attention_mask'])
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
tokenizer=bert_tokenizer, mlm=True, mlm_probability=0.15
)
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments("test-trainer3")
trainer = Trainer(
bert_maskedML,
training_args,
data_collator=data_collator,
train_dataset=tok_oup
)
trainer.train()
trainer.save_model('./MaskedLM')
#step 2
from transformers import AutoModel
bertMasked_auto=AutoModel.from_pretrained('/kaggle/working/MaskedLM')
#step2 , if no step1
from transformers import AutoModel
bert_auto=AutoModel.from_pretrained('bert-base-uncased')
#comparison
inputs=[ "no you say that if i make a late payment there is no late fee"]
inputs=bert_tokenizer(inputs, padding='max_length', return_tensors='pt')
bert_masked_predctn=bertMasked_auto(**inputs)
bert_auto_predctn=bert_auto(**inputs)
from torch.nn import CosineSimilarity
cos = CosineSimilarity(dim=0, eps=1e-6)
auto_pooler=bert_auto_predctn['pooler_output']
Masked_auto_pooler=bert_masked_predctn['pooler_output']
cos(auto_pooler[0], Masked_auto_pooler[0])