Hey guys,
I am using xlm-roberta-base with transformers==4.29.2 torch==1.13.1
For any given pair of random text, I always get a high cosine similarity.
model_id = "xlm-roberta-base"
tokenizer = XLMRobertaTokenizerFast.from_pretrained(model_id)
xlmr_model = XLMRobertaModel.from_pretrained(model_id)
xlmr_model.eval()
def get_model_output(text, max_length=30):
text_input = tokenizer(
text,
padding="max_length",
max_length=max_length,
truncation=True,
return_tensors="pt",
add_special_tokens=True
)
print(text_input)
text_embedding = xlmr_model(
**text_input
)
embedding = text_embedding.pooler_output.flatten().tolist()
return embedding
def cosine_similarity(v1, v2):
return float(np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)))
text1 = "white joota with blue lace" # (white shoes with blue lace)
text2 = "vcvbhjook jjjjj"
embedding1 = get_model_output(text1)
embedding2 = get_model_output(text2)
print(cosine_similarity(embedding1, embedding2)) # 0.99