I have pretrained TF Roberta model (ruRoberta).
I use
model = RobertaModel.from_pretrained('iroberta_tf_module', from_tf=True, local_files_only=True)
tokenizer = RobertaTokenizer.from_pretrained('iroberta_tf_module', add_prefix_space=True, local_files_only=True)
to load model with TF weights.
When i saved it to Torch weights
model.save_pretrained('iroberta_torch_module')
The problem is that inference with TF weights and Torch weights gives different embeddings for same input text
def mean_pooling(model_output, attention_mask):
"""
:param model_output: model forward pass
:param attention_mask: tokenizer attentions mask
"""
token_embeddings = model_output[0]
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
input_text = 'Привет как дела!'
input_ids = tokenizer([input_text], return_tensors='pt')
# {'input_ids': tensor([[ 1, 16523, 414, 2510, 5, 2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
with torch.no_grad():
model_output = model(**input_ids)
# TF
mean_pooling(model_output, input_ids['attention_mask'])
# tensor([[-1.3579, -3.1302, -1.0000, ..., -0.3164, -0.7615, 1.1477]])
# Torch (
# Loaded as
# model = RobertaModel.from_pretrained('iroberta_torch_module', local_files_only=True)
# tokenizer = RobertaTokenizer.from_pretrained('iroberta_torch_module', add_prefix_space=True, local_files_only=True)
input_ids = tokenizer([input_text], return_tensors='pt')
# {'input_ids': tensor([[ 1, 16523, 414, 2510, 5, 2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}
with torch.no_grad():
model_output = model(**input_ids)
mean_pooling(model_output, input_ids['attention_mask'])
# tensor([[ 1.7111, -0.8426, -1.0796, ..., 0.2451, 0.6090, -0.7238]])
transformers.__version__ == 4.11.3