It seems you’ve unearthed an ancient bug. A living fossil of a bug.
from transformers import AutoTokenizer, AutoModelForTokenClassification, TokenClassificationPipeline
NER_MODEL_NAME = "elastic/distilbert-base-cased-finetuned-conll03-english"
"""
C -> PER (0.50)
##Ă´me -> ORG (0.40)
Ai -> LOC (0.62)
##x -> LOC (0.97)
- -> LOC (0.64)
en -> LOC (0.91)
- -> LOC (0.81)
Provence -> LOC (0.87)
et travaille pour l ’ INRIA -> ORG (0.89)
"""
NER_MODEL_NAME = "CATIE-AQ/NERmembert-base-4entities"
"""
CĂ´me -> PER (0.67)
Aix-en-Provence -> LOC (1.00)
INRIA -> ORG (1.00)
"""
NER_MODEL_NAME = "elastic/distilbert-base-uncased-finetuned-conll03-english"
# aix - en - provence et travaille pour l ’ inria -> ORG (0.87)
NER_MODEL_NAME = "distilbert/distilbert-base-multilingual-cased"
"""
CĂ´me -> LABEL_0 (0.57)
habite -> LABEL_1 (0.53)
Ă -> LABEL_0 (0.53)
Aix - en - -> LABEL_1 (0.52)
Provence et travaille pour l -> LABEL_0 (0.55)
’ -> LABEL_1 (0.51)
INRIA. -> LABEL_0 (0.55)
"""
NER_MODEL_NAME = "distilbert/distilbert-base-uncased"
"""
come -> LABEL_0 (0.54)
habit -> LABEL_1 (0.53)
##e -> LABEL_0 (0.52)
a aix -> LABEL_1 (0.52)
- -> LABEL_0 (0.52)
en - provence et -> LABEL_1 (0.55)
tr -> LABEL_0 (0.51)
##ava -> LABEL_1 (0.50)
##ille -> LABEL_0 (0.51)
pour -> LABEL_1 (0.54)
l -> LABEL_0 (0.52)
’ -> LABEL_1 (0.52)
inria -> LABEL_0 (0.52)
. -> LABEL_1 (0.50)
"""
tokenizer = AutoTokenizer.from_pretrained(NER_MODEL_NAME)
model = AutoModelForTokenClassification.from_pretrained(NER_MODEL_NAME).to("cuda")
# Create a pipeline for NER
ner_pipeline = TokenClassificationPipeline(
model=model,
tokenizer=tokenizer,
#aggregation_strategy="first" # this would work
aggregation_strategy="simple"
)
# Run NER
text = "Côme habite à Aix-en-Provence et travaille pour l’INRIA."
entities = ner_pipeline(text)
# Print results
for entity in entities:
print(f"{entity['word']} -> {entity['entity_group']} ({entity['score']:.2f})")