Hi
I used XLM-RoBERTa for fine tuning this model to determine the text language. After training, I uploaded the model to the huggingface repository. When trying to use the Inference API on a model page for a model test I get an error
Can’t load tokenizer using from_pretrained, please update its configuration: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 83 column 3
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
def tokenize_function(examples):
return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)
# Apply tokenization to each split
dataset = dataset.map(tokenize_function, batched=True)
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('xlm-roberta-base', num_labels=21)
from transformers import Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
acc = accuracy_score(labels, preds)
return {
'accuracy': acc,
'f1': f1,
'precision': precision,
'recall': recall
}
training_args = TrainingArguments(
output_dir='./results',
num_train_epochs=3,
per_device_train_batch_size=16,
per_device_eval_batch_size=64,
warmup_steps=500,
weight_decay=0.01,
logging_dir='./logs',
logging_steps=10,
evaluation_strategy="epoch",
save_strategy="epoch"
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset['train'],
eval_dataset=dataset['validation'],
compute_metrics=compute_metrics
)
trainer.train()
from huggingface_hub import notebook_login
notebook_login()
repo_name = "my_repo"
model.push_to_hub(repo_name)
tokenizer.push_to_hub(repo_name)
Why could this be?