hey guys I’m really new to training models for NLP and i created my first notebook similar to this guide
only difference being I want to do language detection which is why I’m using the papluca language-identification dataset.
The code always gives an error when trying to train the model, telling me I should implement padding and truncation which I did.
Hope anyone can help me out here.
Here is the given Code:
!pip install transformers
!pip install datasets evaluate huggingface_hub
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding
from transformers import Trainer, TrainingArguments
data = load_dataset("papluca/language-identification")
def languageFilter(dataset):
wanted_languages = ['de', 'en', 'es', 'fr']
if dataset['labels'] in wanted_languages:
return True
else:
return False
filtered_data = data.filter(languageFilter)
for x in filtered_data:
print(f"There are {len(filtered_data[x])} in the {x}-Split")
filtered_data["test"][0]
tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection")
id2label = {0: "de",
1: "en",
2: "fr",
3: "es"}
label2id = {"de": 0,
"en": 1,
"fr": 2,
"es": 3
}
def tokenize_function(examples):
return tokenizer(examples['text'], truncation=True, padding = "longest")
tokenized_data = filtered_data.map(tokenize_function)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=-1)
return accuracy.compute(predictions=predictions, references=labels)
training_args = TrainingArguments(
output_dir="my_awesome_model",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_data["train"],
eval_dataset=tokenized_data["test"],
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics
)
trainer.train()