Hello, I am loading a fine-tuned model I trained to test on a new dataset. The dataset is straight off Huggingface to simplify the process. However, I get this error ValueError: You need to specify either text
or text_target
Here is my code:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import pipeline
import evaluate
import re
#------------------------------------------------------------------------------#
#--------------------- Loading Fine-tuned Model and Tokenizer -----------------#
#------------------------------------------------------------------------------#
#-- Load your model
model_path = “/content/drive/MyDrive/NLP!!/Sentiment Analysis/Airline_reviews/Model 1 - 80_20 Split”
#— Load tokenizer and model
loaded_tokenizer = AutoTokenizer.from_pretrained(model_path)
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path)
#------------------------------------------------------------------------------#
#------------------------ Dealing with the new dataset ------------------------#
#------------------------------------------------------------------------------#
#— Load the new dataset
new_dataset = load_dataset(“Sp1786/multiclass-sentiment-analysis-dataset”, split=“test”) #only load the test dataset
new_dataset
#— Pre-process new data in the same way you did as the data you used to train your model
def preprocess_text(text):
if not isinstance(text, str): # Check if text is not a string
text = “” # Convert to empty string if not a string
# Remove URLs
text = re.sub(r'http\S+', '', text)
# Remove "@" mentions and usernames
text = re.sub(r'@\w+\s*', '', text)
return text
#— Get the text data from the ‘test’ split
texts = new_dataset[‘text’]
texts
#— Make it a list and Apply your preprocessing funciton
preprocessed_texts = [preprocess_text(text) for text in texts]
preprocessed_texts
#------------------------------------------------------------------------------#
#----------------------------------- Inference --------------------------------#
#------------------------------------------------------------------------------#
#— Setting up pipeline for inference
sentiment_pipeline = pipeline(task=“text-classification”,
model=loaded_model,
tokenizer=loaded_tokenizer,
device=0) # will tokenize entries for you
A quick test
sentiment_pipeline(preprocessed_texts) # the pipeline works
ground_truth_labels = new_dataset[‘label’]
#------------------------------------------------------------------------------#
#----------------------------------- Evaluation -------------------------------#
#------------------------------------------------------------------------------#
Initialize the task evaluator
task_evaluator = evaluator(“text-classification”)
Define label mapping for your dataset
eval_results = task_evaluator.compute(
model_or_pipeline=loaded_model,
tokenizer=loaded_tokenizer,
metric=“accuracy”,
data=new_dataset,
input_column= ‘text’,
label_column= ‘label’,
label_mapping={“positive”: 2, “neutral”: 1, “negative”: 0}
)
Thank you in advance! Any help would be great