I’ve been trying to save with trainer.save_model("./model")
like so:
from datasets import Dataset
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer
from transformers.trainer_utils import EvalPrediction
import evaluate
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
# prepare dataset
df = pd.read_csv('test_dataset.csv')
# tokenise
BASE_MODEL_NAME = "google-bert/bert-base-cased"
print("Tokenising...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
def process_data(row):
text = str(row["text"])
encodings = tokenizer(text, padding="max_length", truncation=True)
encodings["label"] = "Positive" if row["is_positive"] else "Negative"
encodings["text"] = row["text"]
return encodings
processed_data = []
for i in range(len(df)):
processed_data.append(process_data(df.iloc[i]))
new_df = pd.DataFrame(processed_data)
train_df, valid_df = train_test_split(
new_df,
test_size=0.2,
random_state=2024,
stratify=df["is_postiive"]
)
train_hg = Dataset(pa.Table.from_pandas(train_df)).class_encode_column("label")
valid_hg = Dataset(pa.Table.from_pandas(valid_df)).class_encode_column("label")
# Train
torch.cuda.set_device(0)
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred: EvalPrediction):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
model = AutoModelForSequenceClassification.from_pretrained(
BASE_MODEL_NAME,
num_labels=2
)
print("Training...")
training_args = TrainingArguments(
output_dir="test_trainer",
eval_strategy="epoch",
torch_compile=True,
disable_tqdm=True,
num_train_epochs=5,
fp16=True,
)
training_args.set_training(num_epochs=10)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_hg,
eval_dataset=valid_hg,
compute_metrics=compute_metrics
)
trainer.train()
print("Evaluating...")
trainer.evaluate()
trainer.save_model('./model/')
Then using this one to load it (with a test interface just to sanity check the model).
from transformers import AutoModelForSequenceClassification, pipeline, AutoTokenizer
model = AutoModelForSequenceClassification.from_pretrained('./model/')
BASE_MODEL_NAME = "google-bert/bert-base-cased"
print("Tokenising...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME)
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
while True:
text = input("Input string to classify:")
print(classifier(text))
```
When I run this loading script, it doesn't output the two labels I defined instead saying "LABEL1". Additionally different inputs lead to very similar values each time, perhaps indicating the tokenizer isn't loading properly? (effectively putting random tokens into the model for classification).
Thank you