Hi, I have tried multi-label classification using setfit multiple times. The code is always crashing on my colab. Can someone please help??
from setfit import SetFitModel
from datasets import load_dataset
model_id=“sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2”
dataset=load_dataset(“ethos”,“multilabel”)
model=SetFitModel.from_pretrained(model_id,multi_target_strategy=“one-vs-rest”)
import numpy as np
features=dataset[“train”].column_names
features.remove(“text”)
num_samples=16
samples=np.concatenate([np.random.choice(np.where(dataset[“train”][f])[0],num_samples) for f in features])
def encode_labels(record):
return {“labels”:[record[feature] for feature in features]}
dataset=dataset.map(encode_labels)
train_dataset=dataset[“train”].select(samples)
eval_dataset=dataset[“train”].select(np.setdiff1d(np.arange(len(dataset[“train”])),samples))
from setfit import SetFitModel,Trainer,TrainingArguments
from sentence_transformers.losses import CosineSimilarityLoss
model2=SetFitModel.from_pretrained(model_id,multi_target_strategy=“one-vs-rest”)
args = TrainingArguments(batch_size=4,
num_epochs=1,
evaluation_strategy=“epoch”,
save_strategy=“epoch”,
load_best_model_at_end=True,
)
trainer = Trainer(
model=model2,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
metric=“accuracy”,
column_mapping={“text”: “text”, “labels”: “label”} # Map dataset columns to text/label expected by trainer
)
Train and evaluate
trainer.train()
I have tried reducing the batch size. It did not work. Please let me know if you have any ideas !