RuntimeError: result type Float can't be cast to the desired output type Long

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset
from sklearn.model_selection import train_test_split
# Example dataset
data = [
    {"job_title": "Asia Finance Controller", "tags": ["Manager", "Director"]},
    {"job_title": "Assistant Audit Manager AVP", "tags": ["Manager", "Director"]},
    {"job_title": "Business Controller", "tags": ["Manager", "Director",'officer']}
]

# Preprocess data
df = pd.DataFrame(data)
mlb = MultiLabelBinarizer()
df['labels'] = list(mlb.fit_transform(df['tags']))

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df[['job_title','labels']])

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(mlb.classes_))
model = AutoModelForSequenceClassification.from_pretrained(
    "huawei-noah/TinyBERT_General_6L_768D",
    num_labels=len(mlb.classes_),  # Adjust for your task
    problem_type="multi_label_classification",  # For multi-label classification
)

# Tokenize data
def preprocess_function(examples):
    tokenized_dataset=tokenizer(examples['job_title'], truncation=True, padding=True)
    # tokenized_dataset['labels']=tokenized_dataset['labels']
    return tokenized_dataset

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# X_train, X_test, y_train, y_test = train_test_split(df['job_title'],df['labels'], test_size=0.2)


# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=1,
    per_device_train_batch_size=2,
    logging_dir="./logs",
   
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Train model
trainer.train()

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset
from sklearn.model_selection import train_test_split

Example dataset

data = [
{ā€œjob_titleā€: ā€œAsia Finance Controllerā€, ā€œtagsā€: [ā€œManagerā€, ā€œDirectorā€]},
{ā€œjob_titleā€: ā€œAssistant Audit Manager AVPā€, ā€œtagsā€: [ā€œManagerā€, ā€œDirectorā€]},
{ā€œjob_titleā€: ā€œBusiness Controllerā€, ā€œtagsā€: [ā€œManagerā€, ā€œDirectorā€,ā€˜officerā€™]}
]

Preprocess data

df = pd.DataFrame(data)
mlb = MultiLabelBinarizer()
df[ā€˜labelsā€™] = list(mlb.fit_transform(df[ā€˜tagsā€™]))

Convert to Hugging Face dataset

dataset = Dataset.from_pandas(df[[ā€˜job_titleā€™,ā€˜labelsā€™]])

Load tokenizer and model

tokenizer = BertTokenizer.from_pretrained(ā€œbert-base-uncasedā€)

model = BertForSequenceClassification.from_pretrained(ā€œbert-base-uncasedā€, num_labels=len(mlb.classes_))

model = AutoModelForSequenceClassification.from_pretrained(
ā€œhuawei-noah/TinyBERT_General_6L_768Dā€,
num_labels=len(mlb.classes_), # Adjust for your task
problem_type=ā€œmulti_label_classificationā€, # For multi-label classification
)

Tokenize data

def preprocess_function(examples):
tokenized_dataset=tokenizer(examples[ā€˜job_titleā€™], truncation=True, padding=True)
# tokenized_dataset[ā€˜labelsā€™]=tokenized_dataset[ā€˜labelsā€™]
return tokenized_dataset

tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset.set_format(type=ā€˜torchā€™, columns=[ā€˜input_idsā€™, ā€˜attention_maskā€™, ā€˜labelsā€™])

X_train, X_test, y_train, y_test = train_test_split(df[ā€˜job_titleā€™],df[ā€˜labelsā€™], test_size=0.2)

Training arguments

training_args = TrainingArguments(
output_dir=ā€œ./resultsā€,
evaluation_strategy=ā€œepochā€,
save_strategy=ā€œepochā€,
num_train_epochs=1,
per_device_train_batch_size=2,
logging_dir=ā€œ./logsā€,

)

Trainer

trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset,
tokenizer=tokenizer
)

Train model

trainer.train()

1 Like

There seem to be various possible causes, but is this it?