from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset
from sklearn.model_selection import train_test_split
# Example dataset
data = [
{"job_title": "Asia Finance Controller", "tags": ["Manager", "Director"]},
{"job_title": "Assistant Audit Manager AVP", "tags": ["Manager", "Director"]},
{"job_title": "Business Controller", "tags": ["Manager", "Director",'officer']}
]
# Preprocess data
df = pd.DataFrame(data)
mlb = MultiLabelBinarizer()
df['labels'] = list(mlb.fit_transform(df['tags']))
# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df[['job_title','labels']])
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(mlb.classes_))
model = AutoModelForSequenceClassification.from_pretrained(
"huawei-noah/TinyBERT_General_6L_768D",
num_labels=len(mlb.classes_), # Adjust for your task
problem_type="multi_label_classification", # For multi-label classification
)
# Tokenize data
def preprocess_function(examples):
tokenized_dataset=tokenizer(examples['job_title'], truncation=True, padding=True)
# tokenized_dataset['labels']=tokenized_dataset['labels']
return tokenized_dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# X_train, X_test, y_train, y_test = train_test_split(df['job_title'],df['labels'], test_size=0.2)
# Training arguments
training_args = TrainingArguments(
output_dir="./results",
evaluation_strategy="epoch",
save_strategy="epoch",
num_train_epochs=1,
per_device_train_batch_size=2,
logging_dir="./logs",
)
# Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset,
tokenizer=tokenizer
)
# Train model
trainer.train()
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.preprocessing import MultiLabelBinarizer
from datasets import Dataset
from sklearn.model_selection import train_test_split
Example dataset
data = [
{ājob_titleā: āAsia Finance Controllerā, ātagsā: [āManagerā, āDirectorā]},
{ājob_titleā: āAssistant Audit Manager AVPā, ātagsā: [āManagerā, āDirectorā]},
{ājob_titleā: āBusiness Controllerā, ātagsā: [āManagerā, āDirectorā,āofficerā]}
]
Preprocess data
df = pd.DataFrame(data)
mlb = MultiLabelBinarizer()
df[ālabelsā] = list(mlb.fit_transform(df[ātagsā]))
Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df[[ājob_titleā,ālabelsā]])
Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained(ābert-base-uncasedā)
model = BertForSequenceClassification.from_pretrained(ābert-base-uncasedā, num_labels=len(mlb.classes_))
model = AutoModelForSequenceClassification.from_pretrained(
āhuawei-noah/TinyBERT_General_6L_768Dā,
num_labels=len(mlb.classes_), # Adjust for your task
problem_type=āmulti_label_classificationā, # For multi-label classification
)
Tokenize data
def preprocess_function(examples):
tokenized_dataset=tokenizer(examples[ājob_titleā], truncation=True, padding=True)
# tokenized_dataset[ālabelsā]=tokenized_dataset[ālabelsā]
return tokenized_dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset.set_format(type=ātorchā, columns=[āinput_idsā, āattention_maskā, ālabelsā])
X_train, X_test, y_train, y_test = train_test_split(df[ājob_titleā],df[ālabelsā], test_size=0.2)
Training arguments
training_args = TrainingArguments(
output_dir=ā./resultsā,
evaluation_strategy=āepochā,
save_strategy=āepochā,
num_train_epochs=1,
per_device_train_batch_size=2,
logging_dir=ā./logsā,
)
Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_dataset,
eval_dataset=tokenized_dataset,
tokenizer=tokenizer
)
Train model
trainer.train()