!pip install -q transformers datasets sklearn
imports
import numpy as np
import pandas as pd
import datasets
import torchfrom transformers import BertTokenizer, BertForTokenClassification, Trainer, TrainingArguments, AutoConfig
from sklearn.preprocessing import LabelEncoder
from datasets import load_dataset
from random import shuffle
from sklearn.metrics import f1_score
from random import shuffle
dataset = datasets.load_dataset(‘yelp_polarity’,split=‘train[800:1000]’) # [:100] [:1%]
tokenizer = BertTokenizer.from_pretrained(‘bert-base-uncased’, do_lower_case=True)
encoded_dataset = [tokenizer(item[‘text’], return_tensors=“pt”, padding=‘max_length’, truncation=True, max_length=512) for item in dataset]
import torch
for enc_item, item in zip(encoded_dataset, dataset):
enc_item[‘labels’] = torch.LongTensor([item[‘label’]])for key, val in encoded_dataset[3].items():
print(f’key: {key}, content: {val.size()}')model = BertForTokenClassification.from_pretrained(‘bert-base-uncased’)
for item in encoded_dataset:
for key in item:
item[key] = torch.squeeze(item[key])train_set = encoded_dataset[:100]
test_set = encoded_dataset[100:]training_args = TrainingArguments(
num_train_epochs=5,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
output_dir=‘results’,
logging_dir=‘logs’,
no_cuda=True,
)trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=train_set
)
trainer.train()
Getting this error when i train
Expected input batch_size (2048) to match target batch_size (4)