ValueError: Expected input batch_size (4096) to match target batch_size (8)

/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:309: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  FutureWarning,
***** Running training *****
  Num examples = 1000
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 625
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-21-3435b262f1ae> in <module>()
----> 1 trainer.train()

7 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in cross_entropy(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)
   2844     if size_average is not None or reduce is not None:
   2845         reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2846     return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
   2847 
   2848 

ValueError: Expected input batch_size (4096) to match target batch_size (8).

attempting to fine-tune a model, getting the error when running trainer.train(). 2nd, just to test if I can functionally run this before cutting up a proper train and eval dataset, I’m using the same smaller dataset in the Trainer.

how do I process the dataset to fit the smaller batch size?

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=4096,
    per_device_eval_batch_size=4096,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
)

from transformers import Trainer

trainer = Trainer(model=model, args=training_args, train_dataset=small_train_dataset, eval_dataset=small_train_dataset)
1 Like

Please provide more details. Also, have a look at this: Here

I am also experiencing this. Here is my code.

from transformers import AutoTokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
import torch.utils.data
from pathlib import Path
from sklearn.model_selection import train_test_split


def read_training_data_split(split_dir):
    split_dir = Path(split_dir)
    texts = []
    labels = []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir == "neg" else 1)

    return texts, labels


class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        labels = torch.tensor(self.labels[idx])
        item['labels'] = labels
        return item

    def __len__(self):
        return len(self.labels)


def train_model():
    train_texts, train_labels = read_training_data_split('train')
    train_texts, eval_texts, train_labels, eval_labels = train_test_split(train_texts, train_labels, test_size=.2)

    if None in train_texts or None in eval_texts:
        raise Exception("Not enough training data")

    print(train_texts)
    print(eval_texts)

    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
    eval_encodings = tokenizer(eval_texts, truncation=True, padding=True)

    print(train_encodings)
    print(eval_encodings)

    eval_dataset = Dataset(eval_encodings, eval_labels)
    train_dataset = Dataset(train_encodings, train_labels)

    training_args = TrainingArguments(
        output_dir='./model',
        num_train_epochs=3,  # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,  # batch size for evaluation
        warmup_steps=500,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir='./logs',  # directory for storing logs
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )
    trainer.train()


tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({"pad_token": "[PAD]"})
model = GPT2LMHeadModel.from_pretrained("gpt2")
train_model()

inputs = tokenizer.encode("Hello, my dog is cute", return_tensors="pt")
outputs = model.generate(inputs, max_length=50)

response = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(response)

I have had the same problem and managed to solve it as shown here. As @ahmedbr points out, more details – in particular what task you train for – are required to solve this.