ETA for training time is 60k hours for first generation

Takekazuchi · April 22, 2024, 6:19am

I’m using the “Fine-tuning for Image Classification with Transformers” notebook and using it on my own custom dataset and am fine-tuning it using “apple/mobilevitv2-1.0-imagenet1k-256”, there were no error previous to me running trainer.train(). After 15 minutes of no bar showing up it finally appeared but there was no progress, and another 7 minutes later it moves up by 1 and gives an ETA of 60,000 hours.

here’s my code:

pip install torch==1.11.0
pip install -q datasets transformers
pip install fsspec==2023.9.2
import tensorflow as tf
tf.config.list_physical_devices('GPU')
model_checkpoint = "apple/mobilevitv2-1.0-imagenet1k-256" # pre-trained model from which to fine-tune
batch_size = 32 # batch size for training and evaluation
pip install ipywidgets
from huggingface_hub import notebook_login

notebook_login()
%%capture
!git config --global credential.helper store
from transformers.utils import send_example_telemetry

send_example_telemetry("image_classification_notebook", framework="pytorch")
import datasets
datasets.__version__
from datasets import load_dataset 

# load a custom dataset from local/remote files or folders using the ImageFolder feature

# option 1: local/remote files (supporting the following formats: tar, gzip, zip, xz, rar, zstd)
dataset = load_dataset("imagefolder", data_files=r"Desktop/Finished.zip")

# note that you can also provide several splits:
# dataset = load_dataset("imagefolder", data_files={"train": ["path/to/file1", "path/to/file2"], "test": ["path/to/file3", "path/to/file4"]})

# note that you can push your dataset to the hub very easily (and reload afterwards using load_dataset)!
# dataset.push_to_hub("nielsr/eurosat")
# dataset.push_to_hub("nielsr/eurosat", private=True)

# option 2: local folder
# dataset = load_dataset("imagefolder", data_dir= r"Desktop/Finished.zip")

# option 3: just load any existing dataset from the hub, like CIFAR-10, FashionMNIST ...
# dataset = load_dataset("food101")
from datasets import load_metric

trust_remote_code=True

metric = load_metric("accuracy")
dataset
pip install Pillow
import PIL
example = dataset["train"][10]
example
dataset["train"].features
example['image']
example['image'].resize((200, 200))
example['label']
labels = dataset["train"].features["label"].names
label2id, id2label = dict(), dict()
for i, label in enumerate(labels):
    label2id[label] = i
    id2label[i] = label

id2label[2779]
from transformers import AutoImageProcessor

image_processor  = AutoImageProcessor.from_pretrained(model_checkpoint)
image_processor 
from torchvision.transforms import (
    CenterCrop,
    Compose,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
    Resize,
    ToTensor,
)

# normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
if "height" in image_processor.size:
    size = (image_processor.size["height"], image_processor.size["width"])
    crop_size = size
    max_size = None
elif "shortest_edge" in image_processor.size:
    size = image_processor.size["shortest_edge"]
    crop_size = (size, size)
    max_size = image_processor.size.get("longest_edge")

train_transforms = Compose(
        [
            RandomResizedCrop(crop_size),
            RandomHorizontalFlip(),
            ToTensor(),
            # normalize,
        ]
    )

val_transforms = Compose(
        [
            Resize(size),
            CenterCrop(crop_size),
            ToTensor(),
            # normalize,
        ]
    )

def preprocess_train(example_batch):
    """Apply train_transforms across a batch."""
    example_batch["pixel_values"] = [
        train_transforms(image.convert("RGB")) for image in example_batch["image"]
    ]
    return example_batch

def preprocess_val(example_batch):
    """Apply val_transforms across a batch."""
    example_batch["pixel_values"] = [val_transforms(image.convert("RGB")) for image in example_batch["image"]]
    return example_batch
# split up training into training + validation
splits = dataset["train"].train_test_split(test_size=0.1)
train_ds = splits['train']
val_ds = splits['test']
train_ds.set_transform(preprocess_train)
val_ds.set_transform(preprocess_val)
train_ds[0]
import torch
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(device)
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    model_checkpoint, 
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes = True, 
).to(device)

! pip install -U accelerate
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    f"{model_name}",
    remove_unused_columns=False,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=batch_size,
    gradient_checkpointing = True,
    gradient_checkpointing_kwargs={'use_reentrant':False},
    fp16=True,
    num_train_epochs=15,
    warmup_ratio=0.1,
    weight_decay=-0.01,
    logging_strategy = "steps",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
)
import numpy as np

# the compute_metrics function takes a Named Tuple as input:
# predictions, which are the logits of the model as Numpy arrays,
# and label_ids, which are the ground-truth labels as Numpy arrays.
def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    return metric.compute(predictions=predictions, references=eval_pred.label_ids)
import torch

def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}
trainer = Trainer(
    model,
    args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
    data_collator=collate_fn,
)
train_results = trainer.train()
trainer.save_model()
trainer.log_metrics("train", train_results.metrics)
trainer.save_metrics("train", train_results.metrics)
trainer.save_state()

Python 3.10.14
pip 24.0
tensorflow 2.10.0
torch 1.11.0
torchvision 0.12.0
transformers 4.40.0
scikit-learn 1.3.0
safetensors 0.4.3
pillow 10.3.0
opencv 4.9.0.80
numpy 1.26.4
datasets 2.19.0
fsspec 2023.9.2
ipywidgets 8.1.2
huggingface-hub 0.22.2

I’m using Conda and WSL2, Ubuntu 22.04.3

Topic		Replies	Views
Distributed Training with Trainer Class is Really Slow Beginners	0	1107	October 24, 2022
Not using GPU although it is specified Course	5	31065	December 30, 2024
Loading local dataset shows less classes than are present 🤗Datasets	0	197	April 25, 2024
Trainer.train() seems to finish almost instantly 🤗Transformers	0	520	September 29, 2023
Using huggingface transformers trainer method for hugging face datasets 🤗Datasets	1	1097	April 15, 2024

ETA for training time is 60k hours for first generation

Related topics