Labeling model with hugginface Dataset

I have the following code

from scipy.spatial.distance import dice, directed_hausdorff
from sklearn.metrics        import f1_score
from segments import SegmentsClient
from segments import SegmentsDataset
from datasets import load_dataset
from segments.huggingface import release2dataset
from huggingface_hub import cached_download, hf_hub_url
from transformers import TrainingArguments
import numpy as np
from transformers import Trainer
from torchvision.transforms import ColorJitter
from transformers import SegformerFeatureExtractor

#!huggingface-cli login
api_key = "..."
#etc

client = SegmentsClient(api_key)
dataset_identifier = "kasumi222/busigt"
vers = "v0.1"
release = client.get_release(dataset_identifier, vers)


ds = release2dataset(release)
ds = ds.shuffle(seed=1)
ds = ds.train_test_split(test_size=0.2)
train_ds = ds["train"]
test_ds = ds["test"]

Here I’m transforming from the SegmentAi API format to a Huggingface Dataset.
However, I would like to extract the label for the following code to work:

# repo_id = f"datasets/{hf_dataset_identifier}"
filename = "dataset_infos.json"
#id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename)), "r"))
id2label = {"benigno":0, "maligno":1}
label2id = {0:"benigno",1:"maligno"}
num_labels = len(id2label)


feature_extractor = SegformerFeatureExtractor()
jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) 

def train_transforms(example_batch):
    images = [jitter(x) for x in example_batch['pixel_values']]
    labels = [x for x in example_batch['label']]
    inputs = feature_extractor(images, labels)
    return inputs


def val_transforms(example_batch):
    images = [x for x in example_batch['pixel_values']]
    labels = [x for x in example_batch['label']]
    inputs = feature_extractor(images, labels)
    return inputs


# Set transforms
train_ds.set_transform(train_transforms)
test_ds.set_transform(val_trans)

from transformers import SegformerForSemanticSegmentation

pretrained_model_name = "nvidia/mit-b0" 
model = SegformerForSemanticSegmentation.from_pretrained(
    pretrained_model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)


epochs = 1
lr = 0.00006
batch_size = 1

hub_model_id = "segformer-b0-finetuned-busigt"

training_args = TrainingArguments(
    "segformer-b0-finetuned-busigt-outputs",
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_total_limit=3,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=20,
    eval_steps=20,
    logging_steps=1,
    eval_accumulation_steps=5,
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_model_id=hub_model_id,
    hub_strategy="end",
)


metric = load_metric("mean_iou")

def compute_metrics(eval_pred):
  with torch.no_grad():
    logits, labels = eval_pred
    logits_tensor = torch.from_numpy(logits)
    # scale the logits to the size of the label
    logits_tensor = nn.functional.interpolate(
        logits_tensor,
        size=labels.shape[-2:],
        mode="bilinear",
        align_corners=False,
    ).argmax(dim=1)

    pred_labels = logits_tensor.detach().cpu().numpy()
    metrics = metric.compute(predictions=pred_labels, references=labels, 
                                   num_labels=num_labels, 
                                   ignore_index=0,
                                   reduce_labels=feature_extractor.reduce_labels)
    for key, value in metrics.items():
      if type(value) is np.ndarray:
        metrics[key] = value.tolist()
    return metrics



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
    )

trainer.train()

Inside each sample of the dataset there is a field label.annotations, which contains a list with only one element, that is a dictionary with the label field, named category_id.

I suppose that the problem with the code is that it is not finding that as a label, because on the last line a warning appears

The following columns in the training set don't have a corresponding argument in `SegformerForSemanticSegmentation.forward` and have been ignored: label.annotations, image, status, uuid, name, label.segmentation_bitmap. If label.annotations, image, status, uuid, name, label.segmentation_bitmap are not expected by `SegformerForSemanticSegmentation.forward`,  you can safely ignore this message.

So what can I do to correctly train the model?

Edit:

I changed the transform functions to:

def train_transforms(example_batch):
    images = [jitter(x) for x in example_batch['image']]
    labels = [x for x in example_batch['label.annotations'][0]["category_id"]]
    inputs = feature_extractor(images, labels)
    return inputs


def val_transforms(example_batch):
    images = [jitter(x) for x in example_batch['image']]
    labels = [x for x in example_batch['label.annotations'][0]["category_id"]]
    inputs = feature_extractor(images, labels)
    return inputs

Same error:

The following columns in the training set don't have a corresponding argument in `SegformerForSemanticSegmentation.forward` and have been ignored: image, name, uuid, label.annotations, status, label.segmentation_bitmap. If image, name, uuid, label.annotations, status, label.segmentation_bitmap are not expected by `SegformerForSemanticSegmentation.forward`,  you can safely ignore this message.
/usr/local/lib/python3.7/dist-packages/transformers/optimization.py:310: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set `no_deprecation_warning=True` to disable this warning
  FutureWarning,
***** Running training *****
  Num examples = 517
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 517

---------------------------------------------------------------------------

ZeroDivisionError                         Traceback (most recent call last)

<ipython-input-31-d8203853aa11> in <module>
      1 import numpy as np
----> 2 trainer.train()

10 frames

/usr/local/lib/python3.7/dist-packages/datasets/formatting/formatting.py in _query_table(table, key)
     79     """
     80     if isinstance(key, int):
---> 81         return table.fast_slice(key % table.num_rows, 1)
     82     if isinstance(key, slice):
     83         key = range(*key.indices(table.num_rows))

ZeroDivisionError: integer division or modulo by zero
1 Like