Git-base-vatex: input pixel_value dimension mismatch (blocking issue)

skar02 · September 19, 2023, 2:31am

git-base-vatex: input pixel_value dimension mismatch (blocking issue)

opened 06:07PM - 18 Sep 23 UTC

### System Info - `transformers` version: 4.30.2 - Platform: Linux-4.19.0-25…-cloud-amd64-x86_64-with-debian-10.13 - Python version: 3.7.12 - Huggingface_hub version: 0.15.1 - Safetensors version: 0.3.1 but is ignored because of PyTorch version too old. - PyTorch version (GPU?): 1.9.0+cu111 (True) - Tensorflow version (GPU?): not installed (NA) - Flax version (CPU?/GPU?/TPU?): not installed (NA) - Jax version: not installed - JaxLib version: not installed - Using GPU in script?: Yes - Using distributed or parallel set-up in script?: No ### Who can help? @NielsRogge ### Information - [ ] The official example scripts - [X] My own modified scripts ### Tasks - [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...) - [X] My own task or dataset (give details below) ### Reproduction ``` from torch.utils.data import Dataset import av import numpy as np import torch from PIL import Image from huggingface_hub import hf_hub_download from transformers import AutoProcessor, AutoModelForCausalLM from generativeimage2text.make_dataset import create_video_captions from transformers import AutoProcessor, AutoModelForCausalLM, TrainingArguments, Trainer from typing import Union, List import json import glob import os import math from datasets import load_dataset import shutil from tqdm import tqdm from moviepy.editor import VideoFileClip from evaluate import load processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex") model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex") np.random.seed(45) def read_video_pyav(container, indices): ''' Decode the video with PyAV decoder. Args: container (`av.container.input.InputContainer`): PyAV container. indices (`List[int]`): List of frame indices to decode. Returns: result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3). ''' frames = [] container.seek(0) start_index = indices[0] end_index = indices[-1] for i, frame in enumerate(container.decode(video=0)): if i > end_index: break if i >= start_index and i in indices: frames.append(frame) return np.stack([x.to_ndarray(format="rgb24") for x in frames]) def sample_frame_indices(clip_len, seg_len): ''' Sample a given number of frame indices from the video. Args: clip_len (`int`): Total number of frames to sample. seg_len (`int`): Maximum allowed index of sample's last frame. Returns: indices (`List[int]`): List of sampled frame indices ''' frame_sample_rate = (seg_len / clip_len) - 1 converted_len = int(clip_len * frame_sample_rate) end_idx = np.random.randint(converted_len, seg_len) start_idx = end_idx - converted_len indices = np.linspace(start_idx, end_idx, num=clip_len) indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64) return indices class VideoCaptioningDataset(Dataset): def __init__(self, videos, captions, processor, num_frames): self.videos = videos self.captions = captions self.processor = processor self.num_frames = num_frames self.cache = {} # to store processed samples def __len__(self): return len(self.videos) def __getitem__(self, idx): if idx in self.cache: return self.cache[idx] video_file = list(self.videos)[idx] caption = self.captions[idx] container = av.open(video_file) indices = sample_frame_indices( clip_len=self.num_frames, seg_len=container.streams.video[0].frames ) frames = read_video_pyav(container, indices) # process the pixel values and caption with the processor pixel_values = self.processor(images=list(frames), return_tensors="pt").pixel_values # pixel_values = pixel_values.squeeze(0) inputs = self.processor(text=caption, return_tensors="pt", padding="max_length", max_length=50) sample = { "pixel_values": pixel_values, "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": inputs["input_ids"], } return sample from sklearn.model_selection import train_test_split videos = ['/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_03_segment_0.mp4', '/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_0.mp4', '/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_1.mp4', '/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_2.mp4', '/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_3.mp4', '/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_4.mp4', '/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_5.mp4', '/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_6.mp4', '/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_7.mp4', '/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_8.mp4'] captions = ['hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi'] # for demo -- the real data is PHI but I can confirm that the video files exist and is in the same format so that isn't the issue. dataset = VideoCaptioningDataset(videos, captions, processor, 6) train_dataset, val_dataset = train_test_split(dataset, test_size=0.1) training_args = TrainingArguments( output_dir=f"video_finetune_1", learning_rate=5e-5, num_train_epochs=50, fp16=True, per_device_train_batch_size=4, per_device_eval_batch_size=4, gradient_accumulation_steps=16, save_total_limit=3, evaluation_strategy="steps", eval_steps=50, save_strategy="steps", save_steps=50, logging_steps=50, remove_unused_columns=False, push_to_hub=False, label_names=["labels"], load_best_model_at_end=True ) def compute_metrics(eval_pred): logits, labels = eval_pred predicted = logits.argmax(-1) decoded_labels = processor.batch_decode(labels, skip_special_tokens=True) decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True) wer_score = wer.compute(predictions=decoded_predictions, references=decoded_labels) bleu_score = bleu.compute(predictions=decoded_predictions, references=decoded_labels) return {"wer_score": wer_score, "bleu_score": bleu_score} wer = load("wer") bleu = load("bleu") trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=val_dataset, compute_metrics=compute_metrics, ) val = train_dataset[0]["pixel_values"].ndim print(f"the dim is {val}") trainer.train() ``` At this point when I do the print the dimension is 5 (as expected). But when I print the dimension of ```pixel_values``` in the first line in ``` forward ``` in file ``` modeling_git.py" ``` the dimension is 6. Because of this I get error ``` raise ValueError("pixel_values must be of rank 4 or 5") ValueError: pixel_values must be of rank 4 or 5 ``` This is the full stack trace for reference: ``` File "generativeimage2text/video_finetune.py", line 231, in <module> trainer.train() File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/transformers/trainer.py", line 1649, in train ignore_keys_for_eval=ignore_keys_for_eval, File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/transformers/trainer.py", line 1938, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/transformers/trainer.py", line 2759, in training_step loss = self.compute_loss(model, inputs) File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/transformers/trainer.py", line 2784, in compute_loss outputs = model(**inputs) File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl return forward_call(*input, **kwargs) File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/accelerate/utils/operations.py", line 553, in forward return model_forward(*args, **kwargs) File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/accelerate/utils/operations.py", line 541, in __call__ return convert_to_fp32(self.model_forward(*args, **kwargs)) File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/torch/cuda/amp/autocast_mode.py", line 141, in decorate_autocast return func(*args, **kwargs) File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/transformers/models/git/modeling_git.py", line 1507, in forward return_dict=return_dict, File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl return forward_call(*input, **kwargs) File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/transformers/models/git/modeling_git.py", line 1250, in forward raise ValueError("pixel_values must be of rank 4 or 5") ValueError: pixel_values must be of rank 4 or 5 ``` ### Expected behavior Ideally the dimension of ```pixel_values``` inside ```forward``` would also be 5 and the finetuning of git-base-vatex on video would work This is a blocking issue and any help would be really appreciated!

Topic		Replies	Views
"too many values to unpack (expected 4)" but pixel_values dimension is correct 🤗Transformers	2	476	February 14, 2024
Errors when fine-tuning T5 Beginners	7	6548	January 3, 2022
ValueError: too many dimensions 'str' 🤗Transformers	0	3021	April 13, 2022
Finetuning Vision Encoder Decoder Models with huggingface causes ValueError: expected sequence of length 11 at dim 2 (got 12) Beginners	0	493	March 12, 2023
Error when using transform function of pixel_values Intermediate	0	571	July 1, 2023

Git-base-vatex: input pixel_value dimension mismatch (blocking issue)

Related topics