### System Info
- `transformers` version: 4.30.2
- Platform: Linux-4.19.0-25…-cloud-amd64-x86_64-with-debian-10.13
- Python version: 3.7.12
- Huggingface_hub version: 0.15.1
- Safetensors version: 0.3.1 but is ignored because of PyTorch version too old.
- PyTorch version (GPU?): 1.9.0+cu111 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?: Yes
- Using distributed or parallel set-up in script?: No
### Who can help?
@NielsRogge
### Information
- [ ] The official example scripts
- [X] My own modified scripts
### Tasks
- [ ] An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)
- [X] My own task or dataset (give details below)
### Reproduction
```
from torch.utils.data import Dataset
import av
import numpy as np
import torch
from PIL import Image
from huggingface_hub import hf_hub_download
from transformers import AutoProcessor, AutoModelForCausalLM
from generativeimage2text.make_dataset import create_video_captions
from transformers import AutoProcessor, AutoModelForCausalLM, TrainingArguments, Trainer
from typing import Union, List
import json
import glob
import os
import math
from datasets import load_dataset
import shutil
from tqdm import tqdm
from moviepy.editor import VideoFileClip
from evaluate import load
processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")
np.random.seed(45)
def read_video_pyav(container, indices):
'''
Decode the video with PyAV decoder.
Args:
container (`av.container.input.InputContainer`): PyAV container.
indices (`List[int]`): List of frame indices to decode.
Returns:
result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
'''
frames = []
container.seek(0)
start_index = indices[0]
end_index = indices[-1]
for i, frame in enumerate(container.decode(video=0)):
if i > end_index:
break
if i >= start_index and i in indices:
frames.append(frame)
return np.stack([x.to_ndarray(format="rgb24") for x in frames])
def sample_frame_indices(clip_len, seg_len):
'''
Sample a given number of frame indices from the video.
Args:
clip_len (`int`): Total number of frames to sample.
seg_len (`int`): Maximum allowed index of sample's last frame.
Returns:
indices (`List[int]`): List of sampled frame indices
'''
frame_sample_rate = (seg_len / clip_len) - 1
converted_len = int(clip_len * frame_sample_rate)
end_idx = np.random.randint(converted_len, seg_len)
start_idx = end_idx - converted_len
indices = np.linspace(start_idx, end_idx, num=clip_len)
indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
return indices
class VideoCaptioningDataset(Dataset):
def __init__(self, videos, captions, processor, num_frames):
self.videos = videos
self.captions = captions
self.processor = processor
self.num_frames = num_frames
self.cache = {} # to store processed samples
def __len__(self):
return len(self.videos)
def __getitem__(self, idx):
if idx in self.cache:
return self.cache[idx]
video_file = list(self.videos)[idx]
caption = self.captions[idx]
container = av.open(video_file)
indices = sample_frame_indices(
clip_len=self.num_frames, seg_len=container.streams.video[0].frames
)
frames = read_video_pyav(container, indices)
# process the pixel values and caption with the processor
pixel_values = self.processor(images=list(frames), return_tensors="pt").pixel_values
# pixel_values = pixel_values.squeeze(0)
inputs = self.processor(text=caption, return_tensors="pt", padding="max_length", max_length=50)
sample = {
"pixel_values": pixel_values,
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"],
"labels": inputs["input_ids"],
}
return sample
from sklearn.model_selection import train_test_split
videos = ['/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_03_segment_0.mp4',
'/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_0.mp4',
'/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_1.mp4',
'/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_2.mp4',
'/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_3.mp4',
'/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_4.mp4',
'/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_5.mp4',
'/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_6.mp4',
'/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_7.mp4',
'/home/name/GenerativeImage2Text/generativeimage2text/output_videos/clip_07_segment_8.mp4']
captions = ['hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi', 'hi'] # for demo -- the real data is PHI but I can confirm that the video files exist and is in the same format so that isn't the issue.
dataset = VideoCaptioningDataset(videos, captions, processor, 6)
train_dataset, val_dataset = train_test_split(dataset, test_size=0.1)
training_args = TrainingArguments(
output_dir=f"video_finetune_1",
learning_rate=5e-5,
num_train_epochs=50,
fp16=True,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
gradient_accumulation_steps=16,
save_total_limit=3,
evaluation_strategy="steps",
eval_steps=50,
save_strategy="steps",
save_steps=50,
logging_steps=50,
remove_unused_columns=False,
push_to_hub=False,
label_names=["labels"],
load_best_model_at_end=True
)
def compute_metrics(eval_pred):
logits, labels = eval_pred
predicted = logits.argmax(-1)
decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True)
wer_score = wer.compute(predictions=decoded_predictions, references=decoded_labels)
bleu_score = bleu.compute(predictions=decoded_predictions, references=decoded_labels)
return {"wer_score": wer_score, "bleu_score": bleu_score}
wer = load("wer")
bleu = load("bleu")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
compute_metrics=compute_metrics,
)
val = train_dataset[0]["pixel_values"].ndim
print(f"the dim is {val}")
trainer.train()
```
At this point when I do the print the dimension is 5 (as expected). But when I print the dimension of ```pixel_values``` in the first line in ``` forward ``` in file ``` modeling_git.py" ``` the dimension is 6. Because of this I get error
``` raise ValueError("pixel_values must be of rank 4 or 5") ValueError: pixel_values must be of rank 4 or 5 ```
This is the full stack trace for reference:
```
File "generativeimage2text/video_finetune.py", line 231, in <module>
trainer.train()
File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/transformers/trainer.py", line 1649, in train
ignore_keys_for_eval=ignore_keys_for_eval,
File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/transformers/trainer.py", line 1938, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/transformers/trainer.py", line 2759, in training_step
loss = self.compute_loss(model, inputs)
File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/transformers/trainer.py", line 2784, in compute_loss
outputs = model(**inputs)
File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/accelerate/utils/operations.py", line 553, in forward
return model_forward(*args, **kwargs)
File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/accelerate/utils/operations.py", line 541, in __call__
return convert_to_fp32(self.model_forward(*args, **kwargs))
File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/torch/cuda/amp/autocast_mode.py", line 141, in decorate_autocast
return func(*args, **kwargs)
File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/transformers/models/git/modeling_git.py", line 1507, in forward
return_dict=return_dict,
File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/name/GenerativeImage2Text/git2/lib/python3.7/site-packages/transformers/models/git/modeling_git.py", line 1250, in forward
raise ValueError("pixel_values must be of rank 4 or 5")
ValueError: pixel_values must be of rank 4 or 5
```
### Expected behavior
Ideally the dimension of ```pixel_values``` inside ```forward``` would also be 5 and the finetuning of git-base-vatex on video would work This is a blocking issue and any help would be really appreciated!