Hi, i am an absolute beginner and want to run a model to translate audio in lubemburgish text. I found a model here : ZLSCompLing/whisper_large_lb_ZLS_v4_38h
I have the following errors when runnig my programm
Traceback (most recent call last):
File “/.stt/whisperLuxLarge.py”, line 55, in
whisper_model.load_state_dict(hf_state_dict)
File “/.stt/stt-lux/lib/python3.9/site-packages/torch/nn/modules/module.py”, line 2153, in load_state_dict
raise RuntimeError(‘Error(s) in loading state_dict for {}:\n\t{}’.format(
RuntimeError: Error(s) in loading state_dict for Whisper:
size mismatch for encoder.conv1.weight: copying a param with shape torch.Size([1280, 80, 3]) from checkpoint, the shape in current model is torch.Size([1280, 128, 3]).
size mismatch for decoder.token_embedding.weight: copying a param with shape torch.Size([51865, 1280]) from checkpoint, the shape in current model is torch.Size([51866, 1280])
the programm is the follwing
# Load model directly
import re
import whisper
import torch
import time
myAudioPath = "./audio/"
#MODEL_PATH = "./models/"
MODEL_PATH = "./models/ZLSCompLing/whisper_large_lb_ZLS_v4_38h/pytorch_model.bin"
#MODEL_PATH = "./models/steja/whisper-small-luxembourgish/pytorch_model.bin"
def hf_to_whisper_states(text): return (text
.replace("model.", "")
.replace("layers", "blocks")
.replace("fc1", "mlp.0")
.replace("fc2", "mlp.2")
.replace("final_layer_norm", "mlp_ln")
.replace(".self_attn.q_proj", ".attn.query")
.replace(".self_attn.k_proj", ".attn.key")
.replace(".self_attn.v_proj", ".attn.value")
.replace(".self_attn_layer_norm", ".attn_ln")
.replace(".self_attn.out_proj", ".attn.out")
.replace(".encoder_attn.q_proj", ".cross_attn.query")
.replace(".encoder_attn.k_proj", ".cross_attn.key")
.replace(".encoder_attn.v_proj", ".cross_attn.value")
.replace(".encoder_attn_layer_norm", ".cross_attn_ln")
.replace(".encoder_attn.out_proj", ".cross_attn.out")
.replace("decoder.layer_norm.", "decoder.ln.")
.replace("encoder.layer_norm.", "encoder.ln_post.")
.replace("embed_tokens", "token_embedding")
.replace("encoder.embed_positions.weight", "encoder.positional_embedding")
.replace("decoder.embed_positions.weight", "decoder.positional_embedding")
.replace("layer_norm", "ln_post")
)
# Load HF Model
print("loading Model ",MODEL_PATH )
hf_state_dict = torch.load(MODEL_PATH, map_location=torch.device('cpu')) # pytorch_model.bin file
print("Model loaded")
# Rename layers
print("renaming layers")
for key in list(hf_state_dict.keys())[:]:
new_key = hf_to_whisper_states(key)
hf_state_dict[new_key] = hf_state_dict.pop(key)
print(" initialising Model")
# Init Whisper Model and replace model weights
whisper_model = whisper.load_model('large') # need to use the same size than the model used
print("model created")
whisper_model.load_state_dict(hf_state_dict)
print("transcribing")
# set timer
start = time.time()
result = whisper_model.transcribe("./audio/audio-test.mp3")
end = time.time()
print(f' The text in audio: \n {result["text"]}')
print(f' it took \n {end - start}')
Could anyone please help me in resolving, and understanding, this problem ?