Problems running a whisper model locally on a mac

Hi, i am an absolute beginner and want to run a model to translate audio in lubemburgish text. I found a model here : ZLSCompLing/whisper_large_lb_ZLS_v4_38h
I have the following errors when runnig my programm
Traceback (most recent call last):
File “/.stt/whisperLuxLarge.py”, line 55, in
whisper_model.load_state_dict(hf_state_dict)
File “/.stt/stt-lux/lib/python3.9/site-packages/torch/nn/modules/module.py”, line 2153, in load_state_dict
raise RuntimeError(‘Error(s) in loading state_dict for {}:\n\t{}’.format(
RuntimeError: Error(s) in loading state_dict for Whisper:
size mismatch for encoder.conv1.weight: copying a param with shape torch.Size([1280, 80, 3]) from checkpoint, the shape in current model is torch.Size([1280, 128, 3]).
size mismatch for decoder.token_embedding.weight: copying a param with shape torch.Size([51865, 1280]) from checkpoint, the shape in current model is torch.Size([51866, 1280])

the programm is the follwing

# Load model directly
import re
import whisper
import torch
import time

myAudioPath = "./audio/"
#MODEL_PATH = "./models/"
MODEL_PATH = "./models/ZLSCompLing/whisper_large_lb_ZLS_v4_38h/pytorch_model.bin"
#MODEL_PATH = "./models/steja/whisper-small-luxembourgish/pytorch_model.bin"

def hf_to_whisper_states(text): return (text
    .replace("model.", "")
    .replace("layers", "blocks")
    .replace("fc1", "mlp.0")
    .replace("fc2", "mlp.2")
    .replace("final_layer_norm", "mlp_ln")
    .replace(".self_attn.q_proj", ".attn.query")
    .replace(".self_attn.k_proj", ".attn.key")
    .replace(".self_attn.v_proj", ".attn.value")
    .replace(".self_attn_layer_norm", ".attn_ln")
    .replace(".self_attn.out_proj", ".attn.out")
    .replace(".encoder_attn.q_proj", ".cross_attn.query")
    .replace(".encoder_attn.k_proj", ".cross_attn.key")
    .replace(".encoder_attn.v_proj", ".cross_attn.value")
    .replace(".encoder_attn_layer_norm", ".cross_attn_ln")
    .replace(".encoder_attn.out_proj", ".cross_attn.out")
    .replace("decoder.layer_norm.", "decoder.ln.")
    .replace("encoder.layer_norm.", "encoder.ln_post.")
    .replace("embed_tokens", "token_embedding")
    .replace("encoder.embed_positions.weight", "encoder.positional_embedding")
    .replace("decoder.embed_positions.weight", "decoder.positional_embedding")
    .replace("layer_norm", "ln_post")
)

# Load HF Model
print("loading Model ",MODEL_PATH )
hf_state_dict = torch.load(MODEL_PATH, map_location=torch.device('cpu'))    # pytorch_model.bin file
print("Model loaded")

# Rename layers
print("renaming layers")
for key in list(hf_state_dict.keys())[:]:
    
    new_key = hf_to_whisper_states(key)
    hf_state_dict[new_key] = hf_state_dict.pop(key)

print(" initialising Model")
# Init Whisper Model and replace model weights
whisper_model = whisper.load_model('large')  # need to use the same size than the model used
print("model created")
whisper_model.load_state_dict(hf_state_dict)

print("transcribing")
# set timer
start = time.time()
result = whisper_model.transcribe("./audio/audio-test.mp3")
end = time.time()
print(f' The text in audio: \n {result["text"]}')
print(f' it took \n {end - start}')

Could anyone please help me in resolving, and understanding, this problem ?

Hi,

The model you refer to (ZLSCompLing/whisper_large_lb_ZLS_v4_38h) seems to be compatible with the Transformers library, hence you can use this code to transcribe audio.