import torch
from transformers import WhisperFeatureExtractor, WhisperForConditionalGeneration, WhisperTokenizer, WhisperProcessor
from peft import get_peft_model, LoraConfig, TaskType
import os
from torch.utils.data import Dataset, DataLoader
import torchaudio
import pandas as pd
Load Feature extractor, Tokenizer, and Processor
feature_extractor = WhisperFeatureExtractor.from_pretrained(“openai/whisper-small”)
tokenizer = WhisperTokenizer.from_pretrained(“openai/whisper-small”, language=“Tamil”, task=“transcribe”)
processor = WhisperProcessor.from_pretrained(“openai/whisper-small”, language=“Tamil”, task=“transcribe”)
Load the model
model = WhisperForConditionalGeneration.from_pretrained(“openai/whisper-small”)
Define the target modules for LoRA based on the actual model structure
target_modules = [
# Encoder layers
“model.encoder.layers.0.self_attn.k_proj”,
“model.encoder.layers.0.self_attn.v_proj”,
“model.encoder.layers.0.self_attn.q_proj”,
“model.encoder.layers.0.self_attn.out_proj”,
“model.encoder.layers.0.fc1”,
“model.encoder.layers.0.fc2”,
“model.encoder.layers.1.self_attn.k_proj”,
“model.encoder.layers.1.self_attn.v_proj”,
“model.encoder.layers.1.self_attn.q_proj”,
“model.encoder.layers.1.self_attn.out_proj”,
“model.encoder.layers.1.fc1”,
“model.encoder.layers.1.fc2”,
“model.encoder.layers.2.self_attn.k_proj”,
“model.encoder.layers.2.self_attn.v_proj”,
“model.encoder.layers.2.self_attn.q_proj”,
“model.encoder.layers.2.self_attn.out_proj”,
“model.encoder.layers.2.fc1”,
“model.encoder.layers.2.fc2”,
“model.encoder.layers.3.self_attn.k_proj”,
“model.encoder.layers.3.self_attn.v_proj”,
“model.encoder.layers.3.self_attn.q_proj”,
“model.encoder.layers.3.self_attn.out_proj”,
“model.encoder.layers.3.fc1”,
“model.encoder.layers.3.fc2”,
“model.encoder.layers.4.self_attn.k_proj”,
“model.encoder.layers.4.self_attn.v_proj”,
“model.encoder.layers.4.self_attn.q_proj”,
“model.encoder.layers.4.self_attn.out_proj”,
“model.encoder.layers.4.fc1”,
“model.encoder.layers.4.fc2”,
“model.encoder.layers.5.self_attn.k_proj”,
“model.encoder.layers.5.self_attn.v_proj”,
“model.encoder.layers.5.self_attn.q_proj”,
“model.encoder.layers.5.self_attn.out_proj”,
“model.encoder.layers.5.fc1”,
“model.encoder.layers.5.fc2”,
“model.encoder.layers.6.self_attn.k_proj”,
“model.encoder.layers.6.self_attn.v_proj”,
“model.encoder.layers.6.self_attn.q_proj”,
“model.encoder.layers.6.self_attn.out_proj”,
“model.encoder.layers.6.fc1”,
“model.encoder.layers.6.fc2”,
“model.encoder.layers.7.self_attn.k_proj”,
“model.encoder.layers.7.self_attn.v_proj”,
“model.encoder.layers.7.self_attn.q_proj”,
“model.encoder.layers.7.self_attn.out_proj”,
“model.encoder.layers.7.fc1”,
“model.encoder.layers.7.fc2”,
“model.encoder.layers.8.self_attn.k_proj”,
“model.encoder.layers.8.self_attn.v_proj”,
“model.encoder.layers.8.self_attn.q_proj”,
“model.encoder.layers.8.self_attn.out_proj”,
“model.encoder.layers.8.fc1”,
“model.encoder.layers.8.fc2”,
“model.encoder.layers.9.self_attn.k_proj”,
“model.encoder.layers.9.self_attn.v_proj”,
“model.encoder.layers.9.self_attn.q_proj”,
“model.encoder.layers.9.self_attn.out_proj”,
“model.encoder.layers.9.fc1”,
“model.encoder.layers.9.fc2”,
“model.encoder.layers.10.self_attn.k_proj”,
“model.encoder.layers.10.self_attn.v_proj”,
“model.encoder.layers.10.self_attn.q_proj”,
“model.encoder.layers.10.self_attn.out_proj”,
“model.encoder.layers.10.fc1”,
“model.encoder.layers.10.fc2”,
“model.encoder.layers.11.self_attn.k_proj”,
“model.encoder.layers.11.self_attn.v_proj”,
“model.encoder.layers.11.self_attn.q_proj”,
“model.encoder.layers.11.self_attn.out_proj”,
“model.encoder.layers.11.fc1”,
“model.encoder.layers.11.fc2”,
# Decoder layers
"model.decoder.layers.0.self_attn.k_proj",
"model.decoder.layers.0.self_attn.v_proj",
"model.decoder.layers.0.self_attn.q_proj",
"model.decoder.layers.0.self_attn.out_proj",
"model.decoder.layers.0.encoder_attn.k_proj",
"model.decoder.layers.0.encoder_attn.v_proj",
"model.decoder.layers.0.encoder_attn.q_proj",
"model.decoder.layers.0.encoder_attn.out_proj",
"model.decoder.layers.0.fc1",
"model.decoder.layers.0.fc2",
"model.decoder.layers.1.self_attn.k_proj",
"model.decoder.layers.1.self_attn.v_proj",
"model.decoder.layers.1.self_attn.q_proj",
"model.decoder.layers.1.self_attn.out_proj",
"model.decoder.layers.1.encoder_attn.k_proj",
"model.decoder.layers.1.encoder_attn.v_proj",
"model.decoder.layers.1.encoder_attn.q_proj",
"model.decoder.layers.1.encoder_attn.out_proj",
"model.decoder.layers.1.fc1",
"model.decoder.layers.1.fc2",
"model.decoder.layers.2.self_attn.k_proj",
"model.decoder.layers.2.self_attn.v_proj",
"model.decoder.layers.2.self_attn.q_proj",
"model.decoder.layers.2.self_attn.out_proj",
"model.decoder.layers.2.encoder_attn.k_proj",
"model.decoder.layers.2.encoder_attn.v_proj",
"model.decoder.layers.2.encoder_attn.q_proj",
"model.decoder.layers.2.encoder_attn.out_proj",
"model.decoder.layers.2.fc1",
"model.decoder.layers.2.fc2",
"model.decoder.layers.3.self_attn.k_proj",
"model.decoder.layers.3.self_attn.v_proj",
"model.decoder.layers.3.self_attn.q_proj",
"model.decoder.layers.3.self_attn.out_proj",
"model.decoder.layers.3.encoder_attn.k_proj",
"model.decoder.layers.3.encoder_attn.v_proj",
"model.decoder.layers.3.encoder_attn.q_proj",
"model.decoder.layers.3.encoder_attn.out_proj",
"model.decoder.layers.3.fc1",
"model.decoder.layers.3.fc2",
"model.decoder.layers.4.self_attn.k_proj",
"model.decoder.layers.4.self_attn.v_proj",
"model.decoder.layers.4.self_attn.q_proj",
"model.decoder.layers.4.self_attn.out_proj",
"model.decoder.layers.4.encoder_attn.k_proj",
"model.decoder.layers.4.encoder_attn.v_proj",
"model.decoder.layers.4.encoder_attn.q_proj",
"model.decoder.layers.4.encoder_attn.out_proj",
"model.decoder.layers.4.fc1",
"model.decoder.layers.4.fc2",
"model.decoder.layers.5.self_attn.k_proj",
"model.decoder.layers.5.self_attn.v_proj",
"model.decoder.layers.5.self_attn.q_proj",
"model.decoder.layers.5.self_attn.out_proj",
"model.decoder.layers.5.encoder_attn.k_proj",
"model.decoder.layers.5.encoder_attn.v_proj",
"model.decoder.layers.5.encoder_attn.q_proj",
"model.decoder.layers.5.encoder_attn.out_proj",
"model.decoder.layers.5.fc1",
"model.decoder.layers.5.fc2",
"model.decoder.layers.6.self_attn.k_proj",
"model.decoder.layers.6.self_attn.v_proj",
"model.decoder.layers.6.self_attn.q_proj",
"model.decoder.layers.6.self_attn.out_proj",
"model.decoder.layers.6.encoder_attn.k_proj",
"model.decoder.layers.6.encoder_attn.v_proj",
"model.decoder.layers.6.encoder_attn.q_proj",
"model.decoder.layers.6.encoder_attn.out_proj",
"model.decoder.layers.6.fc1",
"model.decoder.layers.6.fc2",
"model.decoder.layers.7.self_attn.k_proj",
"model.decoder.layers.7.self_attn.v_proj",
"model.decoder.layers.7.self_attn.q_proj",
"model.decoder.layers.7.self_attn.out_proj",
"model.decoder.layers.7.encoder_attn.k_proj",
"model.decoder.layers.7.encoder_attn.v_proj",
"model.decoder.layers.7.encoder_attn.q_proj",
"model.decoder.layers.7.encoder_attn.out_proj",
"model.decoder.layers.7.fc1",
"model.decoder.layers.7.fc2",
"model.decoder.layers.8.self_attn.k_proj",
"model.decoder.layers.8.self_attn.v_proj",
"model.decoder.layers.8.self_attn.q_proj",
"model.decoder.layers.8.self_attn.out_proj",
"model.decoder.layers.8.encoder_attn.k_proj",
"model.decoder.layers.8.encoder_attn.v_proj",
"model.decoder.layers.8.encoder_attn.q_proj",
"model.decoder.layers.8.encoder_attn.out_proj",
"model.decoder.layers.8.fc1",
"model.decoder.layers.8.fc2",
"model.decoder.layers.9.self_attn.k_proj",
"model.decoder.layers.9.self_attn.v_proj",
"model.decoder.layers.9.self_attn.q_proj",
"model.decoder.layers.9.self_attn.out_proj",
"model.decoder.layers.9.encoder_attn.k_proj",
"model.decoder.layers.9.encoder_attn.v_proj",
"model.decoder.layers.9.encoder_attn.q_proj",
"model.decoder.layers.9.encoder_attn.out_proj",
"model.decoder.layers.9.fc1",
"model.decoder.layers.9.fc2",
"model.decoder.layers.10.self_attn.k_proj",
"model.decoder.layers.10.self_attn.v_proj",
"model.decoder.layers.10.self_attn.q_proj",
"model.decoder.layers.10.self_attn.out_proj",
"model.decoder.layers.10.encoder_attn.k_proj",
"model.decoder.layers.10.encoder_attn.v_proj",
"model.decoder.layers.10.encoder_attn.q_proj",
"model.decoder.layers.10.encoder_attn.out_proj",
"model.decoder.layers.10.fc1",
"model.decoder.layers.10.fc2",
"model.decoder.layers.11.self_attn.k_proj",
"model.decoder.layers.11.self_attn.v_proj",
"model.decoder.layers.11.self_attn.q_proj",
"model.decoder.layers.11.self_attn.out_proj",
"model.decoder.layers.11.encoder_attn.k_proj",
"model.decoder.layers.11.encoder_attn.v_proj",
"model.decoder.layers.11.encoder_attn.q_proj",
"model.decoder.layers.11.encoder_attn.out_proj",
"model.decoder.layers.11.fc1",
"model.decoder.layers.11.fc2",
]
Configure LoRA
config = LoraConfig(
task_type=TaskType.SEQ_2_SEQ_LM,
r=8,
lora_alpha=16,
target_modules=target_modules,
lora_dropout=0.1,
)
for name, module in model.named_modules():
print(name)
Apply LoRA to the model
lora_model = get_peft_model(model, config)
Set model to training mode
lora_model.train()
class WhisperDataset(Dataset):
def init(self, csv_file, audio_dir, processor):
self.data = pd.read_csv(csv_file)
self.audio_dir = audio_dir
self.processor = processor
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
audio_path = f"{self.audio_dir}/{self.data.iloc[idx, 0]}"
transcription = self.data.iloc[idx, 1]
# Load audio file
speech_array, sampling_rate = torchaudio.load(audio_path)
# Process the audio and transcription
input_features = self.processor.feature_extractor(speech_array.squeeze().numpy(), sampling_rate=sampling_rate, return_tensors="pt").input_features
labels = self.processor.tokenizer(transcription, return_tensors="pt").input_ids
# print(f"Processing index: {idx}")
# print(f"Transcription: {transcription}")
# print(f"Input features shape: {input_features.shape}")
# print(f"Labels (input_ids): {labels}")
return {
"input_features": input_features.squeeze(),
"labels": labels.squeeze()
}
Define a custom collate function to pad sequences
def collate_fn(batch):
input_features = [item[‘input_features’] for item in batch]
labels = [item[‘labels’] for item in batch]
# Pad sequences
input_features_padded = torch.nn.utils.rnn.pad_sequence(input_features, batch_first=True, padding_value=0)
labels_padded = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=tokenizer.pad_token_id)
return {
'input_features': input_features_padded,
'labels': labels_padded
}
Example usage
csv_file = “./dup.csv” # Path to your CSV file
audio_dir = “.” # Path to the directory containing your audio files
processor = WhisperProcessor.from_pretrained(“openai/whisper-small”, language=“Tamil”, task=“transcribe”)
dataset = WhisperDataset(csv_file, audio_dir, processor)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)
Define the optimizer
optimizer = torch.optim.Adam(lora_model.parameters(), lr=1e-4)
Fine-tuning loop
num_epochs = 3 # Define the number of epochs
for epoch in range(num_epochs):
for batch in dataloader:
# Forward pass
input_features = batch[‘input_features’].to(lora_model.device)
labels = batch[‘labels’].to(lora_model.device)
print(f"Batch input features shape: {input_features.shape}")
print(f"Batch labels shape: {labels.shape}")
print(f"Batch labels: {labels}")
# print(input_features.shape)
outputs = lora_model(input_features=input_features, labels=labels)
loss = outputs.loss
# Backward pass
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")
Save the fine-tuned model
lora_model.save_pretrained(“/content/fine_tuned_whisper_with_lora”)
TypeError: WhisperForConditionalGeneration.forward() got an unexpected keyword argument ‘input_ids’
Above error is shown for the code