I have used wav2vec2 pretrained model of wav2vec2-xls-r-300m, and finetuned it to 1000hrs Bengali dataset. Training took 4 full days with 20 epochs. But, there is issue in decoding. It is decoding in some arbitrary fashion, basically outputs random combination of Bengali letters (which does not have any meaning as confirmed by Bengali natives). It is showing a WER of 100% for all the sentences.
My code is based on the notebook at Google Colab
@sanchit-gandhi Pls suggest on what could have gone wrong. Should I use fairseq & redo the experiments?
================ Code snippet ===============
Code below
def compute_metrics(pred):
tokenizer = Wav2Vec2CTCTokenizer(vocabFile, unk_token=“[UNK]”, pad_token=“[PAD]”, word_delimiter_token=“|”)
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
#The feature extractor and tokenizer are wrapped into a single Wav2Vec2Processor class
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
pred_logits = pred.predictions
pred_ids = np.argmax(pred_logits, axis=-1)
pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
pred_str = processor.batch_decode(pred_ids)
# we do not want to group tokens when computing the metrics
label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
wer_metric = load_metric("wer")
wer = wer_metric.compute(predictions=pred_str, references=label_str)
return {"wer": wer}
def extract_all_chars(batch):
all_text = " ".join(batch[“transcription”])
vocab = list(set(all_text))
return {“vocab”: [vocab], “all_text”: [all_text]}
def vocabCreation(train_data, vocabFile):
vocabs = train_data.map(extract_all_chars,batched=True,batch_size=-1,keep_in_memory=True,remove_columns=train_data.column_names)
vocab_list = list(set(vocabs[“vocab”][0]) | set(vocabs[“vocab”][0]))
vocab_dict = {v: k for k, v in enumerate(vocab_list)}
vocab_dict[“|”] = vocab_dict[" “]
del vocab_dict[” “]
vocab_dict[”[UNK]“] = len(vocab_dict)
vocab_dict[”[PAD]"] = len(vocab_dict)
#vocab_dict[“NOISE”] = len(vocab_dict)
##Save the vocabulary as a json file
with io.open(vocabFile, ‘w’, encoding=‘utf8’) as vocab_file:
#with io.open(vocabFile, ‘w’, encoding=‘utf-16le’) as vocab_file:
json.dump(vocab_dict, vocab_file, ensure_ascii=False)
def readDatasets(csv_file):
df = pd.read_csv(csv_file, sep = “:”, low_memory=False)
transcription = df[‘transcription’]
fullWavFile = df[‘wav’] ## Add absolute path of wave file
data = Dataset.from_pandas(df)
new_features = data.features.copy()
new_features[“wav”] = Audio(sampling_rate = 16000)
data_out = data.cast(new_features)
return data_out #df, transcription, fullWavFile
def prepare_dataset(batch):
#Tokenizer that processes the model’s output format to text. We use the json file to instantiate an object of the Wav2Vec2CTCTokenizer class
tokenizer = Wav2Vec2CTCTokenizer(vocabFile, unk_token=“[UNK]”, pad_token=“[PAD]”, word_delimiter_token=“|”)
#Feature extractor that processes the speech signal to the model’s input format
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
#The feature extractor and tokenizer are wrapped into a single Wav2Vec2Processor class
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
#load and resample the audio data
audio = batch[“wav”]
#batched output is “un-batched” to ensure mapping is correct
#extract the input_values from the loaded audio file
batch[“input_values”] = processor(audio[“array”], sampling_rate=audio[“sampling_rate”]).input_values[0]
batch[“input_length”] = len(batch[“input_values”])
#encode the transcriptions to label ids
with processor.as_target_processor():
batch[“labels”] = processor(batch[“transcription”]).input_ids
return batch
train_data = readDatasets(trainFile)
vocabCreation(train_data, vocabFile)
tokenizer = Wav2Vec2CTCTokenizer(vocabFile, unk_token=“[UNK]”, pad_token=“[PAD]”, word_delimiter_token=“|”)
#Feature extractor that processes the speech signal to the model’s input format
feature_extractor = Wav2Vec2FeatureExtractor(feature_size=1, sampling_rate=16000, padding_value=0.0, do_normalize=True, return_attention_mask=False)
#The feature extractor and tokenizer are wrapped into a single Wav2Vec2Processor class
processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
train_ds = train_data.map(prepare_dataset, remove_columns=train_data.column_names)
valid_data = readDatasets(validFile)
valid_ds = valid_data.map(prepare_dataset, remove_columns=valid_data.column_names)
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
wv_model = Wav2Vec2ForCTC.from_pretrained(
“facebook/wav2vec2-xls-r-300m”,
#“facebook/wav2vec2-xls-r-1b”,
attention_dropout=0.0,
hidden_dropout=0.0,
feat_proj_dropout=0.0,
mask_time_prob=0.05,
layerdrop=0.0,
ctc_loss_reduction=“mean”,
pad_token_id=processor.tokenizer.pad_token_id,
vocab_size=len(processor.tokenizer)+2,
)
#set the requires_grad to False for all parameters of the feature extraction part
wv_model.freeze_feature_extractor()
logDir = ‘/exp/logs/’
training_args = TrainingArguments(
output_dir = logDir,
group_by_length=True,
per_device_train_batch_size=8,
gradient_accumulation_steps=2,
evaluation_strategy=“steps”,
num_train_epochs=20,
gradient_checkpointing=True,
fp16=True,
save_steps=500,
eval_steps=500,
logging_steps=500,
learning_rate=3e-4,
warmup_steps=500,
save_total_limit=2,
push_to_hub=False,
#report_to=“none”,
)
#All the instances can be passed to the Trainer and we are ready to start the training
trainer = Trainer(
model=wv_model,
data_collator=data_collator,
args=training_args,
compute_metrics=compute_metrics,
train_dataset=train_ds,
eval_dataset=valid_ds,
tokenizer=processor.feature_extractor,
)
os.environ[“WANDB_DISABLED”] = “true”
#Training
trainer.train()
modelSavePath = ‘/exp/models/xlsr_300m_ft_bn_model-full/’
trainer.save_model(modelSavePath)
Thanks