Good morning,
I have fine tuned the whisper tiny model following this blog, and after training I have saved it locally with training.save_model(output_dir). My problem is that I don’t manage to load again the model and resume training. I have read several posts and tried different combination but it still does not work. Any help is welcome
my training code looks like this (I skipped some details):
@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
processor: Any
def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
# split inputs and labels since they have to be of different lengths and need different padding methods
# first treat the audio inputs by simply returning torch tensors
input_features = [{"input_features": feature["input_features"]} for feature in features]
batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")
# get the tokenized label sequences
label_features = [{"input_ids": feature["labels"]} for feature in features]
# pad the labels to max length
labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")
# replace padding with -100 to ignore loss correctly
labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
# if bos token is appended in previous tokenization step,
# cut bos token here as it's append later anyways
if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
labels = labels[:, 1:]
batch["labels"] = labels
return batch
def main():
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
common_voice_train["train"], common_voice_train["test"] =
get_and_process_common_voice_data(commonvoice_dir) #following the blog
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
training_args = Seq2SeqTrainingArgs(# ...)
trainer = Seq2SeqTrainer(
args=training_args,
model=model,
train_dataset=common_voice["train"],
eval_dataset=common_voice["test"],
data_collator=data_collator,
compute_metrics=compute_metrics,
tokenizer=processor.feature_extractor,
)
trainer.train() # this produces a lot of notices that we cannot use cache, but it auto-fixes itself...
trainer.save_model(DIRECTORY))
To resume training I do:
def resume_training():
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny")
data["train"], data["test"] =
get_and_process_common_voice_data(commonvoice_dir) #following the blog
data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
training_args = torch.load(DIRECTORY/"training_args.bin")
config = WhisperConfig.from_json_file(DIRECTORY/"config.json")
model = WhisperModel(config)
state_dict = torch.load(DIRECTORY/"pytorch_model.bin", map_location=torch.device('cpu'))
# I have here to correct the keys of the state dict
model.load_state_dict(state_dict)
# these settings are needed for fine-tuning --> understand better why
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []
trainer = Seq2SeqTrainer(
args=training_args,
model=model,
train_dataset=common_voice["train"],
eval_dataset=common_voice["test"],
data_collator=data_collator,
compute_metrics=compute_metrics,
tokenizer=processor.feature_extractor,
)
trainer.train()
What happens is that the model seems correctly loaded, the data seems correct (contains input_features, and labels), but when trainer.train() calls the data_collator, it fails, since the features that are passed are a list composed of one dictionary, whose only key is “input_features”. the key 'labels"is missing.
Thank you in advance for any help you might provide.