Model.generate generates same output for different inputs

Hi! I trained a SpeechT5 model on an ASR task but i only get the same output sequence when using model generate for different inputs… During the training i use cumpute metrics and there the output actually fits to the label output but when the model is finisehd with training it does weird thins. Does anybody why this occurs? Below is the Code I used to configurate my model an to prepare my data… I already read something about right shifting your data when using Seq2Seq models but I dont know if this really helps…

config = SpeechT5Config(
return_dict=False,
sampling_rate=16000,
vocab_size=tokenizer.vocab_size,
use_cache=False,
activation_function='relu',
max_text_positions=2450,
max_speech_positions=160000,
pad_token_id=3,
bos_token_id=1,
eos_token_id=2,
unk_token_id=0,
decoder_start_token_id=2
)

model = SpeechT5ForSpeechToText(config)

training_args = Seq2SeqTrainingArguments(
evaluation_strategy="steps",
save_strategy="steps",
eval_steps=250,
save_steps=250,
logging_steps=250,
output_dir=r"/home/ec2-user/SageMaker/SpeechT5/Model 2 Char 10Sek 14Ep/Output Dir 18Ep",
num_train_epochs=18,
learning_rate=5e-5,
per_device_train_batch_size=1,
per_device_eval_batch_size=1,
logging_dir=r"/home/ec2-user/SageMaker/SpeechT5/Model 2 Char 10Sek 14Ep/Log Dir 18Ep",
gradient_accumulation_steps=1,
report_to="tensorboard",
fp16=True,
fp16_full_eval=True,
lr_scheduler_type="linear",
seed=42
)

torch.cuda.empty_cache()

torch.manual_seed(42)

model.to(device)
model.train()

trainer = Seq2SeqTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=valid_dataset,
compute_metrics=compute_metrics,
data_collator=data_collator,
tokenizer=processor
)

trainer.train()

def preprocess_function(examples):

audio = examples["audio"]

input_data = processor(
    audio=audio["array"],
    text_target=examples["transcription"],
    sampling_rate=16000
)

return input_data


@dataclass
class DataCollatorForSeq2Seq:
processor: Any

def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:

    input_features = [{"input_values": list(feature["input_values"][0])} for feature in features]

    label_features = [{"input_ids": feature["labels"]} for feature in features]

    # print("DataCollator")

    batch = self.processor.pad(input_values=input_features,
                               return_tensors="pt",
                               return_attention_mask=True,
                               padding='max_length',
                               max_length=160000)

    # print("DataCollator batch")

    labels_batch = self.processor.pad(labels=label_features,
                                      return_tensors="pt",
                                      return_attention_mask=True,
                                      padding='max_length',
                                      max_length=2450)

    # print("DataCollator label batch")

    # print(batch)
    # print(labels_batch)

    batch["labels"] = labels_batch["input_ids"]
    batch["decoder_attention_mask"] = labels_batch["attention_mask"]

    # print("Label features after padding:", batch["input_values"].shape)
    # print("Label features after padding:", batch["attention_mask"].shape)
    # print("Label features after padding:", batch["labels"].shape)
    # print("Label features after padding:", batch["decoder_attention_mask"].shape)

    # print(batch)

    return batch


 data_collator = DataCollatorForSeq2Seq(processor=processor)

@Luan77777 Hi, I faced the same issue when using the GPT2 model to generate sequences. The output is always the same even when there is no input. Have you solved the problem?