Inference API error

Hello!
I have pushed my fine-tuned conversation model onto the hub and the hosted inference API is returning this error:

It's impossible to use `encoder_no_repeat_ngram_size` with decoder-only architecture

Here are the codes:

raw_datasets = load_dataset("IlyaGusev/gpt_roleplay_realm", revision="main", cache_dir="E:\HuggingFace")
dia_sets = raw_datasets["en"]
checkpoint = "facebook/blenderbot-400M-distill"

tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

model = AutoModelForCausalLM.from_pretrained(checkpoint)

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

def tokenize_function(dia_sets):
    # raw_datasets = [utterance for turns in raw_datasets["turns"] for turn in turns for utterance in turn["utterances"]]
    dia_sets = [actuals['content'] for turns in dia_sets["dialogues"] for ac in turns for actuals in ac["chat"]]
    return tokenizer(dia_sets, padding=True, truncation=True, return_overflowing_tokens=True)

tokenized_datasets = dia_sets.map(tokenize_function, remove_columns=raw_datasets['en'].column_names, batched=True) #remove_columns=raw_datasets["train"].column_names

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments("test_trainer",
                                  save_strategy="epoch",
                                  per_device_train_batch_size=4,
                                  gradient_accumulation_steps=8,
                                  optim="adafactor",
                                  eval_accumulation_steps=1,
                                  num_train_epochs=3,
                                  )


def compute_metrics(eval_preds):
    metric = evaluate.load("IlyaGusev/gpt_roleplay_realm", cache_dir="E:\HuggingFace")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets,
    # eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

path = r".env\test_bot"
trainer.save_model(path)

model = AutoModelForCausalLM.from_pretrained(path)
tokenizer = AutoTokenizer.from_pretrained(path)

model.push_to_hub("test_bot")
tokenizer.push_to_hub("test_bot")

Is there anything I need to tweak so that it does not try to use an encoder inside the model when having it to generate a dialogue? Also, the Inference API made it text generation instead of making it conversational in the model card.

Edit: I have tried deleting the "encoder_no_repeat_ngram_size": 3 before pushing it to the hub but the error still occurred