EnocederDecoder training/prediction with two tokenizers

I am currently trying to get a hang of the EncoderDecoder model for seq2seq task from pretrained decoder models.
I am a bit confused about how to create a Encoder-decoder from two pretrained Bert models with different tokenizers and different vocabs. I am currently trying to build a seq2seq English to German encoder-decoder model with bert-uncased and dbmdz / bert-base-german-uncased. I know that there are models which have a tokenizer with shared vocab, but I for my future project I specifically want to work with two models that do not have that.

I was able to create a model and start training by adding pad-,end- and clf token of the decoder tokenizer to the EncoderDecoder config (as was described in the colab : Google Colab)

During training the loss decreases to nearly 0 , but when I use the model to generate a translation, I only get start tokens.

Is this a problem how I set up the model, or a problem with the fine-tuning.

Below is the code I used for creating, training and generation.

from transformers import AutoTokenizer
import numpy as np
from transformers import EncoderDecoderModel 
import torch
import datasets
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

#loading model and tokenizer
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased","dbmdz/bert-base-german-uncased")

tokenizer_ger = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-uncased")
tokenizer_en = AutoTokenizer.from_pretrained("bert-base-uncased")

model.config.decoder_start_token_id = tokenizer_ger.cls_token_id
model.config.eos_token_id = tokenizer_ger.sep_token_id
model.config.pad_token_id = tokenizer_ger.pad_token_id

#perparing dataset

train = datasets.load_dataset("wmt14","de-en")
train_dataset = pd.DataFrame([x["translation"] for x in train["train"]])
test_dataset = pd.DataFrame([x["translation"] for x in train["test"]])


#smaller subset for training /troubleshooting

train_dataset = train_dataset.sample(1000)

train_dataset["e_length"] = [len(tokenizer_en(x).input_ids) for x in train_dataset["en"]]

train_dataset["d_length"] = [len(tokenizer_ger(x).input_ids) for x in train_dataset["de"]]


#length to 100, since majority of samples is shorter



train_dataset = train_dataset[train_dataset["e_length"]<100]
train_dataset = train_dataset[train_dataset["d_length"]<100]


test_dataset["e_length"] = [len(tokenizer_en(x).input_ids) for x in test_dataset["en"]]
test_dataset["d_length"] = [len(tokenizer_ger(x).input_ids) for x in test_dataset["de"]]
test_dataset = test_dataset[test_dataset["e_length"]<100]
test_dataset = test_dataset[test_dataset["d_length"]<100]

encoder_max_length=100
decoder_max_length=100

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  inputs = tokenizer_en(batch["en"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer_ger(batch["de"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer_ger.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

batch_size=12

train_dataset = train_dataset.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["e_length", "d_length", "en","de"]
)

test_dataset = test_dataset.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["e_length", "d_length", "en","de"]
)



train_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

test_dataset.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

#training




from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    fp16=True, 
    output_dir="/home/Documents/EN2GER",
    num_train_epochs=30,
    weight_decay=0.01,
    load_best_model_at_end=True,
    save_strategy = "epoch",

    save_total_limit=4,
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    #compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)
trainer.train()


#generation

def generate_result(batch):
    inputs = tokenizer_en(batch["english"].to_list(), padding="max_length", truncation=True, max_length=100, return_tensors="pt")
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask

    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer_ger.batch_decode(outputs, skip_special_tokens=True)

    batch["translations"] = output_str

    return batch

df = pd.DataFrame({"english":["hello my name is Georg. nice to meet you", "what is your name?","I have been feeling under the weather lately", "How about you?"]})

generate_result(df)

1 Like

Have you figured it out?
Is it possible to use two tokenizers, one for encoder one for decoder?