I am currently trying to get a hang of the EncoderDecoder model for seq2seq task from pretrained decoder models.
I am a bit confused about how to create a Encoder-decoder from two pretrained Bert models with different tokenizers and different vocabs. I am currently trying to build a seq2seq English to German encoder-decoder model with bert-uncased and dbmdz / bert-base-german-uncased. I know that there are models which have a tokenizer with shared vocab, but I for my future project I specifically want to work with two models that do not have that.
I was able to create a model and start training by adding pad-,end- and clf token of the decoder tokenizer to the EncoderDecoder config (as was described in the colab : Google Colab)
During training the loss decreases to nearly 0 , but when I use the model to generate a translation, I only get start tokens.
Is this a problem how I set up the model, or a problem with the fine-tuning.
Below is the code I used for creating, training and generation.
from transformers import AutoTokenizer
import numpy as np
from transformers import EncoderDecoderModel
import torch
import datasets
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
#loading model and tokenizer
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased","dbmdz/bert-base-german-uncased")
tokenizer_ger = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-uncased")
tokenizer_en = AutoTokenizer.from_pretrained("bert-base-uncased")
model.config.decoder_start_token_id = tokenizer_ger.cls_token_id
model.config.eos_token_id = tokenizer_ger.sep_token_id
model.config.pad_token_id = tokenizer_ger.pad_token_id
#perparing dataset
train = datasets.load_dataset("wmt14","de-en")
train_dataset = pd.DataFrame([x["translation"] for x in train["train"]])
test_dataset = pd.DataFrame([x["translation"] for x in train["test"]])
#smaller subset for training /troubleshooting
train_dataset = train_dataset.sample(1000)
train_dataset["e_length"] = [len(tokenizer_en(x).input_ids) for x in train_dataset["en"]]
train_dataset["d_length"] = [len(tokenizer_ger(x).input_ids) for x in train_dataset["de"]]
#length to 100, since majority of samples is shorter
train_dataset = train_dataset[train_dataset["e_length"]<100]
train_dataset = train_dataset[train_dataset["d_length"]<100]
test_dataset["e_length"] = [len(tokenizer_en(x).input_ids) for x in test_dataset["en"]]
test_dataset["d_length"] = [len(tokenizer_ger(x).input_ids) for x in test_dataset["de"]]
test_dataset = test_dataset[test_dataset["e_length"]<100]
test_dataset = test_dataset[test_dataset["d_length"]<100]
encoder_max_length=100
decoder_max_length=100
def process_data_to_model_inputs(batch):
# tokenize the inputs and labels
inputs = tokenizer_en(batch["en"], padding="max_length", truncation=True, max_length=encoder_max_length)
outputs = tokenizer_ger(batch["de"], padding="max_length", truncation=True, max_length=decoder_max_length)
batch["input_ids"] = inputs.input_ids
batch["attention_mask"] = inputs.attention_mask
batch["decoder_input_ids"] = outputs.input_ids
batch["decoder_attention_mask"] = outputs.attention_mask
batch["labels"] = outputs.input_ids.copy()
# because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
# We have to make sure that the PAD token is ignored
batch["labels"] = [[-100 if token == tokenizer_ger.pad_token_id else token for token in labels] for labels in batch["labels"]]
return batch
batch_size=12
train_dataset = train_dataset.map(
process_data_to_model_inputs,
batched=True,
batch_size=batch_size,
remove_columns=["e_length", "d_length", "en","de"]
)
test_dataset = test_dataset.map(
process_data_to_model_inputs,
batched=True,
batch_size=batch_size,
remove_columns=["e_length", "d_length", "en","de"]
)
train_dataset.set_format(
type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
test_dataset.set_format(
type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
#training
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
training_args = Seq2SeqTrainingArguments(
predict_with_generate=True,
evaluation_strategy="epoch",
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
fp16=True,
output_dir="/home/Documents/EN2GER",
num_train_epochs=30,
weight_decay=0.01,
load_best_model_at_end=True,
save_strategy = "epoch",
save_total_limit=4,
)
trainer = Seq2SeqTrainer(
model=model,
args=training_args,
#compute_metrics=compute_metrics,
train_dataset=train_dataset,
eval_dataset=test_dataset,
)
trainer.train()
#generation
def generate_result(batch):
inputs = tokenizer_en(batch["english"].to_list(), padding="max_length", truncation=True, max_length=100, return_tensors="pt")
input_ids = inputs.input_ids
attention_mask = inputs.attention_mask
outputs = model.generate(input_ids, attention_mask=attention_mask)
output_str = tokenizer_ger.batch_decode(outputs, skip_special_tokens=True)
batch["translations"] = output_str
return batch
df = pd.DataFrame({"english":["hello my name is Georg. nice to meet you", "what is your name?","I have been feeling under the weather lately", "How about you?"]})
generate_result(df)