Hey , I tried to train the roberta2roberta model like this
from transformers import EncoderDecoderModel, RobertaTokenizer
import torch
import argparse
from ff_dataloader import CNNDataset, DataLoader, collate_fn
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()
parser = argparse.ArgumentParser()
args = parser.parse_args()
args.max_src_len = 512
args.max_dst_len = 128
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = EncoderDecoderModel.from_encoder_decoder_pretrained('roberta-base', 'roberta-base')
model = model.to(0)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
dataset_path = 'dataset/example.json'
vocab_path = 'dataset/vocab.txt'
dataset = CNNDataset(dataset_path, vocab_path, args)
dataloader = DataLoader(dataset, batch_size=16, collate_fn=collate_fn)
cnt = 0
for epoch in range(20):
for src, dst in dataloader:
src = torch.stack(src).to(0)
mask = (src!=0)
mask =mask.long()
labels =[[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in dst]
dst = torch.stack(dst).to(0)
labels = torch.tensor(labels).to(0)
outputs = model(input_ids=src, attention_mask=mask, decoder_input_ids=dst, labels=labels, return_dict=True)
loss, logits = outputs.loss, outputs.logits
loss.backward()
optimizer.step()
optimizer.zero_grad()
if cnt % 100 == 0:
writer.add_scalar('loss', loss, cnt)
if cnt % 1000 == 0:
model.save_pretrained('roberta2roberta-mask')
cnt = cnt + 1
When I tried to test the model like this
from transformers import BertTokenizer, EncoderDecoderModel, RobertaTokenizer
from datasets import load_from_disk
import nlp
from rouge import Rouge
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = EncoderDecoderModel.from_pretrained("roberta2roberta-mask")
model.to("cuda")
test_dataset = load_from_disk("test_dataset")
batch_size = 128
tokenizer.bos_token = tokenizer.cls_token
# SEP token will work as EOS token
tokenizer.eos_token = tokenizer.sep_token
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
# map data correctly
def generate_summary(batch):
# Tokenizer will automatically set [BOS] <text> [EOS]
# cut off at BERT max length 512
inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.to("cuda")
attention_mask = inputs.attention_mask.to("cuda")
outputs = model.generate(input_ids, attention_mask=attention_mask, decoder_start_token_id=model.config.decoder_start_token_id)
# all special tokens including will be removed
output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)
batch["pred"] = output_str
return batch
results = test_dataset.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["article"])
# load rouge for validation
rouge = Rouge()
pred_str = results["pred"]
label_str = results["highlights"]
rouge_output = rouge.compute(predictions=pred_str, references=label_str)
print('roberta2roberta')
print(rouge_output)
It generated all pad tokens.