I am a beginner around here. I would like to generate some samples of abstractive text summaries using Pegasus. I used the code snippet from Pegasus โ transformers 4.3.0 documentation . However, I realize the summary is the same everytime I use decode.
How should I generate distinct samples of summaries? Thank you in advance.
#! pip install transformers
#! pip install datasets
#! pip install sentencepiece
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import datasets
model = PegasusForConditionalGeneration.from_pretrained("sshleifer/distill-pegasus-xsum-16-4")
tokenizer = PegasusTokenizer.from_pretrained("sshleifer/distill-pegasus-xsum-16-4")
# Download data samples
data = datasets.load_dataset("xsum", split="validation[:10]")
# Pick two examples
text2summarize_1 = data["document"][0]
text2summarize_2 = data["document"][3]
#print(text2summarize_1)
#print(text2summarize_2)
def generate_for_sample(sample, **kwargs):
"""
Returns decoded summary (code snippets from the docs)
kwargs are passed on to the model's generate function
"""
inputs = tokenizer(sample, truncation=True, max_length=1024, return_tensors='pt')
summary_ids = model.generate(inputs['input_ids'], **kwargs)
return [tokenizer.decode(g,
skip_special_tokens=True,
clean_up_tokenization_spaces=False) for g in summary_ids]
print("Summaries generated with default parameters:")
summary_1 = generate_for_sample(text2summarize_1)
summary_2 = generate_for_sample(text2summarize_2)
print("summary_1: {}".format(summary_1))
print("summary_2: {}".format(summary_2))
print("Some default parameter values: ", "num_beams={}, do_sample={}, top_k={}, top_p={}".
format(model.config.num_beams, model.config.do_sample, model.config.top_k, model.config.top_p))
print("Summaries generated with custom parameter values:")
summary_1 = generate_for_sample(text2summarize_1, num_beams=4)
summary_2 = generate_for_sample(text2summarize_2, do_sample=True, top_k=10, top_p=0.8)
print("summary_1: {}".format(summary_1))
print("summary_2: {}".format(summary_2))
Output:
Summaries generated with default parameters:
summary_1: [โApple has been accused of misleading customers in Australia over its new iPad.โ]
summary_2: [โThe worldโs first marine energy system has been installed in the North Sea.โ]
Some default parameter values: num_beams=8, do_sample=False, top_k=50, top_p=1.0
Summaries generated with custom parameter values:
summary_1: [โApple is facing legal action in Australia over its new iPad with wi-fi and 4G.โ]
summary_2: [โA marine energy system has been installed in the North Sea for the first time.โ]