Marian: Language Discovery questions

Backtranslation Snippet


from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
mname_fwd = 'Helsinki-NLP/opus-mt-en-ceb'  #ceb=cebuano https://en.wikipedia.org/wiki/Cebuano_language
mname_bwd = 'Helsinki-NLP/opus-mt-ceb-en'
src_text = ['I am a small frog with tiny legs.']
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
fwd = AutoModelForSeq2SeqLM.from_pretrained(mname_fwd).to(torch_device)
fwd_tok = AutoTokenizer.from_pretrained(mname_fwd)
bwd_tok = AutoTokenizer.from_pretrained(mname_bwd)
bwd = AutoModelForSeq2SeqLM.from_pretrained(mname_bwd).to(torch_device)
if torch_device == 'cuda':
    fwd = fwd.half()
    bwd = bwd.half()

fwd_batch = fwd_tok(src_text, return_tensors='pt').to(torch_device)
translated = fwd.generate(**fwd_batch, num_beams=2)
translated_txt = fwd_tok.batch_decode(translated, skip_special_tokens=True)
bwd_batch = bwd_tok(translated_txt, return_tensors='pt').to(torch_device)
backtranslated = bwd.generate(**bwd_batch, num_beams=2)
result = bwd_tok.batch_decode(backtranslated, skip_special_tokens=True)
# ['I am a small toad with small feet.']
3 Likes