The problem Trying to back-translate 90k csv rows (fr-en-fr) using MarianModel huggingface. I keep getting different kinds of memory problems, at first my session just crashed and if I had a small csv to try it went extremely slow
What I tried
I took a code that uses GPU (colab pro) and also read the csv in chunks, but now I am getting CUDA out of memory
. I am unsure what to do, I would be eternally grateful for anyone that can point me in the right direction. Thank you!
This is the code:
from transformers import MarianMTModel, MarianTokenizer
target_model_name = 'Helsinki-NLP/opus-mt-fr-en'
target_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
target_model = MarianMTModel.from_pretrained(target_model_name).to('cuda:0')
en_model_name = 'Helsinki-NLP/opus-mt-en-fr'
en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
en_model = MarianMTModel.from_pretrained(en_model_name).to('cuda:0')
def translate(texts, model, tokenizer, language="en"):
# Prepare the text data into appropriate format for the model
template = lambda text: f"{text}" if language == "fr" else f">>{language}<< {text}"
src_texts = [template(text) for text in texts]
# Tokenize the texts
encoded = tokenizer.prepare_seq2seq_batch(src_texts,
return_tensors='pt').to('cuda:0')
# Generate translation using model
translated = model.generate(**encoded)
# Convert the generated tokens indices back into text
translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
return translated_texts
def back_translate(texts, target_lang="en", source_lang="fr"):
# Translate to target language
fr_texts = translate(texts, target_model, target_tokenizer,
language=target_lang)
# Translate from target language back to source language
back_translated_texts = translate(fr_texts, en_model, en_tokenizer,
language=source_lang)
return back_translated_texts
# lets do the work
import pandas as pd
csv_file="train.csv"
out_csv= "out_records.csv"
chunksize = 100
for chunk in pd.read_csv(csv_file, chunksize=chunksize):
chunk_results = list(chunk['text'])
#chunk_translate = perform_back_translation(chunk_results)
chunk_translate = back_translate(chunk_results, source_lang="fr", target_lang="en")
df = pd.DataFrame({'col':chunk_translate})
df.to_csv(out_csv, mode='a', sep='|', encoding='utf-8', header=None, index=False)