I am Trying to make a dataset translator which Translate alpaca_data_cleaned in different languages. I am Successful in Translating the Part of Dataset (1000 line of csv) but if want to convert the large dataset (50000 line) this code will take 1:30hr. Can anyone Please Guild me how i can Adjust my code and make it more faster (I am Beginner).
Code which works but it is Slow
import pandas as pd
from deep_translator import GoogleTranslator
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
# Path to the input CSV file
input_path = r"alpaca_data_100line.csv"
# Path to the output CSV file
output_path = r"aaaalpaca_data_translated.csv"
# Read the input CSV file
data = pd.read_csv(input_path)
# Split the data into 10 chunks
chunk_size = len(data) // 10
data_chunks = [data.iloc[i:i+chunk_size].copy() for i in range(0, len(data), chunk_size)]
def translate_row(subset):
try:
subset['output'] = subset['output'].apply(lambda x: GoogleTranslator(source='en', target='bho').translate(x))
subset['input'] = subset['input'].apply(lambda x: GoogleTranslator(source='en', target='bho').translate(x) if pd.notnull(x) else 'NaN')
subset['instruction'] = subset['instruction'].apply(lambda x: GoogleTranslator(source='en', target='bho').translate(x))
except Exception as e:
print(f"Translation error: {e}")
return subset
if __name__ == "__main__":
translated_chunks = []
# Create a thread pool with an appropriate number of worker threads
with ThreadPoolExecutor() as executor:
# Submit translation tasks to the executor
futures = [executor.submit(translate_row, chunk) for chunk in tqdm(data_chunks, desc="Submitting tasks")]
# Collect the translated chunks
translated_chunks = [future.result() for future in tqdm(futures, desc="Translating chunks", total=len(futures))]
# Alternatively, write each translated chunk to a separate CSV file
combined_data = pd.concat(translated_chunks, ignore_index=True)
translate_chunk.to_csv(output_path, index=False)
print("Done!")