Hello,
I am having some issues to run tokenization using the XLSUM tokenizer.
It seems to be an issue related to Huggingface as the file in the error message changes every time I run my script.
The error message:
requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: https://huggingface.co/csebuetnlp/mT5_multilingual_XLSum/resolve/main/tokenizer_config.json
Here is my code:
from datasets import Dataset, concatenate_datasets, load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.model_selection import train_test_split
import pandas as pd
import glob
import os
import re
# PARAMETERS
encoder_max_length = 512
decoder_max_length = 150
batch_size = 100
model_name = "csebuetnlp/mT5_multilingual_XLSum"
num_processes_to_tokenize = 12
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))
def load_description_and_transcript_data(lang):
# concat all CSVs and load them as one pandas dataframe
all_files = glob.glob('data/{}/description_and_transcript*.csv'.format(lang))
li = []
for filename in all_files:
df = pd.read_csv(filename, index_col=None, header=0)
li.append(df)
df = pd.concat(li, axis=0, ignore_index=True)
# keep only 3 columns: episode_uri, episode_description_cleaned, transcript_text
df = df[['episode_uri', 'episode_description_cleaned', 'transcript_text']]
df = df.dropna()
# clean text (source https://huggingface.co/csebuetnlp/mT5_multilingual_XLSum)
df["transcript_text"] = df["transcript_text"].apply(WHITESPACE_HANDLER)
df["episode_description_cleaned"] = df["episode_description_cleaned"].apply(WHITESPACE_HANDLER)
return df
def load_train_dev_splits_data(lang):
dev = pd.read_csv('data/{}/out.final.filtered.dev.tsv'.format(lang), index_col=None, header=0, sep='\t')
train = pd.read_csv('data/{}/out.final.filtered.train.tsv'.format(lang), index_col=None, header=0, sep='\t')
return train, dev
# Copied from https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb
def process_data_to_model_inputs(batch, lang):
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
inputs = tokenizer(
batch["transcript_text"],
# return_tensors="pt",
padding="max_length",
truncation=True,
max_length=encoder_max_length
)
outputs = tokenizer(
batch["episode_description_cleaned"],
# return_tensors="pt",
padding="max_length",
truncation=True,
max_length=decoder_max_length,
)
batch["input_ids"] = inputs.input_ids
batch["attention_mask"] = inputs.attention_mask
batch["labels"] = outputs.input_ids
return batch
def tokenize_data(hf_dataset, lang):
processed_data = hf_dataset.map(
lambda batch: process_data_to_model_inputs(batch, lang),
num_proc=num_processes_to_tokenize,
batched=True,
batch_size=batch_size,
remove_columns=["transcript_text", "episode_description_cleaned"],
)
# set Python list to PyTorch tensor
processed_data.set_format(
type="torch",
columns=["input_ids", "attention_mask", "labels"],
)
return processed_data
def preprocess_data(lang):
"""
Steps:
- load splits out.final.filtered.dev.tsv and out.final.filtered.train.tsv
- load the dataset containing both the episode description and the transcript text
- split the dataset from the previous step into train and dev
- tokenize the data
- save tokenized
:param df:
:return:
"""
print("START preprocessing language:", lang)
desc_transc = load_description_and_transcript_data(lang)
print('desc_transc:', len(desc_transc))
train, dev = load_train_dev_splits_data(lang)
print('train/dev:', len(train), len(dev))
desc_and_transc_train = desc_transc[desc_transc['episode_uri'].isin(train['episode_uri'].tolist())]
desc_and_transc_dev = desc_transc[desc_transc['episode_uri'].isin(dev['episode_uri'].tolist())]
# Load as HF dataset
print('Loading as HF dataset...')
podcast_train = Dataset.from_pandas(desc_and_transc_train)
podcast_dev = Dataset.from_pandas(desc_and_transc_dev)
# remove column __index_level_0__
podcast_train = podcast_train.map(lambda x: x, remove_columns=['__index_level_0__'])
podcast_dev = podcast_dev.map(lambda x: x, remove_columns=['__index_level_0__'])
# Tokenize and save data
print('Processing TRAIN data...')
tokenized_data = tokenize_data(podcast_train, lang)
tokenized_data.save_to_disk(f'data/preprocessed/{lang}/train/')
print('Processing TRAIN data FINISHED')
# Tokenize and save data
print('Processing DEV data...')
tokenized_data = tokenize_data(podcast_dev, lang)
tokenized_data.save_to_disk(f'data/preprocessed/{lang}/dev/')
print('Processing DEV data FINISHED')
def read_lines(file_path):
with open(file_path, "r") as f:
return f.readlines()
def merge_and_shuffle():
en_data = load_from_disk('data/preprocessed/en_XX/train/')
pt_data = load_from_disk('data/preprocessed/pt_XX/train/')
all_train = concatenate_datasets(dsets=[en_data, pt_data])
all_train = all_train.shuffle()
all_train.save_to_disk('data/preprocessed/all/train/')
en_data = load_from_disk('data/preprocessed/en_XX/dev/')
pt_data = load_from_disk('data/preprocessed/pt_XX/dev/')
all_dev = concatenate_datasets(dsets=[en_data, pt_data])
all_dev = all_dev.shuffle()
all_dev.save_to_disk('data/preprocessed/all/dev/')
def main():
preprocess_data('pt_XX')
preprocess_data('en_XX')
merge_and_shuffle()
main()