HTTP 502 Bad Gateway for url

Hello,

I am having some issues to run tokenization using the XLSUM tokenizer.
It seems to be an issue related to Huggingface as the file in the error message changes every time I run my script.

The error message:

requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: https://huggingface.co/csebuetnlp/mT5_multilingual_XLSum/resolve/main/tokenizer_config.json

Here is my code:

from datasets import Dataset, concatenate_datasets, load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from sklearn.model_selection import train_test_split
import pandas as pd
import glob
import os
import re

# PARAMETERS
encoder_max_length = 512
decoder_max_length = 150
batch_size = 100
model_name = "csebuetnlp/mT5_multilingual_XLSum"
num_processes_to_tokenize = 12
WHITESPACE_HANDLER = lambda k: re.sub('\s+', ' ', re.sub('\n+', ' ', k.strip()))


def load_description_and_transcript_data(lang):
    # concat all CSVs and load them as one pandas dataframe
    all_files = glob.glob('data/{}/description_and_transcript*.csv'.format(lang))
    li = []

    for filename in all_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        li.append(df)

    df = pd.concat(li, axis=0, ignore_index=True)

    # keep only 3 columns: episode_uri, episode_description_cleaned, transcript_text
    df = df[['episode_uri', 'episode_description_cleaned', 'transcript_text']]
    df = df.dropna()

    # clean text (source https://huggingface.co/csebuetnlp/mT5_multilingual_XLSum)
    df["transcript_text"] = df["transcript_text"].apply(WHITESPACE_HANDLER)
    df["episode_description_cleaned"] = df["episode_description_cleaned"].apply(WHITESPACE_HANDLER)

    return df


def load_train_dev_splits_data(lang):
    dev = pd.read_csv('data/{}/out.final.filtered.dev.tsv'.format(lang), index_col=None, header=0, sep='\t')
    train = pd.read_csv('data/{}/out.final.filtered.train.tsv'.format(lang), index_col=None, header=0, sep='\t')

    return train, dev


# Copied from https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb
def process_data_to_model_inputs(batch, lang):
    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    inputs = tokenizer(
        batch["transcript_text"],
        # return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=encoder_max_length
    )

    outputs = tokenizer(
        batch["episode_description_cleaned"],
        # return_tensors="pt",
        padding="max_length",
        truncation=True,
        max_length=decoder_max_length,
    )

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids

    return batch


def tokenize_data(hf_dataset, lang):
    processed_data = hf_dataset.map(
        lambda batch: process_data_to_model_inputs(batch, lang),
        num_proc=num_processes_to_tokenize,
        batched=True,
        batch_size=batch_size,
        remove_columns=["transcript_text", "episode_description_cleaned"],
    )

    # set Python list to PyTorch tensor
    processed_data.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "labels"],
    )

    return processed_data


def preprocess_data(lang):
    """
    Steps:
    - load splits out.final.filtered.dev.tsv and out.final.filtered.train.tsv
    - load the dataset containing both the episode description and the transcript text
    - split the dataset from the previous step into train and dev
    - tokenize the data
    - save tokenized
    :param df:
    :return:
    """
    print("START preprocessing language:", lang)

    desc_transc = load_description_and_transcript_data(lang)
    print('desc_transc:', len(desc_transc))

    train, dev = load_train_dev_splits_data(lang)
    print('train/dev:', len(train), len(dev))

    desc_and_transc_train = desc_transc[desc_transc['episode_uri'].isin(train['episode_uri'].tolist())]
    desc_and_transc_dev = desc_transc[desc_transc['episode_uri'].isin(dev['episode_uri'].tolist())]

    # Load as HF dataset
    print('Loading as HF dataset...')
    podcast_train = Dataset.from_pandas(desc_and_transc_train)
    podcast_dev = Dataset.from_pandas(desc_and_transc_dev)

    # remove column __index_level_0__
    podcast_train = podcast_train.map(lambda x: x, remove_columns=['__index_level_0__'])
    podcast_dev = podcast_dev.map(lambda x: x, remove_columns=['__index_level_0__'])

    # Tokenize and save data
    print('Processing TRAIN data...')
    tokenized_data = tokenize_data(podcast_train, lang)
    tokenized_data.save_to_disk(f'data/preprocessed/{lang}/train/')
    print('Processing TRAIN data FINISHED')

    # Tokenize and save data
    print('Processing DEV data...')
    tokenized_data = tokenize_data(podcast_dev, lang)
    tokenized_data.save_to_disk(f'data/preprocessed/{lang}/dev/')
    print('Processing DEV data FINISHED')


def read_lines(file_path):
    with open(file_path, "r") as f:
        return f.readlines()


def merge_and_shuffle():
    en_data = load_from_disk('data/preprocessed/en_XX/train/')
    pt_data = load_from_disk('data/preprocessed/pt_XX/train/')
    all_train = concatenate_datasets(dsets=[en_data, pt_data])
    all_train = all_train.shuffle()
    all_train.save_to_disk('data/preprocessed/all/train/')

    en_data = load_from_disk('data/preprocessed/en_XX/dev/')
    pt_data = load_from_disk('data/preprocessed/pt_XX/dev/')
    all_dev = concatenate_datasets(dsets=[en_data, pt_data])
    all_dev = all_dev.shuffle()
    all_dev.save_to_disk('data/preprocessed/all/dev/')


def main():
    preprocess_data('pt_XX')
    preprocess_data('en_XX')
    merge_and_shuffle()


main()

I am getting the same error since yesterday. Below is my instance:

requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: https://huggingface.co/api/models/roberta-base

i am facing the same error too , any solutions found ?