Getting PermissionError: [WinError 32] When Using Load_Dataset()

I am trying to use the load_dataset command to create a dataset of my CSV train and test files. However, when attempting to load in my csv files, I’m getting a windows error.

My code:

from datasets import load_dataset

# load data
train_dataset = load_dataset('csv', data_files='C:/Users/WTF/Desktop/cleaned_dataset/train.csv')

The error return:

Downloading and preparing dataset csv/default-6eb8a0ce457cdcea (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to C:\Users\WTF\.cache\huggingface\datasets\csv\default-6eb8a0ce457cdcea\0.0.0\2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2...




0 tables [00:00, ? tables/s]AAAA
---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\builder.py in incomplete_dir(dirname)
    484                     try:
--> 485                         yield tmp_dir
    486                         if os.path.isdir(dirname):

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\builder.py in download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, **download_and_prepare_kwargs)
    526                         self._download_and_prepare(
--> 527                             dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
    528                         )

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\builder.py in _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
    603                 # Prepare split will record examples associated to the split
--> 604                 self._prepare_split(split_generator, **prepare_split_kwargs)
    605             except OSError as e:

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\builder.py in _prepare_split(self, split_generator)
    958         not_verbose = bool(logger.getEffectiveLevel() > WARNING)
--> 959         for key, table in utils.tqdm(generator, unit=" tables", leave=False, disable=not_verbose):
    960             writer.write_table(table)

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\tqdm\std.py in __iter__(self)
   1103 
-> 1104         for obj in iterable:
   1105             yield obj

C:\Users\WTF\.cache\huggingface\modules\datasets_modules\datasets\csv\2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2\csv.py in _generate_tables(self, files)
    126                 float_precision=self.config.float_precision,
--> 127                 chunksize=self.config.chunksize,
    128             )

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
    701 
--> 702         return _read(filepath_or_buffer, kwds)
    703 

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
    428     # Create the parser.
--> 429     parser = TextFileReader(filepath_or_buffer, **kwds)
    430 

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
    894 
--> 895         self._make_engine(self.engine)
    896 

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
   1121         if engine == 'c':
-> 1122             self._engine = CParserWrapper(self.f, **self.options)
   1123         else:

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
   1852 
-> 1853         self._reader = parsers.TextReader(src, **kwds)
   1854         self.unnamed_cols = self._reader.unnamed_cols

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()

pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._get_header()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x91 in position 57: invalid start byte

During handling of the above exception, another exception occurred:

PermissionError                           Traceback (most recent call last)
<ipython-input-10-ab3af5dabfdc> in <module>()
      2 
      3 # load data
----> 4 train_dataset = load_dataset('csv', data_files='C:/Users/WTF/Desktop/cleaned_dataset/train.csv')
      5 test_dataset = load_dataset('csv', data_files='C:/Users/WTF/Desktop/cleaned_dataset/test.csv')

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\load.py in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, save_infos, script_version, **config_kwargs)
    610         download_config=download_config,
    611         download_mode=download_mode,
--> 612         ignore_verifications=ignore_verifications,
    613     )
    614 

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\builder.py in download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, **download_and_prepare_kwargs)
    532                     self.info.size_in_bytes = self.info.dataset_size + self.info.download_size
    533                     # Save info
--> 534                     self._save_info()
    535 
    536             # Download post processing resources

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\contextlib.py in __exit__(self, type, value, traceback)
    128                 value = type()
    129             try:
--> 130                 self.gen.throw(type, value, traceback)
    131             except StopIteration as exc:
    132                 # Suppress StopIteration *unless* it's the same exception that

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\builder.py in incomplete_dir(dirname)
    489                     finally:
    490                         if os.path.exists(tmp_dir):
--> 491                             shutil.rmtree(tmp_dir)
    492 
    493             # Print is intentional: we want this to always go to stdout so user has

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\shutil.py in rmtree(path, ignore_errors, onerror)
    514             # can't continue even if onerror hook returns
    515             return
--> 516         return _rmtree_unsafe(path, onerror)
    517 
    518 # Allow introspection of whether or not the hardening against symlink

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\shutil.py in _rmtree_unsafe(path, onerror)
    398                 os.unlink(fullname)
    399             except OSError:
--> 400                 onerror(os.unlink, fullname, sys.exc_info())
    401     try:
    402         os.rmdir(path)

C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\shutil.py in _rmtree_unsafe(path, onerror)
    396         else:
    397             try:
--> 398                 os.unlink(fullname)
    399             except OSError:
    400                 onerror(os.unlink, fullname, sys.exc_info())

PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\WTF\\.cache\\huggingface\\datasets\\csv\\default-6eb8a0ce457cdcea\\0.0.0\\2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2.incomplete\\csv-train.arrow```

cc @lhoestq

Hi ! It looks like the program tried to delete a directory that contains an arrow file that is still loaded by datasets somehow.

Can you close your python instances that may be conflicting, and also clear the bad directory and then try again ?

Actually I didn’t notice at first but the permission issue is actually caused by another error UnicodeDecodeError that is raised when generating the examples from the csv file.

The permission error is just a consequence of the UnicodeDecodeError that only happens on windows.

The UnicodeDecodeError is

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x91 in position 57: invalid start byte

It looks like the utf-8 encoding doesn’t allow to read your CSV file. Can you try using another encoding ?

train_dataset = load_dataset(
    'csv',
    data_files='C:/Users/WTF/Desktop/cleaned_dataset/train.csv',
    encoding=...
)
1 Like

Many thanks! encoding='cp1252' did the trick for me.