I am trying to use the load_dataset
command to create a dataset of my CSV train and test files. However, when attempting to load in my csv files, I’m getting a windows error.
My code:
from datasets import load_dataset
# load data
train_dataset = load_dataset('csv', data_files='C:/Users/WTF/Desktop/cleaned_dataset/train.csv')
The error return:
Downloading and preparing dataset csv/default-6eb8a0ce457cdcea (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to C:\Users\WTF\.cache\huggingface\datasets\csv\default-6eb8a0ce457cdcea\0.0.0\2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2...
0 tables [00:00, ? tables/s]AAAA
---------------------------------------------------------------------------
UnicodeDecodeError Traceback (most recent call last)
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\builder.py in incomplete_dir(dirname)
484 try:
--> 485 yield tmp_dir
486 if os.path.isdir(dirname):
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\builder.py in download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, **download_and_prepare_kwargs)
526 self._download_and_prepare(
--> 527 dl_manager=dl_manager, verify_infos=verify_infos, **download_and_prepare_kwargs
528 )
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\builder.py in _download_and_prepare(self, dl_manager, verify_infos, **prepare_split_kwargs)
603 # Prepare split will record examples associated to the split
--> 604 self._prepare_split(split_generator, **prepare_split_kwargs)
605 except OSError as e:
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\builder.py in _prepare_split(self, split_generator)
958 not_verbose = bool(logger.getEffectiveLevel() > WARNING)
--> 959 for key, table in utils.tqdm(generator, unit=" tables", leave=False, disable=not_verbose):
960 writer.write_table(table)
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\tqdm\std.py in __iter__(self)
1103
-> 1104 for obj in iterable:
1105 yield obj
C:\Users\WTF\.cache\huggingface\modules\datasets_modules\datasets\csv\2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2\csv.py in _generate_tables(self, files)
126 float_precision=self.config.float_precision,
--> 127 chunksize=self.config.chunksize,
128 )
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, delim_whitespace, low_memory, memory_map, float_precision)
701
--> 702 return _read(filepath_or_buffer, kwds)
703
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py in _read(filepath_or_buffer, kwds)
428 # Create the parser.
--> 429 parser = TextFileReader(filepath_or_buffer, **kwds)
430
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py in __init__(self, f, engine, **kwds)
894
--> 895 self._make_engine(self.engine)
896
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py in _make_engine(self, engine)
1121 if engine == 'c':
-> 1122 self._engine = CParserWrapper(self.f, **self.options)
1123 else:
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\pandas\io\parsers.py in __init__(self, src, **kwds)
1852
-> 1853 self._reader = parsers.TextReader(src, **kwds)
1854 self.unnamed_cols = self._reader.unnamed_cols
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader.__cinit__()
pandas\_libs\parsers.pyx in pandas._libs.parsers.TextReader._get_header()
UnicodeDecodeError: 'utf-8' codec can't decode byte 0x91 in position 57: invalid start byte
During handling of the above exception, another exception occurred:
PermissionError Traceback (most recent call last)
<ipython-input-10-ab3af5dabfdc> in <module>()
2
3 # load data
----> 4 train_dataset = load_dataset('csv', data_files='C:/Users/WTF/Desktop/cleaned_dataset/train.csv')
5 test_dataset = load_dataset('csv', data_files='C:/Users/WTF/Desktop/cleaned_dataset/test.csv')
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\load.py in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, save_infos, script_version, **config_kwargs)
610 download_config=download_config,
611 download_mode=download_mode,
--> 612 ignore_verifications=ignore_verifications,
613 )
614
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\builder.py in download_and_prepare(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, **download_and_prepare_kwargs)
532 self.info.size_in_bytes = self.info.dataset_size + self.info.download_size
533 # Save info
--> 534 self._save_info()
535
536 # Download post processing resources
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\contextlib.py in __exit__(self, type, value, traceback)
128 value = type()
129 try:
--> 130 self.gen.throw(type, value, traceback)
131 except StopIteration as exc:
132 # Suppress StopIteration *unless* it's the same exception that
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\site-packages\datasets\builder.py in incomplete_dir(dirname)
489 finally:
490 if os.path.exists(tmp_dir):
--> 491 shutil.rmtree(tmp_dir)
492
493 # Print is intentional: we want this to always go to stdout so user has
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\shutil.py in rmtree(path, ignore_errors, onerror)
514 # can't continue even if onerror hook returns
515 return
--> 516 return _rmtree_unsafe(path, onerror)
517
518 # Allow introspection of whether or not the hardening against symlink
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\shutil.py in _rmtree_unsafe(path, onerror)
398 os.unlink(fullname)
399 except OSError:
--> 400 onerror(os.unlink, fullname, sys.exc_info())
401 try:
402 os.rmdir(path)
C:\Users\WTF\AppData\Local\Programs\Python\Python37\lib\shutil.py in _rmtree_unsafe(path, onerror)
396 else:
397 try:
--> 398 os.unlink(fullname)
399 except OSError:
400 onerror(os.unlink, fullname, sys.exc_info())
PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: 'C:\\Users\\WTF\\.cache\\huggingface\\datasets\\csv\\default-6eb8a0ce457cdcea\\0.0.0\\2960f95a26e85d40ca41a230ac88787f715ee3003edaacb8b1f0891e9f04dda2.incomplete\\csv-train.arrow```