Load_dataset for natural questions stucks at checksum (windows)

The following code for loading natural language datasets stucks after computing checksum.

Code:
from datasets import load_dataset
dataset = load_dataset(“natural_questions”, beam_runner=‘DirectRunner’, cache_dir=‘./cache’,ignore_verifications=True)

Here is the trace back after I interrupt the kernel:


KeyboardInterrupt Traceback (most recent call last)
File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1417, in apache_beam.runners.common.DoFnRunner.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:623, in apache_beam.runners.common.SimpleInvoker.invoke_process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1571, in apache_beam.runners.common._OutputHandler.handle_process_outputs()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\io\filebasedsource.py:394, in _ReadRange.process(self, element, *args, **kwargs)
392 source = source_list[0].source
→ 394 for record in source.read(range.new_tracker()):
395 if self._with_filename:

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\io\textio.py:224, in _TextSource.read_records(self, file_name, range_tracker)
223 while range_tracker.try_claim(next_record_start_position):
→ 224 record, num_bytes_to_next_record = self._read_record(file_to_read,
225 read_buffer)
226 # For compressed text files that use an unsplittable OffsetRangeTracker
227 # with infinity as the end position, above ‘try_claim()’ invocation
228 # would pass for an empty record at the end of file that is not
229 # followed by a new line character. Since such a record is at the last
230 # position of a file, it should not be a part of the considered range.
231 # We do this check to ignore such records.

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\io\textio.py:369, in _TextSource._read_record(self, file_to_read, read_buffer)
368 record_start_position_in_buffer = read_buffer.position
→ 369 sep_bounds = self._find_separator_bounds(file_to_read, read_buffer)
370 read_buffer.position = sep_bounds[1] if sep_bounds else len(
371 read_buffer.data)

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\io\textio.py:290, in _TextSource._find_separator_bounds(self, file_to_read, read_buffer)
287 if current_pos >= len(read_buffer.data) - delimiter_len + 1:
288 # Ensuring that there are enough bytes to determine
289 # at current_pos.
→ 290 if not self._try_to_ensure_num_bytes_in_buffer(
291 file_to_read, read_buffer, current_pos + delimiter_len):
292 return

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\io\textio.py:340, in _TextSource._try_to_ensure_num_bytes_in_buffer(self, file_to_read, read_buffer, num_bytes)
338 return False
→ 340 read_buffer.data += read_data
342 return True

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\io\textio.py:76, in _TextSource.ReadBuffer.data(self, value)
74 @data.setter
75 def data(self, value):
—> 76 assert isinstance(value, bytes)
77 self._data = value

KeyboardInterrupt:

During handling of the above exception, another exception occurred:

RuntimeError Traceback (most recent call last)
File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\datasets\builder.py:811, in DatasetBuilder.download_and_prepare..incomplete_dir(dirname)
810 try:
→ 811 yield tmp_dir
812 if os.path.isdir(dirname):

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\datasets\builder.py:860, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
859 prepare_split_kwargs[“num_proc”] = num_proc
→ 860 self._download_and_prepare(
861 dl_manager=dl_manager,
862 verify_infos=verify_infos,
863 **prepare_split_kwargs,
864 **download_and_prepare_kwargs,
865 )
866 # Sync info

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\datasets\builder.py:1971, in BeamBasedBuilder._download_and_prepare(self, dl_manager, verify_infos, **prepare_splits_kwargs)
1970 # Run pipeline
→ 1971 pipeline_results = pipeline.run()
1972 pipeline_results.wait_until_finish()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\pipeline.py:577, in Pipeline.run(self, test_runner_api)
576 shutil.rmtree(tmpdir)
→ 577 return self.runner.run_pipeline(self, self._options)
578 finally:

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\direct\direct_runner.py:131, in SwitchingDirectRunner.run_pipeline(self, pipeline, options)
129 runner = BundleBasedDirectRunner()
→ 131 return runner.run_pipeline(pipeline, options)

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\portability\fn_api_runner\fn_runner.py:201, in FnApiRunner.run_pipeline(self, pipeline, options)
198 self._profiler_factory = Profile.factory_from_options(
199 options.view_as(pipeline_options.ProfilingOptions))
→ 201 self._latest_run_result = self.run_via_runner_api(
202 pipeline.to_runner_api(default_environment=self._default_environment),
203 options)
204 return self._latest_run_result

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\portability\fn_api_runner\fn_runner.py:222, in FnApiRunner.run_via_runner_api(self, pipeline_proto, options)
221 stage_context, stages = self.create_stages(pipeline_proto)
→ 222 return self.run_stages(stage_context, stages)

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\portability\fn_api_runner\fn_runner.py:453, in FnApiRunner.run_stages(self, stage_context, stages)
452 bundle_counter += 1
→ 453 bundle_results = self._execute_bundle(
454 runner_execution_context, bundle_context_manager, bundle_input)
456 if consuming_stage_name in monitoring_infos_by_stage:

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\portability\fn_api_runner\fn_runner.py:781, in FnApiRunner._execute_bundle(self, runner_execution_context, bundle_context_manager, bundle_input)
778 bundle_manager = self._get_bundle_manager(bundle_context_manager)
780 last_result, deferred_inputs, newly_set_timers, watermark_updates = (
→ 781 self._run_bundle(
782 runner_execution_context,
783 bundle_context_manager,
784 bundle_input,
785 bundle_context_manager.stage_data_outputs,
786 bundle_context_manager.stage_timer_outputs,
787 bundle_manager))
789 for pc_name, watermark in watermark_updates.items():

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\portability\fn_api_runner\fn_runner.py:1010, in FnApiRunner._run_bundle(self, runner_execution_context, bundle_context_manager, bundle_input, data_output, expected_timer_output, bundle_manager)
1002 self._run_bundle_multiple_times_for_testing(
1003 runner_execution_context,
1004 bundle_manager,
(…)
1007 input_timers,
1008 expected_timer_output)
→ 1010 result, splits = bundle_manager.process_bundle(
1011 data_input, data_output, input_timers, expected_timer_output)
1012 # Now we collect all the deferred inputs remaining from bundle execution.
1013 # Deferred inputs can be:
1014 # - timers
1015 # - SDK-initiated deferred applications of root elements
1016 # - Runner-initiated deferred applications of root elements

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\portability\fn_api_runner\fn_runner.py:1346, in BundleManager.process_bundle(self, inputs, expected_outputs, fired_timers, expected_output_timers, dry_run)
1340 process_bundle_req = beam_fn_api_pb2.InstructionRequest(
1341 instruction_id=process_bundle_id,
1342 process_bundle=beam_fn_api_pb2.ProcessBundleRequest(
1343 process_bundle_descriptor_id=self.bundle_context_manager.
1344 process_bundle_descriptor.id,
1345 cache_tokens=[next(self._cache_token_generator)]))
→ 1346 result_future = self._worker_handler.control_conn.push(process_bundle_req)
1348 split_results = # type: List[beam_fn_api_pb2.ProcessBundleSplitResponse]

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\portability\fn_api_runner\worker_handlers.py:379, in EmbeddedWorkerHandler.push(self, request)
378 request.instruction_id = ‘control_%s’ % self._uid_counter
→ 379 response = self.worker.do_instruction(request)
380 return ControlFuture(request.instruction_id, response)

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\sdk_worker.py:596, in SdkWorker.do_instruction(self, request)
594 if request_type:
595 # E.g. if register is set, this will call self.register(request.register))
→ 596 return getattr(self, request_type)(
597 getattr(request, request_type), request.instruction_id)
598 else:

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\sdk_worker.py:634, in SdkWorker.process_bundle(self, request, instruction_id)
632 with self.maybe_profile(instruction_id):
633 delayed_applications, requests_finalization = (
→ 634 bundle_processor.process_bundle(instruction_id))
635 monitoring_infos = bundle_processor.monitoring_infos()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\bundle_processor.py:1003, in BundleProcessor.process_bundle(self, instruction_id)
1002 elif isinstance(element, beam_fn_api_pb2.Elements.Data):
→ 1003 input_op_by_transform_id[element.transform_id].process_encoded(
1004 element.data)
1006 # Finish all operations.

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\bundle_processor.py:227, in DataInputOperation.process_encoded(self, encoded_windowed_values)
225 decoded_value = self.windowed_coder_impl.decode_from_stream(
226 input_stream, True)
→ 227 self.output(decoded_value)

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\operations.py:526, in apache_beam.runners.worker.operations.Operation.output()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\operations.py:528, in apache_beam.runners.worker.operations.Operation.output()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\operations.py:237, in apache_beam.runners.worker.operations.SingletonElementConsumerSet.receive()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\operations.py:240, in apache_beam.runners.worker.operations.SingletonElementConsumerSet.receive()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\operations.py:907, in apache_beam.runners.worker.operations.DoOperation.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\operations.py:908, in apache_beam.runners.worker.operations.DoOperation.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1419, in apache_beam.runners.common.DoFnRunner.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1491, in apache_beam.runners.common.DoFnRunner._reraise_augmented()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1417, in apache_beam.runners.common.DoFnRunner.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:623, in apache_beam.runners.common.SimpleInvoker.invoke_process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1581, in apache_beam.runners.common._OutputHandler.handle_process_outputs()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1694, in apache_beam.runners.common._OutputHandler._write_value_to_tag()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\operations.py:240, in apache_beam.runners.worker.operations.SingletonElementConsumerSet.receive()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\operations.py:907, in apache_beam.runners.worker.operations.DoOperation.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\operations.py:908, in apache_beam.runners.worker.operations.DoOperation.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1419, in apache_beam.runners.common.DoFnRunner.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1491, in apache_beam.runners.common.DoFnRunner._reraise_augmented()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1417, in apache_beam.runners.common.DoFnRunner.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:623, in apache_beam.runners.common.SimpleInvoker.invoke_process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1581, in apache_beam.runners.common._OutputHandler.handle_process_outputs()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1694, in apache_beam.runners.common._OutputHandler._write_value_to_tag()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\operations.py:240, in apache_beam.runners.worker.operations.SingletonElementConsumerSet.receive()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\operations.py:907, in apache_beam.runners.worker.operations.DoOperation.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\worker\operations.py:908, in apache_beam.runners.worker.operations.DoOperation.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1419, in apache_beam.runners.common.DoFnRunner.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1507, in apache_beam.runners.common.DoFnRunner._reraise_augmented()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1417, in apache_beam.runners.common.DoFnRunner.process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:623, in apache_beam.runners.common.SimpleInvoker.invoke_process()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\runners\common.py:1571, in apache_beam.runners.common._OutputHandler.handle_process_outputs()

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\io\filebasedsource.py:394, in _ReadRange.process(self, element, *args, **kwargs)
392 source = source_list[0].source
→ 394 for record in source.read(range.new_tracker()):
395 if self._with_filename:

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\io\textio.py:224, in _TextSource.read_records(self, file_name, range_tracker)
223 while range_tracker.try_claim(next_record_start_position):
→ 224 record, num_bytes_to_next_record = self._read_record(file_to_read,
225 read_buffer)
226 # For compressed text files that use an unsplittable OffsetRangeTracker
227 # with infinity as the end position, above ‘try_claim()’ invocation
228 # would pass for an empty record at the end of file that is not
229 # followed by a new line character. Since such a record is at the last
230 # position of a file, it should not be a part of the considered range.
231 # We do this check to ignore such records.

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\io\textio.py:369, in _TextSource._read_record(self, file_to_read, read_buffer)
368 record_start_position_in_buffer = read_buffer.position
→ 369 sep_bounds = self._find_separator_bounds(file_to_read, read_buffer)
370 read_buffer.position = sep_bounds[1] if sep_bounds else len(
371 read_buffer.data)

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\io\textio.py:290, in _TextSource._find_separator_bounds(self, file_to_read, read_buffer)
287 if current_pos >= len(read_buffer.data) - delimiter_len + 1:
288 # Ensuring that there are enough bytes to determine
289 # at current_pos.
→ 290 if not self._try_to_ensure_num_bytes_in_buffer(
291 file_to_read, read_buffer, current_pos + delimiter_len):
292 return

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\io\textio.py:340, in _TextSource._try_to_ensure_num_bytes_in_buffer(self, file_to_read, read_buffer, num_bytes)
338 return False
→ 340 read_buffer.data += read_data
342 return True

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\apache_beam\io\textio.py:76, in _TextSource.ReadBuffer.data(self, value)
74 @data.setter
75 def data(self, value):
—> 76 assert isinstance(value, bytes)
77 self._data = value

RuntimeError: KeyboardInterrupt [while running ‘train/ReadAllFromText/ReadAllFiles/ReadRange’]

During handling of the above exception, another exception occurred:

PermissionError Traceback (most recent call last)
Cell In[2], line 3
1 if name == “main”:
----> 3 dataset = load_dataset(“natural_questions”, beam_runner=‘DirectRunner’, cache_dir=‘./cache’)

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\datasets\load.py:1758, in load_dataset(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, revision, use_auth_token, task, streaming, num_proc, **config_kwargs)
1755 try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
1757 # Download and prepare data
→ 1758 builder_instance.download_and_prepare(
1759 download_config=download_config,
1760 download_mode=download_mode,
1761 ignore_verifications=ignore_verifications,
1762 try_from_hf_gcs=try_from_hf_gcs,
1763 num_proc=num_proc,
1764 )
1766 # Build dataset for splits
1767 keep_in_memory = (
1768 keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
1769 )

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\datasets\builder.py:839, in DatasetBuilder.download_and_prepare(self, output_dir, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, file_format, max_shard_size, num_proc, storage_options, **download_and_prepare_kwargs)
836 self._check_manual_download(dl_manager)
838 # Create a tmp dir and rename to self._output_dir on successful exit.
→ 839 with incomplete_dir(self._output_dir) as tmp_output_dir:
840 # Temporarily assign _output_dir to tmp_data_dir to avoid having to forward
841 # it to every sub function.
842 with temporary_assignment(self, “_output_dir”, tmp_output_dir):
843
844 # Try to download the already prepared dataset files
845 downloaded_from_gcs = False

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\contextlib.py:153, in _GeneratorContextManager.exit(self, typ, value, traceback)
151 value = typ()
152 try:
→ 153 self.gen.throw(typ, value, traceback)
154 except StopIteration as exc:
155 # Suppress StopIteration unless it’s the same exception that
156 # was passed to throw(). This prevents a StopIteration
157 # raised inside the “with” statement from being suppressed.
158 return exc is not value

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\site-packages\datasets\builder.py:818, in DatasetBuilder.download_and_prepare..incomplete_dir(dirname)
816 finally:
817 if os.path.exists(tmp_dir):
→ 818 shutil.rmtree(tmp_dir)

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\shutil.py:750, in rmtree(path, ignore_errors, onerror)
748 # can’t continue even if onerror hook returns
749 return
→ 750 return _rmtree_unsafe(path, onerror)

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\shutil.py:615, in _rmtree_unsafe(path, onerror)
613 onerror(os.path.islink, fullname, sys.exc_info())
614 continue
→ 615 _rmtree_unsafe(fullname, onerror)
616 else:
617 try:

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\shutil.py:620, in _rmtree_unsafe(path, onerror)
618 os.unlink(fullname)
619 except OSError:
→ 620 onerror(os.unlink, fullname, sys.exc_info())
621 try:
622 os.rmdir(path)

File ~\AppData\Local\Continuum\miniconda3\envs\dsi\lib\shutil.py:618, in _rmtree_unsafe(path, onerror)
616 else:
617 try:
→ 618 os.unlink(fullname)
619 except OSError:
620 onerror(os.unlink, fullname, sys.exc_info())

PermissionError: [WinError 32] The process cannot access the file because it is being used by another process: ‘D:/DSI/cache/natural_questions/default/0.0.4/da8124c83e3394df62c0f9bbc6c07652bbe9288ad833053134d5f0e978bb4ee5.incomplete\beam-temp-natural_questions-train-af38e534a92411eda4f110e7c6488664\0449d810-381e-4c5b-9dc2-4c2c15a87db6.natural_questions-train’

Hi ! What makes you think it is because of checksum ?

The error says it was interrupted during the Beam step “train/ReadAllFromText/ReadAllFiles/ReadRange” if I’m not mistaken