I’m looking at the accelerate find batch size code
if len(params) < (len(args) + 1):
arg_str = ", ".join([f"{arg}={value}" for arg, value in zip(params[1:], args[1:])])
raise TypeError(
f"Batch size was passed into `{function.__name__}` as the first argument when called."
f"Remove this as the decorator already does so: `{function.__name__}({arg_str})`"
)
while True:
if batch_size == 0:
raise RuntimeError("No executable batch size found, reached zero.")
try:
return function(batch_size, *args, **kwargs)
except Exception as e:
if should_reduce_batch_size(e):
gc.collect()
if not is_xpu_available():
torch.cuda.empty_cache()
else:
torch.xpu.empty_cache()
batch_size //= 2
else:
raise
Which is used in hf trainer.
if resume_from_checkpoint is not None and not is_sagemaker_mp_enabled() and not self.is_deepspeed_enabled:
self._load_from_checkpoint(resume_from_checkpoint)
# If model was re-initialized, put it on the right device and update self.model_wrapped
if model_reloaded:
if self.place_model_on_device:
self._move_model_to_device(self.model, args.device)
self.model_wrapped = self.model
inner_training_loop = find_executable_batch_size(
self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
)
return inner_training_loop(
args=args,
resume_from_checkpoint=resume_from_checkpoint,
trial=trial,
ignore_keys_for_eval=ignore_keys_for_eval,
)
def _inner_training_loop(
It seems that if a batch size fails, it reduced the batch size by half and returns a brand new _inner_training_loop
with the new batch size, and thus a new dataloader is instantiated.
From the dataloader instantiation, it seems that nothing is being passed to indicate that the training samples are being continued from before
"collate_fn": data_collator,
"num_workers": self.args.dataloader_num_workers,
"pin_memory": self.args.dataloader_pin_memory,
}
if not isinstance(train_dataset, torch.utils.data.IterableDataset):
dataloader_params["sampler"] = self._get_train_sampler()
dataloader_params["drop_last"] = self.args.dataloader_drop_last
dataloader_params["worker_init_fn"] = seed_worker
return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.Sampler]:
# Deprecated code
if self.args.use_legacy_prediction_loop:
if is_torch_tpu_available():
return SequentialDistributedSampler(
eval_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()
)
elif is_sagemaker_mp_enabled():
return SequentialDistributedSampler(
But from the training output, it seems to indicate that it is continuing from where it left off.