Cannot resume trainer from checkpoint

I am trying to resume a training session from a checkpoint. I load the original model and then I call the train(“path/to/checkpoint”) method with a path to the checkpoint. However this refuses to resume from checkpoint. The model was trained with FSDP. could this be affecting nature of the checkpoint saves? This is the error I am getting below.

Loading model from output/checkpoint-18000.
Traceback (most recent call last):
File “/home/ubuntu/alpaca/stanford_alpaca/train.py”, line 246, in
train()
File “/home/ubuntu/alpaca/stanford_alpaca/train.py”, line 239, in train
trainer.train(“output/checkpoint-18000”)
File “/home/ubuntu/.local/lib/python3.10/site-packages/transformers/trainer.py”, line 1617, in train
self._load_from_checkpoint(resume_from_checkpoint)
File “/home/ubuntu/.local/lib/python3.10/site-packages/transformers/trainer.py”, line 2120, in _load_from_checkpoint
load_result = load_sharded_checkpoint(model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled())
File “/home/ubuntu/.local/lib/python3.10/site-packages/transformers/modeling_utils.py”, line 385, in load_sharded_checkpoint
state_dict = torch.load(os.path.join(folder, shard_file), map_location=“cpu”)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/serialization.py”, line 809, in load
return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/serialization.py”, line 1172, in _load
result = unpickler.load()
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/_utils.py”, line 169, in _rebuild_tensor_v2
tensor = _rebuild_tensor(storage, storage_offset, size, stride)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/_utils.py”, line 148, in rebuild_tensor
return t.set
(storage._untyped_storage, storage_offset, size, stride)
RuntimeError: Trying to resize storage that is not resizable
Loading model from output/checkpoint-18000.
Loading model from output/checkpoint-18000.
Loading model from output/checkpoint-18000.
Traceback (most recent call last):
File “/home/ubuntu/alpaca/stanford_alpaca/train.py”, line 246, in
train()
File “/home/ubuntu/alpaca/stanford_alpaca/train.py”, line 239, in train
trainer.train(“output/checkpoint-18000”)
File “/home/ubuntu/.local/lib/python3.10/site-packages/transformers/trainer.py”, line 1617, in train
self._load_from_checkpoint(resume_from_checkpoint)
File “/home/ubuntu/.local/lib/python3.10/site-packages/transformers/trainer.py”, line 2120, in _load_from_checkpoint
load_result = load_sharded_checkpoint(model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled())
File “/home/ubuntu/.local/lib/python3.10/site-packages/transformers/modeling_utils.py”, line 385, in load_sharded_checkpoint
state_dict = torch.load(os.path.join(folder, shard_file), map_location=“cpu”)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/serialization.py”, line 809, in load
return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/serialization.py”, line 1172, in _load
result = unpickler.load()
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/_utils.py”, line 169, in _rebuild_tensor_v2
tensor = _rebuild_tensor(storage, storage_offset, size, stride)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/_utils.py”, line 148, in rebuild_tensor
return t.set
(storage._untyped_storage, storage_offset, size, stride)
RuntimeError: Trying to resize storage that is not resizable
Traceback (most recent call last):
File “/home/ubuntu/alpaca/stanford_alpaca/train.py”, line 246, in
train()
File “/home/ubuntu/alpaca/stanford_alpaca/train.py”, line 239, in train
trainer.train(“output/checkpoint-18000”)
File “/home/ubuntu/.local/lib/python3.10/site-packages/transformers/trainer.py”, line 1617, in train
self._load_from_checkpoint(resume_from_checkpoint)
File “/home/ubuntu/.local/lib/python3.10/site-packages/transformers/trainer.py”, line 2120, in _load_from_checkpoint
load_result = load_sharded_checkpoint(model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled())
File “/home/ubuntu/.local/lib/python3.10/site-packages/transformers/modeling_utils.py”, line 385, in load_sharded_checkpoint
state_dict = torch.load(os.path.join(folder, shard_file), map_location=“cpu”)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/serialization.py”, line 809, in load
return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/serialization.py”, line 1172, in _load
result = unpickler.load()
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/_utils.py”, line 169, in _rebuild_tensor_v2
tensor = _rebuild_tensor(storage, storage_offset, size, stride)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/_utils.py”, line 148, in rebuild_tensor
return t.set
(storage._untyped_storage, storage_offset, size, stride)
RuntimeError: Trying to resize storage that is not resizable
Traceback (most recent call last):
File “/home/ubuntu/alpaca/stanford_alpaca/train.py”, line 246, in
train()
File “/home/ubuntu/alpaca/stanford_alpaca/train.py”, line 239, in train
trainer.train(“output/checkpoint-18000”)
File “/home/ubuntu/.local/lib/python3.10/site-packages/transformers/trainer.py”, line 1617, in train
self._load_from_checkpoint(resume_from_checkpoint)
File “/home/ubuntu/.local/lib/python3.10/site-packages/transformers/trainer.py”, line 2120, in _load_from_checkpoint
load_result = load_sharded_checkpoint(model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled())
File “/home/ubuntu/.local/lib/python3.10/site-packages/transformers/modeling_utils.py”, line 385, in load_sharded_checkpoint
state_dict = torch.load(os.path.join(folder, shard_file), map_location=“cpu”)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/serialization.py”, line 809, in load
return _load(opened_zipfile, map_location, pickle_module, **pickle_load_args)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/serialization.py”, line 1172, in _load
result = unpickler.load()
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/_utils.py”, line 169, in _rebuild_tensor_v2
tensor = _rebuild_tensor(storage, storage_offset, size, stride)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/_utils.py”, line 148, in rebuild_tensor
return t.set
(storage._untyped_storage, storage_offset, size, stride)
RuntimeError: Trying to resize storage that is not resizable
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 123046 closing signal SIGTERM
WARNING:torch.distributed.elastic.multiprocessing.api:Sending process 123048 closing signal SIGTERM
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 1 (pid: 123047) of binary: /usr/local/bin/python3.10
Traceback (most recent call last):
File “/home/ubuntu/.local/bin/torchrun”, line 8, in
sys.exit(main())
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py”, line 346, in wrapper
return f(*args, **kwargs)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/distributed/run.py”, line 794, in main
run(args)
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/distributed/run.py”, line 785, in run
elastic_launch(
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py”, line 134, in call
return launch_agent(self._config, self._entrypoint, list(args))
File “/home/ubuntu/.local/lib/python3.10/site-packages/torch/distributed/launcher/api.py”, line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:

Same error. Any solutions?

I’m having the same errors when loading a checkpoint using torch.load(checkpoint)