I am getting following error while saving the model.
Used this function to save the model
def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: str):
"""Collects the state dict and dump to disk."""
state_dict = trainer.model.state_dict()
if trainer.args.should_save:
cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
del state_dict
trainer._save(output_dir, state_dict=cpu_state_dict) # noqa
ERROR:torch.distributed.elastic.agent.server.api:Error waiting on exit barrier. Elapsed: 305.2644510269165 seconds
Traceback (most recent call last):
File â/usr/local/lib/python3.8/dist-packages/torch/distributed/elastic/agent/server/api.pyâ, line 906, in _exit_barrier
store_util.barrier(
File â/usr/local/lib/python3.8/dist-packages/torch/distributed/elastic/utils/store.pyâ, line 78, in barrier
synchronize(store, data, rank, world_size, key_prefix, barrier_timeout)
File â/usr/local/lib/python3.8/dist-packages/torch/distributed/elastic/utils/store.pyâ, line 64, in synchronize
agent_data = get_all(store, rank, key_prefix, world_size)
File â/usr/local/lib/python3.8/dist-packages/torch/distributed/elastic/utils/store.pyâ, line 34, in get_all
data = store.get(f"{prefix}{idx}")