I am using huggingface Trainer. I was using accelerate
with the transformers.Trainer
but could not get it to do ddp
so now I am using torchrun
instead. I used to do torch_compile=True
. However, now I get this cryptic error message and I have 0 idea what is going on:
[rank1]: Traceback (most recent call last):
[rank1]: File "/homes/55/cornelius/.vscode-server/extensions/ms-python.debugpy-2024.10.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/pydevd.py", line 3489, in <module>
[rank1]: main()
[rank1]: File "/homes/55/cornelius/.vscode-server/extensions/ms-python.debugpy-2024.10.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/pydevd.py", line 3482, in main
[rank1]: globals = debugger.run(setup['file'], None, None, is_module)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/.vscode-server/extensions/ms-python.debugpy-2024.10.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/pydevd.py", line 2510, in run
[rank1]: return self._exec(is_module, entry_point_fn, module_name, file, globals, locals)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/.vscode-server/extensions/ms-python.debugpy-2024.10.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/pydevd.py", line 2517, in _exec
[rank1]: globals = pydevd_runpy.run_path(file, globals, '__main__')
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/.vscode-server/extensions/ms-python.debugpy-2024.10.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 321, in run_path
[rank1]: return _run_module_code(code, init_globals, run_name,
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/.vscode-server/extensions/ms-python.debugpy-2024.10.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 135, in _run_module_code
[rank1]: _run_code(code, mod_globals, init_globals,
[rank1]: File "/homes/55/cornelius/.vscode-server/extensions/ms-python.debugpy-2024.10.0-linux-x64/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py", line 124, in _run_code
[rank1]: exec(code, run_globals)
[rank1]: File "train_model.py", line 447, in <module>
[rank1]: main()
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/hydra/main.py", line 94, in decorated_main
[rank1]: _run_hydra(
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
[rank1]: _run_app(
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/hydra/_internal/utils.py", line 457, in _run_app
[rank1]: run_and_report(
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
[rank1]: raise ex
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
[rank1]: return func()
[rank1]: ^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/hydra/_internal/utils.py", line 458, in <lambda>
[rank1]: lambda: hydra.run(
[rank1]: ^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/hydra/_internal/hydra.py", line 132, in run
[rank1]: _ = ret.return_value
[rank1]: ^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/hydra/core/utils.py", line 260, in return_value
[rank1]: raise self._return_value
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/hydra/core/utils.py", line 186, in run_job
[rank1]: ret.return_value = task_function(task_cfg)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "train_model.py", line 436, in main
[rank1]: trainer.train()
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/transformers/trainer.py", line 1938, in train
[rank1]: return inner_training_loop(
[rank1]: ^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/transformers/trainer.py", line 2279, in _inner_training_loop
[rank1]: tr_loss_step = self.training_step(model, inputs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/transformers/trainer.py", line 3318, in training_step
[rank1]: loss = self.compute_loss(model, inputs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/transformers/trainer.py", line 3363, in compute_loss
[rank1]: outputs = model(**inputs)
[rank1]: ^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank1]: return self._call_impl(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
[rank1]: return forward_call(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/_dynamo/eval_frame.py", line 451, in _fn
[rank1]: return fn(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/_dynamo/external_utils.py", line 36, in inner
[rank1]: return fn(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank1]: return self._call_impl(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
[rank1]: return forward_call(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1593, in forward
[rank1]: else self._run_ddp_forward(*inputs, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/nn/parallel/distributed.py", line 1411, in _run_ddp_forward
[rank1]: return self.module(*inputs, **kwargs) # type: ignore[index]
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
[rank1]: return self._call_impl(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
[rank1]: return forward_call(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 917, in catch_errors
[rank1]: return hijacked_callback(frame, cache_entry, hooks, frame_state)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 786, in _convert_frame
[rank1]: result = inner_convert(
[rank1]: ^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 400, in _convert_frame_assert
[rank1]: return _compile(
[rank1]: ^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/contextlib.py", line 81, in inner
[rank1]: return func(*args, **kwds)
[rank1]: ^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 703, in _compile
[rank1]: raise InternalTorchDynamoError(str(e)).with_traceback(
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 676, in _compile
[rank1]: guarded_code = compile_inner(code, one_graph, hooks, transform)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/_dynamo/utils.py", line 262, in time_wrapper
[rank1]: r = func(*args, **kwargs)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 535, in compile_inner
[rank1]: out_code = transform_code_object(code, transform)
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/_dynamo/bytecode_transformation.py", line 1036, in transform_code_object
[rank1]: transformations(instructions, code_options)
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/_dynamo/convert_frame.py", line 176, in _fn
[rank1]: torch.cuda.set_rng_state(cuda_rng_state) # type: ignore[possibly-undefined]
[rank1]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/cuda/random.py", line 74, in set_rng_state
[rank1]: _lazy_call(cb)
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/cuda/__init__.py", line 223, in _lazy_call
[rank1]: callable()
[rank1]: File "/homes/55/cornelius/anaconda3/envs/llm/lib/python3.11/site-packages/torch/cuda/random.py", line 72, in cb
[rank1]: default_generator.set_state(new_state_copy)
[rank1]: torch._dynamo.exc.InternalTorchDynamoError: CUDA error: an illegal memory access was encountered
[rank1]: CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
[rank1]: For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
[rank1]: Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
[rank1]: You can suppress this exception and fall back to eager by setting:
[rank1]: import torch._dynamo
[rank1]: torch._dynamo.config.suppress_errors = True
W0828 18:01:32.177000 139647980343424 torch/distributed/elastic/multiprocessing/api.py:851] Sending process 28078 closing signal SIGTERM
E0828 18:01:32.628000 139647980343424 torch/distributed/elastic/multiprocessing/api.py:826] failed (exitcode: 1) local_rank: 1 (pid: 28079) of binary: /homes/55/cornelius/anaconda3/envs/llm/bin/python