the following is the gist of our code
model = transformers.LlamaForCausalLM.from_pretrained(
pretrained_model_name_or_path=model_args.input_model_local_path,
cache_dir=training_args.cache_dir,
torch_dtype=torch.bfloat16,
)
tokenizer = transformers.LlamaTokenizer.from_pretrained(
pretrained_model_name_or_path=model_args.input_model_local_path,
cache_dir=training_args.cache_dir,
model_max_length=training_args.model_max_length,
padding_side="right",
use_fast=False,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
**data_module,
callbacks=[ManifoldTensorBoardLoggerCallback()],
)
trainer.train()
but got error:
trainer/0 [1]: trainer.train()
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/transformers/trainer.py", line 1555, in train
trainer/0 [1]: return inner_training_loop(
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/transformers/trainer.py", line 1674, in _inner_training_loop
trainer/0 [1]: self.model = self.accelerator.prepare(self.model)
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/accelerate/accelerator.py", line 1270, in prepare
trainer/0 [1]: result = tuple(
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/accelerate/accelerator.py", line 1271, in <genexpr>
trainer/0 [1]: self._prepare_one(obj, first_pass=True, device_placement=d) for obj, d in zip(args, device_placement)
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/accelerate/accelerator.py", line 1083, in _prepare_one
trainer/0 [1]: return self.prepare_model(obj, device_placement=device_placement)
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/accelerate/accelerator.py", line 1429, in prepare_model
trainer/0 [1]: model = FSDP(model, **kwargs)
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 471, in __init__
trainer/0 [1]: _auto_wrap(
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/torch/distributed/fsdp/_wrap_utils.py", line 101, in _auto_wrap
trainer/0 [1]: _recursive_wrap(**recursive_wrap_kwargs, **root_kwargs) # type: ignore[arg-type]
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/torch/distributed/fsdp/wrap.py", line 543, in _recursive_wrap
trainer/0 [1]: wrapped_child, num_wrapped_params = _recursive_wrap(
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/torch/distributed/fsdp/wrap.py", line 543, in _recursive_wrap
trainer/0 [1]: wrapped_child, num_wrapped_params = _recursive_wrap(
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/torch/distributed/fsdp/wrap.py", line 543, in _recursive_wrap
trainer/0 [1]: wrapped_child, num_wrapped_params = _recursive_wrap(
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/torch/distributed/fsdp/wrap.py", line 561, in _recursive_wrap
trainer/0 [1]: return _wrap(module, wrapper_cls, **kwargs), nonwrapped_numel
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/torch/distributed/fsdp/wrap.py", line 490, in _wrap
trainer/0 [1]: return wrapper_cls(module, **kwargs)
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 497, in __init__
trainer/0 [1]: _init_param_handle_from_module(
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/torch/distributed/fsdp/_init_utils.py", line 597, in _init_param_handle_from_module
trainer/0 [1]: _sync_module_params_and_buffers(
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/torch/distributed/fsdp/_init_utils.py", line 1072, in _sync_module_params_and_buffers
trainer/0 [1]: _sync_params_and_buffers(
trainer/0 [1]: File "/tmp/jetter.ye5l5sro/torch/distributed/utils.py", line 306, in _sync_params_and_buffers
trainer/0 [1]: dist._broadcast_coalesced(
trainer/0 [1]:RuntimeError: CUDA error: device kernel image is invalid
trainer/0 [1]:CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
trainer/0 [1]:For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
trainer/0 [1]:Device-side assertion tracking was not enabled by user.
trainer/0 [0]:2023-10-03 16:02:46,982 - comment_gen_llama.train - INFO - Complete tokenizer loading...
as far as I know the CUDA on the hardware is 12.0