When training a causal language model from scratch , i am getting this error:
‘’---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[258], line 3
1 from transformers import Trainer, TrainingArguments
----> 3 args = TrainingArguments(
4     output_dir=“codeparrot-ds”,
5     per_device_train_batch_size=32,
6     per_device_eval_batch_size=32,
7     evaluation_strategy=“steps”,
8     eval_steps=5_000,
9     logging_steps=5_000,
10     gradient_accumulation_steps=8,
11     num_train_epochs=1,
12     weight_decay=0.1,
13     warmup_steps=1_000,
14     lr_scheduler_type=“cosine”,
15     learning_rate=5e-4,
16     save_steps=5_000,
17     fp16=True,
18     push_to_hub=True,
19 )
21 trainer = Trainer(
22     model=model,
23     tokenizer=tokenizer,
(…)
27     eval_dataset=tokenized_datasets[“valid”],
28 )
File :125, in init(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, eval_delay, learning_rate, weight_decay, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, lr_scheduler_kwargs, warmup_ratio, warmup_steps, log_level, log_level_replica, log_on_each_node, logging_dir, logging_strategy, logging_first_step, logging_steps, logging_nan_inf_filter, save_strategy, save_steps, save_total_limit, save_safetensors, save_on_each_node, save_only_model, no_cuda, use_cpu, use_mps_device, seed, data_seed, jit_mode_eval, use_ipex, bf16, fp16, fp16_opt_level, half_precision_backend, bf16_full_eval, fp16_full_eval, tf32, local_rank, ddp_backend, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, dataloader_prefetch_factor, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, fsdp, fsdp_min_num_params, fsdp_config, fsdp_transformer_layer_cls_to_wrap, accelerator_config, deepspeed, label_smoothing_factor, optim, optim_args, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, ddp_bucket_cap_mb, ddp_broadcast_buffers, dataloader_pin_memory, dataloader_persistent_workers, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, hub_model_id, hub_strategy, hub_token, hub_private_repo, hub_always_push, gradient_checkpointing, gradient_checkpointing_kwargs, include_inputs_for_metrics, eval_do_concat_batches, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters, auto_find_batch_size, full_determinism, torchdynamo, ray_scope, ddp_timeout, torch_compile, torch_compile_backend, torch_compile_mode, dispatch_batches, split_batches, include_tokens_per_second, include_num_input_tokens_seen, neftune_noise_alpha, optim_target_modules)
File ~\AppData\Local\Programs\Python\Python311\Lib\site-packages\transformers\training_args.py:1612, in TrainingArguments.post_init(self)
1600         raise ValueError(“–optim adamw_torch_fused with --fp16 requires PyTorch>2.0”)
1602 if (
1603     self.framework == “pt”
1604     and is_torch_available()
(…)
1610     and (self.fp16 or self.fp16_full_eval)
1611 ):
 → 1612     raise ValueError(
1613         “FP16 Mixed precision training with AMP or APEX (--fp16) and FP16 half precision evaluation”
1614         " (--fp16_full_eval) can only be used on CUDA or MLU devices or NPU devices or certain XPU devices (with IPEX)."
1615     )
1617 if (
1618     self.framework == “pt”
1619     and is_torch_available()
(…)
1627     and (self.bf16 or self.bf16_full_eval)
1628 ):
1629     raise ValueError(
1630         “BF16 Mixed precision training with AMP (--bf16) and BF16 half precision evaluation”
1631         " (--bf16_full_eval) can only be used on CUDA, XPU (with IPEX), NPU, MLU or CPU/TPU/NeuronCore devices."
1632     )
ValueError: FP16 Mixed precision training with AMP or APEX (--fp16) and FP16 half precision evaluation (--fp16_full_eval) can only be used on CUDA or MLU devices or NPU devices or certain XPU devices (with IPEX).
‘’