Accelerate version errors in Trainer

I installed the accelerated version to 0.28.0 (by ‘pip install accelerate -U’) but the same error repeats. The accelerate version seems to change midway through, is there a workaround?

I’m using a Jupyter-Lab running in a virtualized environment with Python version 3.11.

This is my Trainer code.

!pip install transformers[torch]
!pip install accelerate -U

from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-large-v3-ko",  # change to a repo name of your choice
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
    learning_rate=1e-5,
    warmup_steps=500,
    max_steps=4000,
    gradient_checkpointing=True,
    fp16=True,
    evaluation_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=1000,
    eval_steps=1000,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="cer",
    greater_is_better=False,
    push_to_hub=True,
)

This is error code.

Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 290.1/290.1 kB 7.4 MB/s eta 0:00:00:00:01
Installing collected packages: accelerate
Successfully installed accelerate-0.28.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv

ImportError                               Traceback (most recent call last)
Cell In[18], line 6
      2 get_ipython().system('pip install accelerate -U')
      4 from transformers import Seq2SeqTrainingArguments
----> 6 training_args = Seq2SeqTrainingArguments(
      7     output_dir="./whisper-large-v3-ko",  # change to a repo name of your choice
      8     per_device_train_batch_size=16,
      9     gradient_accumulation_steps=1,  # increase by 2x for every 2x decrease in batch size
     10     learning_rate=1e-5,
     11     warmup_steps=500,
     12     max_steps=4000,
     13     gradient_checkpointing=True,
     14     fp16=True,
     15     evaluation_strategy="steps",
     16     per_device_eval_batch_size=8,
     17     predict_with_generate=True,
     18     generation_max_length=225,
     19     save_steps=1000,
     20     eval_steps=1000,
     21     logging_steps=25,
     22     report_to=["tensorboard"],
     23     load_best_model_at_end=True,
     24     metric_for_best_model="cer",
     25     greater_is_better=False,
     26     push_to_hub=True,
     27 )

File <string>:129, in __init__(self, output_dir, overwrite_output_dir, do_train, do_eval, do_predict, evaluation_strategy, prediction_loss_only, per_device_train_batch_size, per_device_eval_batch_size, per_gpu_train_batch_size, per_gpu_eval_batch_size, gradient_accumulation_steps, eval_accumulation_steps, eval_delay, learning_rate, weight_decay, adam_beta1, adam_beta2, adam_epsilon, max_grad_norm, num_train_epochs, max_steps, lr_scheduler_type, lr_scheduler_kwargs, warmup_ratio, warmup_steps, log_level, log_level_replica, log_on_each_node, logging_dir, logging_strategy, logging_first_step, logging_steps, logging_nan_inf_filter, save_strategy, save_steps, save_total_limit, save_safetensors, save_on_each_node, save_only_model, no_cuda, use_cpu, use_mps_device, seed, data_seed, jit_mode_eval, use_ipex, bf16, fp16, fp16_opt_level, half_precision_backend, bf16_full_eval, fp16_full_eval, tf32, local_rank, ddp_backend, tpu_num_cores, tpu_metrics_debug, debug, dataloader_drop_last, eval_steps, dataloader_num_workers, dataloader_prefetch_factor, past_index, run_name, disable_tqdm, remove_unused_columns, label_names, load_best_model_at_end, metric_for_best_model, greater_is_better, ignore_data_skip, fsdp, fsdp_min_num_params, fsdp_config, fsdp_transformer_layer_cls_to_wrap, accelerator_config, deepspeed, label_smoothing_factor, optim, optim_args, adafactor, group_by_length, length_column_name, report_to, ddp_find_unused_parameters, ddp_bucket_cap_mb, ddp_broadcast_buffers, dataloader_pin_memory, dataloader_persistent_workers, skip_memory_metrics, use_legacy_prediction_loop, push_to_hub, resume_from_checkpoint, hub_model_id, hub_strategy, hub_token, hub_private_repo, hub_always_push, gradient_checkpointing, gradient_checkpointing_kwargs, include_inputs_for_metrics, fp16_backend, push_to_hub_model_id, push_to_hub_organization, push_to_hub_token, mp_parameters, auto_find_batch_size, full_determinism, torchdynamo, ray_scope, ddp_timeout, torch_compile, torch_compile_backend, torch_compile_mode, dispatch_batches, split_batches, include_tokens_per_second, include_num_input_tokens_seen, neftune_noise_alpha, optim_target_modules, sortish_sampler, predict_with_generate, generation_max_length, generation_num_beams, generation_config)

File ~/anaconda3/lib/python3.11/site-packages/transformers/training_args.py:1552, in TrainingArguments.__post_init__(self)
   1546     if version.parse(version.parse(torch.__version__).base_version) == version.parse("2.0.0") and self.fp16:
   1547         raise ValueError("--optim adamw_torch_fused with --fp16 requires PyTorch>2.0")
   1549 if (
   1550     self.framework == "pt"
   1551     and is_torch_available()
-> 1552     and (self.device.type != "cuda")
   1553     and (self.device.type != "mlu")
   1554     and (self.device.type != "npu")
   1555     and (self.device.type != "xpu")
   1556     and (get_xla_device_type(self.device) not in ["GPU", "CUDA"])
   1557     and (self.fp16 or self.fp16_full_eval)
   1558 ):
   1559     raise ValueError(
   1560         "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation"
   1561         " (`--fp16_full_eval`) can only be used on CUDA or MLU devices or NPU devices or certain XPU devices (with IPEX)."
   1562     )
   1564 if (
   1565     self.framework == "pt"
   1566     and is_torch_available()
   (...)
   1574     and (self.bf16 or self.bf16_full_eval)
   1575 ):

File ~/anaconda3/lib/python3.11/site-packages/transformers/training_args.py:2034, in TrainingArguments.device(self)
   2030 """
   2031 The device used by this process.
   2032 """
   2033 requires_backends(self, ["torch"])
-> 2034 return self._setup_devices

File ~/anaconda3/lib/python3.11/site-packages/transformers/utils/generic.py:63, in cached_property.__get__(self, obj, objtype)
     61 cached = getattr(obj, attr, None)
     62 if cached is None:
---> 63     cached = self.fget(obj)
     64     setattr(obj, attr, cached)
     65 return cached

File ~/anaconda3/lib/python3.11/site-packages/transformers/training_args.py:1940, in TrainingArguments._setup_devices(self)
   1938 if not is_sagemaker_mp_enabled():
   1939     if not is_accelerate_available():
-> 1940         raise ImportError(
   1941             f"Using the `Trainer` with `PyTorch` requires `accelerate>={ACCELERATE_MIN_VERSION}`: "
   1942             "Please run `pip install transformers[torch]` or `pip install accelerate -U`"
   1943         )
   1944     AcceleratorState._reset_state(reset_partial_state=True)
   1945 self.distributed_state = None

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.21.0`: Please run `pip install transformers[torch]` or `pip install accelerate -U`

If so, you need to restart your session so the new libraries can be loaded in

Thank you, but if I proceed with the acceleration library installation code at the top of the file and rerun the code, I get the same problem. I’ve restarted the session several times, changed the environment (Colab, Jupyter, Conda) several times too…

Hi @BBIBIBBIBI , could you share a minimum reproducer with colab by calling is_accelerate_available since this seems to fail here.

Thanks for your reply. I don’t know what caused this issue, but it’s fixed! I rebooted a few days later without any other solution and it worked… Maybe there was an update. :slight_smile:

2 Likes