Hi everyone,
I’ve been trying to do grid search for hyper parameter tuning with the new trainer API and ray tune.
I have to say there is always something messing up during that time.
Has anyone successfully done grid search using Trainer API and BERT?
tune_config = {
"per_device_train_batch_size": 32,
"per_device_eval_batch_size": 32,
"num_train_epochs": tune.choice([2, 3, 4, 5]),
"weight_decay": tune.uniform(0.0, 0.3),
"num_epochs": tune.choice([2, 3, 4, 5]),
#"max_steps": 1 if smoke_test else -1, # Used for smoke test.
}
training_args = TrainingArguments("test", eval_steps=500, disable_tqdm=True)
trainer = Trainer(
args=training_args,
tokenizer=tokenizer,
train_dataset=tokenized_datasets_train,
eval_dataset=tokenized_datasets_val,
model_init=model_init,
compute_metrics=compute_metrics,
)
trainer.hyperparameter_search(
direction="maximize",
backend="ray",
hp_space=lambda _: tune_config)
I’ve used ray tune but it tends to give errors like these:
(pid=991) 2021-08-30 12:30:53,900 ERROR function_runner.py:266 -- Runner Thread raised error.
(pid=991) Traceback (most recent call last):
(pid=991) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 260, in run
(pid=991) self._entrypoint()
(pid=991) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 329, in entrypoint
(pid=991) self._status_reporter.get_checkpoint())
(pid=991) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 594, in _trainable_func
(pid=991) output = fn()
(pid=991) File "/usr/local/lib/python3.7/dist-packages/ray/tune/utils/trainable.py", line 344, in inner
(pid=991) trainable(config, **fn_kwargs)
(pid=991) File "/usr/local/lib/python3.7/dist-packages/transformers/integrations.py", line 162, in _objective
(pid=991) local_trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
(pid=991) File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 1031, in train
(pid=991) self._hp_search_setup(trial)
(pid=991) File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 860, in _hp_search_setup
(pid=991) f"Trying to set {key} in the hyperparameter search but there is no corresponding field in `TrainingArguments`."
(pid=991) AttributeError: Trying to set num_epochs in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
(pid=991) Exception in thread Thread-2:
(pid=991) Traceback (most recent call last):
(pid=991) File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
(pid=991) self.run()
(pid=991) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 279, in run
(pid=991) raise e
(pid=991) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 260, in run
(pid=991) self._entrypoint()
(pid=991) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 329, in entrypoint
(pid=991) self._status_reporter.get_checkpoint())
(pid=991) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 594, in _trainable_func
(pid=991) output = fn()
(pid=991) File "/usr/local/lib/python3.7/dist-packages/ray/tune/utils/trainable.py", line 344, in inner
(pid=991) trainable(config, **fn_kwargs)
(pid=991) File "/usr/local/lib/python3.7/dist-packages/transformers/integrations.py", line 162, in _objective
(pid=991) local_trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
(pid=991) File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 1031, in train
(pid=991) self._hp_search_setup(trial)
(pid=991) File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 860, in _hp_search_setup
(pid=991) f"Trying to set {key} in the hyperparameter search but there is no corresponding field in `TrainingArguments`."
(pid=991) AttributeError: Trying to set num_epochs in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
(pid=991)
2021-08-30 12:30:54,010 ERROR trial_runner.py:773 -- Trial _objective_19f77_00000: Error processing event.
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/ray/tune/trial_runner.py", line 739, in _process_trial
results = self.trial_executor.fetch_result(trial)
File "/usr/local/lib/python3.7/dist-packages/ray/tune/ray_trial_executor.py", line 746, in fetch_result
result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
File "/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py", line 82, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/ray/worker.py", line 1621, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TuneError): ray::ImplicitFunc.train_buffered() (pid=991, ip=172.28.0.2, repr=<ray.tune.function_runner.ImplicitFunc object at 0x7f208feca750>)
File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 178, in train_buffered
result = self.train()
File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 237, in train
result = self.step()
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 379, in step
self._report_thread_runner_error(block=True)
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 527, in _report_thread_runner_error
("Trial raised an exception. Traceback:\n{}".format(err_tb_str)
ray.tune.error.TuneError: Trial raised an exception. Traceback:
ray::ImplicitFunc.train_buffered() (pid=991, ip=172.28.0.2, repr=<ray.tune.function_runner.ImplicitFunc object at 0x7f208feca750>)
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 260, in run
self._entrypoint()
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 329, in entrypoint
self._status_reporter.get_checkpoint())
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 594, in _trainable_func
output = fn()
File "/usr/local/lib/python3.7/dist-packages/ray/tune/utils/trainable.py", line 344, in inner
trainable(config, **fn_kwargs)
File "/usr/local/lib/python3.7/dist-packages/transformers/integrations.py", line 162, in _objective
local_trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 1031, in train
self._hp_search_setup(trial)
File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 860, in _hp_search_setup
f"Trying to set {key} in the hyperparameter search but there is no corresponding field in `TrainingArguments`."
AttributeError: Trying to set num_epochs in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Result for _objective_19f77_00000:
{}
== Status ==
Memory usage on this node: 5.0/25.5 GiB
Using FIFO scheduling algorithm.
Resources requested: 1.0/4 CPUs, 1.0/1 GPUs, 0.0/15.0 GiB heap, 0.0/7.5 GiB objects (0.0/1.0 accelerator_type:T4)
Result logdir: /root/ray_results/_objective_2021-08-30_12-30-46
Number of trials: 18/20 (1 ERROR, 16 PENDING, 1 RUNNING)
+------------------------+----------+-------+--------------+--------------------+----------------+
| Trial name | status | loc | num_epochs | num_train_epochs | weight_decay |
|------------------------+----------+-------+--------------+--------------------+----------------|
| _objective_19f77_00001 | RUNNING | | 2 | 4 | 0.233907 |
| _objective_19f77_00002 | PENDING | | 4 | 4 | 0.13375 |
| _objective_19f77_00003 | PENDING | | 2 | 4 | 0.137775 |
| _objective_19f77_00004 | PENDING | | 4 | 5 | 0.04286 |
| _objective_19f77_00005 | PENDING | | 5 | 3 | 0.0169235 |
| _objective_19f77_00006 | PENDING | | 3 | 5 | 0.281566 |
| _objective_19f77_00007 | PENDING | | 2 | 5 | 0.297663 |
| _objective_19f77_00008 | PENDING | | 2 | 5 | 0.183496 |
| _objective_19f77_00009 | PENDING | | 4 | 5 | 0.00691873 |
| _objective_19f77_00010 | PENDING | | 5 | 4 | 0.119958 |
| _objective_19f77_00011 | PENDING | | 4 | 5 | 0.292127 |
| _objective_19f77_00012 | PENDING | | 3 | 3 | 0.0271819 |
| _objective_19f77_00013 | PENDING | | 5 | 4 | 0.114739 |
| _objective_19f77_00014 | PENDING | | 2 | 5 | 0.140029 |
| _objective_19f77_00015 | PENDING | | 2 | 4 | 0.204092 |
| _objective_19f77_00016 | PENDING | | 2 | 4 | 0.00397949 |
| _objective_19f77_00017 | PENDING | | 3 | 5 | 0.168986 |
| _objective_19f77_00000 | ERROR | | 4 | 4 | 0.238963 |
+------------------------+----------+-------+--------------+--------------------+----------------+
Number of errored trials: 1
+------------------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name | # failures | error file |
|------------------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------|
| _objective_19f77_00000 | 1 | /root/ray_results/_objective_2021-08-30_12-30-46/_objective_19f77_00000_0_num_epochs=4,num_train_epochs=4,weight_decay=0.23896_2021-08-30_12-30-46/error.txt |
+------------------------+--------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------+
(pid=989) 2021-08-30 12:31:01,040 ERROR function_runner.py:266 -- Runner Thread raised error.
(pid=989) Traceback (most recent call last):
(pid=989) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 260, in run
(pid=989) self._entrypoint()
(pid=989) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 329, in entrypoint
(pid=989) self._status_reporter.get_checkpoint())
(pid=989) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 594, in _trainable_func
(pid=989) output = fn()
(pid=989) File "/usr/local/lib/python3.7/dist-packages/ray/tune/utils/trainable.py", line 344, in inner
(pid=989) trainable(config, **fn_kwargs)
(pid=989) File "/usr/local/lib/python3.7/dist-packages/transformers/integrations.py", line 162, in _objective
(pid=989) local_trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
(pid=989) File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 1031, in train
(pid=989) self._hp_search_setup(trial)
(pid=989) File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 860, in _hp_search_setup
(pid=989) f"Trying to set {key} in the hyperparameter search but there is no corresponding field in `TrainingArguments`."
(pid=989) AttributeError: Trying to set num_epochs in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
(pid=989) Exception in thread Thread-2:
(pid=989) Traceback (most recent call last):
(pid=989) File "/usr/lib/python3.7/threading.py", line 926, in _bootstrap_inner
(pid=989) self.run()
(pid=989) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 279, in run
(pid=989) raise e
(pid=989) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 260, in run
(pid=989) self._entrypoint()
(pid=989) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 329, in entrypoint
(pid=989) self._status_reporter.get_checkpoint())
(pid=989) File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 594, in _trainable_func
(pid=989) output = fn()
(pid=989) File "/usr/local/lib/python3.7/dist-packages/ray/tune/utils/trainable.py", line 344, in inner
(pid=989) trainable(config, **fn_kwargs)
(pid=989) File "/usr/local/lib/python3.7/dist-packages/transformers/integrations.py", line 162, in _objective
(pid=989) local_trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
(pid=989) File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 1031, in train
(pid=989) self._hp_search_setup(trial)
(pid=989) File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 860, in _hp_search_setup
(pid=989) f"Trying to set {key} in the hyperparameter search but there is no corresponding field in `TrainingArguments`."
(pid=989) AttributeError: Trying to set num_epochs in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
(pid=989)
2021-08-30 12:31:01,211 ERROR trial_runner.py:773 -- Trial _objective_19f77_00001: Error processing event.
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/ray/tune/trial_runner.py", line 739, in _process_trial
results = self.trial_executor.fetch_result(trial)
File "/usr/local/lib/python3.7/dist-packages/ray/tune/ray_trial_executor.py", line 746, in fetch_result
result = ray.get(trial_future[0], timeout=DEFAULT_GET_TIMEOUT)
File "/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py", line 82, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.7/dist-packages/ray/worker.py", line 1621, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(TuneError): ray::ImplicitFunc.train_buffered() (pid=989, ip=172.28.0.2, repr=<ray.tune.function_runner.ImplicitFunc object at 0x7f4ad6a4cc50>)
File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 178, in train_buffered
result = self.train()
File "/usr/local/lib/python3.7/dist-packages/ray/tune/trainable.py", line 237, in train
result = self.step()
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 379, in step
self._report_thread_runner_error(block=True)
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 527, in _report_thread_runner_error
("Trial raised an exception. Traceback:\n{}".format(err_tb_str)
ray.tune.error.TuneError: Trial raised an exception. Traceback:
ray::ImplicitFunc.train_buffered() (pid=989, ip=172.28.0.2, repr=<ray.tune.function_runner.ImplicitFunc object at 0x7f4ad6a4cc50>)
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 260, in run
self._entrypoint()
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 329, in entrypoint
self._status_reporter.get_checkpoint())
File "/usr/local/lib/python3.7/dist-packages/ray/tune/function_runner.py", line 594, in _trainable_func
output = fn()
File "/usr/local/lib/python3.7/dist-packages/ray/tune/utils/trainable.py", line 344, in inner
trainable(config, **fn_kwargs)
File "/usr/local/lib/python3.7/dist-packages/transformers/integrations.py", line 162, in _objective
local_trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 1031, in train
self._hp_search_setup(trial)
File "/usr/local/lib/python3.7/dist-packages/transformers/trainer.py", line 860, in _hp_search_setup
f"Trying to set {key} in the hyperparameter search but there is no corresponding field in `TrainingArguments`."
AttributeError: Trying to set num_epochs in the hyperparameter search but there is no corresponding field in `TrainingArguments`.
Result for _objective_19f77_00001:
{}
Let me know, please!