Good morning!
I’m trying to play with hyper parameters, and am doing a pretty massive search of the parameter space. I have one python file, and re-call BERT for every iteration that I have. When doing this, I find myself running out of memory (output below).
This may be more of a python question rather than a hugging face question, but is there a way to remove the BERT model? I use the same variable name, but I believe it’s still in there given that I’m still running out of memory.
Thanks for the help!
2022-12-06 16:11:50.263539: I tensorflow/tsl/framework/bfc_allocator.cc:1104] total_region_allocated_bytes_: 11039080448 memory_limit_: 11039080448 available bytes: 0 curr_region_allocation_bytes_: 22078160896
2022-12-06 16:11:50.263552: I tensorflow/tsl/framework/bfc_allocator.cc:1110] Stats:
Limit: 11039080448
InUse: 10998232576
MaxInUse: 10998232576
NumAllocs: 3107
MaxAllocSize: 205778944
Reserved: 0
PeakReserved: 0
LargestFreeBlock: 0
2022-12-06 16:11:50.263622: W tensorflow/tsl/framework/bfc_allocator.cc:492] ****************************************************************************************************
2022-12-06 16:11:50.263665: W tensorflow/core/framework/op_kernel.cc:1818] RESOURCE_EXHAUSTED: failed to allocate memory
Traceback (most recent call last):
File "/home/jovyan/main.py", line 464, in <module>
DOE(DOE_RunNumber= 16, DOE_dense_1=192, DOE_dense_2=48, DOE_dropout=.1, DOE_learning_rate=3e-5, DOE_EPOCH=50,
File "/home/jovyan/main.py", line 411, in DOE
m_history = model_full(df_data_path, dense_1 = DOE_dense_1, dense_2=DOE_dense_2, dropout=DOE_dropout, learning_rate=DOE_learning_rate, epoch=DOE_EPOCH,
File "/home/jovyan/main.py", line 256, in model_full
history = model.fit({'input_ids': x_train['input_ids'], 'attention_mask': x_train['attention_mask']},
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "/opt/conda/envs/tf/lib/python3.9/site-packages/tensorflow/python/eager/execute.py", line 52, in quick_execute
tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
tensorflow.python.framework.errors_impl.ResourceExhaustedError: Graph execution error:
Detected at node 'model/tf_bert_model/bert/encoder/layer_._11/intermediate/Gelu/mul_1' defined at (most recent call last):
File "/home/jovyan/main.py", line 464, in <module>
DOE(DOE_RunNumber= 16, DOE_dense_1=192, DOE_dense_2=48, DOE_dropout=.1, DOE_learning_rate=3e-5, DOE_EPOCH=50,
File "/home/jovyan/main.py", line 411, in DOE
m_history = model_full(df_data_path, dense_1 = DOE_dense_1, dense_2=DOE_dense_2, dropout=DOE_dropout, learning_rate=DOE_learning_rate, epoch=DOE_EPOCH,
File "/home/jovyan/main.py", line 256, in model_full
history = model.fit({'input_ids': x_train['input_ids'], 'attention_mask': x_train['attention_mask']},
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1409, in fit
tmp_logs = self.train_function(iterator)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1051, in train_function
return step_function(self, iterator)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1040, in step_function
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 1030, in run_step
outputs = model.train_step(data)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 889, in train_step
y_pred = self(x, training=True)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 490, in __call__
return super().__call__(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/functional.py", line 458, in call
return self._run_internal_graph(
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/functional.py", line 596, in _run_internal_graph
outputs = node.layer(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/training.py", line 490, in __call__
return super().__call__(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 1090, in run_call_with_unpacked_inputs
# (and avoid unnecessary warnings).
File "/opt/conda/envs/tf/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 1118, in call
outputs = self.bert(
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/transformers/modeling_tf_utils.py", line 1090, in run_call_with_unpacked_inputs
# (and avoid unnecessary warnings).
File "/opt/conda/envs/tf/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 873, in call
encoder_outputs = self.encoder(
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 558, in call
for i, layer_module in enumerate(self.layer):
File "/opt/conda/envs/tf/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 564, in call
layer_outputs = layer_module(
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 520, in call
intermediate_output = self.intermediate(hidden_states=attention_output)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/engine/base_layer.py", line 1014, in __call__
outputs = call_fn(inputs, *args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/utils/traceback_utils.py", line 92, in error_handler
return fn(*args, **kwargs)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/transformers/models/bert/modeling_tf_bert.py", line 424, in call
hidden_states = self.intermediate_act_fn(hidden_states)
File "/opt/conda/envs/tf/lib/python3.9/site-packages/keras/activations.py", line 351, in gelu
return tf.nn.gelu(x, approximate)
Node: 'model/tf_bert_model/bert/encoder/layer_._11/intermediate/Gelu/mul_1'
failed to allocate memory
[[{{node model/tf_bert_model/bert/encoder/layer_._11/intermediate/Gelu/mul_1}}]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.
[Op:__inference_train_function_31575]