Hi,
for inference I created a deployment on AWS which is working fine when used for summarization
It works fine on some texts, but as soon as I use a particular long text I get an error and I am not able to run any summarization taks again, even the ones with short input texts are no longer working.
I have to create a new endpoint in order to get it working again.
What could I do to prevent this error? How is it possible that the model recovers without new deployment?
Your help is highly appreciated.
Kindest regards, Phil
# model_id = 'facebook/bart-large-cnn'
hub = {
'HF_MODEL_ID': model_id,
'HF_TASK': 'summarization'
}
# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
transformers_version="4.26.0", # transformers version used
pytorch_version="1.13.1", # pytorch version used
py_version='py39',
env=hub,
role=role,
)
Traceback (most recent call last):
File "EU10_004_ACCOUNT/venv/lib/python3.11/site-packages/langchain/llms/sagemaker_endpoint.py", line 234, in _call
response = self.client.invoke_endpoint(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/venv/lib/python3.11/site-packages/botocore/client.py", line 530, in _api_call
return self._make_api_call(operation_name, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/venv/lib/python3.11/site-packages/botocore/client.py", line 964, in _make_api_call
raise error_class(parsed_response, operation_name)
botocore.errorfactory.ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from primary with message "{
"code": 400,
"type": "InternalServerException",
"message": "CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasLtMatmul( ltHandle, computeDesc.descriptor(), \u0026alpha_val, mat1_ptr, Adesc.descriptor(), mat2_ptr, Bdesc.descriptor(), \u0026beta_val, result_ptr, Cdesc.descriptor(), result_ptr, Cdesc.descriptor(), \u0026heuristicResult.algo, workspace.data_ptr(), workspaceSize, at::cuda::getCurrentCUDAStream())`"
}
"
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/chain/sagemaker_endpoint/run.py", line 93, in <module>
print(run(docs, content_handler=content_handler))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/chain/sagemaker_endpoint/run.py", line 75, in run
return chain.run(docs)
^^^^^^^^^^^^^^^
File "/venv/lib/python3.11/site-packages/langchain/chains/base.py", line 236, in run
return self(args[0], callbacks=callbacks)[self.output_keys[0]]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/venv/lib/python3.11/site-packages/langchain/chains/base.py", line 140, in __call__
raise e
File "/venv/lib/python3.11/site-packages/langchain/chains/base.py", line 134, in __call__
self._call(inputs, run_manager=run_manager)
File "/venv/lib/python3.11/site-packages/langchain/chains/combine_documents/base.py", line 84, in _call
output, extra_return_dict = self.combine_docs(
^^^^^^^^^^^^^^^^^^
File "/venv/lib/python3.11/site-packages/langchain/chains/combine_documents/map_reduce.py", line 144, in combine_docs
results = self.llm_chain.apply(
^^^^^^^^^^^^^^^^^^^^^
File "/venv/lib/python3.11/site-packages/langchain/chains/llm.py", line 157, in apply
raise e
File "/venv/lib/python3.11/site-packages/langchain/chains/llm.py", line 154, in apply
response = self.generate(input_list, run_manager=run_manager)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/venv/lib/python3.11/site-packages/langchain/chains/llm.py", line 79, in generate
return self.llm.generate_prompt(
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/venv/lib/python3.11/site-packages/langchain/llms/base.py", line 134, in generate_prompt
return self.generate(prompt_strings, stop=stop, callbacks=callbacks)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/venv/lib/python3.11/site-packages/langchain/llms/base.py", line 191, in generate
raise e
File "/venv/lib/python3.11/site-packages/langchain/llms/base.py", line 185, in generate
self._generate(prompts, stop=stop, run_manager=run_manager)
File "/venv/lib/python3.11/site-packages/langchain/llms/base.py", line 405, in _generate
self._call(prompt, stop=stop, run_manager=run_manager)
File "/venv/lib/python3.11/site-packages/langchain/llms/sagemaker_endpoint.py", line 242, in _call
raise ValueError(f"Error raised by inference endpoint: {e}")
ValueError: Error raised by inference endpoint: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (400) from primary with message "{
"code": 400,
"type": "InternalServerException",
"message": "CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasLtMatmul( ltHandle, computeDesc.descriptor(), \u0026alpha_val, mat1_ptr, Adesc.descriptor(), mat2_ptr, Bdesc.descriptor(), \u0026beta_val, result_ptr, Cdesc.descriptor(), result_ptr, Cdesc.descriptor(), \u0026heuristicResult.algo, workspace.data_ptr(), workspaceSize, at::cuda::getCurrentCUDAStream())`"
}