I want to deploy Qwen2-VL-2B-Instruct to a SageMaker endpoint. I know that the model needs additional requirements such as qwen-vl-utils but also needs the latest version of transformers (4.45.2) to work properly. As this version of transformers is not available in DLC, I added a requirements.txt and inference.py inside a code/ folder in the model.tar.gz. Here are the files content:
requirements.txt
transformers==4.45.2
accelerate==0.34.2
bitsandbytes==0.43.3
qwen-vl-utils==0.0.4
inference.py
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def model_fn(model_dir):
model = Qwen2VLForConditionalGeneration.from_pretrained(model_dir, torch_dtype=torch.bfloat16).to(device)
# Load the processor
processor = AutoProcessor.from_pretrained(model_dir)
return [model, processor]
def predict_fn(data, model_process):
model = model_process[0]
processor = model_process[1]
text = processor.apply_chat_template(data, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(data)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to(device)
with torch.no_grad():
generated_ids = model.generate(**inputs, max_new_tokens=496)
generated_ids_trimmed = [
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
return {"response": output_text[0]}
deployment code:
image_uri = "763104351884.dkr.ecr.eu-central-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.4-tgi2.3-gpu-py311-cu124-ubuntu22.04"
config = {
"HF_MODEL_ID": "Qwen/Qwen2-VL-2B-Instruct",
"SM_NUM_GPUS": json.dumps(1),
"MAX_INPUT_LENGTH": json.dumps(1024),
"MAW_TOTAL_TOKENS": json.dumps(2048),
}
huggingface_model = HuggingFaceModel(
model_data=s3_model_uri,
role=execution_role_arn,
image_uri=image_uri,
sagemaker_session = sess,
env=config
)
predictor = huggingface_model.deploy(
initial_instance_count=1,
instance_type="ml.g5.xlarge",
endpoint_name="Qwen2-VL"
)
The error I get:
UnexpectedStatusException: Error hosting endpoint Qwen2-VL: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint.
What I found in the logs:
ValueError: Unsupported model type qwen2_vl
Any idea of what could have gone wrong?