Need help to reduce CLIP image embedding time

I am working on an image clustering API using CLIP. The user can enter possible labels if they like, and the model tries to match it accordingly (zero shot image classification). For the images that are not matched to any labels, I cluster them based on how similar they are to each other. I’m working with Next.js and AWS Lambda.

In Next.JS, I am generating a zip file of images and sending it over to my /api route where I retrieve the image and text embeddings from CLIP through Transformers.js. I’m then sending that data over to my AWS Lambda function (in Python) to perform image clustering. My goal is to cluster images that match the label with each other, and those images that do not match any label get clustered based on how similar they are to each other.

This is how I’m doing that:

import { NextApiHandler, NextApiRequest, NextApiResponse } from "next";
import fs from "fs";
import path from "path";
import os from "os";
import { AutoProcessor, AutoTokenizer, CLIPVisionModelWithProjection, RawImage, CLIPTextModelWithProjection } from '@xenova/transformers';
import JSZip from 'jszip';
import axios from "axios";
export const config = {
  maxDuration: 300,
  api: {
    bodyParser: {
        sizeLimit: '50mb' // Set desired value here
    }
}
}

const runPythonScript: NextApiHandler = async (req: NextApiRequest, res: NextApiResponse) => {
  if (req.method === "POST") {
    const { base64String, labels } = req.body;
    // buffer data is a zip file containing a folder containing images
    // extract the images from the zip file and embed them using the vision model
    const objectOfImages = {}
    const objectOfTexts = {};
    try {
      // Extract images from the zip file
      const buffer = Buffer.from(base64String, 'base64');
      // fs.writeFileSync('images.zip', buffer);
      const directory = path.join(os.tmpdir(), 'images');
      const zip = new JSZip();
      await zip.loadAsync(buffer);
      const tokenizer = await AutoTokenizer.from_pretrained('Xenova/clip-vit-base-patch32');
      const processor = await AutoProcessor.from_pretrained('Xenova/clip-vit-base-patch32');
      const vision_model = await CLIPVisionModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch32');
      const textModel = await CLIPTextModelWithProjection.from_pretrained('Xenova/clip-vit-base-patch32');
      
      // if labels exist, run the tokenizer and text model
      if (labels) {
        const labelsArray = labels.split(',').map(label => label.trim());
        for (const label of labelsArray) {
          const text_inputs = await tokenizer(label, { padding: true, truncation: true });
          const { text_embeds } = await textModel(text_inputs);
          objectOfTexts[label] = text_embeds;
        }
      }
      
      await Promise.all(Object.entries(zip.files).map(async ([relativePath, file]) => {
        if (!file.dir) {
          const filePath = path.join(directory, relativePath);
          await fs.promises.mkdir(path.dirname(filePath), { recursive: true });
          await file.async('nodebuffer').then(async (content) => {
            fs.writeFileSync(filePath, content);
            // console.log(`Extracted file: ${filePath}`);
            // Read image and run processor
            const image = await RawImage.read(filePath);
            objectOfImages[relativePath] = image;
            const image_inputs = await processor(image);
            // Compute embeddings
            const { image_embeds } = await vision_model(image_inputs);
            objectOfImages[relativePath] = image_embeds;
          });
        }
      }))
      console.log('Text Embeddings:', objectOfTexts);
      console.log('Image Embeddings:', objectOfImages);
    }
    catch (error) {
      console.error("Error occurred while extracting images from zip file:", error);
      return res.status(500).json({ message: "Error occurred while extracting images from zip file" });
    }
    try {
      const AWSLambdaEndpoint = 'myAWSLambdaEndpoint';
      const response = await axios.post(AWSLambdaEndpoint, { objectOfImageEmbeddings: objectOfImages, objectOfTextEmbeddings: objectOfTexts});
      console.log('Response from the flask server:', response.data);
      return res.status(200).json({ message: "Successfully ran the vision model", data: response.data});
    }
    catch (error) {
      console.error("Error occurred while communicating with Flask API:", error);
      return res.status(500).json({ message: "Error occurred while communicating with Flask API" });
    }
  }
}

export default runPythonScript;

I know this isn’t exactly relevant or necessary but I figured I would provide it for context. This is the AWS Lambda Python script. Thanks to GitHub - LexCybermac/smlr: A Simple Image Clustering Script using CLIP and Hierarchial Clustering for the clustering implementation:

import json
import numpy as np
from annoy import AnnoyIndex
from scipy.cluster.hierarchy import linkage, fcluster
from collections import defaultdict

def build_annoy_index(all_embeddings):
    embeddings = np.array(all_embeddings)
    n_dimensions = embeddings.shape[1]

    annoy_index = AnnoyIndex(n_dimensions, "angular")
    for i, embedding in enumerate(embeddings):
        annoy_index.add_item(i, embedding)

    annoy_index.build(100)
    return annoy_index

# Compute the distance matrix of the embeddings using the Annoy index
def compute_distance_matrix(all_embeddings, annoy_index):
    n = len(all_embeddings)
    distances = []

    for i in range(n):
        for j in range(i + 1, n):
            distance = annoy_index.get_distance(i, j)
            distances.append(distance)

    return distances

# Apply hierarchical clustering on the computed distance matrix with the given threshold
def apply_clustering(distances, threshold):
    condensed_distances = np.array(distances)
    Z = linkage(condensed_distances, method='average', optimal_ordering=True)
    return fcluster(Z, t=threshold, criterion='distance')

# Build clusters of image ids based on the clustering labels
def build_image_clusters(all_image_ids, labels):
    image_id_clusters = defaultdict(set)

    for image_id, cluster_label in zip(all_image_ids, labels):
        image_id_clusters[cluster_label].add(image_id)

    return image_id_clusters

def lambda_handler(event, context):
    try:
        body = json.loads(event['body'])
        objectOfImageEmbeddings = body.get('objectOfImageEmbeddings')
        objectOfTextEmbeddings = body.get('objectOfTextEmbeddings')
        similarity_threshold = 0.27

        if not objectOfImageEmbeddings or not objectOfTextEmbeddings:
            return {
                'statusCode': 400,
                'body': json.dumps({'error': 'Invalid request payload'})
            }

        # Convert image embeddings to NumPy arrays
        image_embeddings = []
        image_names = []
        for image_name, tensor in objectOfImageEmbeddings.items():
            data = tensor.get('data')
            if isinstance(data, dict):
                image_embeddings.append(np.array(list(data.values()), dtype=np.float32))
                image_names.append(image_name)
            else:
                print(f"Unsupported data structure for tensor: {tensor}")

        # Convert text embeddings to NumPy arrays
        text_embeddings = []
        text_labels = []
        for text_label, tensor in objectOfTextEmbeddings.items():
            data = tensor.get('data')
            if isinstance(data, dict):
                text_embeddings.append(np.array(list(data.values()), dtype=np.float32))
                text_labels.append(text_label)
            else:
                print(f"Unsupported data structure for tensor: {tensor}")

        # Calculate cosine similarity
        cos_scores = []
        for i, image_embedding in enumerate(image_embeddings):
            image_scores = []
            for j, text_embedding in enumerate(text_embeddings):
                dot_product = np.dot(image_embedding, text_embedding)
                norm_image = np.linalg.norm(image_embedding)
                norm_text = np.linalg.norm(text_embedding)
                cos_sim = dot_product / (norm_image * norm_text)
                image_scores.append({
                    'score': cos_sim,
                    'image_name': image_names[i],
                    'text_label': text_labels[j]
                })
            cos_scores.append(image_scores)

        # Create a list of image-text pairs with similarity scores above the threshold, convert float32 to float
        labeled_clusters = []
        unlabeled_images = []
        unlabeled_clusters = []
        for i, image_scores in enumerate(cos_scores):
            max_score = max(score['score'] for score in image_scores)
            if max_score > similarity_threshold:
                for j, score in enumerate(image_scores):
                    if score['score'] == max_score:
                        labeled_clusters.append({
                            'image_name': score['image_name'],
                            'text_label': score['text_label'],
                            'score': float(score['score'])
                        })
            else:
                unlabeled_images.append({
                    'image_name': image_names[i],
                    'image_embedding': image_embeddings[i].tolist()
                })

        # Cluster unlabeled images based on cosine similarity
        if unlabeled_images:
            # Get the embeddings of the unlabeled images
            unlabeled_image_embeddings = [image['image_embedding'] for image in unlabeled_images]
            unlabeled_image_names = [image['image_name'] for image in unlabeled_images]
            annoy_index = build_annoy_index(unlabeled_image_embeddings)
            distances = compute_distance_matrix(unlabeled_image_embeddings, annoy_index)
            labels = apply_clustering(distances, 0.55)
            image_id_clusters = build_image_clusters(unlabeled_image_names, labels.tolist())
            for cluster_label, image_ids in image_id_clusters.items():
                unlabeled_clusters.append({
                    'cluster_label': cluster_label,
                    'image_ids': list(image_ids)
                })

        return {
            'statusCode': 200,
            'body': json.dumps({
                'message': 'Cosine similarity calculation successful',
                'labeled_clusters': labeled_clusters,
                'unlabeled_images': unlabeled_image_names,
                'unlabeled_clusters': unlabeled_clusters,
            })
        }
    except Exception as e:
        print(f"Error occurred during cosine similarity calculation: {e}")
        return {
            'statusCode': 500,
            'body': json.dumps({'error': str(e)})
        }

The actual implementation seems to be working well for my goals here. However, the script takes so long to return a response. If I’m sending around 21 images, it takes about 30 seconds for the request to return a response. I narrowed it down to being an issue of the image embedding process taking too long.

On AWS Lambda, it shows that the DurationInMS is less than a second, so I’m not sure why it’s taking that long, and so I just ended up trying to return the objectOfImages and objectOfTexts without calling the endpoint, and it took about 30-40 seconds. I changed the implementation to use the base-32 model instead, which reduced the embedding time by about 10 seconds, which is still not ideal and the model isn’t accurate enough to my liking, so I switched back to the base-16 model. I’ve been scratching my head at this forever to figure this out - I was actually using Transformers in Python to do the image embeddings initially (on a server) and thought that that may be a reason as to why it’s so slow, so I used Transformers.js to embed the images instead, but it’s still quite slow (helps with keeping server costs low too).

Any ideas as to why this is happening? Any help would be greatly appreciated. I can tell that my implementation itself works as intended, it’s just that it’s so slow. I apologize if I didn’t provide enough information or my question is dumb, I’m completely new to ML and AI implementations. For context, this is what is getting returned when I call this function for the 21 images:

{
    "message": "Successfully ran the vision model",
    "data": {
        "message": "Cosine similarity calculation successful",
        "labeled_clusters": [
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/aae31c6b-f2a8-476a-b638-aab9830e1573.png",
                "text_label": "a drawing of a ladder",
                "score": 0.29822564125061035
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/45b58cda-cfdb-4256-9158-26342111e58b.png",
                "text_label": "a drawing of a smile",
                "score": 0.2904837727546692
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/8fc819a2-f5e6-49b5-a627-8c11a2ffc563.png",
                "text_label": "a drawing of a smile",
                "score": 0.29255419969558716
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/f5390270-88d4-4991-8551-5c29b51ab80e.png",
                "text_label": "a drawing of a smile",
                "score": 0.2797013521194458
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/9da5db0b-16ba-4424-94fb-6f2fdd643a1e.png",
                "text_label": "a drawing of a smile",
                "score": 0.28653350472450256
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/5b9cfb98-b2c6-4327-82b1-54f43066e4ee.png",
                "text_label": "a drawing of a ladder",
                "score": 0.29258492588996887
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/82c8f665-dcd8-4923-808b-7374625177bb.png",
                "text_label": "a drawing of a ladder",
                "score": 0.275624543428421
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/b6ac7b3f-39d9-45d1-81cd-ea8be156835c.png",
                "text_label": "a drawing of a ladder",
                "score": 0.3456617593765259
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/7c36d7fe-d41c-4d12-8c04-5f5f4d3d8401.png",
                "text_label": "a drawing of a smile",
                "score": 0.2944153845310211
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/05dbe3f1-915c-4dfe-9b1d-036269ceb137.png",
                "text_label": "a drawing of a smile",
                "score": 0.30991873145103455
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/2ee95ec4-2e28-44b7-9697-f9580ffd025e.png",
                "text_label": "a drawing of a smile",
                "score": 0.2918935716152191
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/b0d2d76e-bab2-48bd-89ad-a6dbcce43796.png",
                "text_label": "a drawing of a ladder",
                "score": 0.3389083445072174
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/c5d52f93-80ee-4667-99ed-3d918aa02acd.png",
                "text_label": "a drawing of a ladder",
                "score": 0.3246758282184601
            },
            {
                "image_name": "Cluster Snakes, Ladders, Smile, Sad/4677548a-bf98-4a51-986b-9d8ac70db78d.png",
                "text_label": "a drawing of a ladder",
                "score": 0.31894147396087646
            }
        ],
        "unlabeled_images": [
            "Cluster Snakes, Ladders, Smile, Sad/4b50d855-53af-426e-8b63-14edbc820bc5.png",
            "Cluster Snakes, Ladders, Smile, Sad/89fc5ae9-6202-442a-8b69-c6157c2f8b28.png",
            "Cluster Snakes, Ladders, Smile, Sad/80bf1bb9-8e94-4142-bc72-1b1aded7b307.png",
            "Cluster Snakes, Ladders, Smile, Sad/a2fa5264-46e0-4b63-9b77-c1ef4ec261cb.png",
            "Cluster Snakes, Ladders, Smile, Sad/874c1481-41cf-454e-973a-2fea482e9ebf.png",
            "Cluster Snakes, Ladders, Smile, Sad/96d0ddb6-8a85-4d44-af47-9e5d1e7e5b50.png",
            "Cluster Snakes, Ladders, Smile, Sad/409a0bfa-43ae-4e12-8857-0e2c97eea67c.png"
        ],
        "unlabeled_clusters": [
            {
                "cluster_label": 3,
                "image_ids": [
                    "Cluster Snakes, Ladders, Smile, Sad/4b50d855-53af-426e-8b63-14edbc820bc5.png"
                ]
            },
            {
                "cluster_label": 1,
                "image_ids": [
                    "Cluster Snakes, Ladders, Smile, Sad/89fc5ae9-6202-442a-8b69-c6157c2f8b28.png",
                    "Cluster Snakes, Ladders, Smile, Sad/96d0ddb6-8a85-4d44-af47-9e5d1e7e5b50.png"
                ]
            },
            {
                "cluster_label": 2,
                "image_ids": [
                    "Cluster Snakes, Ladders, Smile, Sad/409a0bfa-43ae-4e12-8857-0e2c97eea67c.png",
                    "Cluster Snakes, Ladders, Smile, Sad/a2fa5264-46e0-4b63-9b77-c1ef4ec261cb.png",
                    "Cluster Snakes, Ladders, Smile, Sad/80bf1bb9-8e94-4142-bc72-1b1aded7b307.png",
                    "Cluster Snakes, Ladders, Smile, Sad/874c1481-41cf-454e-973a-2fea482e9ebf.png"
                ]
            }
        ]
    }
}

Screenshot of Response Time: