Vanilla app using depth estimation model

mercymercymercy · November 23, 2023, 7:20pm

I am using transformers.js to create a web app that runs Xenova/dpt-large and on the screen it shows the depth image model and the coordinates of each object. Basically a combination of object detection + depth estimation. I do write a code for it, but for some reason it does not load the dpt-large model but it loads object detection model.

Here is my javascript code:
// script.js
import { pipeline, env } from ‘https://cdn.jsdelivr.net/npm/@xenova/transformers@2.6.0’;

env.allowLocalModels = false;

const status = document.getElementById(‘status’);
const fileUpload = document.getElementById(‘file-upload’);
const imageContainer = document.getElementById(‘image-container’);

let objectDetector, depthEstimator;

async function loadModels() {
status.textContent = ‘Loading models…’;
objectDetector = await pipeline(‘object-detection’, ‘Xenova/detr-resnet-50’);
depthEstimator = await pipeline(‘image-to-image’, ‘Xenova/dpt-large’);
status.textContent = ‘Models are ready’;
}

loadModels();

fileUpload.addEventListener(‘change’, async function (e) {
const file = e.target.files[0];
if (!file) return;

const reader = new FileReader();
reader.onload = async function (e) {
    imageContainer.innerHTML = '';
    const img = new Image();
    img.src = e.target.result;
    img.onload = async () => {
        await processImage(img);
    };
    imageContainer.appendChild(img);
};
reader.readAsDataURL(file);

});

async function processImage(img) {
status.textContent = ‘Detecting objects…’;
const objectDetectionOutput = await objectDetector(img);
status.textContent = ‘Estimating depth…’;
const depthEstimationOutput = await depthEstimator(img);
status.textContent = ‘’;

// Assuming objectDetectionOutput is an array of detected objects
objectDetectionOutput.forEach(detection => {
    renderBox(detection, img);
});

// Assuming depthEstimationOutput contains depth data
renderDepth(depthEstimationOutput, img);

}

function renderBox(detection, img) {
const { box, label } = detection; // Assuming detection object contains ‘box’ and ‘label’
const { xmax, xmin, ymax, ymin } = box; // Assuming box object contains coordinates

const boxElement = document.createElement('div');
boxElement.className = 'bounding-box';
boxElement.style.left = `${xmin * img.width}px`;
boxElement.style.top = `${ymin * img.height}px`;
boxElement.style.width = `${(xmax - xmin) * img.width}px`;
boxElement.style.height = `${(ymax - ymin) * img.height}px`;

const labelElement = document.createElement('span');
labelElement.className = 'bounding-box-label';
labelElement.textContent = label;

boxElement.appendChild(labelElement);
imageContainer.appendChild(boxElement);

}

function renderDepth(depthData, img) {
// The implementation of this function depends on the format of depthData
// For example, if depthData contains depth per pixel, you might want to render a heatmap
// If depthData contains depth per detected object, you might want to update existing bounding boxes

// Example implementation for per-object depth (assuming depthData format):
depthData.forEach((depth, index) => {
    const depthLabel = document.createElement('div');
    depthLabel.className = 'depth-label';
    depthLabel.textContent = `Depth: ${depth}m`; // Example depth text
    depthLabel.style.position = 'absolute';
    depthLabel.style.left = '0'; // Positioning should be adjusted based on actual data
    depthLabel.style.top = `${index * 20}px`; // Example positioning
    imageContainer.appendChild(depthLabel);
});

}