Hi, I am building a custom classifier for coding job descriptions. I have two text inputs that I concatenate, preprocess, and embed using a sentenceTransformer during training (in python) and the feature-extraction pipeline during prediction (transformers.js). At training time, I use tensorflow to train a multilayer classifier that also take another multi-hot encoded input. I save my classifer as an onnx model. I have written a web app that uses my classifier, but I would love to have a pipeline that others could use like:
let job_classifier = pipeline('job-classifer','DR/job_classifer_v1')
my current code very incomplete looks like this:
import { env, pipeline, Pipeline } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.3.2';
import * as ort from 'https://cdn.jsdelivr.net/npm/onnxruntime-web@1.19.2/dist/ort.webgpu.bundle.min.mjs';
class JobClassificationPipeline extends Pipeline {
constructor(modelPath) {
super('job-classification')
this.modelPath = modelPath
this.initialized = false
}
async initialize() {
this.initialized = true;
this.config = await (await fetch(`${this.modelPath}/config.json`)).json();
this.abbreviations = await (await fetch(`${this.modelPath}/abbrev.json`)).json();
this.crosswalks = {}
for (const [key, path] of Object.entries(this.config.known_crosswalks)) {
console.log(`... loading ${key} crosswalk ...`)
this.crosswalks[key] = await (await fetch(path)).json()
}
this.embeddingPipeline = await pipeline('text-embedding',"")
}
async _call(jobDescription) {
if (!this.initialized) {
await this.initialize();
}
console.log(this.config)
let pp = this.preprocess(jobDescription)
let embeddings =
console.log(pp)
this.crosswalk(jobDescription)
}
crosswalk(jobDescription) {
if (!Array.isArray(jobDescription)) {
jobDescription = [jobDescription]
}
console.log(this.crosswalks)
}
preprocess(jobDescription) {
console.log(jobDescription)
if (!Array.isArray(jobDescription)) {
jobDescription = [jobDescription]
}
console.log(this.abbreviations)
return jobDescription.map(x => x.JobTitle ?? "")
}
}
let j27 = new JobClassificationPipeline('models/s3_jan27')
j27({ "JobTitle": "doctor", "JobTask": "see patients", "soc1980": "261" })
Before I go too far down this rabbit hole, is this the wrong approach? Should I just create an ESM that wraps the huggingface encoder and my onnx model?