diff --git a/TODO.md b/TODO.md index 86e8a7fe..bdb68061 100644 --- a/TODO.md +++ b/TODO.md @@ -53,3 +53,5 @@ Object detection using CenterNet or NanoDet models is not working when using WAS


+ +> const mod = (a, b) => tf.sub(a, tf.mul(tf.div(a, tf.scalar(b, 'int32')), tf.scalar(b, 'int32'))); // modulus op implemented in tf diff --git a/demo/index.js b/demo/index.js index 053ca929..068054c9 100644 --- a/demo/index.js +++ b/demo/index.js @@ -327,10 +327,10 @@ async function drawResults(input) { `; ui.framesDraw++; ui.lastFrame = human.now(); - // if buffered, immediate loop but limit frame rate although it's going to run slower as JS is singlethreaded if (ui.buffered) { if (isLive(input)) { - ui.drawThread = requestAnimationFrame(() => drawResults(input)); + // ui.drawThread = requestAnimationFrame(() => drawResults(input)); + ui.drawThread = setTimeout(() => drawResults(input), 25); } else { cancelAnimationFrame(ui.drawThread); videoPause(); diff --git a/demo/typescript/index.html b/demo/typescript/index.html index bbea62b8..a74a14bc 100644 --- a/demo/typescript/index.html +++ b/demo/typescript/index.html @@ -23,7 +23,7 @@ -

+    

     

     
diff --git a/demo/typescript/index.js b/demo/typescript/index.js index c1f9e7f9..937fd57c 100644 --- a/demo/typescript/index.js +++ b/demo/typescript/index.js @@ -6,14 +6,13 @@ // demo/typescript/index.ts import Human from "../../dist/human.esm.js"; -var config = { - modelBasePath: "../../models", - backend: "humangl", - async: true +var humanConfig = { + modelBasePath: "../../models" }; -var human = new Human(config); -human.env.perfadd = false; -var result; +var human = new Human(humanConfig); +human.env["perfadd"] = false; +human.draw.options.font = 'small-caps 24px "Lato"'; +human.draw.options.lineHeight = 24; var dom = { video: document.getElementById("video"), canvas: document.getElementById("canvas"), @@ -21,20 +20,17 @@ var dom = { fps: document.getElementById("status"), perf: document.getElementById("performance") }; +var timestamp = { detect: 0, draw: 0, tensors: 0 }; var fps = { detect: 0, draw: 0 }; var log = (...msg) => { dom.log.innerText += msg.join(" ") + "\n"; console.log(...msg); }; -var status = (msg) => { - dom.fps.innerText = msg; -}; -var perf = (msg) => { - dom.perf.innerText = "performance: " + JSON.stringify(msg).replace(/"|{|}/g, "").replace(/,/g, " | "); -}; +var status = (msg) => dom.fps.innerText = msg; +var perf = (msg) => dom.perf.innerText = "tensors:" + human.tf.memory().numTensors + " | performance: " + JSON.stringify(msg).replace(/"|{|}/g, "").replace(/,/g, " | "); async function webCam() { status("starting webcam..."); - const options = { audio: false, video: { facingMode: "user", resizeMode: "crop-and-scale", width: { ideal: document.body.clientWidth } } }; + const options = { audio: false, video: { facingMode: "user", resizeMode: "none", width: { ideal: document.body.clientWidth } } }; const stream = await navigator.mediaDevices.getUserMedia(options); const ready = new Promise((resolve) => { dom.video.onloadeddata = () => resolve(true); @@ -57,34 +53,39 @@ async function webCam() { }; } async function detectionLoop() { - const t0 = human.now(); if (!dom.video.paused) { - result = await human.detect(dom.video); + await human.detect(dom.video); + const tensors = human.tf.memory().numTensors; + if (tensors - timestamp.tensors !== 0) + log("allocated tensors:", tensors - timestamp.tensors); + timestamp.tensors = tensors; } - const t1 = human.now(); - fps.detect = 1e3 / (t1 - t0); + const now = human.now(); + fps.detect = 1e3 / (now - timestamp.detect); + timestamp.detect = now; requestAnimationFrame(detectionLoop); } async function drawLoop() { - const t0 = human.now(); if (!dom.video.paused) { - const interpolated = await human.next(result); + const interpolated = await human.next(human.result); await human.draw.canvas(dom.video, dom.canvas); await human.draw.all(dom.canvas, interpolated); perf(interpolated.performance); } - const t1 = human.now(); - fps.draw = 1e3 / (t1 - t0); - status(dom.video.paused ? "paused" : `fps: ${fps.detect.toFixed(1).padStart(5, " ")} detect / ${fps.draw.toFixed(1).padStart(5, " ")} draw`); - requestAnimationFrame(drawLoop); + const now = human.now(); + fps.draw = 1e3 / (now - timestamp.draw); + timestamp.draw = now; + status(dom.video.paused ? "paused" : `fps: ${fps.detect.toFixed(1).padStart(5, " ")} detect | ${fps.draw.toFixed(1).padStart(5, " ")} draw`); + setTimeout(drawLoop, 30); } async function main() { - log("human version:", human.version, "tfjs:", human.tf.version_core); + log("human version:", human.version, "tfjs version:", human.tf.version_core); log("platform:", human.env.platform, "agent:", human.env.agent); status("loading..."); await human.load(); + log("backend:", human.tf.getBackend(), "| available:", human.env.backends); + log("loaded models:" + Object.values(human.models).filter((model) => model !== null).length); status("initializing..."); - log("backend:", human.tf.getBackend(), "available:", human.env.backends); await human.warmup(); await webCam(); await detectionLoop(); diff --git a/demo/typescript/index.ts b/demo/typescript/index.ts index ca1969f1..b80b2da9 100644 --- a/demo/typescript/index.ts +++ b/demo/typescript/index.ts @@ -11,46 +11,45 @@ import Human from '../../dist/human.esm.js'; // equivalent of @vladmandic/human -const config = { +const humanConfig = { // user configuration for human, used to fine-tune behavior modelBasePath: '../../models', - backend: 'humangl', - async: true, - // face: { enabled: true, detector: { rotation: true }, iris: { enabled: false }, description: { enabled: false }, emotion: { enabled: false } }, + // backend: 'humangl', + // async: true, + // face: { enabled: false, detector: { rotation: true }, iris: { enabled: false }, description: { enabled: false }, emotion: { enabled: false } }, // body: { enabled: false }, // hand: { enabled: false }, // object: { enabled: false }, // gesture: { enabled: true }, }; -const human = new Human(config); -human.env.perfadd = false; -let result; +const human = new Human(humanConfig); // create instance of human with overrides from user configuration -const dom = { +human.env['perfadd'] = false; // is performance data showing instant or total values +human.draw.options.font = 'small-caps 24px "Lato"'; // set font used to draw labels when using draw methods +human.draw.options.lineHeight = 24; + +const dom = { // grab instances of dom objects so we dont have to look them up later video: document.getElementById('video') as HTMLVideoElement, canvas: document.getElementById('canvas') as HTMLCanvasElement, log: document.getElementById('log') as HTMLPreElement, fps: document.getElementById('status') as HTMLPreElement, perf: document.getElementById('performance') as HTMLDivElement, }; +const timestamp = { detect: 0, draw: 0, tensors: 0 }; // holds information used to calculate performance and possible memory leaks +const fps = { detect: 0, draw: 0 }; // holds calculated fps information for both detect and screen refresh -const fps = { detect: 0, draw: 0 }; - -const log = (...msg) => { +const log = (...msg) => { // helper method to output messages dom.log.innerText += msg.join(' ') + '\n'; // eslint-disable-next-line no-console console.log(...msg); }; -const status = (msg) => { - dom.fps.innerText = msg; -}; -const perf = (msg) => { - dom.perf.innerText = 'performance: ' + JSON.stringify(msg).replace(/"|{|}/g, '').replace(/,/g, ' | '); -}; +const status = (msg) => dom.fps.innerText = msg; // print status element +const perf = (msg) => dom.perf.innerText = 'tensors:' + human.tf.memory().numTensors + ' | performance: ' + JSON.stringify(msg).replace(/"|{|}/g, '').replace(/,/g, ' | '); // print performance element -async function webCam() { +async function webCam() { // initialize webcam status('starting webcam...'); - const options = { audio: false, video: { facingMode: 'user', resizeMode: 'crop-and-scale', width: { ideal: document.body.clientWidth } } }; + // @ts-ignore resizeMode is not yet defined in tslib + const options: MediaStreamConstraints = { audio: false, video: { facingMode: 'user', resizeMode: 'none', width: { ideal: document.body.clientWidth } } }; const stream: MediaStream = await navigator.mediaDevices.getUserMedia(options); const ready = new Promise((resolve) => { dom.video.onloadeddata = () => resolve(true); }); dom.video.srcObject = stream; @@ -63,47 +62,53 @@ async function webCam() { const settings: MediaTrackSettings | string = track.getSettings ? track.getSettings() : ''; const constraints: MediaTrackConstraints | string = track.getConstraints ? track.getConstraints() : ''; log('video:', dom.video.videoWidth, dom.video.videoHeight, track.label, { stream, track, settings, constraints, capabilities }); - dom.canvas.onclick = () => { + dom.canvas.onclick = () => { // pause when clicked on screen and resume on next click if (dom.video.paused) dom.video.play(); else dom.video.pause(); }; } -async function detectionLoop() { - const t0 = human.now(); +async function detectionLoop() { // main detection loop if (!dom.video.paused) { - result = await human.detect(dom.video); + // console.log('profiling data:', await human.profile(dom.video)); + await human.detect(dom.video); // actual detection; were not capturing output in a local variable as it can also be reached via human.result + const tensors = human.tf.memory().numTensors; // check current tensor usage for memory leaks + if (tensors - timestamp.tensors !== 0) log('allocated tensors:', tensors - timestamp.tensors); // printed on start and each time there is a tensor leak + timestamp.tensors = tensors; } - const t1 = human.now(); - fps.detect = 1000 / (t1 - t0); - requestAnimationFrame(detectionLoop); + const now = human.now(); + fps.detect = 1000 / (now - timestamp.detect); + timestamp.detect = now; + requestAnimationFrame(detectionLoop); // start new frame immediately } -async function drawLoop() { - const t0 = human.now(); +async function drawLoop() { // main screen refresh loop if (!dom.video.paused) { - const interpolated = await human.next(result); - await human.draw.canvas(dom.video, dom.canvas); - await human.draw.all(dom.canvas, interpolated); - perf(interpolated.performance); + const interpolated = await human.next(human.result); // smoothen result using last-known results + await human.draw.canvas(dom.video, dom.canvas); // draw canvas to screen + await human.draw.all(dom.canvas, interpolated); // draw labels, boxes, lines, etc. + perf(interpolated.performance); // write performance data } - const t1 = human.now(); - fps.draw = 1000 / (t1 - t0); - status(dom.video.paused ? 'paused' : `fps: ${fps.detect.toFixed(1).padStart(5, ' ')} detect / ${fps.draw.toFixed(1).padStart(5, ' ')} draw`); - requestAnimationFrame(drawLoop); + const now = human.now(); + fps.draw = 1000 / (now - timestamp.draw); + timestamp.draw = now; + status(dom.video.paused ? 'paused' : `fps: ${fps.detect.toFixed(1).padStart(5, ' ')} detect | ${fps.draw.toFixed(1).padStart(5, ' ')} draw`); // write status + // requestAnimationFrame(drawLoop); // refresh at screen refresh rate + setTimeout(drawLoop, 30); // use to slow down refresh from max refresh rate to target of 30 fps } -async function main() { - log('human version:', human.version, 'tfjs:', human.tf.version_core); +async function main() { // main entry point + log('human version:', human.version, 'tfjs version:', human.tf.version_core); log('platform:', human.env.platform, 'agent:', human.env.agent); status('loading...'); - await human.load(); + await human.load(); // preload all models + log('backend:', human.tf.getBackend(), '| available:', human.env.backends); + log('loaded models:' + Object.values(human.models).filter((model) => model !== null).length); status('initializing...'); - log('backend:', human.tf.getBackend(), 'available:', human.env.backends); - await human.warmup(); - await webCam(); - await detectionLoop(); - await drawLoop(); + await human.warmup(); // warmup function to initialize backend for future faster detection + await webCam(); // start webcam + await detectionLoop(); // start detection loop + await drawLoop(); // start draw loop } window.onload = main; diff --git a/src/config.ts b/src/config.ts index 811c55be..502b1ae8 100644 --- a/src/config.ts +++ b/src/config.ts @@ -339,7 +339,7 @@ const config: Config = { enabled: true, rotation: true, skipFrames: 99, - skipTime: 2000, + skipTime: 1000, minConfidence: 0.50, iouThreshold: 0.2, maxDetected: -1, @@ -358,7 +358,7 @@ const config: Config = { iouThreshold: 0.4, maxDetected: 10, skipFrames: 99, - skipTime: 1000, + skipTime: 2000, }, segmentation: { enabled: false, diff --git a/src/face/blazeface.ts b/src/face/blazeface.ts index 41e377b1..e843e9fa 100644 --- a/src/face/blazeface.ts +++ b/src/face/blazeface.ts @@ -9,6 +9,7 @@ import * as util from './facemeshutil'; import type { Config } from '../config'; import type { Tensor, GraphModel } from '../tfjs/types'; import { env } from '../util/env'; +import type { Point } from '../result'; const keypointsCount = 6; let model: GraphModel | null; @@ -34,63 +35,72 @@ export async function load(config: Config): Promise { } function decodeBounds(boxOutputs) { - const boxStarts = tf.slice(boxOutputs, [0, 1], [-1, 2]); - const centers = tf.add(boxStarts, anchors); - const boxSizes = tf.slice(boxOutputs, [0, 3], [-1, 2]); - const boxSizesNormalized = tf.div(boxSizes, inputSize); - const centersNormalized = tf.div(centers, inputSize); - const halfBoxSize = tf.div(boxSizesNormalized, 2); - const starts = tf.sub(centersNormalized, halfBoxSize); - const ends = tf.add(centersNormalized, halfBoxSize); - const startNormalized = tf.mul(starts, inputSize); - const endNormalized = tf.mul(ends, inputSize); - const concatAxis = 1; - return tf.concat2d([startNormalized, endNormalized], concatAxis); + const t: Record = {}; + t.boxStarts = tf.slice(boxOutputs, [0, 1], [-1, 2]); + t.centers = tf.add(t.boxStarts, anchors); + t.boxSizes = tf.slice(boxOutputs, [0, 3], [-1, 2]); + t.boxSizesNormalized = tf.div(t.boxSizes, inputSize); + t.centersNormalized = tf.div(t.centers, inputSize); + t.halfBoxSize = tf.div(t.boxSizesNormalized, 2); + t.starts = tf.sub(t.centersNormalized, t.halfBoxSize); + t.ends = tf.add(t.centersNormalized, t.halfBoxSize); + t.startNormalized = tf.mul(t.starts, inputSize); + t.endNormalized = tf.mul(t.ends, inputSize); + const boxes = tf.concat2d([t.startNormalized, t.endNormalized], 1); + Object.keys(t).forEach((tensor) => tf.dispose(t[tensor])); + return boxes; } export async function getBoxes(inputImage: Tensor, config: Config) { // sanity check on input if ((!inputImage) || (inputImage['isDisposedInternal']) || (inputImage.shape.length !== 4) || (inputImage.shape[1] < 1) || (inputImage.shape[2] < 1)) return { boxes: [] }; - const [batch, boxes, scores] = tf.tidy(() => { - const resizedImage = tf.image.resizeBilinear(inputImage, [inputSize, inputSize]); - const normalizedImage = tf.sub(tf.div(resizedImage, 127.5), 0.5); - const res = model?.execute(normalizedImage); - let batchOut; - if (Array.isArray(res)) { // are we using tfhub or pinto converted model? - const sorted = res.sort((a, b) => a.size - b.size); - const concat384 = tf.concat([sorted[0], sorted[2]], 2); // dim: 384, 1 + 16 - const concat512 = tf.concat([sorted[1], sorted[3]], 2); // dim: 512, 1 + 16 - const concat = tf.concat([concat512, concat384], 1); - batchOut = tf.squeeze(concat, 0); - } else { - batchOut = tf.squeeze(res); // when using tfhub model - } - const boxesOut = decodeBounds(batchOut); - const logits = tf.slice(batchOut, [0, 0], [-1, 1]); - const scoresOut = tf.squeeze(tf.sigmoid(logits)); // inside tf.tidy - return [batchOut, boxesOut, scoresOut]; - }); + const t: Record = {}; - const nmsTensor = await tf.image.nonMaxSuppressionAsync(boxes, scores, (config.face.detector?.maxDetected || 0), (config.face.detector?.iouThreshold || 0), (config.face.detector?.minConfidence || 0)); - const nms = await nmsTensor.array(); - tf.dispose(nmsTensor); - const annotatedBoxes: Array<{ box: { startPoint: Tensor, endPoint: Tensor }, landmarks: Tensor, anchor: [number, number] | undefined, confidence: number }> = []; - const scoresData = await scores.data(); + t.resized = tf.image.resizeBilinear(inputImage, [inputSize, inputSize]); + t.div = tf.div(t.resized, 127.5); + t.normalized = tf.sub(t.div, 0.5); + const res = model?.execute(t.normalized) as Tensor[]; + if (Array.isArray(res)) { // are we using tfhub or pinto converted model? + const sorted = res.sort((a, b) => a.size - b.size); + t.concat384 = tf.concat([sorted[0], sorted[2]], 2); // dim: 384, 1 + 16 + t.concat512 = tf.concat([sorted[1], sorted[3]], 2); // dim: 512, 1 + 16 + t.concat = tf.concat([t.concat512, t.concat384], 1); + t.batch = tf.squeeze(t.concat, 0); + } else { + t.batch = tf.squeeze(res); // when using tfhub model + } + tf.dispose(res); + t.boxes = decodeBounds(t.batch); + t.logits = tf.slice(t.batch, [0, 0], [-1, 1]); + t.sigmoid = tf.sigmoid(t.logits); + t.scores = tf.squeeze(t.sigmoid); + + t.nms = await tf.image.nonMaxSuppressionAsync(t.boxes, t.scores, (config.face.detector?.maxDetected || 0), (config.face.detector?.iouThreshold || 0), (config.face.detector?.minConfidence || 0)); + const nms = await t.nms.array() as number[]; + const boxes: Array<{ box: { startPoint: Point, endPoint: Point }, landmarks: Point[], confidence: number }> = []; + const scores = await t.scores.data(); for (let i = 0; i < nms.length; i++) { - const confidence = scoresData[nms[i]]; + const confidence = scores[nms[i]]; if (confidence > (config.face.detector?.minConfidence || 0)) { - const boundingBox = tf.slice(boxes, [nms[i], 0], [1, -1]); - const landmarks = tf.tidy(() => tf.reshape(tf.squeeze(tf.slice(batch, [nms[i], keypointsCount - 1], [1, -1])), [keypointsCount, -1])); - annotatedBoxes.push({ box: util.createBox(boundingBox), landmarks, anchor: anchorsData[nms[i]], confidence }); - tf.dispose(boundingBox); + const b: Record = {}; + b.bbox = tf.slice(t.boxes, [nms[i], 0], [1, -1]); + b.slice = tf.slice(t.batch, [nms[i], keypointsCount - 1], [1, -1]); + b.squeeze = tf.squeeze(b.slice); + b.landmarks = tf.reshape(b.squeeze, [keypointsCount, -1]); + b.startPoint = tf.slice(b.bbox, [0, 0], [-1, 2]); + b.endPoint = tf.slice(b.bbox, [0, 2], [-1, 2]); + boxes.push({ + box: { + startPoint: (await b.startPoint.data()) as unknown as Point, + endPoint: (await b.endPoint.data()) as unknown as Point, + }, + landmarks: (await b.landmarks.array()) as Point[], + confidence, + }); + Object.keys(b).forEach((tensor) => tf.dispose(b[tensor])); } } - tf.dispose(batch); - tf.dispose(boxes); - tf.dispose(scores); - return { - boxes: annotatedBoxes, - scaleFactor: [inputImage.shape[2] / inputSize, inputImage.shape[1] / inputSize], - }; + Object.keys(t).forEach((tensor) => tf.dispose(t[tensor])); + return { boxes, scaleFactor: [inputImage.shape[2] / inputSize, inputImage.shape[1] / inputSize] }; } diff --git a/src/face/facemesh.ts b/src/face/facemesh.ts index a2557d9d..84c62073 100644 --- a/src/face/facemesh.ts +++ b/src/face/facemesh.ts @@ -37,14 +37,13 @@ export async function predict(input: Tensor, config: Config): Promise, + startPoint: possible.box.startPoint, + endPoint: possible.box.endPoint, + landmarks: possible.landmarks, confidence: possible.confidence, }; boxCache.push(util.squarifyBox(util.enlargeBox(util.scaleBoxCoordinates(box, possibleBoxes.scaleFactor), Math.sqrt(enlargeFact)))); } - possibleBoxes.boxes.forEach((prediction) => tf.dispose([prediction.box.startPoint, prediction.box.endPoint, prediction.landmarks])); skipped = 0; } else { skipped++; diff --git a/src/face/faceres.ts b/src/face/faceres.ts index 783919d2..606be75b 100644 --- a/src/face/faceres.ts +++ b/src/face/faceres.ts @@ -37,57 +37,50 @@ export async function load(config: Config): Promise { } export function enhance(input): Tensor { - const image = tf.tidy(() => { - // input received from detector is already normalized to 0..1 - // input is also assumed to be straightened - const tensor = input.image || input.tensor || input; - if (!(tensor instanceof tf.Tensor)) return null; - // do a tight crop of image and resize it to fit the model - if (!model?.inputs[0].shape) return null; // model has no shape so no point continuing - const crop = tf.image.resizeBilinear(tensor, [model.inputs[0].shape[2], model.inputs[0].shape[1]], false); - /* - const box = [[0.05, 0.15, 0.85, 0.85]]; // empyrical values for top, left, bottom, right - const crop = (tensor.shape.length === 3) - ? tf.image.cropAndResize(tf.expandDims(tensor, 0), box, [0], [model.inputs[0].shape[2], model.inputs[0].shape[1]]) // add batch dimension if missing - : tf.image.cropAndResize(tensor, box, [0], [model.inputs[0].shape[2], model.inputs[0].shape[1]]); - */ - /* - // just resize to fit the embedding model instead of cropping - const crop = tf.image.resizeBilinear(tensor, [model.inputs[0].shape[2], model.inputs[0].shape[1]], false); - */ + const tensor = (input.image || input.tensor || input) as Tensor; // input received from detector is already normalized to 0..1, input is also assumed to be straightened + if (!model?.inputs[0].shape) return tensor; // model has no shape so no point continuing + // do a tight crop of image and resize it to fit the model + const crop = tf.image.resizeBilinear(tensor, [model.inputs[0].shape[2], model.inputs[0].shape[1]], false); + /* + const box = [[0.05, 0.15, 0.85, 0.85]]; // empyrical values for top, left, bottom, right + const crop = (tensor.shape.length === 3) + ? tf.image.cropAndResize(tf.expandDims(tensor, 0), box, [0], [model.inputs[0].shape[2], model.inputs[0].shape[1]]) // add batch dimension if missing + : tf.image.cropAndResize(tensor, box, [0], [model.inputs[0].shape[2], model.inputs[0].shape[1]]); + */ + /* + // just resize to fit the embedding model instead of cropping + const crop = tf.image.resizeBilinear(tensor, [model.inputs[0].shape[2], model.inputs[0].shape[1]], false); + */ - /* - // convert to black&white to avoid colorization impact - const rgb = [0.2989, 0.5870, 0.1140]; // factors for red/green/blue colors when converting to grayscale: https://www.mathworks.com/help/matlab/ref/rgb2gray.html - const [red, green, blue] = tf.split(crop, 3, 3); - const redNorm = tf.mul(red, rgb[0]); - const greenNorm = tf.mul(green, rgb[1]); - const blueNorm = tf.mul(blue, rgb[2]); - const grayscale = tf.addN([redNorm, greenNorm, blueNorm]); - const merge = tf.stack([grayscale, grayscale, grayscale], 3).squeeze(4); - */ + /* + // convert to black&white to avoid colorization impact + const rgb = [0.2989, 0.5870, 0.1140]; // factors for red/green/blue colors when converting to grayscale: https://www.mathworks.com/help/matlab/ref/rgb2gray.html + const [red, green, blue] = tf.split(crop, 3, 3); + const redNorm = tf.mul(red, rgb[0]); + const greenNorm = tf.mul(green, rgb[1]); + const blueNorm = tf.mul(blue, rgb[2]); + const grayscale = tf.addN([redNorm, greenNorm, blueNorm]); + const merge = tf.stack([grayscale, grayscale, grayscale], 3).squeeze(4); + */ - /* - // increase image pseudo-contrast 100% - // (or do it per-channel so mean is done on each channel) - // (or calculate histogram and do it based on histogram) - const mean = merge.mean(); - const factor = 2; - const contrast = merge.sub(mean).mul(factor).add(mean); - */ + /* + // increase image pseudo-contrast 100% + // (or do it per-channel so mean is done on each channel) + // (or calculate histogram and do it based on histogram) + const mean = merge.mean(); + const factor = 2; + const contrast = merge.sub(mean).mul(factor).add(mean); + */ - /* - // normalize brightness from 0..1 - // silly way of creating pseudo-hdr of image - const darken = crop.sub(crop.min()); - const lighten = darken.div(darken.max()); - */ - - const norm = tf.mul(crop, 255); - - return norm; - }); - return image; + /* + // normalize brightness from 0..1 + // silly way of creating pseudo-hdr of image + const darken = crop.sub(crop.min()); + const lighten = darken.div(darken.max()); + */ + const norm = tf.mul(crop, 255); + tf.dispose(crop); + return norm; } export async function predict(image: Tensor, config: Config, idx, count) { diff --git a/src/face/iris.ts b/src/face/iris.ts index 85a845db..f3c7c857 100644 --- a/src/face/iris.ts +++ b/src/face/iris.ts @@ -126,7 +126,7 @@ export async function augmentIris(rawCoords, face, config, meshSize) { tf.dispose(rightEyeCrop); const eyePredictions = model.execute(combined) as Tensor; tf.dispose(combined); - const eyePredictionsData = await eyePredictions.data(); // inside tf.tidy + const eyePredictionsData = await eyePredictions.data(); tf.dispose(eyePredictions); const leftEyeData = eyePredictionsData.slice(0, irisLandmarks.numCoordinates * 3); const { rawCoords: leftEyeRawCoords, iris: leftIrisRawCoords } = getEyeCoords(leftEyeData, leftEyeBox, leftEyeBoxSize, true); diff --git a/src/gear/emotion.ts b/src/gear/emotion.ts index 0a4877b3..82e9dc4d 100644 --- a/src/gear/emotion.ts +++ b/src/gear/emotion.ts @@ -43,35 +43,27 @@ export async function predict(image: Tensor, config: Config, idx, count) { return new Promise(async (resolve) => { const obj: Array<{ score: number, emotion: string }> = []; if (config.face.emotion?.enabled) { + const t: Record = {}; const inputSize = model?.inputs[0].shape ? model.inputs[0].shape[2] : 0; - const resize = tf.image.resizeBilinear(image, [inputSize, inputSize], false); + t.resize = tf.image.resizeBilinear(image, [inputSize, inputSize], false); // const box = [[0.15, 0.15, 0.85, 0.85]]; // empyrical values for top, left, bottom, right // const resize = tf.image.cropAndResize(image, box, [0], [inputSize, inputSize]); - - const [red, green, blue] = tf.split(resize, 3, 3); - tf.dispose(resize); + [t.red, t.green, t.blue] = tf.split(t.resize, 3, 3); // weighted rgb to grayscale: https://www.mathworks.com/help/matlab/ref/rgb2gray.html - const redNorm = tf.mul(red, rgb[0]); - const greenNorm = tf.mul(green, rgb[1]); - const blueNorm = tf.mul(blue, rgb[2]); - tf.dispose(red); - tf.dispose(green); - tf.dispose(blue); - const grayscale = tf.addN([redNorm, greenNorm, blueNorm]); - tf.dispose(redNorm); - tf.dispose(greenNorm); - tf.dispose(blueNorm); - const normalize = tf.tidy(() => tf.mul(tf.sub(grayscale, 0.5), 2)); - tf.dispose(grayscale); - const emotionT = model?.execute(normalize) as Tensor; // result is already in range 0..1, no need for additional activation + t.redNorm = tf.mul(t.red, rgb[0]); + t.greenNorm = tf.mul(t.green, rgb[1]); + t.blueNorm = tf.mul(t.blue, rgb[2]); + t.grayscale = tf.addN([t.redNorm, t.greenNorm, t.blueNorm]); + t.grayscaleSub = tf.sub(t.grayscale, 0.5); + t.grayscaleMul = tf.mul(t.grayscaleSub, 2); + t.emotion = model?.execute(t.grayscaleMul) as Tensor; // result is already in range 0..1, no need for additional activation lastTime = now(); - const data = await emotionT.data(); - tf.dispose(emotionT); + const data = await t.emotion.data(); for (let i = 0; i < data.length; i++) { if (data[i] > (config.face.emotion?.minConfidence || 0)) obj.push({ score: Math.min(0.99, Math.trunc(100 * data[i]) / 100), emotion: annotations[i] }); } obj.sort((a, b) => b.score - a.score); - tf.dispose(normalize); + Object.keys(t).forEach((tensor) => tf.dispose(t[tensor])); } last[idx] = obj; lastCount = count; diff --git a/src/hand/handposedetector.ts b/src/hand/handposedetector.ts index c1b58ae0..a7c28db5 100644 --- a/src/hand/handposedetector.ts +++ b/src/hand/handposedetector.ts @@ -7,6 +7,7 @@ import * as tf from '../../dist/tfjs.esm.js'; import * as util from './handposeutil'; import * as anchors from './handposeanchors'; import type { Tensor, GraphModel } from '../tfjs/types'; +import type { Point } from '../result'; export class HandDetector { model: GraphModel; @@ -26,62 +27,64 @@ export class HandDetector { } normalizeBoxes(boxes) { - return tf.tidy(() => { - const boxOffsets = tf.slice(boxes, [0, 0], [-1, 2]); - const boxSizes = tf.slice(boxes, [0, 2], [-1, 2]); - const boxCenterPoints = tf.add(tf.div(boxOffsets, this.inputSizeTensor), this.anchorsTensor); - const halfBoxSizes = tf.div(boxSizes, this.doubleInputSizeTensor); - const startPoints = tf.mul(tf.sub(boxCenterPoints, halfBoxSizes), this.inputSizeTensor); - const endPoints = tf.mul(tf.add(boxCenterPoints, halfBoxSizes), this.inputSizeTensor); - return tf.concat2d([startPoints, endPoints], 1); - }); + const t: Record = {}; + t.boxOffsets = tf.slice(boxes, [0, 0], [-1, 2]); + t.boxSizes = tf.slice(boxes, [0, 2], [-1, 2]); + t.div = tf.div(t.boxOffsets, this.inputSizeTensor); + t.boxCenterPoints = tf.add(t.div, this.anchorsTensor); + t.halfBoxSizes = tf.div(t.boxSizes, this.doubleInputSizeTensor); + t.sub = tf.sub(t.boxCenterPoints, t.halfBoxSizes); + t.startPoints = tf.mul(t.sub, this.inputSizeTensor); + t.add = tf.add(t.boxCenterPoints, t.halfBoxSizes); + t.endPoints = tf.mul(t.add, this.inputSizeTensor); + const res = tf.concat2d([t.startPoints, t.endPoints], 1); + Object.keys(t).forEach((tensor) => tf.dispose(t[tensor])); + return res; } normalizeLandmarks(rawPalmLandmarks, index) { - return tf.tidy(() => { - const landmarks = tf.add(tf.div(tf.reshape(rawPalmLandmarks, [-1, 7, 2]), this.inputSizeTensor), this.anchors[index]); - return tf.mul(landmarks, this.inputSizeTensor); - }); + const t: Record = {}; + t.reshape = tf.reshape(rawPalmLandmarks, [-1, 7, 2]); + t.div = tf.div(t.reshape, this.inputSizeTensor); + t.landmarks = tf.add(t.div, this.anchors[index]); + const res = tf.mul(t.landmarks, this.inputSizeTensor); + Object.keys(t).forEach((tensor) => tf.dispose(t[tensor])); + return res; } - async getBoxes(input, config) { + async predict(input, config): Promise<{ startPoint: Point; endPoint: Point, palmLandmarks: Point[]; confidence: number }[]> { const t: Record = {}; - t.batched = this.model.execute(input) as Tensor; + t.resize = tf.image.resizeBilinear(input, [this.inputSize, this.inputSize]); + t.div = tf.div(t.resize, 127.5); + t.image = tf.sub(t.div, 1); + t.batched = this.model.execute(t.image) as Tensor; t.predictions = tf.squeeze(t.batched); - t.scores = tf.tidy(() => tf.squeeze(tf.sigmoid(tf.slice(t.predictions, [0, 0], [-1, 1])))); + t.slice = tf.slice(t.predictions, [0, 0], [-1, 1]); + t.sigmoid = tf.sigmoid(t.slice); + t.scores = tf.squeeze(t.sigmoid); const scores = await t.scores.data(); t.boxes = tf.slice(t.predictions, [0, 1], [-1, 4]); t.norm = this.normalizeBoxes(t.boxes); // box detection is flaky so we look for 3x boxes than we need results t.nms = await tf.image.nonMaxSuppressionAsync(t.norm, t.scores, 3 * config.hand.maxDetected, config.hand.iouThreshold, config.hand.minConfidence); const nms = await t.nms.array() as Array; - const hands: Array<{ box: Tensor, palmLandmarks: Tensor, confidence: number }> = []; + const hands: Array<{ startPoint: Point; endPoint: Point; palmLandmarks: Point[]; confidence: number }> = []; for (const index of nms) { - const palmBox = tf.slice(t.norm, [index, 0], [1, -1]); - const palmLandmarks = tf.tidy(() => tf.reshape(this.normalizeLandmarks(tf.slice(t.predictions, [index, 5], [1, 14]), index), [-1, 2])); - hands.push({ box: palmBox, palmLandmarks, confidence: scores[index] }); - } - for (const tensor of Object.keys(t)) tf.dispose(t[tensor]); // dispose all - return hands; - } - - async estimateHandBounds(input, config): Promise<{ startPoint: number[]; endPoint: number[]; palmLandmarks: number[]; confidence: number }[]> { - const inputHeight = input.shape[1]; - const inputWidth = input.shape[2]; - const image = tf.tidy(() => tf.sub(tf.div(tf.image.resizeBilinear(input, [this.inputSize, this.inputSize]), 127.5), 1)); - const predictions = await this.getBoxes(image, config); - tf.dispose(image); - const hands: Array<{ startPoint: number[]; endPoint: number[]; palmLandmarks: number[]; confidence: number }> = []; - if (!predictions || predictions.length === 0) return hands; - for (const prediction of predictions) { - const boxes = await prediction.box.data(); - const startPoint = boxes.slice(0, 2); - const endPoint = boxes.slice(2, 4); - const palmLandmarks = await prediction.palmLandmarks.array(); - tf.dispose(prediction.box); - tf.dispose(prediction.palmLandmarks); - hands.push(util.scaleBoxCoordinates({ startPoint, endPoint, palmLandmarks, confidence: prediction.confidence }, [inputWidth / this.inputSize, inputHeight / this.inputSize])); + const p: Record = {}; + p.box = tf.slice(t.norm, [index, 0], [1, -1]); + p.slice = tf.slice(t.predictions, [index, 5], [1, 14]); + p.norm = this.normalizeLandmarks(p.slice, index); + p.palmLandmarks = tf.reshape(p.norm, [-1, 2]); + const box = await p.box.data(); + const startPoint = box.slice(0, 2) as unknown as Point; + const endPoint = box.slice(2, 4) as unknown as Point; + const palmLandmarks = await p.palmLandmarks.array(); + const hand = { startPoint, endPoint, palmLandmarks, confidence: scores[index] }; + const scaled = util.scaleBoxCoordinates(hand, [input.shape[2] / this.inputSize, input.shape[1] / this.inputSize]); + hands.push(scaled); + Object.keys(p).forEach((tensor) => tf.dispose(p[tensor])); } + Object.keys(t).forEach((tensor) => tf.dispose(t[tensor])); return hands; } } diff --git a/src/hand/handposepipeline.ts b/src/hand/handposepipeline.ts index 2fb9fa39..4cf71336 100644 --- a/src/hand/handposepipeline.ts +++ b/src/hand/handposepipeline.ts @@ -9,6 +9,7 @@ import type * as detector from './handposedetector'; import type { Tensor, GraphModel } from '../tfjs/types'; import { env } from '../util/env'; import { now } from '../util/util'; +import type { Point } from '../result'; const palmBoxEnlargeFactor = 5; // default 3 const handBoxEnlargeFactor = 1.65; // default 1.65 @@ -21,7 +22,7 @@ export class HandPipeline { handDetector: detector.HandDetector; handPoseModel: GraphModel; inputSize: number; - storedBoxes: Array<{ startPoint: number[]; endPoint: number[]; palmLandmarks: number[]; confidence: number } | null>; + storedBoxes: Array<{ startPoint: Point; endPoint: Point; palmLandmarks: Point[]; confidence: number } | null>; skipped: number; detectedHands: number; @@ -93,7 +94,7 @@ export class HandPipeline { const skipTime = (config.hand.skipTime || 0) > (now() - lastTime); const skipFrame = this.skipped < (config.hand.skipFrames || 0); if (config.skipAllowed && skipTime && skipFrame) { - boxes = await this.handDetector.estimateHandBounds(image, config); + boxes = await this.handDetector.predict(image, config); this.skipped = 0; } if (config.skipAllowed) this.skipped++; @@ -105,7 +106,7 @@ export class HandPipeline { // for (const possible of boxes) this.storedBoxes.push(possible); if (this.storedBoxes.length > 0) useFreshBox = true; } - const hands: Array<{ landmarks: number[], confidence: number, boxConfidence: number, fingerConfidence: number, box: { topLeft: number[], bottomRight: number[] } }> = []; + const hands: Array<{ landmarks: Point[], confidence: number, boxConfidence: number, fingerConfidence: number, box: { topLeft: Point, bottomRight: Point } }> = []; // go through working set of boxes for (let i = 0; i < this.storedBoxes.length; i++) { diff --git a/src/hand/handposeutil.ts b/src/hand/handposeutil.ts index 847b2c99..f8f18890 100644 --- a/src/hand/handposeutil.ts +++ b/src/hand/handposeutil.ts @@ -1,4 +1,5 @@ import * as tf from '../../dist/tfjs.esm.js'; +import type { Point } from '../result'; export function getBoxSize(box) { return [ @@ -27,8 +28,8 @@ export function cutBoxFromImageAndResize(box, image, cropSize) { } export function scaleBoxCoordinates(box, factor) { - const startPoint = [box.startPoint[0] * factor[0], box.startPoint[1] * factor[1]]; - const endPoint = [box.endPoint[0] * factor[0], box.endPoint[1] * factor[1]]; + const startPoint = [box.startPoint[0] * factor[0], box.startPoint[1] * factor[1]] as Point; + const endPoint = [box.endPoint[0] * factor[0], box.endPoint[1] * factor[1]] as Point; const palmLandmarks = box.palmLandmarks.map((coord) => { const scaledCoord = [coord[0] * factor[0], coord[1] * factor[1]]; return scaledCoord; @@ -40,8 +41,8 @@ export function enlargeBox(box, factor = 1.5) { const center = getBoxCenter(box); const size = getBoxSize(box); const newHalfSize = [factor * size[0] / 2, factor * size[1] / 2]; - const startPoint = [center[0] - newHalfSize[0], center[1] - newHalfSize[1]]; - const endPoint = [center[0] + newHalfSize[0], center[1] + newHalfSize[1]]; + const startPoint = [center[0] - newHalfSize[0], center[1] - newHalfSize[1]] as Point; + const endPoint = [center[0] + newHalfSize[0], center[1] + newHalfSize[1]] as Point; return { startPoint, endPoint, palmLandmarks: box.palmLandmarks }; } @@ -50,8 +51,8 @@ export function squarifyBox(box) { const size = getBoxSize(box); const maxEdge = Math.max(...size); const halfSize = maxEdge / 2; - const startPoint = [centers[0] - halfSize, centers[1] - halfSize]; - const endPoint = [centers[0] + halfSize, centers[1] + halfSize]; + const startPoint = [centers[0] - halfSize, centers[1] - halfSize] as Point; + const endPoint = [centers[0] + halfSize, centers[1] + halfSize] as Point; return { startPoint, endPoint, palmLandmarks: box.palmLandmarks }; } @@ -61,8 +62,8 @@ export function shiftBox(box, shiftFactor) { box.endPoint[1] - box.startPoint[1], ]; const shiftVector = [boxSize[0] * shiftFactor[0], boxSize[1] * shiftFactor[1]]; - const startPoint = [box.startPoint[0] + shiftVector[0], box.startPoint[1] + shiftVector[1]]; - const endPoint = [box.endPoint[0] + shiftVector[0], box.endPoint[1] + shiftVector[1]]; + const startPoint = [box.startPoint[0] + shiftVector[0], box.startPoint[1] + shiftVector[1]] as Point; + const endPoint = [box.endPoint[0] + shiftVector[0], box.endPoint[1] + shiftVector[1]] as Point; return { startPoint, endPoint, palmLandmarks: box.palmLandmarks }; } diff --git a/src/hand/handtrack.ts b/src/hand/handtrack.ts index c3b89a6a..f3ee00db 100644 --- a/src/hand/handtrack.ts +++ b/src/hand/handtrack.ts @@ -24,7 +24,7 @@ const inputSize = [[0, 0], [0, 0]]; const classes = ['hand', 'fist', 'pinch', 'point', 'face', 'tip', 'pinchtip']; const faceIndex = 4; -const boxExpandFact = 1.6; +const boxExpandFact = 1.7; const maxDetectorResolution = 512; const detectorExpandFact = 1.4; diff --git a/src/human.ts b/src/human.ts index 0c0ea2b4..69665146 100644 --- a/src/human.ts +++ b/src/human.ts @@ -347,6 +347,26 @@ export class Human { return res; } + /** Run detect with tensorflow profiling + * - result object will contain total exeuction time information for top-20 kernels + * - actual detection object can be accessed via `human.result` + */ + async profile(input: Input, userConfig?: Partial): Promise> { + const profile = await this.tf.profile(() => this.detect(input, userConfig)); + const kernels = {}; + for (const kernel of profile.kernels) { // sum kernel time values per kernel + if (kernels[kernel.name]) kernels[kernel.name] += kernel.kernelTimeMs; + else kernels[kernel.name] = kernel.kernelTimeMs; + } + const kernelArr: Array<{ name, ms }> = []; + Object.entries(kernels).forEach((key) => kernelArr.push({ name: key[0], ms: key[1] })); // convert to array + kernelArr.sort((a, b) => b.ms - a.ms); // sort + kernelArr.length = 20; // crop + const res: Record = {}; + for (const kernel of kernelArr) res[kernel.name] = kernel.ms; // create perf objects + return res; + } + /** Main detection method * - Analyze configuration: {@link Config} * - Pre-process input: {@link Input}