From 11d3f76df0e844917aad6577a4544708ff76c5fa Mon Sep 17 00:00:00 2001 From: Vladimir Mandic Date: Sun, 8 Nov 2020 09:56:02 -0500 Subject: [PATCH] update hand model --- config.js | 1 + demo/browser.js | 35 +++++++----- demo/menu.js | 1 + package.json | 8 +-- src/hand/box.js | 2 +- src/hand/handdetector.js | 43 +++++++-------- src/hand/handpipeline.js | 116 +++++++++++++++++---------------------- src/hand/handpose.js | 12 ++-- 8 files changed, 107 insertions(+), 111 deletions(-) diff --git a/config.js b/config.js index 716e6766..c49c1999 100644 --- a/config.js +++ b/config.js @@ -113,6 +113,7 @@ export default { scoreThreshold: 0.8, // threshold for deciding when to remove boxes based on score in non-maximum suppression enlargeFactor: 1.65, // empiric tuning as skeleton prediction prefers hand box with some whitespace maxHands: 1, // maximum number of hands detected in the input, should be set to the minimum number for performance + landmarks: true, // detect hand landmarks or just hand boundary box detector: { modelPath: '../models/handdetect.json', }, diff --git a/demo/browser.js b/demo/browser.js index 3fa25bc7..a5e436e1 100644 --- a/demo/browser.js +++ b/demo/browser.js @@ -27,6 +27,10 @@ const ui = { maxFrames: 10, modelsPreload: true, modelsWarmup: true, + menuWidth: 0, + menuHeight: 0, + camera: {}, + fps: [], }; // global variables @@ -34,8 +38,6 @@ let menu; let menuFX; let worker; let timeStamp; -let camera = {}; -const fps = []; // helper function: translates json to human readable string function str(...msg) { @@ -62,17 +64,22 @@ const status = (msg) => { // draws processed results and starts processing of a next frame function drawResults(input, result, canvas) { // update fps data - fps.push(1000 / (performance.now() - timeStamp)); - if (fps.length > ui.maxFrames) fps.shift(); + const elapsed = performance.now() - timeStamp; + ui.fps.push(1000 / elapsed); + if (ui.fps.length > ui.maxFrames) ui.fps.shift(); // enable for continous performance monitoring // console.log(result.performance); - // eslint-disable-next-line no-use-before-define - if (input.srcObject) requestAnimationFrame(() => runHumanDetect(input, canvas)); // immediate loop before we even draw results - + // immediate loop before we even draw results, but limit frame rate to 30 + if (input.srcObject) { + // eslint-disable-next-line no-use-before-define + if (elapsed > 33) requestAnimationFrame(() => runHumanDetect(input, canvas)); + // eslint-disable-next-line no-use-before-define + else setTimeout(() => runHumanDetect(input, canvas), 33 - elapsed); + } // draw fps chart - menu.updateChart('FPS', fps); + menu.updateChart('FPS', ui.fps); // draw image from video const ctx = canvas.getContext('2d'); ctx.fillStyle = ui.baseBackground; @@ -94,9 +101,9 @@ function drawResults(input, result, canvas) { const gpu = engine.backendInstance ? `gpu: ${(engine.backendInstance.numBytesInGPU ? engine.backendInstance.numBytesInGPU : 0).toLocaleString()} bytes` : ''; const memory = `system: ${engine.state.numBytes.toLocaleString()} bytes ${gpu} | tensors: ${engine.state.numTensors.toLocaleString()}`; const processing = result.canvas ? `processing: ${result.canvas.width} x ${result.canvas.height}` : ''; - const avg = Math.trunc(10 * fps.reduce((a, b) => a + b) / fps.length) / 10; + const avg = Math.trunc(10 * ui.fps.reduce((a, b) => a + b) / ui.fps.length) / 10; document.getElementById('log').innerText = ` - video: ${camera.name} | facing: ${camera.facing} | resolution: ${camera.width} x ${camera.height} ${processing} + video: ${ui.camera.name} | facing: ${ui.camera.facing} | resolution: ${ui.camera.width} x ${ui.camera.height} ${processing} backend: ${human.tf.getBackend()} | ${memory} performance: ${str(result.performance)} FPS:${avg} `; @@ -147,7 +154,7 @@ async function setupCamera() { const track = stream.getVideoTracks()[0]; const settings = track.getSettings(); log('camera constraints:', constraints, 'window:', { width: window.innerWidth, height: window.innerHeight }, 'settings:', settings, 'track:', track); - camera = { name: track.label, width: settings.width, height: settings.height, facing: settings.facingMode === 'user' ? 'front' : 'back' }; + ui.camera = { name: track.label, width: settings.width, height: settings.height, facing: settings.facingMode === 'user' ? 'front' : 'back' }; return new Promise((resolve) => { video.onloadeddata = async () => { video.width = video.videoWidth; @@ -156,6 +163,8 @@ async function setupCamera() { canvas.height = video.height; canvas.style.width = canvas.width > canvas.height ? '100vw' : ''; canvas.style.height = canvas.width > canvas.height ? '' : '100vh'; + ui.menuWidth.input.setAttribute('value', video.width); + ui.menuHeight.input.setAttribute('value', video.height); // silly font resizing for paint-on-canvas since viewport can be zoomed const size = 14 + (6 * canvas.width / window.innerWidth); ui.baseFont = ui.baseFontProto.replace(/{size}/, `${size}px`); @@ -351,8 +360,8 @@ function setupMenu() { menuFX.addHTML('
'); menuFX.addLabel('Image Processing'); menuFX.addBool('Enabled', human.config.filter, 'enabled'); - menuFX.addRange('Image width', human.config.filter, 'width', 0, 3840, 10, (val) => human.config.filter.width = parseInt(val)); - menuFX.addRange('Image height', human.config.filter, 'height', 0, 2160, 10, (val) => human.config.filter.height = parseInt(val)); + ui.menuWidth = menuFX.addRange('Image width', human.config.filter, 'width', 0, 3840, 10, (val) => human.config.filter.width = parseInt(val)); + ui.menuHeight = menuFX.addRange('Image height', human.config.filter, 'height', 0, 2160, 10, (val) => human.config.filter.height = parseInt(val)); menuFX.addRange('Brightness', human.config.filter, 'brightness', -1.0, 1.0, 0.05, (val) => human.config.filter.brightness = parseFloat(val)); menuFX.addRange('Contrast', human.config.filter, 'contrast', -1.0, 1.0, 0.05, (val) => human.config.filter.contrast = parseFloat(val)); menuFX.addRange('Sharpness', human.config.filter, 'sharpness', 0, 1.0, 0.05, (val) => human.config.filter.sharpness = parseFloat(val)); diff --git a/demo/menu.js b/demo/menu.js index 2e68ef1a..a5fb0b51 100644 --- a/demo/menu.js +++ b/demo/menu.js @@ -219,6 +219,7 @@ class Menu { evt.target.setAttribute('value', evt.target.value); if (callback) callback(evt.target.value); }); + el.input = el.children[0]; return el; } diff --git a/package.json b/package.json index 8c502c60..2c1ce5e7 100644 --- a/package.json +++ b/package.json @@ -41,16 +41,16 @@ "scripts": { "start": "node --trace-warnings --unhandled-rejections=strict --trace-uncaught --no-deprecation src/node.js", "lint": "eslint src/*.js demo/*.js", - "dev": "npm install && node --trace-warnings --unhandled-rejections=strict --trace-uncaught --no-deprecation dev-server.js", + "dev": "npm install && node --trace-warnings --unhandled-rejections=strict --trace-uncaught --no-deprecation dev-server/dev-server.js", + "changelog": "node dev-server/changelog.js", "build-iife": "esbuild --bundle --minify --platform=browser --sourcemap --target=es2018 --format=iife --external:fs --global-name=Human --metafile=dist/human.json --outfile=dist/human.js src/human.js", "build-esm-bundle": "esbuild --bundle --minify --platform=browser --sourcemap --target=es2018 --format=esm --external:fs --metafile=dist/human.esm.json --outfile=dist/human.esm.js src/human.js", "build-esm-nobundle": "esbuild --bundle --minify --platform=browser --sourcemap --target=es2018 --format=esm --external:@tensorflow --external:fs --metafile=dist/human.esm-nobundle.json --outfile=dist/human.esm-nobundle.js src/human.js", "build-node": "esbuild --bundle --minify --platform=node --sourcemap --target=es2018 --format=cjs --metafile=dist/human.node.json --outfile=dist/human.node.js src/human.js", "build-node-nobundle": "esbuild --bundle --minify --platform=node --sourcemap --target=es2018 --format=cjs --external:@tensorflow --metafile=dist/human.node.json --outfile=dist/human.node-nobundle.js src/human.js", "build-demo": "esbuild --bundle --log-level=error --platform=browser --sourcemap --target=es2018 --format=esm --external:fs --metafile=dist/demo-browser-index.json --outfile=dist/demo-browser-index.js demo/browser.js", - "build": "rimraf dist/* && npm run build-iife && npm run build-esm-bundle && npm run build-esm-nobundle && npm run build-node && npm run build-node-nobundle && npm run build-demo", - "update": "npm update --depth 20 --force && npm dedupe && npm prune && npm audit", - "changelog": "node changelog.js" + "build": "rimraf dist/* && npm run build-iife && npm run build-esm-bundle && npm run build-esm-nobundle && npm run build-node && npm run build-node-nobundle && npm run build-demo && npm run changelog", + "update": "npm update --depth 20 --force && npm dedupe && npm prune && npm audit" }, "keywords": [ "tensorflowjs", diff --git a/src/hand/box.js b/src/hand/box.js index 4078f732..e8e1916a 100644 --- a/src/hand/box.js +++ b/src/hand/box.js @@ -46,7 +46,7 @@ function scaleBoxCoordinates(box, factor) { const scaledCoord = [coord[0] * factor[0], coord[1] * factor[1]]; return scaledCoord; }); - return { startPoint, endPoint, palmLandmarks }; + return { startPoint, endPoint, palmLandmarks, confidence: box.confidence }; } function enlargeBox(box, factor = 1.5) { const center = getBoxCenter(box); diff --git a/src/hand/handdetector.js b/src/hand/handdetector.js index 6df3cace..66d57628 100644 --- a/src/hand/handdetector.js +++ b/src/hand/handdetector.js @@ -49,29 +49,28 @@ class HandDetector { async getBoxes(input, config) { const batched = this.model.predict(input); const predictions = batched.squeeze(); + batched.dispose(); const scores = tf.tidy(() => tf.sigmoid(tf.slice(predictions, [0, 0], [-1, 1])).squeeze()); - // const scoresVal = scores.dataSync(); // scoresVal[boxIndex] is box confidence + const scoresVal = scores.dataSync(); const rawBoxes = tf.slice(predictions, [0, 1], [-1, 4]); const boxes = this.normalizeBoxes(rawBoxes); - const boxesWithHandsT = await tf.image.nonMaxSuppressionAsync(boxes, scores, config.maxHands, config.iouThreshold, config.scoreThreshold); - const boxesWithHands = boxesWithHandsT.arraySync(); - const toDispose = [ - batched, - boxesWithHandsT, - predictions, - boxes, - rawBoxes, - scores, - ]; + rawBoxes.dispose(); + const filteredT = await tf.image.nonMaxSuppressionAsync(boxes, scores, config.maxHands, config.iouThreshold, config.scoreThreshold); + const filtered = filteredT.arraySync(); + scores.dispose(); + filteredT.dispose(); const hands = []; - for (const boxIndex of boxesWithHands) { - const matchingBox = tf.slice(boxes, [boxIndex, 0], [1, -1]); - const rawPalmLandmarks = tf.slice(predictions, [boxIndex, 5], [1, 14]); - const palmLandmarks = tf.tidy(() => this.normalizeLandmarks(rawPalmLandmarks, boxIndex).reshape([-1, 2])); - rawPalmLandmarks.dispose(); - hands.push({ box: matchingBox, palmLandmarks }); + for (const boxIndex of filtered) { + if (scoresVal[boxIndex] >= config.minConfidence) { + const matchingBox = tf.slice(boxes, [boxIndex, 0], [1, -1]); + const rawPalmLandmarks = tf.slice(predictions, [boxIndex, 5], [1, 14]); + const palmLandmarks = tf.tidy(() => this.normalizeLandmarks(rawPalmLandmarks, boxIndex).reshape([-1, 2])); + rawPalmLandmarks.dispose(); + hands.push({ box: matchingBox, palmLandmarks, confidence: scoresVal[boxIndex] }); + } } - toDispose.forEach((tensor) => tensor.dispose()); + predictions.dispose(); + boxes.dispose(); return hands; } @@ -84,13 +83,13 @@ class HandDetector { if (!predictions || predictions.length === 0) return null; const hands = []; for (const prediction of predictions) { - const boundingBoxes = prediction.box.dataSync(); - const startPoint = boundingBoxes.slice(0, 2); - const endPoint = boundingBoxes.slice(2, 4); + const boxes = prediction.box.dataSync(); + const startPoint = boxes.slice(0, 2); + const endPoint = boxes.slice(2, 4); const palmLandmarks = prediction.palmLandmarks.arraySync(); prediction.box.dispose(); prediction.palmLandmarks.dispose(); - hands.push(box.scaleBoxCoordinates({ startPoint, endPoint, palmLandmarks }, [inputWidth / config.inputSize, inputHeight / config.inputSize])); + hands.push(box.scaleBoxCoordinates({ startPoint, endPoint, palmLandmarks, confidence: prediction.confidence }, [inputWidth / config.inputSize, inputHeight / config.inputSize])); } return hands; } diff --git a/src/hand/handpipeline.js b/src/hand/handpipeline.js index 276463c4..0439eecb 100644 --- a/src/hand/handpipeline.js +++ b/src/hand/handpipeline.js @@ -19,7 +19,6 @@ const tf = require('@tensorflow/tfjs'); const box = require('./box'); const util = require('./util'); -const UPDATE_REGION_OF_INTEREST_IOU_THRESHOLD = 0.8; const PALM_BOX_SHIFT_VECTOR = [0, -0.4]; const PALM_BOX_ENLARGE_FACTOR = 3; const HAND_BOX_SHIFT_VECTOR = [0, -0.1]; // move detected hand box by x,y to ease landmark detection @@ -87,68 +86,75 @@ class HandPipeline { async estimateHands(image, config) { this.skipped++; let useFreshBox = false; - // run new detector every skipFrames - const boxes = (this.skipped > config.skipFrames) - ? await this.boxDetector.estimateHandBounds(image, config) : null; + + // run new detector every skipFrames unless we only want box to start with + let boxes; + if ((this.skipped > config.skipFrames) || !config.landmarks) { + boxes = await this.boxDetector.estimateHandBounds(image, config); + this.skipped = 0; + } + // if detector result count doesn't match current working set, use it to reset current working set - if (boxes && (boxes.length !== this.detectedHands) && (this.detectedHands !== config.maxHands)) { - // console.log(this.skipped, config.maxHands, this.detectedHands, this.storedBoxes.length, boxes.length); + if (boxes && (boxes.length > 0) && ((boxes.length !== this.detectedHands) && (this.detectedHands !== config.maxHands) || !config.landmarks)) { this.storedBoxes = []; this.detectedHands = 0; for (const possible of boxes) this.storedBoxes.push(possible); if (this.storedBoxes.length > 0) useFreshBox = true; - this.skipped = 0; } const hands = []; + // console.log(`skipped: ${this.skipped} max: ${config.maxHands} detected: ${this.detectedHands} stored: ${this.storedBoxes.length} new: ${boxes?.length}`); + // go through working set of boxes for (const i in this.storedBoxes) { const currentBox = this.storedBoxes[i]; if (!currentBox) continue; - const angle = util.computeRotation(currentBox.palmLandmarks[PALM_LANDMARKS_INDEX_OF_PALM_BASE], currentBox.palmLandmarks[PALM_LANDMARKS_INDEX_OF_MIDDLE_FINGER_BASE]); - const palmCenter = box.getBoxCenter(currentBox); - const palmCenterNormalized = [palmCenter[0] / image.shape[2], palmCenter[1] / image.shape[1]]; - const rotatedImage = tf.image.rotateWithOffset(image, angle, 0, palmCenterNormalized); - const rotationMatrix = util.buildRotationMatrix(-angle, palmCenter); - const newBox = useFreshBox ? this.getBoxForPalmLandmarks(currentBox.palmLandmarks, rotationMatrix) : currentBox; - const croppedInput = box.cutBoxFromImageAndResize(newBox, rotatedImage, [this.inputSize, this.inputSize]); - const handImage = croppedInput.div(255); - croppedInput.dispose(); - rotatedImage.dispose(); - const [confidence, keypoints] = await this.meshDetector.predict(handImage); - handImage.dispose(); - const confidenceValue = confidence.dataSync()[0]; - confidence.dispose(); - if (confidenceValue >= config.minConfidence) { - const keypointsReshaped = tf.reshape(keypoints, [-1, 3]); - const rawCoords = keypointsReshaped.arraySync(); + if (config.landmarks) { + const angle = util.computeRotation(currentBox.palmLandmarks[PALM_LANDMARKS_INDEX_OF_PALM_BASE], currentBox.palmLandmarks[PALM_LANDMARKS_INDEX_OF_MIDDLE_FINGER_BASE]); + const palmCenter = box.getBoxCenter(currentBox); + const palmCenterNormalized = [palmCenter[0] / image.shape[2], palmCenter[1] / image.shape[1]]; + const rotatedImage = tf.image.rotateWithOffset(image, angle, 0, palmCenterNormalized); + const rotationMatrix = util.buildRotationMatrix(-angle, palmCenter); + const newBox = useFreshBox ? this.getBoxForPalmLandmarks(currentBox.palmLandmarks, rotationMatrix) : currentBox; + const croppedInput = box.cutBoxFromImageAndResize(newBox, rotatedImage, [this.inputSize, this.inputSize]); + const handImage = croppedInput.div(255); + croppedInput.dispose(); + rotatedImage.dispose(); + const [confidence, keypoints] = await this.meshDetector.predict(handImage); + handImage.dispose(); + const confidenceValue = confidence.dataSync()[0]; + confidence.dispose(); + if (confidenceValue >= config.minConfidence) { + const keypointsReshaped = tf.reshape(keypoints, [-1, 3]); + const rawCoords = keypointsReshaped.arraySync(); + keypoints.dispose(); + keypointsReshaped.dispose(); + const coords = this.transformRawCoords(rawCoords, newBox, angle, rotationMatrix); + const nextBoundingBox = this.getBoxForHandLandmarks(coords); + this.storedBoxes[i] = nextBoundingBox; + const result = { + landmarks: coords, + confidence: confidenceValue, + box: { + topLeft: nextBoundingBox.startPoint, + bottomRight: nextBoundingBox.endPoint, + }, + }; + hands.push(result); + } else { + this.storedBoxes[i] = null; + } keypoints.dispose(); - keypointsReshaped.dispose(); - const coords = this.transformRawCoords(rawCoords, newBox, angle, rotationMatrix); - const nextBoundingBox = this.getBoxForHandLandmarks(coords); - this.updateStoredBoxes(nextBoundingBox, i); - const result = { - landmarks: coords, - handInViewConfidence: confidenceValue, - boundingBox: { - topLeft: nextBoundingBox.startPoint, - bottomRight: nextBoundingBox.endPoint, - }, - }; - hands.push(result); } else { - this.updateStoredBoxes(null, i); - /* + const enlarged = box.enlargeBox(box.squarifyBox(box.shiftBox(currentBox, HAND_BOX_SHIFT_VECTOR)), HAND_BOX_ENLARGE_FACTOR); const result = { - handInViewConfidence: confidenceValue, - boundingBox: { - topLeft: currentBox.startPoint, - bottomRight: currentBox.endPoint, + confidence: currentBox.confidence, + box: { + topLeft: enlarged.startPoint, + bottomRight: enlarged.endPoint, }, }; hands.push(result); - */ } - keypoints.dispose(); } this.storedBoxes = this.storedBoxes.filter((a) => a !== null); this.detectedHands = hands.length; @@ -163,26 +169,6 @@ class HandPipeline { const endPoint = [Math.max(...xs), Math.max(...ys)]; return { startPoint, endPoint }; } - - updateStoredBoxes(newBox, i) { - const previousBox = this.storedBoxes[i]; - let iou = 0; - if (newBox && previousBox && previousBox.startPoint) { - const [boxStartX, boxStartY] = newBox.startPoint; - const [boxEndX, boxEndY] = newBox.endPoint; - const [previousBoxStartX, previousBoxStartY] = previousBox.startPoint; - const [previousBoxEndX, previousBoxEndY] = previousBox.endPoint; - const xStartMax = Math.max(boxStartX, previousBoxStartX); - const yStartMax = Math.max(boxStartY, previousBoxStartY); - const xEndMin = Math.min(boxEndX, previousBoxEndX); - const yEndMin = Math.min(boxEndY, previousBoxEndY); - const intersection = (xEndMin - xStartMax) * (yEndMin - yStartMax); - const boxArea = (boxEndX - boxStartX) * (boxEndY - boxStartY); - const previousBoxArea = (previousBoxEndX - previousBoxStartX) * (previousBoxEndY - boxStartY); - iou = intersection / (boxArea + previousBoxArea - intersection); - } - this.storedBoxes[i] = iou > UPDATE_REGION_OF_INTEREST_IOU_THRESHOLD ? previousBox : newBox; - } } exports.HandPipeline = HandPipeline; diff --git a/src/hand/handpose.js b/src/hand/handpose.js index 005c2b87..3e769d52 100644 --- a/src/hand/handpose.js +++ b/src/hand/handpose.js @@ -51,12 +51,12 @@ class HandPose { } } hands.push({ - confidence: prediction.handInViewConfidence, - box: prediction.boundingBox ? [ - prediction.boundingBox.topLeft[0], - prediction.boundingBox.topLeft[1], - prediction.boundingBox.bottomRight[0] - prediction.boundingBox.topLeft[0], - prediction.boundingBox.bottomRight[1] - prediction.boundingBox.topLeft[1], + confidence: prediction.confidence, + box: prediction.box ? [ + prediction.box.topLeft[0], + prediction.box.topLeft[1], + prediction.box.bottomRight[0] - prediction.box.topLeft[0], + prediction.box.bottomRight[1] - prediction.box.topLeft[1], ] : 0, landmarks: prediction.landmarks, annotations,