From e2bd9dbc345298f1de2c961aef774d7a25d87953 Mon Sep 17 00:00:00 2001 From: Vladimir Mandic Date: Sun, 11 Oct 2020 19:22:43 -0400 Subject: [PATCH] initial public commit --- .eslintrc.json | 54 +++++ .gitignore | 1 + README.md | 173 ++++++++++++++ demo/index.html | 25 +++ demo/index.js | 120 ++++++++++ package.json | 51 +++++ src/config.js | 58 +++++ src/facemesh/uvcoords.js | 470 +++++++++++++++++++++++++++++++++++++++ src/handpose/box.js | 65 ++++++ src/handpose/hand.js | 107 +++++++++ src/handpose/index.js | 93 ++++++++ src/image.js | 127 +++++++++++ src/index.js | 81 +++++++ 13 files changed, 1425 insertions(+) create mode 100644 .eslintrc.json create mode 100644 .gitignore create mode 100644 README.md create mode 100644 demo/index.html create mode 100644 demo/index.js create mode 100644 package.json create mode 100644 src/config.js create mode 100644 src/facemesh/uvcoords.js create mode 100644 src/handpose/box.js create mode 100644 src/handpose/hand.js create mode 100644 src/handpose/index.js create mode 100644 src/image.js create mode 100644 src/index.js diff --git a/.eslintrc.json b/.eslintrc.json new file mode 100644 index 00000000..e8937215 --- /dev/null +++ b/.eslintrc.json @@ -0,0 +1,54 @@ +{ + "globals": {}, + "env": { + "browser": true, + "commonjs": true, + "es6": true, + "node": true, + "jquery": true, + "es2020": true + }, + "parserOptions": { "ecmaVersion": 2020 }, + "plugins": [ ], + "extends": [ + "eslint:recommended", + "plugin:import/errors", + "plugin:import/warnings", + "plugin:node/recommended", + "plugin:promise/recommended", + "plugin:json/recommended-with-comments", + "airbnb-base" + ], + "ignorePatterns": [ "dist", "assets", "media", "models", "node_modules" ], + "rules": { + "max-len": [1, 275, 3], + "camelcase": "off", + "guard-for-in": "off", + "prefer-template":"off", + "import/extensions": "off", + "func-names": "off", + "no-await-in-loop": "off", + "no-bitwise": "off", + "no-case-declarations":"off", + "no-continue": "off", + "no-loop-func": "off", + "no-mixed-operators": "off", + "no-param-reassign":"off", + "no-plusplus": "off", + "dot-notation": "off", + "no-restricted-globals": "off", + "no-restricted-syntax": "off", + "no-underscore-dangle": "off", + "newline-per-chained-call": "off", + "node/no-unsupported-features/es-syntax": "off", + "node/shebang": "off", + "object-curly-newline": "off", + "prefer-destructuring": "off", + "promise/always-return": "off", + "promise/catch-or-return": "off", + "promise/no-nesting": "off", + "import/no-absolute-path": "off", + "no-regex-spaces": "off", + "radix": "off" + } +} \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..3c3629e6 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +node_modules diff --git a/README.md b/README.md new file mode 100644 index 00000000..d514a2ad --- /dev/null +++ b/README.md @@ -0,0 +1,173 @@ +# Human: 3D Face Detection, Body Pose, Hand & Finger Tracking, Iris Tracking and Age & Gender Prediction + +URL: + +*Suggestions are welcome!* + +## Credits + +This is an amalgamation of multiple existing models: + +- Face Detection: [**MediaPipe BlazeFace**](https://drive.google.com/file/d/1f39lSzU5Oq-j_OXgS67KfN5wNsoeAZ4V/view) +- Facial Spacial Geometry: [**MediaPipe FaceMesh**](https://drive.google.com/file/d/1VFC_wIpw4O7xBOiTgUldl79d9LA-LsnA/view) +- Eye Iris Details: [**MediaPipe Iris**](https://drive.google.com/file/d/1bsWbokp9AklH2ANjCfmjqEzzxO1CNbMu/view) +- Hand Detection & Skeleton: [**MediaPipe HandPose**](https://drive.google.com/file/d/1sv4sSb9BSNVZhLzxXJ0jBv9DqD-4jnAz/view) +- Body Pose Detection: [**PoseNet**](https://medium.com/tensorflow/real-time-human-pose-estimation-in-the-browser-with-tensorflow-js-7dd0bc881cd5) +- Age & Gender Prediction: [**SSR-Net**](https://github.com/shamangary/SSR-Net) + +## Install + +```shell +npm install @vladmandic/human +``` + +All pre-trained models are included in folder `/models` (25MB total) + +## Demo + +Demo is included in `/demo` + +## Requirements + +`Human` library is based on [TensorFlow/JS (TFJS)](js.tensorflow.org), but does not package it to allow for indepdenent version management - import `tfjs` before importing `Human` + +## Usage + +`Human` library does not require special initialization. +All configuration is done in a single JSON object and all model weights will be dynamically loaded upon their first usage(and only then, `Human` will not load weights that it doesn't need according to configuration). + +There is only *ONE* method you need: + +```js +import * as tf from '@tensorflow/tfjs'; +import human from '@vladmandic/human'; + +// 'image': can be of any type of an image object: HTMLImage, HTMLVideo, HTMLMedia, Canvas, Tensor4D +// 'options': optional parameter used to override any options present in default configuration +const results = await human.detect(image, options?) +``` + +Additionally, `Human` library exposes two classes: + +```js +human.defaults // default configuration object +human.models // dynamically maintained object of any loaded models +``` + +## Configuration + +Below is output of `human.defaults` object +Any property can be overriden by passing user object during `human.detect()` +Note that user object and default configuration are merged using deep-merge, so you do not need to redefine entire configuration + +```js +human.defaults = { + face: { + enabled: true, + detector: { + modelPath: '/models/human/blazeface/model.json', + maxFaces: 10, + skipFrames: 5, + minConfidence: 0.8, + iouThreshold: 0.3, + scoreThreshold: 0.75, + }, + mesh: { + enabled: true, + modelPath: '/models/human/facemesh/model.json', + }, + iris: { + enabled: true, + modelPath: '/models/human/iris/model.json', + }, + age: { + enabled: true, + modelPath: '/models/human/ssrnet-imdb-age/model.json', + skipFrames: 5, + }, + gender: { + enabled: true, + modelPath: '/models/human/ssrnet-imdb-gender/model.json', + }, + }, + body: { + enabled: true, + modelPath: '/models/human/posenet/model.json', + maxDetections: 5, + scoreThreshold: 0.75, + nmsRadius: 20, + }, + hand: { + enabled: true, + skipFrames: 5, + minConfidence: 0.8, + iouThreshold: 0.3, + scoreThreshold: 0.75, + detector: { + anchors: '/models/human/handdetect/anchors.json', + modelPath: '/models/human/handdetect/model.json', + }, + skeleton: { + modelPath: '/models/human/handskeleton/model.json', + }, + }, +}; +``` + +Where: +- `enabled`: controls if specified modul is enabled (note: module is not loaded until it is required) +- `modelPath`: path to specific pre-trained model weights +- `maxFaces`, `maxDetections`: how many faces or people are we trying to analyze. limiting number in busy scenes will result in higher performance +- `skipFrames`: how many frames to skip before re-running bounding box detection (e.g., face position does not move fast within a video, so it's ok to use previously detected face position and just run face geometry analysis) +- `minConfidence`: threshold for discarding a prediction +- `iouThreshold`: threshold for deciding whether boxes overlap too much in non-maximum suppression +- `scoreThreshold`: threshold for deciding when to remove boxes based on score in non-maximum suppression +- `nmsRadius`: radius for deciding points are too close in non-maximum suppression + +## Outputs + +Result of `humand.detect()` is a single object that includes data for all enabled modules and all detected objects: + +```js +result = { + face: // + [ + { + confidence: // + box: // + mesh: // (468 base points & 10 iris points) + annotations: // (32 base annotated landmarks & 2 iris annotations) + iris: // (relative distance of iris to camera, multiple by focal lenght to get actual distance) + age: // (estimated age) + gender: // (male or female) + } + ], + body: // + [ + { + score: // , + keypoints: // (17 annotated landmarks) + } + ], + hand: // + [ + confidence: // , + box: // , + landmarks: // (21 points) + annotations: // ]> (5 annotated landmakrs) + ] +} +``` + +## Performance + +Of course, performance will vary depending on your hardware, but also on number of enabled modules as well as their parameters. +For example, on a low-end nVidia GTX1050 it can perform face detection at 50+ FPS, but drop to <5 FPS if all modules are enabled. + +## Todo + +- Improve detection of smaller faces, add BlazeFace back model +- Create demo, host it on gitpages +- Implement draw helper functions +- Sample Images +- Rename human to human diff --git a/demo/index.html b/demo/index.html new file mode 100644 index 00000000..b8481df3 --- /dev/null +++ b/demo/index.html @@ -0,0 +1,25 @@ + + + + + + + +
+
+
+ + +
+
+
+
+
+ + + diff --git a/demo/index.js b/demo/index.js new file mode 100644 index 00000000..30c35746 --- /dev/null +++ b/demo/index.js @@ -0,0 +1,120 @@ +/* global tf, ScatterGL, dat */ + +import human from '../dist/human.esm.js'; + +const state = { + backend: 'webgl', + triangulateMesh: true, + renderPointcloud: true, + stop: false, + videoSize: 700, +}; +const options = { +}; + +let ctx; +let videoWidth; +let videoHeight; +let video; +let canvas; +let scatterGLHasInitialized = false; +let scatterGL; + +async function renderPrediction() { + const predictions = await human.detect(video); + ctx.drawImage(video, 0, 0, videoWidth, videoHeight, 0, 0, canvas.width, canvas.height); + const div = document.getElementById('faces'); + div.innerHTML = ''; + for (const prediction of predictions) { + div.appendChild(prediction.canvas); + ctx.beginPath(); + ctx.rect(prediction.box[0], prediction.box[1], prediction.box[2], prediction.box[3]); + ctx.font = 'small-caps 1rem "Segoe UI"'; + ctx.fillText(`${prediction.gender} ${prediction.age}`, prediction.box[0] + 2, prediction.box[1] + 16, prediction.box[2]); + ctx.stroke(); + if (state.triangulateMesh) { + for (let i = 0; i < human.triangulation.length / 3; i++) { + const points = [human.triangulation[i * 3], human.triangulation[i * 3 + 1], human.triangulation[i * 3 + 2]].map((index) => prediction.mesh[index]); + const region = new Path2D(); + region.moveTo(points[0][0], points[0][1]); + for (let j = 1; i < points.length; j++) region.lineTo(points[j][0], points[j][1]); + region.closePath(); + ctx.stroke(region); + } + } else { + for (let i = 0; i < prediction.mesh.length; i++) { + const x = prediction.mesh[i][0]; + const y = prediction.mesh[i][1]; + ctx.beginPath(); + ctx.arc(x, y, 1 /* radius */, 0, 2 * Math.PI); + ctx.fill(); + } + } + if (state.renderPointcloud && scatterGL != null) { + const pointsData = predictions.map((pred) => pred.mesh.map((point) => ([-point[0], -point[1], -point[2]]))); + let flattenedPointsData = []; + for (let i = 0; i < pointsData.length; i++) { + flattenedPointsData = flattenedPointsData.concat(pointsData[i]); + } + const dataset = new ScatterGL.Dataset(flattenedPointsData); + if (!scatterGLHasInitialized) scatterGL.render(dataset); + else scatterGL.updateDataset(dataset); + scatterGLHasInitialized = true; + } + } + if (!state.stop) requestAnimationFrame(renderPrediction); +} + +function setupDatGui() { + const gui = new dat.GUI(); + gui.add(state, 'stop').onChange(() => { renderPrediction(); }); + gui.add(state, 'backend', ['webgl', 'cpu']).onChange((backend) => { tf.setBackend(backend); }); + gui.add(options, 'maxFaces', 1, 100, 1).onChange(() => { human.load(options); }); + gui.add(options, 'detectionConfidence', 0, 1, 0.05).onChange(() => { human.load(options); }); + gui.add(options, 'iouThreshold', 0, 1, 0.05).onChange(() => { human.load(options); }); + gui.add(options, 'scoreThreshold', 0, 1, 0.05).onChange(() => { human.load(options); }); + gui.add(state, 'triangulateMesh'); + gui.add(state, 'renderPointcloud').onChange((render) => { document.querySelector('#scatter-gl-container').style.display = render ? 'inline-block' : 'none'; }); +} + +async function setupCamera() { + video = document.getElementById('video'); + const stream = await navigator.mediaDevices.getUserMedia({ + audio: false, + video: { facingMode: 'user', width: state.videoSize, height: state.videoSize }, + }); + video.srcObject = stream; + return new Promise((resolve) => { + video.onloadedmetadata = () => resolve(video); + }); +} + +async function main() { + await tf.setBackend(state.backend); + setupDatGui(); + await setupCamera(); + video.play(); + videoWidth = video.videoWidth; + videoHeight = video.videoHeight; + video.width = videoWidth; + video.height = videoHeight; + canvas = document.getElementById('output'); + canvas.width = videoWidth; + canvas.height = videoHeight; + const canvasContainer = document.querySelector('.canvas-wrapper'); + canvasContainer.style = `width: ${videoWidth}px; height: ${videoHeight}px`; + ctx = canvas.getContext('2d'); + // ctx.translate(canvas.width, 0); + // ctx.scale(-1, 1); + ctx.fillStyle = '#32EEDB'; + ctx.strokeStyle = '#32EEDB'; + ctx.lineWidth = 0.5; + human.load(options); + renderPrediction(); + if (state.renderPointcloud) { + document.querySelector('#scatter-gl-container').style = `width: ${state.videoSize}px; height: ${state.videoSize}px;`; + scatterGL = new ScatterGL(document.querySelector('#scatter-gl-container'), { rotateOnStart: false, selectEnabled: false }); + } +} + +main(); diff --git a/package.json b/package.json new file mode 100644 index 00000000..b5e5f1be --- /dev/null +++ b/package.json @@ -0,0 +1,51 @@ +{ + "name": "@vladmandic/human", + "version": "0.1.3", + "description": "human: 3D Face Detection, Iris Tracking and Age & Gender Prediction", + "sideEffects": false, + "main": "src/index.js", + "module": "dist/human.esm.js", + "browser": "dist/human.js", + "author": "Vladimir Mandic ", + "bugs": { + "url": "https://github.com/vladmandic/human/issues" + }, + "homepage": "https://github.com/vladmandic/human#readme", + "license": "MIT", + "engines": { + "node": ">=14.0.0" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/vladmandic/human.git" + }, + "dependencies": { + "@tensorflow/tfjs": "^2.6.0" + }, + "devDependencies": { + "esbuild": "^0.7.13", + "eslint": "^7.10.0", + "eslint-config-airbnb-base": "^14.2.0", + "eslint-plugin-import": "^2.22.1", + "eslint-plugin-json": "^2.1.2", + "eslint-plugin-node": "^11.1.0", + "eslint-plugin-promise": "^4.2.1", + "rimraf": "^3.0.2" + }, + "scripts": { + "build": "rimraf dist/ && npm run build-esm && npm run build-iife", + "build-esm": "esbuild --bundle --platform=browser --sourcemap --target=esnext --format=esm --external:@tensorflow --outfile=dist/human.esm.js src/index.js", + "build-iife": "esbuild --bundle --platform=browser --sourcemap --target=esnext --format=iife --minify --global-name=human --outfile=dist/human.js src/index.js" + }, + "keywords": [ + "face detection", + "detection", + "recognition", + "blazeface", + "facemesh", + "ssrnet", + "tensorflow", + "tensorflowjs", + "tfjs" + ] +} diff --git a/src/config.js b/src/config.js new file mode 100644 index 00000000..5833f72f --- /dev/null +++ b/src/config.js @@ -0,0 +1,58 @@ +export default { + face: { + enabled: true, // refers to detector, but since all other face modules rely on detector, it should be a global + detector: { + modelPath: '/models/blazeface/model.json', + inputSize: 128, // fixed value + maxFaces: 10, // maximum number of faces detected in the input, should be set to the minimum number for performance + skipFrames: 5, // how many frames to go without running the bounding box detector, only relevant if maxFaces > 1 + minConfidence: 0.8, // threshold for discarding a prediction + iouThreshold: 0.3, // threshold for deciding whether boxes overlap too much in non-maximum suppression, must be between [0, 1] + scoreThreshold: 0.75, // threshold for deciding when to remove boxes based on score in non-maximum suppression + }, + mesh: { + enabled: true, + modelPath: '/models/facemesh/model.json', + inputSize: 192, // fixed value + }, + iris: { + enabled: true, + modelPath: '/models/iris/model.json', + inputSize: 192, // fixed value + }, + age: { + enabled: true, + modelPath: '/models/ssrnet-age/imdb/model.json', + inputSize: 64, // fixed value + skipFrames: 5, + }, + gender: { + enabled: true, + modelPath: '/models/ssrnet-gender/imdb/model.json', + }, + }, + body: { + enabled: true, + modelPath: '/models/posenet/model.json', + inputResolution: 257, // fixed value + outputStride: 16, // fixed value + maxDetections: 5, + scoreThreshold: 0.75, + nmsRadius: 20, + }, + hand: { + enabled: true, + inputSize: 256, // fixed value + skipFrames: 5, + minConfidence: 0.8, + iouThreshold: 0.3, + scoreThreshold: 0.75, + detector: { + anchors: '/models/handdetect/anchors.json', + modelPath: '/models/handdetect/model.json', + }, + skeleton: { + modelPath: '/models/handskeleton/model.json', + }, + }, +}; diff --git a/src/facemesh/uvcoords.js b/src/facemesh/uvcoords.js new file mode 100644 index 00000000..0032bf2d --- /dev/null +++ b/src/facemesh/uvcoords.js @@ -0,0 +1,470 @@ +exportsdiff --git a/src/handpose/box.js b/src/handpose/box.js new file mode 100644 index 00000000..3450ca7b --- /dev/null +++ b/src/handpose/box.js @@ -0,0 +1,65 @@ +const tf = require('@tensorflow/tfjs'); + +function getBoxSize(box) { + return [ + Math.abs(box.endPoint[0] - box.startPoint[0]), + Math.abs(box.endPoint[1] - box.startPoint[1]), + ]; +} +exports.getBoxSize = getBoxSize; +function getBoxCenter(box) { + return [ + box.startPoint[0] + (box.endPoint[0] - box.startPoint[0]) / 2, + box.startPoint[1] + (box.endPoint[1] - box.startPoint[1]) / 2, + ]; +} +exports.getBoxCenter = getBoxCenter; +function cutBoxFromImageAndResize(box, image, cropSize) { + const h = image.shape[1]; + const w = image.shape[2]; + const boxes = [[ + box.startPoint[1] / h, box.startPoint[0] / w, box.endPoint[1] / h, + box.endPoint[0] / w, + ]]; + return tf.image.cropAndResize(image, boxes, [0], cropSize); +} +exports.cutBoxFromImageAndResize = cutBoxFromImageAndResize; +function scaleBoxCoordinates(box, factor) { + const startPoint = [box.startPoint[0] * factor[0], box.startPoint[1] * factor[1]]; + const endPoint = [box.endPoint[0] * factor[0], box.endPoint[1] * factor[1]]; + const palmLandmarks = box.palmLandmarks.map((coord) => { + const scaledCoord = [coord[0] * factor[0], coord[1] * factor[1]]; + return scaledCoord; + }); + return { startPoint, endPoint, palmLandmarks }; +} +exports.scaleBoxCoordinates = scaleBoxCoordinates; +function enlargeBox(box, factor = 1.5) { + const center = getBoxCenter(box); + const size = getBoxSize(box); + const newHalfSize = [factor * size[0] / 2, factor * size[1] / 2]; + const startPoint = [center[0] - newHalfSize[0], center[1] - newHalfSize[1]]; + const endPoint = [center[0] + newHalfSize[0], center[1] + newHalfSize[1]]; + return { startPoint, endPoint, palmLandmarks: box.palmLandmarks }; +} +exports.enlargeBox = enlargeBox; +function squarifyBox(box) { + const centers = getBoxCenter(box); + const size = getBoxSize(box); + const maxEdge = Math.max(...size); + const halfSize = maxEdge / 2; + const startPoint = [centers[0] - halfSize, centers[1] - halfSize]; + const endPoint = [centers[0] + halfSize, centers[1] + halfSize]; + return { startPoint, endPoint, palmLandmarks: box.palmLandmarks }; +} +exports.squarifyBox = squarifyBox; +function shiftBox(box, shiftFactor) { + const boxSize = [ + box.endPoint[0] - box.startPoint[0], box.endPoint[1] - box.startPoint[1], + ]; + const shiftVector = [boxSize[0] * shiftFactor[0], boxSize[1] * shiftFactor[1]]; + const startPoint = [box.startPoint[0] + shiftVector[0], box.startPoint[1] + shiftVector[1]]; + const endPoint = [box.endPoint[0] + shiftVector[0], box.endPoint[1] + shiftVector[1]]; + return { startPoint, endPoint, palmLandmarks: box.palmLandmarks }; +} +exports.shiftBox = shiftBox; diff --git a/src/handpose/hand.js b/src/handpose/hand.js new file mode 100644 index 00000000..8194bf55 --- /dev/null +++ b/src/handpose/hand.js @@ -0,0 +1,107 @@ +const tf = require('@tensorflow/tfjs'); +const bounding = require('./box'); + +class HandDetector { + constructor(model, width, height, anchors, iouThreshold, scoreThreshold) { + this.model = model; + this.width = width; + this.height = height; + this.iouThreshold = iouThreshold; + this.scoreThreshold = scoreThreshold; + this.anchors = anchors.map((anchor) => [anchor.x_center, anchor.y_center]); + this.anchorsTensor = tf.tensor2d(this.anchors); + this.inputSizeTensor = tf.tensor1d([width, height]); + this.doubleInputSizeTensor = tf.tensor1d([width * 2, height * 2]); + } + + normalizeBoxes(boxes) { + return tf.tidy(() => { + const boxOffsets = tf.slice(boxes, [0, 0], [-1, 2]); + const boxSizes = tf.slice(boxes, [0, 2], [-1, 2]); + const boxCenterPoints = tf.add(tf.div(boxOffsets, this.inputSizeTensor), this.anchorsTensor); + const halfBoxSizes = tf.div(boxSizes, this.doubleInputSizeTensor); + const startPoints = tf.mul(tf.sub(boxCenterPoints, halfBoxSizes), this.inputSizeTensor); + const endPoints = tf.mul(tf.add(boxCenterPoints, halfBoxSizes), this.inputSizeTensor); + return tf.concat2d([startPoints, endPoints], 1); + }); + } + + normalizeLandmarks(rawPalmLandmarks, index) { + return tf.tidy(() => { + const landmarks = tf.add(tf.div(rawPalmLandmarks.reshape([-1, 7, 2]), this.inputSizeTensor), this.anchors[index]); + return tf.mul(landmarks, this.inputSizeTensor); + }); + } + + async getBoundingBoxes(input) { + const normalizedInput = tf.tidy(() => tf.mul(tf.sub(input, 0.5), 2)); + let batchedPrediction; + if (tf.getBackend() === 'webgl') { + // Currently tfjs-core does not pack depthwiseConv because it fails for + // very large inputs (https://github.com/tensorflow/tfjs/issues/1652). + // TODO(annxingyuan): call tf.enablePackedDepthwiseConv when available + // (https://github.com/tensorflow/tfjs/issues/2821) + const savedWebglPackDepthwiseConvFlag = tf.env().get('WEBGL_PACK_DEPTHWISECONV'); + tf.env().set('WEBGL_PACK_DEPTHWISECONV', true); + // The model returns a tensor with the following shape: + // [1 (batch), 2944 (anchor points), 19 (data for each anchor)] + batchedPrediction = this.model.predict(normalizedInput); + tf.env().set('WEBGL_PACK_DEPTHWISECONV', savedWebglPackDepthwiseConvFlag); + } else { + batchedPrediction = this.model.predict(normalizedInput); + } + const prediction = batchedPrediction.squeeze(); + // Regression score for each anchor point. + const scores = tf.tidy(() => tf.sigmoid(tf.slice(prediction, [0, 0], [-1, 1])).squeeze()); + // Bounding box for each anchor point. + const rawBoxes = tf.slice(prediction, [0, 1], [-1, 4]); + const boxes = this.normalizeBoxes(rawBoxes); + const boxesWithHandsTensor = await tf.image.nonMaxSuppressionAsync(boxes, scores, 1, this.iouThreshold, this.scoreThreshold); + const boxesWithHands = await boxesWithHandsTensor.array(); + const toDispose = [ + normalizedInput, batchedPrediction, boxesWithHandsTensor, prediction, + boxes, rawBoxes, scores, + ]; + if (boxesWithHands.length === 0) { + toDispose.forEach((tensor) => tensor.dispose()); + return null; + } + const boxIndex = boxesWithHands[0]; + const matchingBox = tf.slice(boxes, [boxIndex, 0], [1, -1]); + const rawPalmLandmarks = tf.slice(prediction, [boxIndex, 5], [1, 14]); + const palmLandmarks = tf.tidy(() => this.normalizeLandmarks(rawPalmLandmarks, boxIndex).reshape([ + -1, 2, + ])); + toDispose.push(rawPalmLandmarks); + toDispose.forEach((tensor) => tensor.dispose()); + return { boxes: matchingBox, palmLandmarks }; + } + + /** + * Returns a Box identifying the bounding box of a hand within the image. + * Returns null if there is no hand in the image. + * + * @param input The image to classify. + */ + async estimateHandBounds(input) { + const inputHeight = input.shape[1]; + const inputWidth = input.shape[2]; + const image = tf.tidy(() => input.resizeBilinear([this.width, this.height]).div(255)); + const prediction = await this.getBoundingBoxes(image); + if (prediction === null) { + image.dispose(); + return null; + } + // Calling arraySync on both boxes and palmLandmarks because the tensors are + // very small so it's not worth calling await array(). + const boundingBoxes = prediction.boxes.arraySync(); + const startPoint = boundingBoxes[0].slice(0, 2); + const endPoint = boundingBoxes[0].slice(2, 4); + const palmLandmarks = prediction.palmLandmarks.arraySync(); + image.dispose(); + prediction.boxes.dispose(); + prediction.palmLandmarks.dispose(); + return bounding.scaleBoxCoordinates({ startPoint, endPoint, palmLandmarks }, [inputWidth / this.width, inputHeight / this.height]); + } +} +exports.HandDetector = HandDetector; diff --git a/src/handpose/index.js b/src/handpose/index.js new file mode 100644 index 00000000..0be8a58c --- /dev/null +++ b/src/handpose/index.js @@ -0,0 +1,93 @@ +const tf = require('@tensorflow/tfjs'); +const hand = require('./hand'); +const keypoints = require('./keypoints'); +const pipe = require('./pipeline'); + +// Load the bounding box detector model. +async function loadHandDetectorModel(url) { + return tf.loadGraphModel(url, { fromTFHub: url.includes('tfhub.dev') }); +} + +// Load the mesh detector model. +async function loadHandPoseModel(url) { + return tf.loadGraphModel(url, { fromTFHub: url.includes('tfhub.dev') }); +} + +// In single shot detector pipelines, the output space is discretized into a set +// of bounding boxes, each of which is assigned a score during prediction. The +// anchors define the coordinates of these boxes. +async function loadAnchors(url) { + return tf.util + .fetch(url) + .then((d) => d.json()); +} + +/** + * Load handpose. + * + * @param config A configuration object with the following properties: + * - `maxContinuousChecks` How many frames to go without running the bounding + * box detector. Defaults to infinity. Set to a lower value if you want a safety + * net in case the mesh detector produces consistently flawed predictions. + * - `detectionConfidence` Threshold for discarding a prediction. Defaults to + * 0.8. + * - `iouThreshold` A float representing the threshold for deciding whether + * boxes overlap too much in non-maximum suppression. Must be between [0, 1]. + * Defaults to 0.3. + * - `scoreThreshold` A threshold for deciding when to remove boxes based + * on score in non-maximum suppression. Defaults to 0.75. + */ +async function load(config) { + const [ANCHORS, handDetectorModel, handPoseModel] = await Promise.all([ + loadAnchors(config.detector.anchors), + loadHandDetectorModel(config.detector.modelPath), + loadHandPoseModel(config.skeleton.modelPath), + ]); + const detector = new hand.HandDetector(handDetectorModel, config.inputSize, config.inputSize, ANCHORS, config.iouThreshold, config.scoreThreshold); + const pipeline = new pipe.HandPipeline(detector, handPoseModel, config.inputSize, config.inputSize, config.skipFrames, config.minConfidence); + // eslint-disable-next-line no-use-before-define + const handpose = new HandPose(pipeline); + return handpose; +} +exports.load = load; + +class HandPose { + constructor(pipeline) { + this.pipeline = pipeline; + } + + static getAnnotations() { + return keypoints.MESH_ANNOTATIONS; + } + + /** + * Finds hands in the input image. + * + * @param input The image to classify. Can be a tensor, DOM element image, + * video, or canvas. + * @param flipHorizontal Whether to flip the hand keypoints horizontally. + * Should be true for videos that are flipped by default (e.g. webcams). + */ + async estimateHands(input, config) { + const image = tf.tidy(() => { + if (!(input instanceof tf.Tensor)) { + input = tf.browser.fromPixels(input); + } + return input.toFloat().expandDims(0); + }); + const prediction = await this.pipeline.estimateHand(image, config); + image.dispose(); + if (!prediction) return []; + const annotations = {}; + for (const key of Object.keys(keypoints.MESH_ANNOTATIONS)) { + annotations[key] = keypoints.MESH_ANNOTATIONS[key].map((index) => prediction.landmarks[index]); + } + return [{ + confidence: prediction.confidence || 0, + box: prediction.box ? [prediction.box.topLeft[0], prediction.box.topLeft[1], prediction.box.bottomRight[0] - prediction.box.topLeft[0], prediction.box.bottomRight[1] - prediction.box.topLeft[1]] : 0, + landmarks: prediction.landmarks, + annotations, + }]; + } +} +exports.HandPose = HandPose; diff --git a/src/image.js b/src/image.js new file mode 100644 index 00000000..e0a485c4 --- /dev/null +++ b/src/image.js @@ -0,0 +1,127 @@ +const defaultFont = 'small-caps 1rem "Segoe UI"'; + +function clear(canvas) { + if (canvas) canvas.getContext('2d').clearRect(0, 0, canvas.width, canvas.height); +} + +function crop(image, x, y, width, height, { color = 'white', title = null, font = null }) { + const canvas = new OffscreenCanvas(width, height); + const ctx = canvas.getContext('2d'); + ctx.drawImage(image, x, y, width, height, 0, 0, canvas.width, canvas.height); + ctx.fillStyle = color; + ctx.font = font || defaultFont; + if (title) ctx.fillText(title, 2, 16, canvas.width - 4); + return canvas; +} + +function point({ canvas = null, x = 0, y = 0, color = 'white', radius = 2, title = null, font = null }) { + if (!canvas) return; + const ctx = canvas.getContext('2d'); + ctx.fillStyle = color; + ctx.beginPath(); + ctx.arc(x, y, radius, 0, 2 * Math.PI); + ctx.fill(); + ctx.font = font || defaultFont; + if (title) ctx.fillText(title, x + 10, y + 4); +} + +function rect({ canvas = null, x = 0, y = 0, width = 0, height = 0, radius = 8, lineWidth = 2, color = 'white', title = null, font = null }) { + if (!canvas) return; + const ctx = canvas.getContext('2d'); + ctx.lineWidth = lineWidth; + ctx.beginPath(); + ctx.moveTo(x + radius, y); + ctx.lineTo(x + width - radius, y); + ctx.quadraticCurveTo(x + width, y, x + width, y + radius); + ctx.lineTo(x + width, y + height - radius); + ctx.quadraticCurveTo(x + width, y + height, x + width - radius, y + height); + ctx.lineTo(x + radius, y + height); + ctx.quadraticCurveTo(x, y + height, x, y + height - radius); + ctx.lineTo(x, y + radius); + ctx.quadraticCurveTo(x, y, x + radius, y); + ctx.closePath(); + ctx.strokeStyle = color; + ctx.stroke(); + ctx.lineWidth = 1; + ctx.fillStyle = color; + ctx.font = font || defaultFont; + if (title) ctx.fillText(title, x + 4, y + 16); +} + +function line({ points = [], canvas = null, lineWidth = 2, color = 'white', title = null, font = null }) { + if (!canvas) return; + if (points.length < 2) return; + const ctx = canvas.getContext('2d'); + ctx.lineWidth = lineWidth; + ctx.beginPath(); + ctx.moveTo(points[0][0], points[0][1]); + for (const pt of points) ctx.lineTo(pt[0], pt[1]); + ctx.strokeStyle = color; + ctx.fillStyle = color; + ctx.stroke(); + ctx.lineWidth = 1; + ctx.font = font || defaultFont; + if (title) ctx.fillText(title, points[0][0] + 4, points[0][1] + 16); +} + +function spline({ points = [], canvas = null, tension = 0.5, lineWidth = 2, color = 'white', title = null, font = null }) { + if (!canvas) return; + if (points.length < 2) return; + const va = (arr, i, j) => [arr[2 * j] - arr[2 * i], arr[2 * j + 1] - arr[2 * i + 1]]; + const distance = (arr, i, j) => Math.sqrt(((arr[2 * i] - arr[2 * j]) ** 2) + ((arr[2 * i + 1] - arr[2 * j + 1]) ** 2)); + // eslint-disable-next-line no-unused-vars + function ctlpts(x1, y1, x2, y2, x3, y3) { + // eslint-disable-next-line prefer-rest-params + const v = va(arguments, 0, 2); + // eslint-disable-next-line prefer-rest-params + const d01 = distance(arguments, 0, 1); + // eslint-disable-next-line prefer-rest-params + const d12 = distance(arguments, 1, 2); + const d012 = d01 + d12; + return [ + x2 - v[0] * tension * d01 / d012, y2 - v[1] * tension * d01 / d012, + x2 + v[0] * tension * d12 / d012, y2 + v[1] * tension * d12 / d012, + ]; + } + const pts = []; + for (const pt of points) { + pts.push(pt[0]); + pts.push(pt[1]); + } + let cps = []; + for (let i = 0; i < pts.length - 2; i += 1) { + cps = cps.concat(ctlpts(pts[2 * i + 0], pts[2 * i + 1], pts[2 * i + 2], pts[2 * i + 3], pts[2 * i + 4], pts[2 * i + 5])); + } + const ctx = canvas.getContext('2d'); + ctx.lineWidth = lineWidth; + ctx.strokeStyle = color; + ctx.fillStyle = color; + if (points.length === 2) { + ctx.beginPath(); + ctx.moveTo(pts[0], pts[1]); + ctx.lineTo(pts[2], pts[3]); + } else { + ctx.beginPath(); + ctx.moveTo(pts[0], pts[1]); + // first segment is a quadratic + ctx.quadraticCurveTo(cps[0], cps[1], pts[2], pts[3]); + // for all middle points, connect with bezier + let i; + for (i = 2; i < ((pts.length / 2) - 1); i += 1) { + ctx.bezierCurveTo(cps[(2 * (i - 1) - 1) * 2], cps[(2 * (i - 1) - 1) * 2 + 1], cps[(2 * (i - 1)) * 2], cps[(2 * (i - 1)) * 2 + 1], pts[i * 2], pts[i * 2 + 1]); + } + // last segment is a quadratic + ctx.quadraticCurveTo(cps[(2 * (i - 1) - 1) * 2], cps[(2 * (i - 1) - 1) * 2 + 1], pts[i * 2], pts[i * 2 + 1]); + } + ctx.stroke(); + ctx.lineWidth = 1; + ctx.font = font || defaultFont; + if (title) ctx.fillText(title, points[0][0] + 4, points[0][1] + 16); +} + +exports.crop = crop; +exports.rect = rect; +exports.point = point; +exports.line = line; +exports.spline = spline; +exports.clear = clear; diff --git a/src/index.js b/src/index.js new file mode 100644 index 00000000..05b7e5ba --- /dev/null +++ b/src/index.js @@ -0,0 +1,81 @@ +const facemesh = require('./facemesh/index.js'); +const ssrnet = require('./ssrnet/index.js'); +const posenet = require('./posenet/index.js'); +const handpose = require('./handpose/index.js'); +// const image = require('./image.js'); +// const triangulation = require('./triangulation.js').default; +const defaults = require('./config.js').default; + +const models = { + facemesh: null, + blazeface: null, + ssrnet: null, + iris: null, +}; + +function mergeDeep(...objects) { + const isObject = (obj) => obj && typeof obj === 'object'; + return objects.reduce((prev, obj) => { + Object.keys(obj).forEach((key) => { + const pVal = prev[key]; + const oVal = obj[key]; + if (Array.isArray(pVal) && Array.isArray(oVal)) { + prev[key] = pVal.concat(...oVal); + } else if (isObject(pVal) && isObject(oVal)) { + prev[key] = mergeDeep(pVal, oVal); + } else { + prev[key] = oVal; + } + }); + return prev; + }, {}); +} + +async function detect(input, userConfig) { + const config = mergeDeep(defaults, userConfig); + + // run posenet + let poseRes = []; + if (config.body.enabled) { + if (!models.posenet) models.posenet = await posenet.load(config.body); + poseRes = await models.posenet.estimateMultiplePoses(input, config.body); + } + + // run handpose + let handRes = []; + if (config.hand.enabled) { + if (!models.handpose) models.handpose = await handpose.load(config.hand); + handRes = await models.handpose.estimateHands(input, config.hand); + } + + // run facemesh, includes blazeface and iris + const faceRes = []; + if (config.face.enabled) { + if (!models.facemesh) models.facemesh = await facemesh.load(config.face); + const faces = await models.facemesh.estimateFaces(input, config.face); + for (const face of faces) { + // run ssr-net age & gender, inherits face from blazeface + const ssrdata = (config.face.age.enabled || config.face.gender.enabled) ? await ssrnet.predict(face.image, config) : {}; + // iris: array[ bottom, left, top, right, center ] + const iris = (face.annotations.leftEyeIris && face.annotations.rightEyeIris) + ? Math.max(face.annotations.leftEyeIris[3][0] - face.annotations.leftEyeIris[1][0], face.annotations.rightEyeIris[3][0] - face.annotations.rightEyeIris[1][0]) + : 0; + faceRes.push({ + confidence: face.confidence, + box: face.box, + mesh: face.mesh, + annotations: face.annotations, + age: ssrdata.age, + gender: ssrdata.gender, + iris: (iris !== 0) ? Math.trunc(100 * 11.7 / iris) / 100 : 0, + }); + } + } + + // combine results + return { face: faceRes, body: poseRes, hand: handRes }; +} + +exports.detect = detect; +exports.defaults = defaults; +exports.models = models;