From cd35d733d91cc3676d095102d008fce7f6ae2ec2 Mon Sep 17 00:00:00 2001
From: Vladimir Mandic <mandic00@live.com>
Date: Thu, 14 Oct 2021 12:26:59 -0400
Subject: [PATCH] enhanced movenet postprocessing

---
 CHANGELOG.md              |   1 +
 demo/index.js             |   2 +-
 package.json              |   6 +--
 src/body/movenet.ts       |  61 ++++++++++++----------
 src/body/movenetcoords.ts |  18 +++++--
 src/body/movenetfix.ts    | 107 ++++++++++++++++++++++++++++++++++++++
 src/config.ts             |   2 +-
 7 files changed, 161 insertions(+), 36 deletions(-)
 create mode 100644 src/body/movenetfix.ts
diff --git a/CHANGELOG.md b/CHANGELOG.md
index dc121b73..d20eda97 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,7 @@
   
 ### **HEAD -> main** 2021/10/13 mandic00@live.com
 
+- use transferrable buffer for worker messages
 - add optional anti-spoofing module
 - add node-match advanced example using worker thread pool
 - package updates
diff --git a/demo/index.js b/demo/index.js
index e7ed7e20..7bd6c0ac 100644
--- a/demo/index.js
+++ b/demo/index.js
@@ -32,7 +32,7 @@ let human;
 
 let userConfig = {
   // face: { enabled: false },
-  // body: { enabled: false },
+  // body: { enabled: true },
   // hand: { enabled: false },
   /*
   warmup: 'none',
diff --git a/package.json b/package.json
index 7ac29164..c20a9a2a 100644
--- a/package.json
+++ b/package.json
@@ -66,15 +66,15 @@
     "@tensorflow/tfjs-layers": "^3.9.0",
     "@tensorflow/tfjs-node": "^3.9.0",
     "@tensorflow/tfjs-node-gpu": "^3.9.0",
-    "@types/node": "^16.10.5",
+    "@types/node": "^16.10.9",
     "@typescript-eslint/eslint-plugin": "^5.0.0",
     "@typescript-eslint/parser": "^5.0.0",
     "@vladmandic/build": "^0.6.0",
     "@vladmandic/pilogger": "^0.3.3",
     "canvas": "^2.8.0",
     "dayjs": "^1.10.7",
-    "esbuild": "^0.13.5",
-    "eslint": "8.0.0",
+    "esbuild": "^0.13.6",
+    "eslint": "8.0.1",
     "eslint-config-airbnb-base": "^14.2.1",
     "eslint-plugin-import": "^2.25.2",
     "eslint-plugin-json": "^3.1.0",
diff --git a/src/body/movenet.ts b/src/body/movenet.ts
index 3b06077c..30998c44 100644
--- a/src/body/movenet.ts
+++ b/src/body/movenet.ts
@@ -8,6 +8,7 @@ import { log, join } from '../util/util';
 import * as box from '../util/box';
 import * as tf from '../../dist/tfjs.esm.js';
 import * as coords from './movenetcoords';
+import * as fix from './movenetfix';
 import type { BodyKeypoint, BodyResult, Box, Point } from '../result';
 import type { GraphModel, Tensor } from '../tfjs/types';
 import type { Config } from '../config';
@@ -16,19 +17,17 @@ import { env } from '../util/env';
 
 let model: GraphModel | null;
 let inputSize = 0;
-const boxExpandFact = 1.5; // increase to 150%
+let skipped = Number.MAX_SAFE_INTEGER;
+// const boxExpandFact = 1.5; // increase to 150%
 
 const cache: {
-  boxes: Array<Box>,
+  boxes: Array<Box>, // unused
   bodies: Array<BodyResult>;
 } = {
   boxes: [],
   bodies: [],
 };
 
-let skipped = Number.MAX_SAFE_INTEGER;
-const keypoints: Array<BodyKeypoint> = [];
-
 export async function load(config: Config): Promise<GraphModel> {
   if (env.initial) model = null;
   if (!model) {
@@ -42,23 +41,9 @@ export async function load(config: Config): Promise<GraphModel> {
   return model;
 }
 
-function fixSides() { // model sometimes mixes up left vs right keypoints so we fix them
-  for (const pair of coords.pairs) {
-    let left = keypoints.find((kp) => kp.part === pair[0]);
-    let right = keypoints.find((kp) => kp.part === pair[1]);
-    if (left && right) {
-      if (left.position[0] > right.position[0]) {
-        const tmp = left;
-        left = right;
-        right = tmp;
-      }
-    }
-  }
-}
-
 async function parseSinglePose(res, config, image, inputBox) {
   const kpt = res[0][0];
-  keypoints.length = 0;
+  const keypoints: Array<BodyKeypoint> = [];
   let score = 0;
   for (let id = 0; id < kpt.length; id++) {
     score = kpt[id][2];
@@ -78,7 +63,6 @@ async function parseSinglePose(res, config, image, inputBox) {
       });
     }
   }
-  fixSides();
   score = keypoints.reduce((prev, curr) => (curr.score > prev ? curr.score : prev), 0);
   const bodies: Array<BodyResult> = [];
   const newBox = box.calc(keypoints.map((pt) => pt.position), [image.shape[2], image.shape[1]]);
@@ -92,7 +76,9 @@ async function parseSinglePose(res, config, image, inputBox) {
     }
     annotations[name] = pt;
   }
-  bodies.push({ id: 0, score, box: newBox.box, boxRaw: newBox.boxRaw, keypoints, annotations });
+  const body: BodyResult = { id: 0, score, box: newBox.box, boxRaw: newBox.boxRaw, keypoints, annotations };
+  fix.bodyParts(body);
+  bodies.push(body);
   return bodies;
 }
 
@@ -102,7 +88,7 @@ async function parseMultiPose(res, config, image, inputBox) {
     const kpt = res[0][id];
     const totalScore = Math.round(100 * kpt[51 + 4]) / 100;
     if (totalScore > config.body.minConfidence) {
-      keypoints.length = 0;
+      const keypoints: Array<BodyKeypoint> = [];
       for (let i = 0; i < 17; i++) {
         const score = kpt[3 * i + 2];
         if (score > config.body.minConfidence) {
@@ -118,7 +104,6 @@ async function parseMultiPose(res, config, image, inputBox) {
           });
         }
       }
-      fixSides();
       const newBox = box.calc(keypoints.map((pt) => pt.position), [image.shape[2], image.shape[1]]);
       // movenet-multipose has built-in box details
       // const boxRaw: Box = [kpt[51 + 1], kpt[51 + 0], kpt[51 + 3] - kpt[51 + 1], kpt[51 + 2] - kpt[51 + 0]];
@@ -133,7 +118,9 @@ async function parseMultiPose(res, config, image, inputBox) {
         }
         annotations[name] = pt;
       }
-      bodies.push({ id, score: totalScore, box: newBox.box, boxRaw: newBox.boxRaw, keypoints: [...keypoints], annotations });
+      const body: BodyResult = { id, score: totalScore, box: newBox.box, boxRaw: newBox.boxRaw, keypoints: [...keypoints], annotations };
+      fix.bodyParts(body);
+      bodies.push(body);
     }
   }
   bodies.sort((a, b) => b.score - a.score);
@@ -158,11 +145,14 @@ export async function predict(input: Tensor, config: Config): Promise<BodyResult
   return new Promise(async (resolve) => {
     const t: Record<string, Tensor> = {};
     skipped = 0;
+    // run detection on squared input and cached boxes
+    /*
     cache.bodies = []; // reset bodies result
     if (cache.boxes.length >= (config.body.maxDetected || 0)) { // if we have enough cached boxes run detection using cache
       for (let i = 0; i < cache.boxes.length; i++) { // run detection based on cached boxes
         t.crop = tf.image.cropAndResize(input, [cache.boxes[i]], [0], [inputSize, inputSize], 'bilinear');
         t.cast = tf.cast(t.crop, 'int32');
+        // t.input = prepareImage(input);
         t.res = await model?.predict(t.cast) as Tensor;
         const res = await t.res.array();
         const newBodies = (t.res.shape[2] === 17) ? await parseSinglePose(res, config, input, cache.boxes[i]) : await parseMultiPose(res, config, input, cache.boxes[i]);
@@ -171,11 +161,11 @@ export async function predict(input: Tensor, config: Config): Promise<BodyResult
       }
     }
     if (cache.bodies.length !== config.body.maxDetected) { // did not find enough bodies based on cached boxes so run detection on full frame
-      t.resized = tf.image.resizeBilinear(input, [inputSize, inputSize], false);
-      t.cast = tf.cast(t.resized, 'int32');
-      t.res = await model?.predict(t.cast) as Tensor;
+      t.input = prepareImage(input);
+      t.res = await model?.predict(t.input) as Tensor;
       const res = await t.res.array();
       cache.bodies = (t.res.shape[2] === 17) ? await parseSinglePose(res, config, input, [0, 0, 1, 1]) : await parseMultiPose(res, config, input, [0, 0, 1, 1]);
+      for (const body of cache.bodies) rescaleBody(body, [input.shape[2] || 1, input.shape[1] || 1]);
       Object.keys(t).forEach((tensor) => tf.dispose(t[tensor]));
     }
     cache.boxes.length = 0; // reset cache
@@ -186,6 +176,21 @@ export async function predict(input: Tensor, config: Config): Promise<BodyResult
         cache.boxes.push(cropBox);
       }
     }
+    */
+
+    // run detection on squared input and no cached boxes
+    t.input = fix.padInput(input, inputSize);
+    t.res = await model?.predict(t.input) as Tensor;
+    const res = await t.res.array();
+    cache.bodies = (t.res.shape[2] === 17)
+      ? await parseSinglePose(res, config, input, [0, 0, 1, 1])
+      : await parseMultiPose(res, config, input, [0, 0, 1, 1]);
+    for (const body of cache.bodies) {
+      fix.rescaleBody(body, [input.shape[2] || 1, input.shape[1] || 1]);
+      fix.jitter(body.keypoints);
+    }
+    Object.keys(t).forEach((tensor) => tf.dispose(t[tensor]));
+
     resolve(cache.bodies);
   });
 }
diff --git a/src/body/movenetcoords.ts b/src/body/movenetcoords.ts
index ac49985a..0aa1075c 100644
--- a/src/body/movenetcoords.ts
+++ b/src/body/movenetcoords.ts
@@ -1,4 +1,4 @@
-export const kpt: Array<string> = [
+export const kpt: Array<string> = [ // used to create part labels
   'nose',
   'leftEye',
   'rightEye',
@@ -18,7 +18,7 @@ export const kpt: Array<string> = [
   'rightAnkle',
 ];
 
-export const pairs: Array<string[]> = [
+export const horizontal: Array<string[]> = [ // used to fix left vs right
   ['leftEye', 'rightEye'],
   ['leftEar', 'rightEar'],
   ['leftShoulder', 'rightShoulder'],
@@ -29,7 +29,19 @@ export const pairs: Array<string[]> = [
   ['leftAnkle', 'rightAnkle'],
 ];
 
-export const connected: Record<string, string[]> = {
+export const vertical: Array<string[]> = [ // used to remove unlikely keypoint positions
+  ['leftKnee', 'leftShoulder'],
+  ['rightKnee', 'rightShoulder'],
+  ['leftAnkle', 'leftKnee'],
+  ['rightAnkle', 'rightKnee'],
+];
+
+export const relative: Array<string[][]> = [ // used to match relative body parts
+  [['leftHip', 'rightHip'], ['leftShoulder', 'rightShoulder']],
+  [['leftElbow', 'rightElbow'], ['leftShoulder', 'rightShoulder']],
+];
+
+export const connected: Record<string, string[]> = { // used to create body outline in annotations
   leftLeg: ['leftHip', 'leftKnee', 'leftAnkle'],
   rightLeg: ['rightHip', 'rightKnee', 'rightAnkle'],
   torso: ['leftShoulder', 'rightShoulder', 'rightHip', 'leftHip', 'leftShoulder'],
diff --git a/src/body/movenetfix.ts b/src/body/movenetfix.ts
new file mode 100644
index 00000000..b09768e7
--- /dev/null
+++ b/src/body/movenetfix.ts
@@ -0,0 +1,107 @@
+import type { BodyKeypoint, BodyResult } from '../result';
+import * as box from '../util/box';
+import * as coords from './movenetcoords';
+import * as tf from '../../dist/tfjs.esm.js';
+import type { Tensor } from '../tfjs/types';
+
+const maxJitter = 0.005; // default allowed jitter is within 0.5%
+
+const cache: {
+  keypoints: Array<BodyKeypoint>,
+  padding: [number, number][];
+} = {
+  keypoints: [],
+  padding: [[0, 0], [0, 0], [0, 0], [0, 0]],
+};
+
+export function bodyParts(body: BodyResult) { // model sometimes mixes up left vs right keypoints so we fix them
+  for (const pair of coords.horizontal) { // fix body parts left vs right
+    const left = body.keypoints.findIndex((kp) => kp.part === pair[0]);
+    const right = body.keypoints.findIndex((kp) => kp.part === pair[1]);
+    if (body.keypoints[left] && body.keypoints[right]) {
+      if (body.keypoints[left].position[0] < body.keypoints[right].position[0]) {
+        const tmp = body.keypoints[left];
+        body.keypoints[left] = body.keypoints[right];
+        body.keypoints[right] = tmp;
+      }
+    }
+  }
+  for (const pair of coords.vertical) { // remove body parts with improbable vertical position
+    const lower = body.keypoints.findIndex((kp) => (kp && kp.part === pair[0]));
+    const higher = body.keypoints.findIndex((kp) => (kp && kp.part === pair[1]));
+    if (body.keypoints[lower] && body.keypoints[higher]) {
+      if (body.keypoints[lower].position[1] < body.keypoints[higher].position[1]) {
+        body.keypoints.splice(lower, 1);
+      }
+    }
+  }
+  for (const [pair, compare] of coords.relative) { // rearrange body parts according to their relative position
+    const left = body.keypoints.findIndex((kp) => (kp && kp.part === pair[0]));
+    const right = body.keypoints.findIndex((kp) => (kp && kp.part === pair[1]));
+    const leftTo = body.keypoints.findIndex((kp) => (kp && kp.part === compare[0]));
+    const rightTo = body.keypoints.findIndex((kp) => (kp && kp.part === compare[1]));
+    if (!body.keypoints[leftTo] || !body.keypoints[rightTo]) continue; // only if we have both compare points
+    const distanceLeft = body.keypoints[left] ? [
+      Math.abs(body.keypoints[leftTo].position[0] - body.keypoints[left].position[0]),
+      Math.abs(body.keypoints[rightTo].position[0] - body.keypoints[left].position[0]),
+    ] : [0, 0];
+    const distanceRight = body.keypoints[right] ? [
+      Math.abs(body.keypoints[rightTo].position[0] - body.keypoints[right].position[0]),
+      Math.abs(body.keypoints[leftTo].position[0] - body.keypoints[right].position[0]),
+    ] : [0, 0];
+    if (distanceLeft[0] > distanceLeft[1] || distanceRight[0] > distanceRight[1]) { // should flip keypoints
+      const tmp = body.keypoints[left];
+      body.keypoints[left] = body.keypoints[right];
+      body.keypoints[right] = tmp;
+    }
+  }
+}
+
+export function jitter(keypoints: Array<BodyKeypoint>): Array<BodyKeypoint> {
+  for (let i = 0; i < keypoints.length; i++) {
+    if (keypoints[i] && cache.keypoints[i]) {
+      const diff = [Math.abs(keypoints[i].positionRaw[0] - cache.keypoints[i].positionRaw[0]), Math.abs(keypoints[i].positionRaw[1] - cache.keypoints[i].positionRaw[1])];
+      if (diff[0] < maxJitter && diff[1] < maxJitter) {
+        keypoints[i] = cache.keypoints[i]; // below jitter so replace keypoint
+      } else {
+        cache.keypoints[i] = keypoints[i]; // above jitter so update cache
+      }
+    } else {
+      cache.keypoints[i] = keypoints[i]; // cache for keypoint doesnt exist so create it here
+    }
+  }
+  return keypoints;
+}
+
+export function padInput(input: Tensor, inputSize: number): Tensor {
+  const t: Record<string, Tensor> = {};
+  if (!input.shape || !input.shape[1] || !input.shape[2]) return input;
+  cache.padding = [
+    [0, 0], // dont touch batch
+    [input.shape[2] > input.shape[1] ? Math.trunc((input.shape[2] - input.shape[1]) / 2) : 0, input.shape[2] > input.shape[1] ? Math.trunc((input.shape[2] - input.shape[1]) / 2) : 0], // height before&after
+    [input.shape[1] > input.shape[2] ? Math.trunc((input.shape[1] - input.shape[2]) / 2) : 0, input.shape[1] > input.shape[2] ? Math.trunc((input.shape[1] - input.shape[2]) / 2) : 0], // width before&after
+    [0, 0], // dont touch rbg
+  ];
+  t.pad = tf.pad(input, cache.padding);
+  t.resize = tf.image.resizeBilinear(t.pad, [inputSize, inputSize]);
+  const final = tf.cast(t.resize, 'int32');
+  Object.keys(t).forEach((tensor) => tf.dispose(t[tensor]));
+  return final;
+}
+
+export function rescaleBody(body: BodyResult, outputSize: [number, number]): BodyResult {
+  body.keypoints = body.keypoints.filter((kpt) => kpt && kpt.position); // filter invalid keypoints
+  for (const kpt of body.keypoints) {
+    kpt.position = [
+      kpt.position[0] * (outputSize[0] + cache.padding[2][0] + cache.padding[2][1]) / outputSize[0] - cache.padding[2][0],
+      kpt.position[1] * (outputSize[1] + cache.padding[1][0] + cache.padding[1][1]) / outputSize[1] - cache.padding[1][0],
+    ];
+    kpt.positionRaw = [
+      kpt.position[0] / outputSize[0], kpt.position[1] / outputSize[1],
+    ];
+  }
+  const rescaledBoxes = box.calc(body.keypoints.map((pt) => pt.position), outputSize);
+  body.box = rescaledBoxes.box;
+  body.boxRaw = rescaledBoxes.boxRaw;
+  return body;
+}
diff --git a/src/config.ts b/src/config.ts
index 4f40262a..dc4381f9 100644
--- a/src/config.ts
+++ b/src/config.ts
@@ -426,7 +426,7 @@ const config: Config = {
                              // should be set to the minimum number for performance
                              // only valid for posenet and movenet-multipose as other models detects single pose
                              // set to -1 to autodetect based on number of detected faces
-    minConfidence: 0.2,      // threshold for discarding a prediction
+    minConfidence: 0.3,      // threshold for discarding a prediction
     skipFrames: 1,           // how many max frames to go without re-running the detector
                              // only used when cacheSensitivity is not zero
 },