update blazepose and extend hand annotations

2021-11-24 16:17:03 -05:00 · 2021-11-24 16:17:03 -05:00 · a60721c5d8
parent d797c871c7
commit a60721c5d8
6 changed files with 82 additions and 75 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -11,14 +11,13 @@
  
 ### **HEAD -> main** 2021/11/23 mandic00@live.com

+- fix face box scaling on detection
 - cleanup

 ### **2.5.4** 2021/11/22 mandic00@live.com

 - prototype blazepose detector
-
-### **origin/main** 2021/11/21 mandic00@live.com
-
+- minor fixes
 - add body 3d interpolation
 - edit blazepose keypoints
 - new build process
--- a/demo/typescript/index.ts
+++ b/demo/typescript/index.ts
@ -10,7 +10,7 @@
 import { Human, Config } from '../../dist/human.esm.js'; // equivalent of @vladmandic/Human

 const humanConfig: Partial<Config> = { // user configuration for human, used to fine-tune behavior
-  // backend: 'webgpu' as 'webgpu,
+  // backend: 'webgpu' as const,
  // async: true,
  modelBasePath: '../../models',
  filter: { enabled: true, equalization: false },
--- a/package.json
+++ b/package.json
@ -65,7 +65,7 @@
    "@tensorflow/tfjs-layers": "^3.11.0",
    "@tensorflow/tfjs-node": "^3.11.0",
    "@tensorflow/tfjs-node-gpu": "^3.11.0",
-    "@types/node": "^16.11.9",
+    "@types/node": "^16.11.10",
    "@types/offscreencanvas": "^2019.6.4",
    "@typescript-eslint/eslint-plugin": "^5.4.0",
    "@typescript-eslint/parser": "^5.4.0",
--- a/src/body/blazepose.ts
+++ b/src/body/blazepose.ts
@ -10,8 +10,7 @@ import type { GraphModel, Tensor } from '../tfjs/types';
 import type { Config } from '../config';
 import * as coords from './blazeposecoords';
 import * as detect from './blazeposedetector';
-
-interface DetectedBox { box: Box, boxRaw: Box, score: number }
+import * as box from '../util/box';

 const env = { initial: true };
 // const models: [GraphModel | null, GraphModel | null] = [null, null];
@ -24,7 +23,7 @@ const outputNodes: { detector: string[], landmarks: string[] } = {
 };

 let cache: BodyResult | null = null;
-let lastBox: Box | undefined;
+let cropBox: Box | undefined;
 let padding: [number, number][] = [[0, 0], [0, 0], [0, 0], [0, 0]];
 let lastTime = 0;

@ -63,50 +62,43 @@ export async function load(config: Config): Promise<[GraphModel | null, GraphMod
  return [models.detector, models.landmarks];
 }

-function calculateBoxes(keypoints: Array<BodyKeypoint>, outputSize: [number, number]): { keypointsBox: Box, keypointsBoxRaw: Box } {
-  const x = keypoints.map((a) => a.position[0]);
-  const y = keypoints.map((a) => a.position[1]);
-  const keypointsBox: Box = [Math.min(...x), Math.min(...y), Math.max(...x) - Math.min(...x), Math.max(...y) - Math.min(...y)];
-  const keypointsBoxRaw: Box = [keypointsBox[0] / outputSize[0], keypointsBox[1] / outputSize[1], keypointsBox[2] / outputSize[0], keypointsBox[3] / outputSize[1]];
-  return { keypointsBox, keypointsBoxRaw };
-}
-
-async function prepareImage(input: Tensor, size: number, box?: Box): Promise<Tensor> {
+async function prepareImage(input: Tensor, size: number): Promise<Tensor> {
  const t: Record<string, Tensor> = {};
  if (!input.shape || !input.shape[1] || !input.shape[2]) return input;
  let final: Tensor;
+  if (cropBox) {
+    t.cropped = tf.image.cropAndResize(input, [cropBox], [0], [input.shape[1], input.shape[2]]); // if we have cached box use it to crop input
+  }
  if (input.shape[1] !== input.shape[2]) { // only pad if width different than height
-    const height: [number, number] = box
-      ? [Math.trunc(input.shape[1] * box[1]), Math.trunc(input.shape[1] * (box[1] + box[3]))]
-      : [input.shape[2] > input.shape[1] ? Math.trunc((input.shape[2] - input.shape[1]) / 2) : 0, input.shape[2] > input.shape[1] ? Math.trunc((input.shape[2] - input.shape[1]) / 2) : 0];
-    const width: [number, number] = box
-      ? [Math.trunc(input.shape[2] * box[0]), Math.trunc(input.shape[2] * (box[0] + box[2]))]
-      : [input.shape[1] > input.shape[2] ? Math.trunc((input.shape[1] - input.shape[2]) / 2) : 0, input.shape[1] > input.shape[2] ? Math.trunc((input.shape[1] - input.shape[2]) / 2) : 0];
+    const height: [number, number] = [
+      input.shape[2] > input.shape[1] ? Math.trunc((input.shape[2] - input.shape[1]) / 2) : 0,
+      input.shape[2] > input.shape[1] ? Math.trunc((input.shape[2] - input.shape[1]) / 2) : 0,
+    ];
+    const width: [number, number] = [
+      input.shape[1] > input.shape[2] ? Math.trunc((input.shape[1] - input.shape[2]) / 2) : 0,
+      input.shape[1] > input.shape[2] ? Math.trunc((input.shape[1] - input.shape[2]) / 2) : 0,
+    ];
    padding = [
      [0, 0], // dont touch batch
      height, // height before&after
      width, // width before&after
      [0, 0], // dont touch rbg
    ];
-    if (box) {
-      t.resize = tf.image.cropAndResize(input, [box], [0], [size, size]);
-    } else {
-      t.pad = tf.pad(input, padding);
-      t.resize = tf.image.resizeBilinear(t.pad, [size, size]);
-    }
+    t.pad = tf.pad(t.cropped || input, padding); // use cropped box if it exists
+    t.resize = tf.image.resizeBilinear(t.pad, [size, size]);
    final = tf.div(t.resize, constants.tf255);
  } else if (input.shape[1] !== size) { // if input needs resizing
-    t.resize = tf.image.resizeBilinear(input, [size, size]);
+    t.resize = tf.image.resizeBilinear(t.cropped || input, [size, size]);
    final = tf.div(t.resize, constants.tf255);
  } else { // if input is already in a correct resolution just normalize it
-    final = tf.div(input, constants.tf255);
+    final = tf.div(t.cropped || input, constants.tf255);
  }
  Object.keys(t).forEach((tensor) => tf.dispose(t[tensor]));
  return final;
 }

 function rescaleKeypoints(keypoints: Array<BodyKeypoint>, outputSize: [number, number]): Array<BodyKeypoint> {
-  for (const kpt of keypoints) {
+  for (const kpt of keypoints) { // first rescale due to padding
    kpt.position = [
      Math.trunc(kpt.position[0] * (outputSize[0] + padding[2][0] + padding[2][1]) / outputSize[0] - padding[2][0]),
      Math.trunc(kpt.position[1] * (outputSize[1] + padding[1][0] + padding[1][1]) / outputSize[1] - padding[1][0]),
@ -114,20 +106,21 @@ function rescaleKeypoints(keypoints: Array<BodyKeypoint>, outputSize: [number, n
    ];
    kpt.positionRaw = [kpt.position[0] / outputSize[0], kpt.position[1] / outputSize[1], kpt.position[2] as number];
  }
-  return keypoints;
-}
-
-function rescaleBoxes(boxes: Array<DetectedBox>, outputSize: [number, number]): Array<DetectedBox> {
-  for (const box of boxes) {
-    box.box = [
-      Math.trunc(box.box[0] * (outputSize[0] + padding[2][0] + padding[2][1]) / outputSize[0]),
-      Math.trunc(box.box[1] * (outputSize[1] + padding[1][0] + padding[1][1]) / outputSize[1]),
-      Math.trunc(box.box[2] * (outputSize[0] + padding[2][0] + padding[2][1]) / outputSize[0]),
-      Math.trunc(box.box[3] * (outputSize[1] + padding[1][0] + padding[1][1]) / outputSize[1]),
-    ];
-    box.boxRaw = [box.box[0] / outputSize[0], box.box[1] / outputSize[1], box.box[2] / outputSize[0], box.box[3] / outputSize[1]];
+  if (cropBox) { // second rescale due to cropping
+    for (const kpt of keypoints) {
+      kpt.positionRaw = [
+        kpt.positionRaw[0] + cropBox[1], // correct offset due to crop
+        kpt.positionRaw[1] + cropBox[0], // correct offset due to crop
+        kpt.positionRaw[2] as number,
+      ];
+      kpt.position = [
+        Math.trunc(kpt.positionRaw[0] * outputSize[0]),
+        Math.trunc(kpt.positionRaw[1] * outputSize[1]),
+        kpt.positionRaw[2] as number,
+      ];
+    }
  }
-  return boxes;
+  return keypoints;
 }

 async function detectLandmarks(input: Tensor, config: Config, outputSize: [number, number]): Promise<BodyResult | null> {
@ -155,22 +148,38 @@ async function detectLandmarks(input: Tensor, config: Config, outputSize: [numbe
  }
  if (poseScore < (config.body.minConfidence || 0)) return null;
  const keypoints: Array<BodyKeypoint> = rescaleKeypoints(keypointsRelative, outputSize); // keypoints were relative to input image which is padded
-  const boxes = calculateBoxes(keypoints, [outputSize[0], outputSize[1]]); // now find boxes based on rescaled keypoints
+  const kpts = keypoints.map((k) => k.position);
+  const boxes = box.calc(kpts, [outputSize[0], outputSize[1]]); // now find boxes based on rescaled keypoints
  const annotations: Record<string, Point[][]> = {};
  for (const [name, indexes] of Object.entries(coords.connected)) {
    const pt: Array<Point[]> = [];
    for (let i = 0; i < indexes.length - 1; i++) {
      const pt0 = keypoints.find((kpt) => kpt.part === indexes[i]);
      const pt1 = keypoints.find((kpt) => kpt.part === indexes[i + 1]);
-      // if (pt0 && pt1 && pt0.score > (config.body.minConfidence || 0) && pt1.score > (config.body.minConfidence || 0)) pt.push([pt0.position, pt1.position]);
      if (pt0 && pt1) pt.push([pt0.position, pt1.position]);
    }
    annotations[name] = pt;
  }
-  const body = { id: 0, score: Math.trunc(100 * poseScore) / 100, box: boxes.keypointsBox, boxRaw: boxes.keypointsBoxRaw, keypoints, annotations };
+  const body = { id: 0, score: Math.trunc(100 * poseScore) / 100, box: boxes.box, boxRaw: boxes.boxRaw, keypoints, annotations };
  return body;
 }

+/*
+interface DetectedBox { box: Box, boxRaw: Box, score: number }
+
+function rescaleBoxes(boxes: Array<DetectedBox>, outputSize: [number, number]): Array<DetectedBox> {
+  for (const b of boxes) {
+    b.box = [
+      Math.trunc(b.box[0] * (outputSize[0] + padding[2][0] + padding[2][1]) / outputSize[0]),
+      Math.trunc(b.box[1] * (outputSize[1] + padding[1][0] + padding[1][1]) / outputSize[1]),
+      Math.trunc(b.box[2] * (outputSize[0] + padding[2][0] + padding[2][1]) / outputSize[0]),
+      Math.trunc(b.box[3] * (outputSize[1] + padding[1][0] + padding[1][1]) / outputSize[1]),
+    ];
+    b.boxRaw = [b.box[0] / outputSize[0], b.box[1] / outputSize[1], b.box[2] / outputSize[0], b.box[3] / outputSize[1]];
+  }
+  return boxes;
+}
+
 async function detectBoxes(input: Tensor, config: Config, outputSize: [number, number]) {
  const t: Record<string, Tensor> = {};
  t.res = models.detector?.execute(input, ['Identity']) as Tensor; //
@ -183,6 +192,7 @@ async function detectBoxes(input: Tensor, config: Config, outputSize: [number, n
  Object.keys(t).forEach((tensor) => tf.dispose(t[tensor]));
  return boxes;
 }
+*/

 export async function predict(input: Tensor, config: Config): Promise<BodyResult[]> {
  const outputSize: [number, number] = [input.shape[2] || 0, input.shape[1] || 0];
@ -192,33 +202,31 @@ export async function predict(input: Tensor, config: Config): Promise<BodyResult
    skipped++;
  } else {
    const t: Record<string, Tensor> = {};
+    /*
    if (config.body['detector'] && config.body['detector']['enabled']) {
      t.detector = await prepareImage(input, 224);
      const boxes = await detectBoxes(t.detector, config, outputSize);
-      if (boxes && boxes.length === 1) {
-        t.landmarks = await prepareImage(input, 256, boxes[0].box); // padded and resized according to detector
-        cache = await detectLandmarks(t.landmarks, config, outputSize);
-      }
-      if (cache) cache.score = boxes[0].score;
-    } else {
-      t.landmarks = await prepareImage(input, 256, lastBox); // padded and resized
-      cache = await detectLandmarks(t.landmarks, config, outputSize);
-      /*
-      lastBox = undefined;
-      if (cache?.box) {
-        const cx = cache.boxRaw[0] + (cache.boxRaw[2] / 2);
-        const cy = cache.boxRaw[1] + (cache.boxRaw[3] / 2);
-        let size = cache.boxRaw[2] > cache.boxRaw[3] ? cache.boxRaw[2] : cache.boxRaw[3];
-        size = (size * 1.2) / 2; // enlarge and half it
-        lastBox = [cx - size, cy - size, 2 * size, 2 * size];
-      }
-      */
    }
+    */
+    t.landmarks = await prepareImage(input, 256); // padded and resized
+    cache = await detectLandmarks(t.landmarks, config, outputSize);
+    /*
+    cropBox = [0, 0, 1, 1]; // reset crop coordinates
+    if (cache?.boxRaw && config.skipAllowed) {
+      const cx = (2.0 * cache.boxRaw[0] + cache.boxRaw[2]) / 2;
+      const cy = (2.0 * cache.boxRaw[1] + cache.boxRaw[3]) / 2;
+      let size = cache.boxRaw[2] > cache.boxRaw[3] ? cache.boxRaw[2] : cache.boxRaw[3];
+      size = (size * 1.0) / 2; // enlarge and half it
+      if (cx > 0.1 && cx < 0.9 && cy > 0.1 && cy < 0.9 && size > 0.1) { // only update if box is sane
+        const y = 0; // cy - size;
+        const x = cx - size;
+        cropBox = [y, x, y + 1, x + 1]; // [y0,x0,y1,x1] used for cropping but width/height are not yet implemented so we only reposition image to center of body
+      }
+    }
+    */
    Object.keys(t).forEach((tensor) => tf.dispose(t[tensor]));
-    // if (cache && boxes.length > 0) cache.box = boxes[0].box;
    lastTime = now();
    skipped = 0;
  }
-  if (cache) return [cache];
-  return [];
+  return cache ? [cache] : [];
 }
--- a/src/face/angles.ts
+++ b/src/face/angles.ts
@ -78,7 +78,7 @@ export const calculateFaceAngle = (face, imageSize): {
    if (isNaN(thetaX)) thetaX = 0;
    if (isNaN(thetaY)) thetaY = 0;
    if (isNaN(thetaZ)) thetaZ = 0;
-    return { pitch: 2 * -thetaX, yaw: 2 * -thetaY, roll: 2 * -thetaZ };
+    return { pitch: -thetaX, yaw: -thetaY, roll: -thetaZ };
  };
  // simple Euler angle calculation based existing 3D mesh
  // eslint-disable-next-line no-unused-vars, @typescript-eslint/no-unused-vars
--- a/src/hand/handtrack.ts
+++ b/src/hand/handtrack.ts
@ -51,11 +51,11 @@ const cache: {
 };

 const fingerMap = {
-  thumb: [1, 2, 3, 4],
-  index: [5, 6, 7, 8],
-  middle: [9, 10, 11, 12],
-  ring: [13, 14, 15, 16],
-  pinky: [17, 18, 19, 20],
+  thumb: [0, 1, 2, 3, 4],
+  index: [0, 5, 6, 7, 8],
+  middle: [0, 9, 10, 11, 12],
+  ring: [0, 13, 14, 15, 16],
+  pinky: [0, 17, 18, 19, 20],
  palm: [0],
 };