major precision improvements to movenet and handtrack

2021-10-10 22:29:20 -04:00 · 2021-10-10 22:29:20 -04:00 · 3925d6d426
parent 6efedef077
commit 3925d6d426
13 changed files with 85 additions and 88 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -9,11 +9,20 @@
  
 ## Changelog
  
+### **HEAD -> main** 2021/10/10 mandic00@live.com
+
+
+### **origin/main** 2021/10/08 mandic00@live.com
+
+- demo default config cleanup
+- improve gaze and face angle visualizations in draw
+
+### **release 2.3.1** 2021/10/06 mandic00@live.com
+
+
 ### **2.3.1** 2021/10/06 mandic00@live.com

-
-### **origin/main** 2021/10/06 mandic00@live.com
-
+- workaround for chrome offscreencanvas bug
 - fix backend conflict in webworker
 - add blazepose v2 and add annotations to body results
 - fix backend order initialization
--- a/demo/index.js
+++ b/demo/index.js
@ -31,15 +31,6 @@ import jsonView from './helpers/jsonview.js';
 let human;

 let userConfig = {
-  cacheSensitivity: 0,
-  hand: { enabled: true },
-  body: { enabled: false },
-  face: { enabled: false },
-  /*
-  hand: { enabled: false, maxDetected: 1, skipFrames: 0 },
-  body: { enabled: false },
-  face: { enabled: false },
-  */
  /*
  warmup: 'none',
  backend: 'humangl',
@ -118,6 +109,7 @@ const ui = {
  lastFrame: 0, // time of last frame processing
  viewportSet: false, // internal, has custom viewport been set
  background: null, // holds instance of segmentation background image
+  transferCanvas: null, // canvas used to transfer data to and from worker

  // webrtc
  useWebRTC: false, // use webrtc as camera source instead of local webcam
@ -318,7 +310,7 @@ async function drawResults(input) {
  const fps = avgDetect > 0 ? `FPS process:${avgDetect} refresh:${avgDraw}` : '';
  const backend = result.backend || human.tf.getBackend();
  const gpu = engine.backendInstance ? `gpu: ${(engine.backendInstance.numBytesInGPU ? engine.backendInstance.numBytesInGPU : 0).toLocaleString()} bytes` : '';
-  const memory = result.tensors || `system: ${engine.state.numBytes.toLocaleString()} bytes ${gpu} | tensors: ${engine.state.numTensors.toLocaleString()}`;
+  const memory = result.tensors ? `tensors: ${result.tensors.toLocaleString()} in worker` : `system: ${engine.state.numBytes.toLocaleString()} bytes ${gpu} | tensors: ${engine.state.numTensors.toLocaleString()}`;
  document.getElementById('log').innerHTML = `
    video: ${ui.camera.name} | facing: ${ui.camera.facing} | screen: ${window.innerWidth} x ${window.innerHeight} camera: ${ui.camera.width} x ${ui.camera.height} ${processing}<br>
    backend: ${backend} | ${memory}<br>
@ -469,13 +461,17 @@ function webWorker(input, image, canvas, timestamp) {
      if (document.getElementById('gl-bench')) document.getElementById('gl-bench').style.display = ui.bench ? 'block' : 'none';
      lastDetectedResult = msg.data.result;

-      if (msg.data.image) {
-        lastDetectedResult.canvas = (typeof OffscreenCanvas !== 'undefined') ? new OffscreenCanvas(msg.data.width, msg.data.height) : document.createElement('canvas');
-        lastDetectedResult.canvas.width = msg.data.width;
-        lastDetectedResult.canvas.height = msg.data.height;
+      if (msg.data.image) { // we dont really need canvas since we draw from video
+        /*
+        if (!lastDetectedResult.canvas || lastDetectedResult.canvas.width !== msg.data.width || lastDetectedResult.canvas.height !== msg.data.height) {
+          lastDetectedResult.canvas = (typeof OffscreenCanvas !== 'undefined') ? new OffscreenCanvas(msg.data.width, msg.data.height) : document.createElement('canvas');
+          lastDetectedResult.canvas.width = msg.data.width;
+          lastDetectedResult.canvas.height = msg.data.height;
+        }
        const ctx = lastDetectedResult.canvas.getContext('2d');
        const imageData = new ImageData(new Uint8ClampedArray(msg.data.image), msg.data.width, msg.data.height);
        ctx.putImageData(imageData, 0, 0);
+        */
      }

      ui.framesDetect++;
@ -508,10 +504,12 @@ function runHumanDetect(input, canvas, timestamp) {
  if (ui.hintsThread) clearInterval(ui.hintsThread);
  if (ui.useWorker && human.env.offscreen) {
    // get image data from video as we cannot send html objects to webworker
-    const offscreen = (typeof OffscreenCanvas !== 'undefined') ? new OffscreenCanvas(canvas.width, canvas.height) : document.createElement('canvas');
-    offscreen.width = canvas.width;
-    offscreen.height = canvas.height;
-    const ctx = offscreen.getContext('2d');
+    if (!ui.transferCanvas || ui.transferCanvas.width !== canvas.width || ui.transferCanvas.height || canvas.height) {
+      ui.transferCanvas = document.createElement('canvas');
+      ui.transferCanvas.width = canvas.width;
+      ui.transferCanvas.height = canvas.height;
+    }
+    const ctx = ui.transferCanvas.getContext('2d');
    ctx.drawImage(input, 0, 0, canvas.width, canvas.height);
    const data = ctx.getImageData(0, 0, canvas.width, canvas.height);
    // perform detection in worker
--- a/package.json
+++ b/package.json
@ -1,6 +1,6 @@
 {
  "name": "@vladmandic/human",
-  "version": "2.3.1",
+  "version": "2.3.2",
  "description": "Human: AI-powered 3D Face Detection & Rotation Tracking, Face Description & Recognition, Body Pose Tracking, 3D Hand & Finger Tracking, Iris Analysis, Age & Gender & Emotion Prediction, Gesture Recognition",
  "sideEffects": false,
  "main": "dist/human.node.js",
@ -74,7 +74,6 @@
    "canvas": "^2.8.0",
    "dayjs": "^1.10.7",
    "esbuild": "^0.13.4",
-    "eslint": "^7.32.0",
    "eslint-config-airbnb-base": "^14.2.1",
    "eslint-plugin-import": "^2.24.2",
    "eslint-plugin-json": "^3.1.0",
@ -86,5 +85,8 @@
    "tslib": "^2.3.1",
    "typedoc": "0.22.5",
    "typescript": "4.4.3"
+  },
+  "dependencies": {
+    "eslint": "7.32.0"
  }
 }
--- a/src/config.ts
+++ b/src/config.ts
@ -420,12 +420,12 @@ const config: Config = {
    rotation: true,          // use best-guess rotated hand image or just box with rotation as-is
                             // false means higher performance, but incorrect finger mapping if hand is inverted
                             // only valid for `handdetect` variation
-    skipFrames: 1,           // how many max frames to go without re-running the hand bounding box detector
+    skipFrames: 2,           // how many max frames to go without re-running the hand bounding box detector
                             // only used when cacheSensitivity is not zero
                             // e.g., if model is running st 25 FPS, we can re-use existing bounding
                             // box for updated hand skeleton analysis as the hand
                             // hasn't moved much in short time (10 * 1/25 = 0.25 sec)
-    minConfidence: 0.55,     // threshold for discarding a prediction
+    minConfidence: 0.50,     // threshold for discarding a prediction
    iouThreshold: 0.2,       // ammount of overlap between two detected objects before one object is removed
    maxDetected: -1,         // maximum number of hands detected in the input
                             // should be set to the minimum number for performance
--- a/src/face/facemesh.ts
+++ b/src/face/facemesh.ts
@ -83,7 +83,7 @@ export async function predict(input: Tensor, config: Config): Promise<FaceResult
        ((box.startPoint[1] + box.endPoint[1])) / 2 + ((box.endPoint[1] + box.startPoint[1]) * pt[1] / blazeface.size()),
      ]);
      face.meshRaw = face.mesh.map((pt) => [pt[0] / (input.shape[2] || 0), pt[1] / (input.shape[1] || 0), (pt[2] || 0) / inputSize]);
-      for (const key of Object.keys(coords.blazeFaceLandmarks)) face.annotations[key] = [face.mesh[coords.blazeFaceLandmarks[key]]]; // add annotations
+      for (const key of Object.keys(coords.blazeFaceLandmarks)) face.annotations[key] = [face.mesh[coords.blazeFaceLandmarks[key] as number]]; // add annotations
    } else if (!model) { // mesh enabled, but not loaded
      if (config.debug) log('face mesh detection requested, but model is not loaded');
    } else { // mesh enabled
--- a/src/face/facemeshcoords.ts
+++ b/src/face/facemeshcoords.ts
@ -3,7 +3,7 @@
 * See `facemesh.ts` for entry point
 */

-export const meshAnnotations = {
+export const meshAnnotations: Record<string, number[]> = {
  silhouette: [
    10, 338, 297, 332, 284, 251, 389, 356, 454, 323, 361, 288,
    397, 365, 379, 378, 400, 377, 152, 148, 176, 149, 150, 136,
@ -42,13 +42,13 @@ export const meshAnnotations = {
  leftCheek: [425],
 };

-export const meshLandmarks = {
+export const meshLandmarks: Record<string, number | number[]> = {
  count: 468,
  mouth: 13,
  symmetryLine: [13, meshAnnotations['midwayBetweenEyes'][0]],
 };

-export const blazeFaceLandmarks = {
+export const blazeFaceLandmarks: Record<string, number | number[]> = {
  leftEye: 0,
  rightEye: 1,
  nose: 2,
@ -58,7 +58,7 @@ export const blazeFaceLandmarks = {
  symmetryLine: [3, 2],
 };

-export const MESH_TO_IRIS_INDICES_MAP = [ // A mapping from facemesh model keypoints to iris model keypoints.
+export const MESH_TO_IRIS_INDICES_MAP: Array<{ key: string, indices: number[] }> = [ // A mapping from facemesh model keypoints to iris model keypoints.
  { key: 'EyeUpper0', indices: [9, 10, 11, 12, 13, 14, 15] },
  { key: 'EyeUpper1', indices: [25, 26, 27, 28, 29, 30, 31] },
  { key: 'EyeUpper2', indices: [41, 42, 43, 44, 45, 46, 47] },
@ -70,7 +70,7 @@ export const MESH_TO_IRIS_INDICES_MAP = [ // A mapping from facemesh model keypo
  // { key: 'EyebrowLower', indices: [48, 49, 50, 51, 52, 53] },
 ];

-export const UV468 = [
+export const UV468: [number, number][] = [
  [0.499976992607117, 0.652534008026123],
  [0.500025987625122, 0.547487020492554],
  [0.499974012374878, 0.602371990680695],
@ -541,7 +541,7 @@ export const UV468 = [
  [0.723330020904541, 0.363372981548309],
 ];

-export const TRI468 = [
+export const TRI468: Array<number> = [
  127, 34, 139, 11, 0, 37, 232, 231, 120, 72, 37, 39, 128, 121, 47, 232, 121, 128, 104, 69, 67, 175, 171, 148, 157, 154, 155, 118, 50, 101, 73, 39, 40, 9,
  151, 108, 48, 115, 131, 194, 204, 211, 74, 40, 185, 80, 42, 183, 40, 92, 186, 230, 229, 118, 202, 212, 214, 83, 18, 17, 76, 61, 146, 160, 29, 30, 56,
  157, 173, 106, 204, 194, 135, 214, 192, 203, 165, 98, 21, 71, 68, 51, 45, 4, 144, 24, 23, 77, 146, 91, 205, 50, 187, 201, 200, 18, 91, 106, 182, 90, 91,
@ -627,7 +627,7 @@ export const TRI468 = [
  259, 443, 259, 260, 444, 260, 467, 445, 309, 459, 250, 305, 289, 290, 305, 290, 460, 401, 376, 435, 309, 250, 392, 376, 411, 433, 453, 341, 464, 357,
  453, 465, 343, 357, 412, 437, 343, 399, 344, 360, 440, 420, 437, 456, 360, 420, 363, 361, 401, 288, 265, 372, 353, 390, 339, 249, 339, 448, 255];

-export const TRI68 = [0, 1, 36, 0, 36, 17, 1, 2, 41, 1, 41, 36, 2, 3, 31, 2, 31, 41, 3, 4, 48, 3, 48, 31, 4, 5, 48, 5, 6, 48, 6, 7, 59, 6, 59, 48, 7, 8, 58, 7, 58, 59,
+export const TRI68: Array<number> = [0, 1, 36, 0, 36, 17, 1, 2, 41, 1, 41, 36, 2, 3, 31, 2, 31, 41, 3, 4, 48, 3, 48, 31, 4, 5, 48, 5, 6, 48, 6, 7, 59, 6, 59, 48, 7, 8, 58, 7, 58, 59,
  8, 9, 56, 8, 56, 57, 8, 57, 58, 9, 10, 55, 9, 55, 56, 10, 11, 54, 10, 54, 55, 11, 12, 54, 12, 13, 54, 13, 14, 35, 13, 35, 54, 14, 15, 46, 14, 46, 35, 15, 16,
  45, 15, 45, 46, 16, 26, 45, 17, 36, 18, 18, 37, 19, 18, 36, 37, 19, 38, 20, 19, 37, 38, 20, 39, 21, 20, 38, 39, 21, 39, 27, 22, 42, 23, 22, 27, 42, 23, 43, 24,
  23, 42, 43, 24, 44, 25, 24, 43, 44, 25, 45, 26, 25, 44, 45, 27, 39, 28, 27, 28, 42, 28, 39, 29, 28, 29, 42, 29, 31, 30, 29, 30, 35, 29, 40, 31, 29, 35, 47, 29,
@ -636,7 +636,7 @@ export const TRI68 = [0, 1, 36, 0, 36, 17, 1, 2, 41, 1, 41, 36, 2, 3, 31, 2, 31,
  48, 59, 60, 49, 61, 50, 49, 60, 61, 50, 62, 51, 50, 61, 62, 51, 62, 52, 52, 63, 53, 52, 62, 63, 53, 64, 54, 53, 63, 64, 54, 64, 55, 55, 65, 56, 55, 64, 65, 56,
  66, 57, 56, 65, 66, 57, 66, 58, 58, 67, 59, 58, 66, 67, 59, 67, 60, 60, 67, 61, 61, 66, 62, 61, 67, 66, 62, 66, 63, 63, 65, 64, 63, 66, 65, 21, 27, 22];

-export const TRI33 = [
+export const TRI33: Array<number> = [
  /*  eyes  */ 0, 8, 7, 7, 8, 1, 2, 10, 9, 9, 10, 3,
  /*  brows */ 17, 0, 18, 18, 0, 7, 18, 7, 19, 19, 7, 1, 19, 1, 11, 19, 11, 20, 21, 3, 22, 21, 9, 3, 20, 9, 21, 20, 2, 9, 20, 11, 2,
  /*  4head */ 23, 17, 18, 25, 21, 22, 24, 19, 20, 24, 18, 19, 24, 20, 21, 24, 23, 18, 24, 21, 25,
@ -647,9 +647,9 @@ export const TRI33 = [
  /*  cont  */ 26, 30, 5, 27, 6, 31, 0, 28, 26, 3, 27, 29, 17, 28, 0, 3, 29, 22, 23, 28, 17, 22, 29, 25, 28, 30, 26, 27, 31, 29,
 ];

-export const TRI7 = [0, 4, 1, 2, 4, 3, 4, 5, 6];
+export const TRI7: Array<number> = [0, 4, 1, 2, 4, 3, 4, 5, 6];

-export const VTX68 = [
+export const VTX68: Array<number> = [
  /* cont  */ 127, 234, 132, 58, 172, 150, 149, 148, 152, 377, 378, 379, 397, 288, 361, 454, 356,
  /* brows */ 70, 63, 105, 66, 107, 336, 296, 334, 293, 300,
  /* nose  */ 168, 6, 195, 4, 98, 97, 2, 326, 327,
@ -658,9 +658,9 @@ export const VTX68 = [
  /* mouth */ 78, 81, 13, 311, 308, 402, 14, 178,
 ];

-export const VTX33 = [33, 133, 362, 263, 1, 62, 308, 159, 145, 386, 374, 6, 102, 331, 2, 13, 14, 70, 105, 107, 336, 334, 300, 54, 10, 284, 50, 280, 234, 454, 58, 288, 152];
+export const VTX33: Array<number> = [33, 133, 362, 263, 1, 62, 308, 159, 145, 386, 374, 6, 102, 331, 2, 13, 14, 70, 105, 107, 336, 334, 300, 54, 10, 284, 50, 280, 234, 454, 58, 288, 152];

-export const VTX7 = [33, 133, 362, 263, 1, 78, 308];
+export const VTX7: Array<number> = [33, 133, 362, 263, 1, 78, 308];

 export const UV68 = VTX68.map((x) => UV468[x]);

--- a/src/face/facemeshutil.ts
+++ b/src/face/facemeshutil.ts
@ -152,8 +152,8 @@ export function transformRawCoords(rawCoords, box, angle, rotationMatrix, inputS
 }

 export function correctFaceRotation(box, input, inputSize) {
-  const [indexOfMouth, indexOfForehead] = (box.landmarks.length >= coords.meshLandmarks.count) ? coords.meshLandmarks.symmetryLine : coords.blazeFaceLandmarks.symmetryLine;
-  const angle: number = computeRotation(box.landmarks[indexOfMouth], box.landmarks[indexOfForehead]);
+  const symmetryLine = (box.landmarks.length >= coords.meshLandmarks.count) ? coords.meshLandmarks.symmetryLine : coords.blazeFaceLandmarks.symmetryLine;
+  const angle: number = computeRotation(box.landmarks[symmetryLine[0]], box.landmarks[symmetryLine[1]]);
  const faceCenter: Point = getBoxCenter({ startPoint: box.startPoint, endPoint: box.endPoint });
  const faceCenterNormalized: Point = [faceCenter[0] / input.shape[2], faceCenter[1] / input.shape[1]];
  const rotated = tf.image.rotateWithOffset(input, angle, 0, faceCenterNormalized); // rotateWithOffset is not defined for tfjs-node
--- a/src/hand/handtrack.ts
+++ b/src/hand/handtrack.ts
@ -22,10 +22,11 @@ const modelOutputNodes = ['StatefulPartitionedCall/Postprocessor/Slice', 'Statef
 const inputSize = [[0, 0], [0, 0]];

 const classes = ['hand', 'fist', 'pinch', 'point', 'face', 'tip', 'pinchtip'];
+const faceIndex = 4;

 const boxExpandFact = 1.6;
 const maxDetectorResolution = 512;
-const detectorExpandFact = 1.2;
+const detectorExpandFact = 1.4;

 let skipped = 0;
 let outputSize: [number, number] = [0, 0];
@ -104,10 +105,11 @@ async function detectHands(input: Tensor, config: Config): Promise<HandDetectRes
  [t.rawScores, t.rawBoxes] = await models[0].executeAsync(t.cast, modelOutputNodes) as Tensor[];
  t.boxes = tf.squeeze(t.rawBoxes, [0, 2]);
  t.scores = tf.squeeze(t.rawScores, [0]);
-  const classScores = tf.unstack(t.scores, 1); // unstack scores based on classes
-  classScores.splice(4, 1); // remove faces
+  const classScores: Array<Tensor> = tf.unstack(t.scores, 1); // unstack scores based on classes
+  tf.dispose(classScores[faceIndex]);
+  classScores.splice(faceIndex, 1); // remove faces
  t.filtered = tf.stack(classScores, 1); // restack
-  tf.dispose(...classScores);
+  tf.dispose(classScores);
  t.max = tf.max(t.filtered, 1); // max overall score
  t.argmax = tf.argMax(t.filtered, 1); // class index of max overall score
  let id = 0;
@ -117,12 +119,13 @@ async function detectHands(input: Tensor, config: Config): Promise<HandDetectRes
  const classNum = await t.argmax.data();
  for (const nmsIndex of Array.from(nms)) { // generates results for each class
    const boxSlice = tf.slice(t.boxes, nmsIndex, 1);
-    const boxData = await boxSlice.data();
+    const boxYX = await boxSlice.data();
    tf.dispose(boxSlice);
-    const boxSquareSize = Math.max(boxData[3] - boxData[1], boxData[2] - boxData[0]);
-    const boxRaw: Box = box.scale([boxData[1], boxData[0], boxSquareSize, boxSquareSize], detectorExpandFact); // for raw box we use squared and expanded box
+    // const boxSquareSize = Math.max(boxData[3] - boxData[1], boxData[2] - boxData[0]);
+    const boxData: Box = [boxYX[1], boxYX[0], boxYX[3] - boxYX[1], boxYX[2] - boxYX[0]]; // yx box reshaped to standard box
+    const boxRaw: Box = box.scale(boxData, detectorExpandFact);
    const boxCrop: Box = box.crop(boxRaw); // crop box is based on raw box
-    const boxFull: Box = [Math.trunc(boxData[1] * outputSize[0]), Math.trunc(boxData[0] * outputSize[1]), Math.trunc((boxData[3] - boxData[1]) * outputSize[0]), Math.trunc((boxData[2] - boxData[0]) * outputSize[1])]; // for box we keep original scaled values
+    const boxFull: Box = [Math.trunc(boxData[0] * outputSize[0]), Math.trunc(boxData[1] * outputSize[1]), Math.trunc(boxData[2] * outputSize[0]), Math.trunc(boxData[3] * outputSize[1])];
    const score = scores[nmsIndex];
    const label = classes[classNum[nmsIndex]];
    const hand: HandDetectResult = { id: id++, score, box: boxFull, boxRaw, boxCrop, label };
@ -161,10 +164,9 @@ async function detectFingers(input: Tensor, h: HandDetectResult, config: Config)
      const coordsData: Point[] = await t.reshaped.array() as Point[];
      const coordsRaw: Point[] = coordsData.map((kpt) => [kpt[0] / inputSize[1][1], kpt[1] / inputSize[1][0], (kpt[2] || 0)]);
      const coordsNorm: Point[] = coordsRaw.map((kpt) => [kpt[0] * h.boxRaw[2], kpt[1] * h.boxRaw[3], (kpt[2] || 0)]);
-      console.log(outputSize, h.box);
      hand.keypoints = (coordsNorm).map((kpt) => [
-        outputSize[0] * kpt[0] + h.box[0],
-        outputSize[1] * kpt[1] + h.box[1],
+        outputSize[0] * (kpt[0] + h.boxRaw[0]),
+        outputSize[1] * (kpt[1] + h.boxRaw[1]),
        (kpt[2] || 0),
      ]);
      // hand.box = box.scale(h.box, 1 / detectorExpandFact); // scale box down for visual appeal
@ -179,13 +181,11 @@ async function detectFingers(input: Tensor, h: HandDetectResult, config: Config)
  return hand;
 }

-let n = 0;
 export async function predict(input: Tensor, config: Config): Promise<HandResult[]> {
-  n++;
  /** handtrack caching
   * 1. if skipFrame returned cached
-   * 2. if any cached results but although not sure if its enough we continute anyhow for 10x skipframes
-   * 3. eventually rerun detector to generated new cached boxes and reset skipped
+   * 2. if any cached results but although not sure if its enough we continute anyhow for 5x skipframes
+   * 3. if not skipframe or eventually rerun detector to generated new cached boxes and reset skipped
   * 4. generate cached boxes based on detected keypoints
   */
  if (!models[0] || !models[1] || !models[0]?.inputs[0].shape || !models[1]?.inputs[0].shape) return []; // something is wrong with the model
@ -193,34 +193,14 @@ export async function predict(input: Tensor, config: Config): Promise<HandResult

  skipped++; // increment skip frames
  if (config.skipFrame && (skipped <= (config.hand.skipFrames || 0))) {
-    console.log(n, 'SKIP', { results: cache.hands.length });
    return cache.hands; // return cached results without running anything
  }
  return new Promise(async (resolve) => {
-    console.log(n, 'DETECT', { skipped, hands: cache.hands.length, boxes: cache.boxes.length });
-    // this is logically consistent but insufficiently efficient
-    /*
-    skipped = 0;
-    if (cache.boxes.length >= (config.hand.maxDetected || 0)) {
-      cache.hands = await Promise.all(cache.boxes.map((handBox) => detectFingers(input, handBox, config))); // if we have enough cached boxes run detection using cache
-    } else {
-      cache.hands = []; // reset hands
-    }
-
-    if (cache.hands.length !== config.hand.maxDetected) { // did not find enough hands based on cached boxes so run detection on full frame
-      cache.boxes = await detectHands(input, config);
+    if (config.skipFrame && skipped < 5 * (config.hand.skipFrames || 0) && cache.hands.length > 0) { // we have some cached results but although not sure if its enough we continute anyhow for bit longer
      cache.hands = await Promise.all(cache.boxes.map((handBox) => detectFingers(input, handBox, config)));
-    }
-    */
-
-    if (config.skipFrame && skipped <= 10 * (config.hand.skipFrames || 0) && cache.hands.length > 0) { // we have some cached results but although not sure if its enough we continute anyhow for 10x skipframes
-      cache.hands = await Promise.all(cache.boxes.map((handBox) => detectFingers(input, handBox, config)));
-      console.log(n, 'HANDS', { hands: cache.hands.length });
    } else {
      cache.boxes = await detectHands(input, config);
-      console.log(n, 'BOXES', { hands: cache.boxes.length });
      cache.hands = await Promise.all(cache.boxes.map((handBox) => detectFingers(input, handBox, config)));
-      console.log(n, 'HANDS', { hands: cache.hands.length });
      skipped = 0;
    }

@ -236,7 +216,6 @@ export async function predict(input: Tensor, config: Config): Promise<HandResult
          cache.boxes.push({ ...oldCache[i], box: boxScale, boxRaw: boxScaleRaw, boxCrop });
        }
      }
-      console.log(n, 'CACHED', { hands: cache.boxes.length });
    }
    resolve(cache.hands);
  });
--- a/src/image/image.ts
+++ b/src/image/image.ts
@ -15,6 +15,7 @@ const maxSize = 2048;
 // internal temp canvases
 let inCanvas: HTMLCanvasElement | OffscreenCanvas | null = null; // use global variable to avoid recreating canvas on each frame
 let outCanvas: HTMLCanvasElement | OffscreenCanvas | null = null; // use global variable to avoid recreating canvas on each frame
+let tmpCanvas: HTMLCanvasElement | OffscreenCanvas | null = null; // use global variable to avoid recreating canvas on each frame
 // @ts-ignore // imagefx is js module that should be converted to a class
 let fx: fxImage.GLImageFilter | null; // instance of imagefx

@ -72,9 +73,13 @@ export function process(input: Input, config: Config, getTensor: boolean = true)
  }
  if (input instanceof tf.Tensor) {
    // if input is tensor, use as-is
-    if ((input)['isDisposedInternal']) throw new Error('input tensor is disposed');
-    else if (!input.shape || input.shape.length !== 4 || input.shape[0] !== 1 || input.shape[3] !== 3) throw new Error(`input tensor shape must be [1, height, width, 3] and instead was ${input.shape}`);
-    else return { tensor: tf.clone(input), canvas: (config.filter.return ? outCanvas : null) };
+    if ((input)['isDisposedInternal']) {
+      throw new Error('input tensor is disposed');
+    } else if (!(input as Tensor).shape || (input as Tensor).shape.length !== 4 || (input as Tensor).shape[0] !== 1 || (input as Tensor).shape[3] !== 3) {
+      throw new Error(`input tensor shape must be [1, height, width, 3] and instead was ${input['shape']}`);
+    } else {
+      return { tensor: tf.clone(input), canvas: (config.filter.return ? outCanvas : null) };
+    }
  } else {
    // check if resizing will be needed
    if (typeof input['readyState'] !== 'undefined' && input['readyState'] <= 2) {
@ -114,10 +119,10 @@ export function process(input: Input, config: Config, getTensor: boolean = true)
      if (config.filter.flip && typeof inCtx.translate !== 'undefined') {
        inCtx.translate(originalWidth, 0);
        inCtx.scale(-1, 1);
-        inCtx.drawImage(input as CanvasImageSource, 0, 0, originalWidth, originalHeight, 0, 0, inCanvas?.width, inCanvas?.height);
+        inCtx.drawImage(input as OffscreenCanvas, 0, 0, originalWidth, originalHeight, 0, 0, inCanvas?.width, inCanvas?.height);
        inCtx.setTransform(1, 0, 0, 1, 0, 0); // resets transforms to defaults
      } else {
-        inCtx.drawImage(input as CanvasImageSource, 0, 0, originalWidth, originalHeight, 0, 0, inCanvas?.width, inCanvas?.height);
+        inCtx.drawImage(input as OffscreenCanvas, 0, 0, originalWidth, originalHeight, 0, 0, inCanvas?.width, inCanvas?.height);
      }
    }

@ -160,23 +165,24 @@ export function process(input: Input, config: Config, getTensor: boolean = true)
        pixels = tf.browser ? tf.browser.fromPixels(input) : null;
      } else {
        depth = input['data'].length / input['height'] / input['width'];
-        // const arr = Uint8Array.from(input['data']);
-        const arr = new Uint8Array(input['data']['buffer']);
+        const arr = Uint8Array.from(input['data']);
+        // const arr = new Uint8Array(input['data']['buffer']);
        pixels = tf.tensor(arr, [input['height'], input['width'], depth], 'float32');
      }
    } else {
+      if (!tmpCanvas || (outCanvas.width !== tmpCanvas.width) || (outCanvas?.height !== tmpCanvas?.height)) tmpCanvas = canvas(outCanvas.width, outCanvas.height); // init output canvas
      if (tf.browser && env.browser) {
        if (config.backend === 'webgl' || config.backend === 'humangl' || config.backend === 'webgpu') {
          pixels = tf.browser.fromPixels(outCanvas); // safe to reuse since both backend and context are gl based
        } else {
-          const tempCanvas = copy(outCanvas); // cannot use output canvas as it already has gl context so we do a silly one more canvas
-          pixels = tf.browser.fromPixels(tempCanvas);
+          tmpCanvas = copy(outCanvas); // cannot use output canvas as it already has gl context so we do a silly one more canvas
+          pixels = tf.browser.fromPixels(tmpCanvas);
        }
      } else {
        const tempCanvas = copy(outCanvas); // cannot use output canvas as it already has gl context so we do a silly one more canvas
        const tempCtx = tempCanvas.getContext('2d') as CanvasRenderingContext2D;
        const tempData = tempCtx.getImageData(0, 0, targetWidth, targetHeight);
-        depth = input['data'].length / targetWidth / targetHeight;
+        depth = tempData.data.length / targetWidth / targetHeight;
        const arr = new Uint8Array(tempData.data.buffer);
        pixels = tf.tensor(arr, [targetWidth, targetHeight, depth]);
      }
--- a/test/test-main.js
+++ b/test/test-main.js
@ -192,7 +192,7 @@ async function test(Human, inputConfig) {
  else log('state', 'passed: warmup face result match');
  config.warmup = 'body';
  res = await testWarmup(human, 'default');
-  if (!res || res?.face?.length !== 1 || res?.body?.length !== 0 || res?.hand?.length !== 1 || res?.gesture?.length !== 4) log('error', 'failed: warmup body result mismatch', res?.face?.length, res?.body?.length, res?.hand?.length, res?.gesture?.length);
+  if (!res || res?.face?.length !== 1 || res?.body?.length !== 1 || res?.hand?.length !== 1 || res?.gesture?.length !== 5) log('error', 'failed: warmup body result mismatch', res?.face?.length, res?.body?.length, res?.hand?.length, res?.gesture?.length);
  else log('state', 'passed: warmup body result match');

  // test default config async
--- a/test/test-node-gpu.js
+++ b/test/test-node-gpu.js
@ -3,6 +3,7 @@ const Human = require('../dist/human.node-gpu.js').default;
 const test = require('./test-main.js').test;

 const config = {
+  cacheSensitivity: 0,
  modelBasePath: 'file://models/',
  backend: 'tensorflow',
  debug: false,
--- a/test/test-node-wasm.js
+++ b/test/test-node-wasm.js
@ -10,6 +10,7 @@ Human.env.Canvas = Canvas; // requires monkey-patch as wasm does not have tf.bro
 Human.env.Image = Image; // requires monkey-patch as wasm does not have tf.browser namespace

 const config = {
+  cacheSensitivity: 0,
  modelBasePath: 'https://vladmandic.github.io/human/models/',
  // modelBasePath: 'http://localhost:10030/models/',
  backend: 'wasm',
--- a/test/test-node.js
+++ b/test/test-node.js
@ -3,6 +3,7 @@ const Human = require('../dist/human.node.js').default;
 const test = require('./test-main.js').test;

 const config = {
+  cacheSensitivity: 0,
  modelBasePath: 'file://models/',
  backend: 'tensorflow',
  debug: false,