From d94aa0362ca78d370775efe374038fe6319c5def Mon Sep 17 00:00:00 2001
From: Vladimir Mandic <mandic00@live.com>
Date: Wed, 4 Nov 2020 10:18:22 -0500
Subject: [PATCH] implemented simple gesture recognition

---
 README.md                    |  18 +++++-
 config.js                    |   3 +
 demo/browser.js              |  34 ++++++++---
 demo/index.html              |   2 +-
 package.json                 |   2 +-
 src/gesture.js               |  45 ++++++++++++++
 src/handpose/handpipeline.js |   4 +-
 src/human.js                 | 115 +++++++++--------------------------
 src/image.js                 |  79 ++++++++++++++++++++++++
 9 files changed, 202 insertions(+), 100 deletions(-)
 create mode 100644 src/gesture.js
 create mode 100644 src/image.js
diff --git a/README.md b/README.md
index 003e8e6e..419a498e 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Human Library
 
-## 3D Face Detection, Body Pose, Hand & Finger Tracking, Iris Tracking, Age & Gender Prediction & Emotion Prediction
+## 3D Face Detection, Body Pose, Hand & Finger Tracking, Iris Tracking, Age & Gender Prediction, Emotion Prediction & Gesture Recognition
 
 - [**Documentation**](https://github.com/vladmandic/human#readme)
 - [**Code Repository**](https://github.com/vladmandic/human)
@@ -361,6 +361,11 @@ config = {
       modelPath: '../models/handskeleton/model.json',
     },
   },
+  gesture: {
+    enabled: true,           // enable simple gesture recognition
+                             // takes processed data and based on geometry detects simple gestures
+                             // easily expandable via code, see `src/gesture.js`
+  },
 };
 ```
 
@@ -408,10 +413,17 @@ result = {
       emotion,     // <string> 'angry', 'discust', 'fear', 'happy', 'sad', 'surpise', 'neutral'
     }
   ],
+  gesture:         // object containing parsed gestures
+  {
+    face,          // <array of string>
+    body,          // <array of string>
+    hand,          // <array of string>
+  }
   performance = {  // performance data of last execution for each module measuredin miliseconds
     backend,       // time to initialize tf backend, valid only during backend startup
     load,          // time to load models, valid only during model load
     image,         // time for image processing
+    gesture,       // gesture analysis time
     body,          // model time
     hand,          // model time
     face,          // model time
@@ -484,6 +496,7 @@ For example, it can perform multiple face detections at 60+ FPS, but drops to ~1
 
 - Enabled all: 15 FPS
 - Image filters: 80 FPS (standalone)
+- Gesture: 80 FPS (standalone)
 - Face Detect: 80 FPS (standalone)
 - Face Geometry: 30 FPS (includes face detect)
 - Face Iris: 30 FPS (includes face detect and face geometry)
@@ -495,8 +508,9 @@ For example, it can perform multiple face detections at 60+ FPS, but drops to ~1
 
 ### Performance per module on a **smartphone** with Snapdragon 855 on a FullHD input:
 
-- Enabled all: 3 FPS
+- Enabled all: 5 FPS
 - Image filters: 30 FPS (standalone)
+- Gesture: 30 FPS (standalone)
 - Face Detect: 20 FPS (standalone)
 - Face Geometry: 10 FPS (includes face detect)
 - Face Iris: 5 FPS (includes face detect and face geometry)
diff --git a/config.js b/config.js
index faa09e72..0f181fc0 100644
--- a/config.js
+++ b/config.js
@@ -41,6 +41,9 @@ export default {
     polaroid: false,         // image polaroid camera effect
     pixelate: 0,             // range: 0 (no pixelate) to N (number of pixels to pixelate)
   },
+  gesture: {
+    enabled: true,           // enable simple gesture recognition
+  },
   face: {
     enabled: true,           // controls if specified modul is enabled
                              // face.enabled is required for all face models: detector, mesh, iris, age, gender, emotion
diff --git a/demo/browser.js b/demo/browser.js
index cdcd1b4f..10a51731 100644
--- a/demo/browser.js
+++ b/demo/browser.js
@@ -25,6 +25,8 @@ const ui = {
   useDepth: true,
   console: true,
   maxFrames: 10,
+  modelsPreload: true,
+  modelsWarmup: true,
 };
 
 // configuration overrides
@@ -62,6 +64,7 @@ const config = {
   },
   body: { enabled: true, maxDetections: 10, scoreThreshold: 0.7, nmsRadius: 20 },
   hand: { enabled: true, skipFrames: 10, minConfidence: 0.5, iouThreshold: 0.3, scoreThreshold: 0.7 },
+  gesture: { enabled: true },
 };
 
 // global variables
@@ -123,15 +126,17 @@ function drawResults(input, result, canvas) {
   draw.face(result.face, canvas, ui, human.facemesh.triangulation);
   draw.body(result.body, canvas, ui);
   draw.hand(result.hand, canvas, ui);
+  draw.gesture(result.gesture, canvas, ui);
   // update log
   const engine = human.tf.engine();
   const gpu = engine.backendInstance ? `gpu: ${(engine.backendInstance.numBytesInGPU ? engine.backendInstance.numBytesInGPU : 0).toLocaleString()} bytes` : '';
   const memory = `system: ${engine.state.numBytes.toLocaleString()} bytes ${gpu} | tensors: ${engine.state.numTensors.toLocaleString()}`;
   const processing = result.canvas ? `processing: ${result.canvas.width} x ${result.canvas.height}` : '';
+  const avg = Math.trunc(10 * fps.reduce((a, b) => a + b) / fps.length) / 10;
   document.getElementById('log').innerText = `
     video: ${camera.name} | facing: ${camera.facing} | resolution: ${camera.width} x ${camera.height} ${processing}
     backend: ${human.tf.getBackend()} | ${memory} | object size: ${(str(result)).length.toLocaleString()} bytes
-    performance: ${str(result.performance)}
+    performance: ${str(result.performance)} FPS:${avg}
   `;
 }
 
@@ -159,7 +164,13 @@ async function setupCamera() {
   try {
     stream = await navigator.mediaDevices.getUserMedia({
       audio: false,
-      video: { facingMode: (ui.facing ? 'user' : 'environment'), width: window.innerWidth, height: window.innerHeight, resizeMode: 'none' },
+      video: {
+        facingMode: (ui.facing ? 'user' : 'environment'),
+        width: window.innerWidth,
+        height: window.innerHeight,
+        resizeMode: 'none',
+        contrast: 75,
+      },
     });
   } catch (err) {
     output.innerText += '\nCamera permission denied';
@@ -267,7 +278,7 @@ async function detectVideo() {
   document.getElementById('canvas').style.display = 'block';
   const video = document.getElementById('video');
   const canvas = document.getElementById('canvas');
-  ui.baseFont = ui.baseFontProto.replace(/{size}/, '1.2rem');
+  ui.baseFont = ui.baseFontProto.replace(/{size}/, '1.3rem');
   ui.baseLineHeight = ui.baseLineHeightProto;
   if ((video.srcObject !== null) && !video.paused) {
     document.getElementById('play').style.display = 'block';
@@ -286,7 +297,7 @@ async function detectVideo() {
 async function detectSampleImages() {
   document.getElementById('play').style.display = 'none';
   config.videoOptimized = false;
-  ui.baseFont = ui.baseFontProto.replace(/{size}/, `${1.2 * ui.columns}rem`);
+  ui.baseFont = ui.baseFontProto.replace(/{size}/, `${1.3 * ui.columns}rem`);
   ui.baseLineHeight = ui.baseLineHeightProto * ui.columns;
   document.getElementById('canvas').style.display = 'none';
   document.getElementById('samples-container').style.display = 'block';
@@ -318,6 +329,7 @@ function setupMenu() {
   menu.addBool('Face Emotion', config.face.emotion, 'enabled');
   menu.addBool('Body Pose', config.body, 'enabled');
   menu.addBool('Hand Pose', config.hand, 'enabled');
+  menu.addBool('Gesture Analysis', config.gesture, 'enabled');
 
   menu.addHTML('<hr style="min-width: 200px; border-style: inset; border-color: dimgray">');
   menu.addLabel('Model Parameters');
@@ -383,11 +395,15 @@ async function main() {
   setupMenu();
   document.getElementById('log').innerText = `Human: version ${human.version} TensorFlow/JS: version ${human.tf.version_core}`;
   // this is not required, just pre-warms the library
-  status('loading');
-  await human.load();
-  status('initializing');
-  const warmup = new ImageData(50, 50);
-  await human.detect(warmup);
+  if (ui.modelsPreload) {
+    status('loading');
+    await human.load();
+  }
+  if (ui.modelsWarmup) {
+    status('initializing');
+    const warmup = new ImageData(50, 50);
+    await human.detect(warmup);
+  }
   status('human: ready');
   document.getElementById('loader').style.display = 'none';
   document.getElementById('play').style.display = 'block';
diff --git a/demo/index.html b/demo/index.html
index 69a14bbe..0315a2ec 100644
--- a/demo/index.html
+++ b/demo/index.html
@@ -27,7 +27,7 @@
       .status { position: absolute; width: 100vw; top: 100px; text-align: center; font-size: 4rem; font-weight: 100; text-shadow: 2px 2px darkslategrey; }
       .thumbnail { margin: 8px; box-shadow: 0 0 4px 4px dimgrey; }
       .thumbnail:hover { box-shadow: 0 0 8px 8px dimgrey; filter: grayscale(1); }
-      .log { position: fixed; bottom: 0; }
+      .log { position: fixed; bottom: 0; margin: 0.4rem; }
       .samples-container { display: flex; flex-wrap: wrap; }
       .video { display: none; }
       .canvas { margin: 0 auto; width: 100%; }
diff --git a/package.json b/package.json
index cd20b9c9..220cf0f4 100644
--- a/package.json
+++ b/package.json
@@ -1,7 +1,7 @@
 {
   "name": "@vladmandic/human",
   "version": "0.6.6",
-  "description": "human: 3D Face Detection, Iris Tracking and Age & Gender Prediction",
+  "description": "human: 3D Face Detection, Body Pose, Hand & Finger Tracking, Iris Tracking, Age & Gender Prediction, Emotion Prediction & Gesture Recognition",
   "sideEffects": false,
   "main": "dist/human.node.js",
   "module": "dist/human.esm.js",
diff --git a/src/gesture.js b/src/gesture.js
new file mode 100644
index 00000000..5531975b
--- /dev/null
+++ b/src/gesture.js
@@ -0,0 +1,45 @@
+exports.body = (res) => {
+  if (!res) return [];
+  const gestures = [];
+  for (const pose of res) {
+    // raising hands
+    const leftWrist = pose.keypoints.find((a) => (a.part === 'leftWrist'));
+    const rightWrist = pose.keypoints.find((a) => (a.part === 'rightWrist'));
+    const nose = pose.keypoints.find((a) => (a.part === 'nose'));
+    if (nose && leftWrist && rightWrist && (leftWrist.position.y < nose.position.y) && (rightWrist.position.y < nose.position.y)) gestures.push('i give up');
+    else if (nose && leftWrist && (leftWrist.position.y < nose.position.y)) gestures.push('raise left hand');
+    else if (nose && rightWrist && (rightWrist.position.y < nose.position.y)) gestures.push('raise right hand');
+
+    // leaning
+    const leftShoulder = pose.keypoints.find((a) => (a.part === 'leftShoulder'));
+    const rightShoulder = pose.keypoints.find((a) => (a.part === 'rightShoulder'));
+    if (leftShoulder && rightShoulder) gestures.push(`leaning ${(leftShoulder.position.y > rightShoulder.position.y) ? 'left' : 'right'}`);
+  }
+  return gestures;
+};
+
+exports.face = (res) => {
+  if (!res) return [];
+  const gestures = [];
+  for (const face of res) {
+    if (face.annotations['rightCheek'] && face.annotations['leftCheek'] && (face.annotations['rightCheek'].length > 0) && (face.annotations['leftCheek'].length > 0)) {
+      gestures.push(`facing ${((face.annotations['rightCheek'][0][2] > 0) || (face.annotations['leftCheek'][0][2] < 0)) ? 'right' : 'left'}`);
+    }
+  }
+  return gestures;
+};
+
+exports.hand = (res) => {
+  if (!res) return [];
+  const gestures = [];
+  for (const hand of res) {
+    const fingers = [];
+    for (const [finger, pos] of Object.entries(hand['annotations'])) {
+      if (finger !== 'palmBase') fingers.push({ name: finger.toLowerCase(), position: pos[0] }); // get tip of each finger
+    }
+    const closest = fingers.reduce((best, a) => (best.position[2] < a.position[2] ? best : a));
+    const highest = fingers.reduce((best, a) => (best.position[1] < a.position[1] ? best : a));
+    gestures.push(`${closest.name} forward ${highest.name} up`);
+  }
+  return gestures;
+};
diff --git a/src/handpose/handpipeline.js b/src/handpose/handpipeline.js
index 90617376..146d040d 100644
--- a/src/handpose/handpipeline.js
+++ b/src/handpose/handpipeline.js
@@ -22,8 +22,8 @@ const util = require('./util');
 const UPDATE_REGION_OF_INTEREST_IOU_THRESHOLD = 0.8;
 const PALM_BOX_SHIFT_VECTOR = [0, -0.4];
 const PALM_BOX_ENLARGE_FACTOR = 3;
-const HAND_BOX_SHIFT_VECTOR = [0, -0.1];
-const HAND_BOX_ENLARGE_FACTOR = 1.65;
+const HAND_BOX_SHIFT_VECTOR = [0, -0.1]; // move detected hand box by x,y to ease landmark detection
+const HAND_BOX_ENLARGE_FACTOR = 1.65; // increased from model default 1.65;
 const PALM_LANDMARK_IDS = [0, 5, 9, 13, 17, 1, 2];
 const PALM_LANDMARKS_INDEX_OF_PALM_BASE = 0;
 const PALM_LANDMARKS_INDEX_OF_MIDDLE_FINGER_BASE = 2;
diff --git a/src/human.js b/src/human.js
index 6fd0d409..8e97df57 100644
--- a/src/human.js
+++ b/src/human.js
@@ -4,7 +4,8 @@ const ssrnet = require('./ssrnet/ssrnet.js');
 const emotion = require('./emotion/emotion.js');
 const posenet = require('./posenet/posenet.js');
 const handpose = require('./handpose/handpose.js');
-const fxImage = require('./imagefx.js');
+const gesture = require('./gesture.js');
+const image = require('./image.js');
 const profile = require('./profile.js');
 const defaults = require('../config.js').default;
 const app = require('../package.json');
@@ -52,9 +53,6 @@ class Human {
     this.analyzeMemoryLeaks = false;
     this.checkSanity = false;
     this.firstRun = true;
-    // internal temp canvases
-    this.inCanvas = null;
-    this.outCanvas = null;
     // object that contains all initialized models
     this.models = {
       facemesh: null,
@@ -94,6 +92,7 @@ class Human {
     if (leaked !== 0) this.log(...msg, leaked);
   }
 
+  // quick sanity check on inputs
   sanity(input) {
     if (!this.checkSanity) return null;
     if (!input) return 'input is not defined';
@@ -108,10 +107,12 @@ class Human {
     return null;
   }
 
+  // preload models, not explicitly required as it's done automatically on first use
   async load(userConfig) {
     if (userConfig) this.config = mergeDeep(defaults, userConfig);
 
     if (this.firstRun) {
+      this.checkBackend(true);
       this.log(`version: ${this.version} TensorFlow/JS version: ${tf.version_core}`);
       this.log('configuration:', this.config);
       this.log('flags:', tf.ENV.flags);
@@ -144,8 +145,9 @@ class Human {
     }
   }
 
-  async checkBackend() {
-    if (tf.getBackend() !== this.config.backend) {
+  // check if backend needs initialization if it changed
+  async checkBackend(force) {
+    if (force || (tf.getBackend() !== this.config.backend)) {
       this.state = 'backend';
       /* force backend reload
       if (this.config.backend in tf.engine().registry) {
@@ -156,7 +158,7 @@ class Human {
         this.log('Backend not registred:', this.config.backend);
       }
       */
-      this.log('Setting backend:', this.config.backend);
+      this.log('setting backend:', this.config.backend);
       await tf.setBackend(this.config.backend);
       tf.enableProdMode();
       /* debug mode is really too mcuh
@@ -167,84 +169,20 @@ class Human {
         this.log('Changing WebGL: WEBGL_DELETE_TEXTURE_THRESHOLD:', this.config.deallocate);
         tf.ENV.set('WEBGL_DELETE_TEXTURE_THRESHOLD', this.config.deallocate ? 0 : -1);
       }
-      tf.ENV.set('WEBGL_CPU_FORWARD', true);
+      // tf.ENV.set('WEBGL_CPU_FORWARD', true);
+      // tf.ENV.set('WEBGL_FORCE_F16_TEXTURES', true);
+      // tf.ENV.set('WEBGL_PACK_DEPTHWISECONV', true);
       await tf.ready();
     }
   }
 
-  tfImage(input) {
-    let tensor;
-    if (input instanceof tf.Tensor) {
-      tensor = tf.clone(input);
-    } else {
-      const originalWidth = input.naturalWidth || input.videoWidth || input.width || (input.shape && (input.shape[1] > 0));
-      const originalHeight = input.naturalHeight || input.videoHeight || input.height || (input.shape && (input.shape[2] > 0));
-      let targetWidth = originalWidth;
-      let targetHeight = originalHeight;
-      if (this.config.filter.width > 0) targetWidth = this.config.filter.width;
-      else if (this.config.filter.height > 0) targetWidth = originalWidth * (this.config.filter.height / originalHeight);
-      if (this.config.filter.height > 0) targetHeight = this.config.filter.height;
-      else if (this.config.filter.width > 0) targetHeight = originalHeight * (this.config.filter.width / originalWidth);
-      if (!this.inCanvas || (this.inCanvas.width !== targetWidth) || (this.inCanvas.height !== targetHeight)) {
-        this.inCanvas = (typeof OffscreenCanvas !== 'undefined') ? new OffscreenCanvas(targetWidth, targetHeight) : document.createElement('canvas');
-        if (this.inCanvas.width !== targetWidth) this.inCanvas.width = targetWidth;
-        if (this.inCanvas.height !== targetHeight) this.inCanvas.height = targetHeight;
-      }
-      const ctx = this.inCanvas.getContext('2d');
-      if (input instanceof ImageData) ctx.putImageData(input, 0, 0);
-      else ctx.drawImage(input, 0, 0, originalWidth, originalHeight, 0, 0, this.inCanvas.width, this.inCanvas.height);
-      if (this.config.filter.enabled) {
-        if (!this.fx || !this.outCanvas || (this.inCanvas.width !== this.outCanvas.width) || (this.inCanvas.height !== this.outCanvas.height)) {
-          this.outCanvas = (typeof OffscreenCanvas !== 'undefined') ? new OffscreenCanvas(this.inCanvas.width, this.inCanvas.height) : document.createElement('canvas');
-          if (this.outCanvas.width !== this.inCanvas.width) this.outCanvas.width = this.inCanvas.width;
-          if (this.outCanvas.height !== this.inCanvas.height) this.outCanvas.height = this.inCanvas.height;
-          this.fx = (tf.ENV.flags.IS_BROWSER && (typeof document !== 'undefined')) ? new fxImage.Canvas({ canvas: this.outCanvas }) : null;
-        }
-        this.fx.reset();
-        this.fx.addFilter('brightness', this.config.filter.brightness); // must have at least one filter enabled
-        if (this.config.filter.contrast !== 0) this.fx.addFilter('contrast', this.config.filter.contrast);
-        if (this.config.filter.sharpness !== 0) this.fx.addFilter('sharpen', this.config.filter.sharpness);
-        if (this.config.filter.blur !== 0) this.fx.addFilter('blur', this.config.filter.blur);
-        if (this.config.filter.saturation !== 0) this.fx.addFilter('saturation', this.config.filter.saturation);
-        if (this.config.filter.hue !== 0) this.fx.addFilter('hue', this.config.filter.hue);
-        if (this.config.filter.negative) this.fx.addFilter('negative');
-        if (this.config.filter.sepia) this.fx.addFilter('sepia');
-        if (this.config.filter.vintage) this.fx.addFilter('brownie');
-        if (this.config.filter.sepia) this.fx.addFilter('sepia');
-        if (this.config.filter.kodachrome) this.fx.addFilter('kodachrome');
-        if (this.config.filter.technicolor) this.fx.addFilter('technicolor');
-        if (this.config.filter.polaroid) this.fx.addFilter('polaroid');
-        if (this.config.filter.pixelate !== 0) this.fx.addFilter('pixelate', this.config.filter.pixelate);
-        this.fx.apply(this.inCanvas);
-      }
-      if (!this.outCanvas) this.outCanvas = this.inCanvas;
-      let pixels;
-      if ((this.config.backend === 'webgl') || (this.outCanvas instanceof ImageData)) {
-        // tf kernel-optimized method to get imagedata, also if input is imagedata, just use it
-        pixels = tf.browser.fromPixels(this.outCanvas);
-      } else {
-        // cpu and wasm kernel does not implement efficient fromPixels method nor we can use canvas as-is, so we do a silly one more canvas
-        const tempCanvas = (typeof OffscreenCanvas !== 'undefined') ? new OffscreenCanvas(targetWidth, targetHeight) : document.createElement('canvas');
-        tempCanvas.width = targetWidth;
-        tempCanvas.height = targetHeight;
-        const tempCtx = tempCanvas.getContext('2d');
-        tempCtx.drawImage(this.outCanvas, 0, 0);
-        const data = tempCtx.getImageData(0, 0, targetWidth, targetHeight);
-        pixels = tf.browser.fromPixels(data);
-      }
-      const casted = pixels.toFloat();
-      tensor = casted.expandDims(0);
-      pixels.dispose();
-      casted.dispose();
-    }
-    return { tensor, canvas: this.config.filter.return ? this.outCanvas : null };
-  }
-
+  // main detect function
   async detect(input, userConfig = {}) {
     this.state = 'config';
     const perf = {};
     let timeStamp;
 
+    // update configuration
     this.config = mergeDeep(defaults, userConfig);
     if (!this.config.videoOptimized) this.config = mergeDeep(this.config, override);
 
@@ -256,6 +194,7 @@ class Human {
       return { error };
     }
 
+    // detection happens inside a promise
     // eslint-disable-next-line no-async-promise-executor
     return new Promise(async (resolve) => {
       let poseRes;
@@ -281,9 +220,8 @@ class Human {
       this.analyze('Start Detect:');
 
       timeStamp = now();
-      const image = this.tfImage(input);
+      const process = image.process(input, this.config);
       perf.image = Math.trunc(now() - timeStamp);
-      const imageTensor = image.tensor;
 
       // run facemesh, includes blazeface and iris
       const faceRes = [];
@@ -291,7 +229,7 @@ class Human {
         this.state = 'run:face';
         timeStamp = now();
         this.analyze('Start FaceMesh:');
-        const faces = await this.models.facemesh.estimateFaces(imageTensor, this.config.face);
+        const faces = await this.models.facemesh.estimateFaces(process.tensor, this.config.face);
         perf.face = Math.trunc(now() - timeStamp);
         for (const face of faces) {
           // is something went wrong, skip the face
@@ -334,38 +272,45 @@ class Human {
 
       // run posenet
       if (this.config.async) {
-        poseRes = this.config.body.enabled ? this.models.posenet.estimatePoses(imageTensor, this.config.body) : [];
+        poseRes = this.config.body.enabled ? this.models.posenet.estimatePoses(process.tensor, this.config.body) : [];
       } else {
         this.state = 'run:body';
         timeStamp = now();
         this.analyze('Start PoseNet');
-        poseRes = this.config.body.enabled ? await this.models.posenet.estimatePoses(imageTensor, this.config.body) : [];
+        poseRes = this.config.body.enabled ? await this.models.posenet.estimatePoses(process.tensor, this.config.body) : [];
         this.analyze('End PoseNet:');
         perf.body = Math.trunc(now() - timeStamp);
       }
 
       // run handpose
       if (this.config.async) {
-        handRes = this.config.hand.enabled ? this.models.handpose.estimateHands(imageTensor, this.config.hand) : [];
+        handRes = this.config.hand.enabled ? this.models.handpose.estimateHands(process.tensor, this.config.hand) : [];
       } else {
         this.state = 'run:hand';
         timeStamp = now();
         this.analyze('Start HandPose:');
-        handRes = this.config.hand.enabled ? await this.models.handpose.estimateHands(imageTensor, this.config.hand) : [];
+        handRes = this.config.hand.enabled ? await this.models.handpose.estimateHands(process.tensor, this.config.hand) : [];
         this.analyze('End HandPose:');
         perf.hand = Math.trunc(now() - timeStamp);
       }
 
       if (this.config.async) [poseRes, handRes] = await Promise.all([poseRes, handRes]);
 
-      imageTensor.dispose();
+      process.tensor.dispose();
       this.state = 'idle';
 
       if (this.config.scoped) tf.engine().endScope();
       this.analyze('End Scope:');
 
+      let gestureRes = [];
+      if (this.config.gesture.enabled) {
+        timeStamp = now();
+        gestureRes = { body: gesture.body(poseRes), hand: gesture.hand(handRes), face: gesture.face(faceRes) };
+        perf.gesture = Math.trunc(now() - timeStamp);
+      }
+
       perf.total = Math.trunc(now() - timeStart);
-      resolve({ face: faceRes, body: poseRes, hand: handRes, performance: perf, canvas: image.canvas });
+      resolve({ face: faceRes, body: poseRes, hand: handRes, gesture: gestureRes, performance: perf, canvas: process.canvas });
     });
   }
 }
diff --git a/src/image.js b/src/image.js
new file mode 100644
index 00000000..364916f8
--- /dev/null
+++ b/src/image.js
@@ -0,0 +1,79 @@
+const tf = require('@tensorflow/tfjs');
+const fxImage = require('./imagefx.js');
+
+// internal temp canvases
+let inCanvas = null;
+let outCanvas = null;
+
+// process input image and return tensor
+// input can be tensor, imagedata, htmlimageelement, htmlvideoelement
+// input is resized and run through imagefx filter
+function process(input, config) {
+  let tensor;
+  if (input instanceof tf.Tensor) {
+    tensor = tf.clone(input);
+  } else {
+    const originalWidth = input.naturalWidth || input.videoWidth || input.width || (input.shape && (input.shape[1] > 0));
+    const originalHeight = input.naturalHeight || input.videoHeight || input.height || (input.shape && (input.shape[2] > 0));
+    let targetWidth = originalWidth;
+    let targetHeight = originalHeight;
+    if (config.filter.width > 0) targetWidth = config.filter.width;
+    else if (config.filter.height > 0) targetWidth = originalWidth * (config.filter.height / originalHeight);
+    if (config.filter.height > 0) targetHeight = config.filter.height;
+    else if (config.filter.width > 0) targetHeight = originalHeight * (config.filter.width / originalWidth);
+    if (!inCanvas || (inCanvas.width !== targetWidth) || (inCanvas.height !== targetHeight)) {
+      inCanvas = (typeof OffscreenCanvas !== 'undefined') ? new OffscreenCanvas(targetWidth, targetHeight) : document.createElement('canvas');
+      if (inCanvas.width !== targetWidth) inCanvas.width = targetWidth;
+      if (inCanvas.height !== targetHeight) inCanvas.height = targetHeight;
+    }
+    const ctx = inCanvas.getContext('2d');
+    if (input instanceof ImageData) ctx.putImageData(input, 0, 0);
+    else ctx.drawImage(input, 0, 0, originalWidth, originalHeight, 0, 0, inCanvas.width, inCanvas.height);
+    if (config.filter.enabled) {
+      if (!this.fx || !outCanvas || (inCanvas.width !== outCanvas.width) || (inCanvas.height !== outCanvas.height)) {
+        outCanvas = (typeof OffscreenCanvas !== 'undefined') ? new OffscreenCanvas(inCanvas.width, inCanvas.height) : document.createElement('canvas');
+        if (outCanvas.width !== inCanvas.width) outCanvas.width = inCanvas.width;
+        if (outCanvas.height !== inCanvas.height) outCanvas.height = inCanvas.height;
+        this.fx = (tf.ENV.flags.IS_BROWSER && (typeof document !== 'undefined')) ? new fxImage.Canvas({ canvas: outCanvas }) : null;
+      }
+      this.fx.reset();
+      this.fx.addFilter('brightness', config.filter.brightness); // must have at least one filter enabled
+      if (config.filter.contrast !== 0) this.fx.addFilter('contrast', config.filter.contrast);
+      if (config.filter.sharpness !== 0) this.fx.addFilter('sharpen', config.filter.sharpness);
+      if (config.filter.blur !== 0) this.fx.addFilter('blur', config.filter.blur);
+      if (config.filter.saturation !== 0) this.fx.addFilter('saturation', config.filter.saturation);
+      if (config.filter.hue !== 0) this.fx.addFilter('hue', config.filter.hue);
+      if (config.filter.negative) this.fx.addFilter('negative');
+      if (config.filter.sepia) this.fx.addFilter('sepia');
+      if (config.filter.vintage) this.fx.addFilter('brownie');
+      if (config.filter.sepia) this.fx.addFilter('sepia');
+      if (config.filter.kodachrome) this.fx.addFilter('kodachrome');
+      if (config.filter.technicolor) this.fx.addFilter('technicolor');
+      if (config.filter.polaroid) this.fx.addFilter('polaroid');
+      if (config.filter.pixelate !== 0) this.fx.addFilter('pixelate', config.filter.pixelate);
+      this.fx.apply(inCanvas);
+    }
+    if (!outCanvas) outCanvas = inCanvas;
+    let pixels;
+    if ((config.backend === 'webgl') || (outCanvas instanceof ImageData)) {
+      // tf kernel-optimized method to get imagedata, also if input is imagedata, just use it
+      pixels = tf.browser.fromPixels(outCanvas);
+    } else {
+      // cpu and wasm kernel does not implement efficient fromPixels method nor we can use canvas as-is, so we do a silly one more canvas
+      const tempCanvas = (typeof OffscreenCanvas !== 'undefined') ? new OffscreenCanvas(targetWidth, targetHeight) : document.createElement('canvas');
+      tempCanvas.width = targetWidth;
+      tempCanvas.height = targetHeight;
+      const tempCtx = tempCanvas.getContext('2d');
+      tempCtx.drawImage(outCanvas, 0, 0);
+      const data = tempCtx.getImageData(0, 0, targetWidth, targetHeight);
+      pixels = tf.browser.fromPixels(data);
+    }
+    const casted = pixels.toFloat();
+    tensor = casted.expandDims(0);
+    pixels.dispose();
+    casted.dispose();
+  }
+  return { tensor, canvas: config.filter.return ? outCanvas : null };
+}
+
+exports.process = process;