diff --git a/README.md b/README.md index ad254f8f..a739f0b6 100644 --- a/README.md +++ b/README.md @@ -268,7 +268,6 @@ config = { inputSize: 64, // fixed value minConfidence: 0.5, // threshold for discarding a prediction skipFrames: 10, // how many frames to go without re-running the detector, only used for video inputs - useGrayscale: true, // convert image to grayscale before prediction or use highest channel modelPath: '../models/emotion/model.json', }, }, diff --git a/config.js b/config.js index 505d942a..a4fe4057 100644 --- a/config.js +++ b/config.js @@ -51,7 +51,6 @@ export default { inputSize: 64, // fixed value minConfidence: 0.5, // threshold for discarding a prediction skipFrames: 10, // how many frames to go without re-running the detector - useGrayscale: true, // convert image to grayscale before prediction or use highest channel modelPath: '../models/emotion/model.json', }, }, diff --git a/src/emotion/emotion.js b/src/emotion/emotion.js index 89d36ca9..3c47c43c 100644 --- a/src/emotion/emotion.js +++ b/src/emotion/emotion.js @@ -6,16 +6,6 @@ let last = []; let frame = 0; const multiplier = 1.5; -function getImage(image, size) { - const tensor = tf.tidy(() => { - const buffer = tf.browser.fromPixels(image, 1); - const resize = tf.image.resizeBilinear(buffer, [size, size]); - const expand = tf.cast(tf.expandDims(resize, 0), 'float32'); - return expand; - }); - return tensor; -} - async function load(config) { if (!models.emotion) models.emotion = await tf.loadGraphModel(config.face.emotion.modelPath); return models.emotion; @@ -27,25 +17,23 @@ async function predict(image, config) { return last; } frame = 0; - const enhance = tf.tidy(() => { - if (image instanceof tf.Tensor) { - const resize = tf.image.resizeBilinear(image, [config.face.emotion.inputSize, config.face.emotion.inputSize], false); - const [r, g, b] = tf.split(resize, 3, 3); - if (config.face.emotion.useGrayscale) { - // weighted rgb to grayscale: https://www.mathworks.com/help/matlab/ref/rgb2gray.html - const r1 = tf.mul(r, [0.2989]); - const g1 = tf.mul(g, [0.5870]); - const b1 = tf.mul(b, [0.1140]); - const grayscale = tf.addN([r1, g1, b1]); - return grayscale; - } - return g; - } - return getImage(image, config.face.emotion.inputSize); - }); + const resize = tf.image.resizeBilinear(image, [config.face.emotion.inputSize, config.face.emotion.inputSize], false); + const [red, green, blue] = tf.split(resize, 3, 3); + resize.dispose(); + // weighted rgb to grayscale: https://www.mathworks.com/help/matlab/ref/rgb2gray.html + const redNorm = tf.mul(red, [0.2989]); + const greenNorm = tf.mul(green, [0.5870]); + const blueNorm = tf.mul(blue, [0.1140]); + red.dispose(); + green.dispose(); + blue.dispose(); + const grayscale = tf.addN([redNorm, greenNorm, blueNorm]); + redNorm.dispose(); + greenNorm.dispose(); + blueNorm.dispose(); const obj = []; if (config.face.emotion.enabled) { - const emotionT = await models.emotion.predict(enhance); + const emotionT = await models.emotion.predict(grayscale); const data = await emotionT.data(); for (let i = 0; i < data.length; i++) { if (multiplier * data[i] > config.face.emotion.minConfidence) obj.push({ score: Math.min(0.99, Math.trunc(100 * multiplier * data[i]) / 100), emotion: annotations[i] }); @@ -53,7 +41,7 @@ async function predict(image, config) { obj.sort((a, b) => b.score - a.score); tf.dispose(emotionT); } - tf.dispose(enhance); + tf.dispose(grayscale); last = obj; return obj; } diff --git a/src/handpose/handdetector.js b/src/handpose/handdetector.js index fe0f5710..73b1524d 100644 --- a/src/handpose/handdetector.js +++ b/src/handpose/handdetector.js @@ -32,9 +32,9 @@ class HandDetector { } async getBoundingBoxes(input) { - const normalizedInput = tf.tidy(() => tf.mul(tf.sub(input, 0.5), 2)); - const batchedPrediction = this.model.predict(normalizedInput); + const batchedPrediction = this.model.predict(input); const prediction = batchedPrediction.squeeze(); + console.log(prediction); // Regression score for each anchor point. const scores = tf.tidy(() => tf.sigmoid(tf.slice(prediction, [0, 0], [-1, 1])).squeeze()); // Bounding box for each anchor point. @@ -42,11 +42,7 @@ class HandDetector { const boxes = this.normalizeBoxes(rawBoxes); const boxesWithHandsTensor = await tf.image.nonMaxSuppressionAsync(boxes, scores, this.maxHands, this.iouThreshold, this.scoreThreshold); const boxesWithHands = await boxesWithHandsTensor.array(); - const toDispose = [normalizedInput, batchedPrediction, boxesWithHandsTensor, prediction, boxes, rawBoxes, scores]; - // if (boxesWithHands.length === 0) { - // toDispose.forEach((tensor) => tensor.dispose()); - // return null; - // } + const toDispose = [batchedPrediction, boxesWithHandsTensor, prediction, boxes, rawBoxes, scores]; const detectedHands = tf.tidy(() => { const detectedBoxes = []; for (const i in boxesWithHands) { @@ -69,12 +65,18 @@ class HandDetector { * @param input The image to classify. */ async estimateHandBounds(input, config) { - const inputHeight = input.shape[1]; - const inputWidth = input.shape[2]; + // const inputHeight = input.shape[2]; + // const inputWidth = input.shape[1]; this.iouThreshold = config.iouThreshold; this.scoreThreshold = config.scoreThreshold; this.maxHands = config.maxHands; - const image = tf.tidy(() => input.resizeBilinear([this.width, this.height]).div(255)); + const resized = input.resizeBilinear([this.width, this.height]); + const divided = resized.div(255); + const normalized = divided.sub(0.5); + const image = normalized.mul(2.0); + resized.dispose(); + divided.dispose(); + normalized.dispose(); const predictions = await this.getBoundingBoxes(image); image.dispose(); if (!predictions || (predictions.length === 0)) return null; @@ -87,7 +89,7 @@ class HandDetector { const palmLandmarks = await prediction.palmLandmarks.array(); prediction.boxes.dispose(); prediction.palmLandmarks.dispose(); - hands.push(bounding.scaleBoxCoordinates({ startPoint, endPoint, palmLandmarks }, [inputWidth / this.width, inputHeight / this.height])); + hands.push(bounding.scaleBoxCoordinates({ startPoint, endPoint, palmLandmarks }, [input.shape[2] / this.width, input.shape[1] / this.height])); } return hands; } diff --git a/src/handpose/handpose.js b/src/handpose/handpose.js index 1326cde6..94f45b8d 100644 --- a/src/handpose/handpose.js +++ b/src/handpose/handpose.js @@ -12,14 +12,7 @@ class HandPose { this.skipFrames = config.skipFrames; this.detectionConfidence = config.minConfidence; this.maxHands = config.maxHands; - const image = tf.tidy(() => { - if (!(input instanceof tf.Tensor)) { - input = tf.browser.fromPixels(input); - } - return input.toFloat().expandDims(0); - }); - const predictions = await this.pipeline.estimateHands(image, config); - image.dispose(); + const predictions = await this.pipeline.estimateHands(input, config); const hands = []; if (!predictions) return hands; for (const prediction of predictions) { diff --git a/src/human.js b/src/human.js index 4b1b9d31..5de59902 100644 --- a/src/human.js +++ b/src/human.js @@ -71,7 +71,9 @@ function mergeDeep(...objects) { function sanity(input) { if (!input) return 'input is not defined'; - if (tf.ENV.flags.IS_BROWSER && (input instanceof ImageData || input instanceof HTMLImageElement || input instanceof HTMLCanvasElement || input instanceof HTMLVideoElement || input instanceof HTMLMediaElement)) { + if (!(input instanceof tf.Tensor) + || (tf.ENV.flags.IS_BROWSER + && (input instanceof ImageData || input instanceof HTMLImageElement || input instanceof HTMLCanvasElement || input instanceof HTMLVideoElement || input instanceof HTMLMediaElement))) { const width = input.naturalWidth || input.videoWidth || input.width || (input.shape && (input.shape[1] > 0)); if (!width || (width === 0)) return 'input is empty'; } @@ -99,6 +101,20 @@ async function load(userConfig) { if (config.face.enabled && config.face.emotion.enabled && !models.emotion) models.emotion = await emotion.load(config); } +function tfImage(input) { + let image; + if (input instanceof tf.Tensor) { + image = tf.clone(input); + } else { + const pixels = tf.browser.fromPixels(input); + const casted = pixels.toFloat(); + image = casted.expandDims(0); + pixels.dispose(); + casted.dispose(); + } + return image; +} + async function detect(input, userConfig = {}) { state = 'config'; const perf = {}; @@ -151,11 +167,13 @@ async function detect(input, userConfig = {}) { analyze('Start Detect:'); + const imageTensor = tfImage(input); + // run posenet state = 'run:body'; timeStamp = now(); analyze('Start PoseNet'); - const poseRes = config.body.enabled ? await models.posenet.estimatePoses(input, config.body) : []; + const poseRes = config.body.enabled ? await models.posenet.estimatePoses(imageTensor, config.body) : []; analyze('End PoseNet:'); perf.body = Math.trunc(now() - timeStamp); @@ -163,7 +181,7 @@ async function detect(input, userConfig = {}) { state = 'run:hand'; timeStamp = now(); analyze('Start HandPose:'); - const handRes = config.hand.enabled ? await models.handpose.estimateHands(input, config.hand) : []; + const handRes = config.hand.enabled ? await models.handpose.estimateHands(imageTensor, config.hand) : []; analyze('End HandPose:'); perf.hand = Math.trunc(now() - timeStamp); @@ -173,7 +191,7 @@ async function detect(input, userConfig = {}) { state = 'run:face'; timeStamp = now(); analyze('Start FaceMesh:'); - const faces = await models.facemesh.estimateFaces(input, config.face); + const faces = await models.facemesh.estimateFaces(imageTensor, config.face); perf.face = Math.trunc(now() - timeStamp); for (const face of faces) { // is something went wrong, skip the face @@ -210,10 +228,11 @@ async function detect(input, userConfig = {}) { emotion: emotionData, iris: (iris !== 0) ? Math.trunc(100 * 11.7 /* human iris size in mm */ / iris) / 100 : 0, }); + analyze('End FaceMesh:'); } - analyze('End FaceMesh:'); } + imageTensor.dispose(); state = 'idle'; if (config.scoped) tf.engine().endScope();