diff --git a/config.js b/config.js index 7dde6b53..716e6766 100644 --- a/config.js +++ b/config.js @@ -109,10 +109,10 @@ export default { // if model is running st 25 FPS, we can re-use existing bounding box for updated hand skeleton analysis // as the hand probably hasn't moved much in short time (10 * 1/25 = 0.25 sec) minConfidence: 0.5, // threshold for discarding a prediction - iouThreshold: 0.2, // threshold for deciding whether boxes overlap too much in non-maximum suppression - scoreThreshold: 0.5, // threshold for deciding when to remove boxes based on score in non-maximum suppression + iouThreshold: 0.1, // threshold for deciding whether boxes overlap too much in non-maximum suppression + scoreThreshold: 0.8, // threshold for deciding when to remove boxes based on score in non-maximum suppression enlargeFactor: 1.65, // empiric tuning as skeleton prediction prefers hand box with some whitespace - maxHands: 10, // maximum number of hands detected in the input, should be set to the minimum number for performance + maxHands: 1, // maximum number of hands detected in the input, should be set to the minimum number for performance detector: { modelPath: '../models/handdetect.json', }, diff --git a/src/hand/handdetector.js b/src/hand/handdetector.js index 6da76af9..6df3cace 100644 --- a/src/hand/handdetector.js +++ b/src/hand/handdetector.js @@ -46,33 +46,30 @@ class HandDetector { }); } - async getBoundingBoxes(input, config) { - const batchedPrediction = this.model.predict(input); - const prediction = batchedPrediction.squeeze(); - const scores = tf.tidy(() => tf.sigmoid(tf.slice(prediction, [0, 0], [-1, 1])).squeeze()); - const rawBoxes = tf.slice(prediction, [0, 1], [-1, 4]); + async getBoxes(input, config) { + const batched = this.model.predict(input); + const predictions = batched.squeeze(); + const scores = tf.tidy(() => tf.sigmoid(tf.slice(predictions, [0, 0], [-1, 1])).squeeze()); + // const scoresVal = scores.dataSync(); // scoresVal[boxIndex] is box confidence + const rawBoxes = tf.slice(predictions, [0, 1], [-1, 4]); const boxes = this.normalizeBoxes(rawBoxes); - const boxesWithHandsTensor = await tf.image.nonMaxSuppressionAsync(boxes, scores, config.maxHands, config.iouThreshold, config.scoreThreshold); - const boxesWithHands = boxesWithHandsTensor.arraySync(); + const boxesWithHandsT = await tf.image.nonMaxSuppressionAsync(boxes, scores, config.maxHands, config.iouThreshold, config.scoreThreshold); + const boxesWithHands = boxesWithHandsT.arraySync(); const toDispose = [ - batchedPrediction, - boxesWithHandsTensor, - prediction, + batched, + boxesWithHandsT, + predictions, boxes, rawBoxes, scores, ]; - if (boxesWithHands.length === 0) { - toDispose.forEach((tensor) => tensor.dispose()); - return null; - } const hands = []; for (const boxIndex of boxesWithHands) { const matchingBox = tf.slice(boxes, [boxIndex, 0], [1, -1]); - const rawPalmLandmarks = tf.slice(prediction, [boxIndex, 5], [1, 14]); + const rawPalmLandmarks = tf.slice(predictions, [boxIndex, 5], [1, 14]); const palmLandmarks = tf.tidy(() => this.normalizeLandmarks(rawPalmLandmarks, boxIndex).reshape([-1, 2])); rawPalmLandmarks.dispose(); - hands.push({ boxes: matchingBox, palmLandmarks }); + hands.push({ box: matchingBox, palmLandmarks }); } toDispose.forEach((tensor) => tensor.dispose()); return hands; @@ -82,16 +79,16 @@ class HandDetector { const inputHeight = input.shape[1]; const inputWidth = input.shape[2]; const image = tf.tidy(() => input.resizeBilinear([config.inputSize, config.inputSize]).div(127.5).sub(1)); - const predictions = await this.getBoundingBoxes(image, config); + const predictions = await this.getBoxes(image, config); image.dispose(); if (!predictions || predictions.length === 0) return null; const hands = []; for (const prediction of predictions) { - const boundingBoxes = prediction.boxes.dataSync(); + const boundingBoxes = prediction.box.dataSync(); const startPoint = boundingBoxes.slice(0, 2); const endPoint = boundingBoxes.slice(2, 4); const palmLandmarks = prediction.palmLandmarks.arraySync(); - prediction.boxes.dispose(); + prediction.box.dispose(); prediction.palmLandmarks.dispose(); hands.push(box.scaleBoxCoordinates({ startPoint, endPoint, palmLandmarks }, [inputWidth / config.inputSize, inputHeight / config.inputSize])); } diff --git a/src/hand/handpipeline.js b/src/hand/handpipeline.js index fe81bcb5..276463c4 100644 --- a/src/hand/handpipeline.js +++ b/src/hand/handpipeline.js @@ -30,12 +30,11 @@ const PALM_LANDMARKS_INDEX_OF_MIDDLE_FINGER_BASE = 2; class HandPipeline { constructor(boundingBoxDetector, meshDetector, inputSize) { - this.boundingBoxDetector = boundingBoxDetector; + this.boxDetector = boundingBoxDetector; this.meshDetector = meshDetector; this.inputSize = inputSize; - this.regionsOfInterest = []; - this.runsWithoutHandDetector = 0; - this.skipFrames = 0; + this.storedBoxes = []; + this.skipped = 0; this.detectedHands = 0; } @@ -86,30 +85,24 @@ class HandPipeline { } async estimateHands(image, config) { - this.skipFrames = config.skipFrames; - // don't need box detection if we have sufficient number of boxes - let useFreshBox = (this.runsWithoutHandDetector > this.skipFrames) || (this.detectedHands !== this.regionsOfInterest.length); - let boundingBoxPredictions; - // but every skipFrames check if detect boxes number changed - if (useFreshBox) boundingBoxPredictions = await this.boundingBoxDetector.estimateHandBounds(image, config); - // if there are new boxes and number of boxes doesn't match use new boxes, but not if maxhands is fixed to 1 - if (config.maxHands > 1 && boundingBoxPredictions && boundingBoxPredictions.length > 0 && boundingBoxPredictions.length !== this.detectedHands) useFreshBox = true; - if (useFreshBox) { - this.regionsOfInterest = []; - if (!boundingBoxPredictions || boundingBoxPredictions.length === 0) { - this.detectedHands = 0; - return null; - } - for (const boundingBoxPrediction of boundingBoxPredictions) { - this.regionsOfInterest.push(boundingBoxPrediction); - } - this.runsWithoutHandDetector = 0; - } else { - this.runsWithoutHandDetector++; + this.skipped++; + let useFreshBox = false; + // run new detector every skipFrames + const boxes = (this.skipped > config.skipFrames) + ? await this.boxDetector.estimateHandBounds(image, config) : null; + // if detector result count doesn't match current working set, use it to reset current working set + if (boxes && (boxes.length !== this.detectedHands) && (this.detectedHands !== config.maxHands)) { + // console.log(this.skipped, config.maxHands, this.detectedHands, this.storedBoxes.length, boxes.length); + this.storedBoxes = []; + this.detectedHands = 0; + for (const possible of boxes) this.storedBoxes.push(possible); + if (this.storedBoxes.length > 0) useFreshBox = true; + this.skipped = 0; } const hands = []; - for (const i in this.regionsOfInterest) { - const currentBox = this.regionsOfInterest[i]; + // go through working set of boxes + for (const i in this.storedBoxes) { + const currentBox = this.storedBoxes[i]; if (!currentBox) continue; const angle = util.computeRotation(currentBox.palmLandmarks[PALM_LANDMARKS_INDEX_OF_PALM_BASE], currentBox.palmLandmarks[PALM_LANDMARKS_INDEX_OF_MIDDLE_FINGER_BASE]); const palmCenter = box.getBoxCenter(currentBox); @@ -121,8 +114,7 @@ class HandPipeline { const handImage = croppedInput.div(255); croppedInput.dispose(); rotatedImage.dispose(); - const prediction = this.meshDetector.predict(handImage); - const [confidence, keypoints] = prediction; + const [confidence, keypoints] = await this.meshDetector.predict(handImage); handImage.dispose(); const confidenceValue = confidence.dataSync()[0]; confidence.dispose(); @@ -133,7 +125,7 @@ class HandPipeline { keypointsReshaped.dispose(); const coords = this.transformRawCoords(rawCoords, newBox, angle, rotationMatrix); const nextBoundingBox = this.getBoxForHandLandmarks(coords); - this.updateRegionsOfInterest(nextBoundingBox, i); + this.updateStoredBoxes(nextBoundingBox, i); const result = { landmarks: coords, handInViewConfidence: confidenceValue, @@ -144,7 +136,7 @@ class HandPipeline { }; hands.push(result); } else { - this.updateRegionsOfInterest(null, i); + this.updateStoredBoxes(null, i); /* const result = { handInViewConfidence: confidenceValue, @@ -158,7 +150,7 @@ class HandPipeline { } keypoints.dispose(); } - this.regionsOfInterest = this.regionsOfInterest.filter((a) => a !== null); + this.storedBoxes = this.storedBoxes.filter((a) => a !== null); this.detectedHands = hands.length; return hands; } @@ -172,8 +164,8 @@ class HandPipeline { return { startPoint, endPoint }; } - updateRegionsOfInterest(newBox, i) { - const previousBox = this.regionsOfInterest[i]; + updateStoredBoxes(newBox, i) { + const previousBox = this.storedBoxes[i]; let iou = 0; if (newBox && previousBox && previousBox.startPoint) { const [boxStartX, boxStartY] = newBox.startPoint; @@ -189,7 +181,7 @@ class HandPipeline { const previousBoxArea = (previousBoxEndX - previousBoxStartX) * (previousBoxEndY - boxStartY); iou = intersection / (boxArea + previousBoxArea - intersection); } - this.regionsOfInterest[i] = iou > UPDATE_REGION_OF_INTEREST_IOU_THRESHOLD ? previousBox : newBox; + this.storedBoxes[i] = iou > UPDATE_REGION_OF_INTEREST_IOU_THRESHOLD ? previousBox : newBox; } }