From cf304bc51427fc8b8236a9046003349cc41c5bcc Mon Sep 17 00:00:00 2001 From: Vladimir Mandic Date: Mon, 22 Nov 2021 14:33:40 -0500 Subject: [PATCH] prototype blazepose detector --- CHANGELOG.md | 3 +- demo/typescript/index.ts | 4 +- src/body/blazepose.ts | 168 +++++++++++++++++++++++----------- src/body/blazeposedetector.ts | 111 ++++++++++++++++++++++ src/config.ts | 17 ++-- 5 files changed, 242 insertions(+), 61 deletions(-) create mode 100644 src/body/blazeposedetector.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index 99d6fff3..ebe93c65 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,8 +9,9 @@ ## Changelog -### **HEAD -> main** 2021/11/19 mandic00@live.com +### **HEAD -> main** 2021/11/21 mandic00@live.com +- add body 3d interpolation - edit blazepose keypoints - new build process diff --git a/demo/typescript/index.ts b/demo/typescript/index.ts index 68369f4d..a58ff182 100644 --- a/demo/typescript/index.ts +++ b/demo/typescript/index.ts @@ -7,9 +7,9 @@ * @license MIT */ -import { Human } from '../../dist/human.esm.js'; // equivalent of @vladmandic/Human +import { Human, Config } from '../../dist/human.esm.js'; // equivalent of @vladmandic/Human -const humanConfig = { // user configuration for human, used to fine-tune behavior +const humanConfig: Partial = { // user configuration for human, used to fine-tune behavior // backend: 'webgpu' as 'webgpu, // async: true, modelBasePath: '../../models', diff --git a/src/body/blazepose.ts b/src/body/blazepose.ts index 8bba9f1a..17d98e5a 100644 --- a/src/body/blazepose.ts +++ b/src/body/blazepose.ts @@ -9,48 +9,58 @@ import type { BodyKeypoint, BodyResult, Box, Point } from '../result'; import type { GraphModel, Tensor } from '../tfjs/types'; import type { Config } from '../config'; import * as coords from './blazeposecoords'; +import * as detect from './blazeposedetector'; + +interface DetectedBox { box: Box, boxRaw: Box, score: number } const env = { initial: true }; -const models: [GraphModel | null, GraphModel | null] = [null, null]; -const inputSize = [[0, 0], [0, 0]]; +// const models: [GraphModel | null, GraphModel | null] = [null, null]; +const models: { detector: GraphModel | null, landmarks: GraphModel | null } = { detector: null, landmarks: null }; +const inputSize: { detector: [number, number], landmarks: [number, number] } = { detector: [224, 224], landmarks: [256, 256] }; let skipped = Number.MAX_SAFE_INTEGER; -let outputNodes: string[]; // different for lite/full/heavy +const outputNodes: { detector: string[], landmarks: string[] } = { + landmarks: ['ld_3d', 'activation_segmentation', 'activation_heatmap', 'world_3d', 'output_poseflag'], + detector: [], +}; + let cache: BodyResult | null = null; +let lastBox: Box | undefined; let padding: [number, number][] = [[0, 0], [0, 0], [0, 0], [0, 0]]; let lastTime = 0; +const sigmoid = (x) => (1 - (1 / (1 + Math.exp(x)))); + export async function loadDetect(config: Config): Promise { - if (env.initial) models[0] = null; - if (!models[0] && config.body.detector?.modelPath || '') { - models[0] = await tf.loadGraphModel(join(config.modelBasePath, config.body.detector?.modelPath || '')) as unknown as GraphModel; - const inputs = Object.values(models[0].modelSignature['inputs']); - inputSize[0][0] = Array.isArray(inputs) ? parseInt(inputs[0].tensorShape.dim[1].size) : 0; - inputSize[0][1] = Array.isArray(inputs) ? parseInt(inputs[0].tensorShape.dim[2].size) : 0; - if (!models[0] || !models[0]['modelUrl']) log('load model failed:', config.body.detector?.modelPath); - else if (config.debug) log('load model:', models[0]['modelUrl']); - } else if (config.debug && models[0]) log('cached model:', models[0]['modelUrl']); - return models[0] as GraphModel; + if (env.initial) models.detector = null; + if (!models.detector && config.body['detector'] && config.body['detector']['modelPath'] || '') { + models.detector = await tf.loadGraphModel(join(config.modelBasePath, config.body['detector']['modelPath'] || '')) as unknown as GraphModel; + const inputs = Object.values(models.detector.modelSignature['inputs']); + inputSize.detector[0] = Array.isArray(inputs) ? parseInt(inputs[0].tensorShape.dim[1].size) : 0; + inputSize.detector[1] = Array.isArray(inputs) ? parseInt(inputs[0].tensorShape.dim[2].size) : 0; + if (!models.detector || !models.detector['modelUrl']) log('load model failed:', config.body['detector']['modelPath']); + else if (config.debug) log('load model:', models.detector['modelUrl']); + } else if (config.debug && models.detector) log('cached model:', models.detector['modelUrl']); + await detect.createAnchors(); + return models.detector as GraphModel; } export async function loadPose(config: Config): Promise { - if (env.initial) models[1] = null; - if (!models[1]) { - models[1] = await tf.loadGraphModel(join(config.modelBasePath, config.body.modelPath || '')) as unknown as GraphModel; - const inputs = Object.values(models[1].modelSignature['inputs']); - inputSize[1][0] = Array.isArray(inputs) ? parseInt(inputs[0].tensorShape.dim[1].size) : 0; - inputSize[1][1] = Array.isArray(inputs) ? parseInt(inputs[0].tensorShape.dim[2].size) : 0; - if (config.body.modelPath?.includes('lite')) outputNodes = ['ld_3d', 'output_segmentation', 'output_heatmap', 'world_3d', 'output_poseflag']; - else outputNodes = ['Identity', 'Identity_2', 'Identity_3', 'Identity_4', 'Identity_1']; // v2 from pinto full and heavy - if (!models[1] || !models[1]['modelUrl']) log('load model failed:', config.body.modelPath); - else if (config.debug) log('load model:', models[1]['modelUrl']); - } else if (config.debug) log('cached model:', models[1]['modelUrl']); - return models[1]; + if (env.initial) models.landmarks = null; + if (!models.landmarks) { + models.landmarks = await tf.loadGraphModel(join(config.modelBasePath, config.body.modelPath || '')) as unknown as GraphModel; + const inputs = Object.values(models.landmarks.modelSignature['inputs']); + inputSize.landmarks[0] = Array.isArray(inputs) ? parseInt(inputs[0].tensorShape.dim[1].size) : 0; + inputSize.landmarks[1] = Array.isArray(inputs) ? parseInt(inputs[0].tensorShape.dim[2].size) : 0; + if (!models.landmarks || !models.landmarks['modelUrl']) log('load model failed:', config.body.modelPath); + else if (config.debug) log('load model:', models.landmarks['modelUrl']); + } else if (config.debug) log('cached model:', models.landmarks['modelUrl']); + return models.landmarks; } export async function load(config: Config): Promise<[GraphModel | null, GraphModel | null]> { - if (!models[0]) await loadDetect(config); - if (!models[1]) await loadPose(config); - return models; + if (!models.detector) await loadDetect(config); + if (!models.landmarks) await loadPose(config); + return [models.detector, models.landmarks]; } function calculateBoxes(keypoints: Array, outputSize: [number, number]): { keypointsBox: Box, keypointsBoxRaw: Box } { @@ -61,22 +71,32 @@ function calculateBoxes(keypoints: Array, outputSize: [number, num return { keypointsBox, keypointsBoxRaw }; } -async function prepareImage(input: Tensor): Promise { +async function prepareImage(input: Tensor, size: number, box?: Box): Promise { const t: Record = {}; if (!input.shape || !input.shape[1] || !input.shape[2]) return input; let final: Tensor; if (input.shape[1] !== input.shape[2]) { // only pad if width different than height + const height: [number, number] = box + ? [Math.trunc(input.shape[1] * box[1]), Math.trunc(input.shape[1] * (box[1] + box[3]))] + : [input.shape[2] > input.shape[1] ? Math.trunc((input.shape[2] - input.shape[1]) / 2) : 0, input.shape[2] > input.shape[1] ? Math.trunc((input.shape[2] - input.shape[1]) / 2) : 0]; + const width: [number, number] = box + ? [Math.trunc(input.shape[2] * box[0]), Math.trunc(input.shape[2] * (box[0] + box[2]))] + : [input.shape[1] > input.shape[2] ? Math.trunc((input.shape[1] - input.shape[2]) / 2) : 0, input.shape[1] > input.shape[2] ? Math.trunc((input.shape[1] - input.shape[2]) / 2) : 0]; padding = [ [0, 0], // dont touch batch - [input.shape[2] > input.shape[1] ? Math.trunc((input.shape[2] - input.shape[1]) / 2) : 0, input.shape[2] > input.shape[1] ? Math.trunc((input.shape[2] - input.shape[1]) / 2) : 0], // height before&after - [input.shape[1] > input.shape[2] ? Math.trunc((input.shape[1] - input.shape[2]) / 2) : 0, input.shape[1] > input.shape[2] ? Math.trunc((input.shape[1] - input.shape[2]) / 2) : 0], // width before&after + height, // height before&after + width, // width before&after [0, 0], // dont touch rbg ]; - t.pad = tf.pad(input, padding); - t.resize = tf.image.resizeBilinear(t.pad, [inputSize[1][0], inputSize[1][1]]); + if (box) { + t.resize = tf.image.cropAndResize(input, [box], [0], [size, size]); + } else { + t.pad = tf.pad(input, padding); + t.resize = tf.image.resizeBilinear(t.pad, [size, size]); + } final = tf.div(t.resize, constants.tf255); - } else if (input.shape[1] !== inputSize[1][0]) { // if input needs resizing - t.resize = tf.image.resizeBilinear(input, [inputSize[1][0], inputSize[1][1]]); + } else if (input.shape[1] !== size) { // if input needs resizing + t.resize = tf.image.resizeBilinear(input, [size, size]); final = tf.div(t.resize, constants.tf255); } else { // if input is already in a correct resolution just normalize it final = tf.div(input, constants.tf255); @@ -88,47 +108,54 @@ async function prepareImage(input: Tensor): Promise { function rescaleKeypoints(keypoints: Array, outputSize: [number, number]): Array { for (const kpt of keypoints) { kpt.position = [ - kpt.position[0] * (outputSize[0] + padding[2][0] + padding[2][1]) / outputSize[0] - padding[2][0], - kpt.position[1] * (outputSize[1] + padding[1][0] + padding[1][1]) / outputSize[1] - padding[1][0], + Math.trunc(kpt.position[0] * (outputSize[0] + padding[2][0] + padding[2][1]) / outputSize[0] - padding[2][0]), + Math.trunc(kpt.position[1] * (outputSize[1] + padding[1][0] + padding[1][1]) / outputSize[1] - padding[1][0]), kpt.position[2] as number, ]; - kpt.positionRaw = [ - kpt.position[0] / outputSize[0], kpt.position[1] / outputSize[1], kpt.position[2] as number, - ]; + kpt.positionRaw = [kpt.position[0] / outputSize[0], kpt.position[1] / outputSize[1], kpt.position[2] as number]; } return keypoints; } -const sigmoid = (x) => (1 - (1 / (1 + Math.exp(x)))); +function rescaleBoxes(boxes: Array, outputSize: [number, number]): Array { + for (const box of boxes) { + box.box = [ + Math.trunc(box.box[0] * (outputSize[0] + padding[2][0] + padding[2][1]) / outputSize[0]), + Math.trunc(box.box[1] * (outputSize[1] + padding[1][0] + padding[1][1]) / outputSize[1]), + Math.trunc(box.box[2] * (outputSize[0] + padding[2][0] + padding[2][1]) / outputSize[0]), + Math.trunc(box.box[3] * (outputSize[1] + padding[1][0] + padding[1][1]) / outputSize[1]), + ]; + box.boxRaw = [box.box[0] / outputSize[0], box.box[1] / outputSize[1], box.box[2] / outputSize[0], box.box[3] / outputSize[1]]; + } + return boxes; +} -async function detectParts(input: Tensor, config: Config, outputSize: [number, number]): Promise { - const t: Record = {}; - t.input = await prepareImage(input); +async function detectLandmarks(input: Tensor, config: Config, outputSize: [number, number]): Promise { /** * t.ld: 39 keypoints [x,y,z,score,presence] normalized to input size * t.segmentation: * t.heatmap: * t.world: 39 keypoints [x,y,z] normalized to -1..1 * t.poseflag: body score - */ - [t.ld/* 1,195(39*5) */, t.segmentation/* 1,256,256,1 */, t.heatmap/* 1,64,64,39 */, t.world/* 1,117(39*3) */, t.poseflag/* 1,1 */] = models[1]?.execute(t.input, outputNodes) as Tensor[]; // run model - const poseScoreRaw = (await t.poseflag.data())[0]; - const poseScore = Math.max(0, (poseScoreRaw - 0.8) / (1 - 0.8)); // blow up score variance 5x + */ + const t: Record = {}; + [t.ld/* 1,195(39*5) */, t.segmentation/* 1,256,256,1 */, t.heatmap/* 1,64,64,39 */, t.world/* 1,117(39*3) */, t.poseflag/* 1,1 */] = models.landmarks?.execute(input, outputNodes.landmarks) as Tensor[]; // run model + const poseScore = (await t.poseflag.data())[0]; const points = await t.ld.data(); + Object.keys(t).forEach((tensor) => tf.dispose(t[tensor])); // dont need tensors after this const keypointsRelative: Array = []; const depth = 5; // each points has x,y,z,visibility,presence for (let i = 0; i < points.length / depth; i++) { const score = sigmoid(points[depth * i + 3]); const presence = sigmoid(points[depth * i + 4]); const adjScore = Math.trunc(100 * score * presence * poseScore) / 100; - const positionRaw: Point = [points[depth * i + 0] / inputSize[1][0], points[depth * i + 1] / inputSize[1][1], points[depth * i + 2] + 0]; + const positionRaw: Point = [points[depth * i + 0] / inputSize.landmarks[0], points[depth * i + 1] / inputSize.landmarks[1], points[depth * i + 2] + 0]; const position: Point = [Math.trunc(outputSize[0] * positionRaw[0]), Math.trunc(outputSize[1] * positionRaw[1]), positionRaw[2] as number]; keypointsRelative.push({ part: coords.kpt[i], positionRaw, position, score: adjScore }); } if (poseScore < (config.body.minConfidence || 0)) return null; - const keypoints: Array = rescaleKeypoints(keypointsRelative, outputSize); // keypoints were relative to input image which is cropped + const keypoints: Array = rescaleKeypoints(keypointsRelative, outputSize); // keypoints were relative to input image which is padded const boxes = calculateBoxes(keypoints, [outputSize[0], outputSize[1]]); // now find boxes based on rescaled keypoints - Object.keys(t).forEach((tensor) => tf.dispose(t[tensor])); const annotations: Record = {}; for (const [name, indexes] of Object.entries(coords.connected)) { const pt: Array = []; @@ -144,6 +171,19 @@ async function detectParts(input: Tensor, config: Config, outputSize: [number, n return body; } +async function detectBoxes(input: Tensor, config: Config, outputSize: [number, number]) { + const t: Record = {}; + t.res = models.detector?.execute(input, ['Identity']) as Tensor; // + t.logitsRaw = tf.slice(t.res, [0, 0, 0], [1, -1, 1]); + t.boxesRaw = tf.slice(t.res, [0, 0, 1], [1, -1, -1]); + t.logits = tf.squeeze(t.logitsRaw); + t.boxes = tf.squeeze(t.boxesRaw); + const boxes = await detect.decode(t.boxes, t.logits, config, outputSize); + rescaleBoxes(boxes, outputSize); + Object.keys(t).forEach((tensor) => tf.dispose(t[tensor])); + return boxes; +} + export async function predict(input: Tensor, config: Config): Promise { const outputSize: [number, number] = [input.shape[2] || 0, input.shape[1] || 0]; const skipTime = (config.body.skipTime || 0) > (now() - lastTime); @@ -151,7 +191,31 @@ export async function predict(input: Tensor, config: Config): Promise = {}; + if (config.body['detector'] && config.body['detector']['enabled']) { + t.detector = await prepareImage(input, 224); + const boxes = await detectBoxes(t.detector, config, outputSize); + if (boxes && boxes.length === 1) { + t.landmarks = await prepareImage(input, 256, boxes[0].box); // padded and resized according to detector + cache = await detectLandmarks(t.landmarks, config, outputSize); + } + if (cache) cache.score = boxes[0].score; + } else { + t.landmarks = await prepareImage(input, 256, lastBox); // padded and resized + cache = await detectLandmarks(t.landmarks, config, outputSize); + /* + lastBox = undefined; + if (cache?.box) { + const cx = cache.boxRaw[0] + (cache.boxRaw[2] / 2); + const cy = cache.boxRaw[1] + (cache.boxRaw[3] / 2); + let size = cache.boxRaw[2] > cache.boxRaw[3] ? cache.boxRaw[2] : cache.boxRaw[3]; + size = (size * 1.2) / 2; // enlarge and half it + lastBox = [cx - size, cy - size, 2 * size, 2 * size]; + } + */ + } + Object.keys(t).forEach((tensor) => tf.dispose(t[tensor])); + // if (cache && boxes.length > 0) cache.box = boxes[0].box; lastTime = now(); skipped = 0; } diff --git a/src/body/blazeposedetector.ts b/src/body/blazeposedetector.ts new file mode 100644 index 00000000..2662278c --- /dev/null +++ b/src/body/blazeposedetector.ts @@ -0,0 +1,111 @@ +import * as tf from '../../dist/tfjs.esm.js'; +import type { Tensor } from '../tfjs/types'; +import type { Box } from '../result'; +import type { Config } from '../config'; + +interface DetectedBox { box: Box, boxRaw: Box, score: number } + +const inputSize = 224; +let anchorTensor: { x, y }; +const numLayers = 5; +const strides = [8, 16, 32, 32, 32]; + +export async function createAnchors() { + const anchors: Array<{ x: number, y: number }> = []; + let layerId = 0; + while (layerId < numLayers) { + let anchorCount = 0; + let lastSameStrideLayer = layerId; + while (lastSameStrideLayer < strides.length && strides[lastSameStrideLayer] === strides[layerId]) { + anchorCount += 2; + lastSameStrideLayer++; + } + const stride = strides[layerId]; + const featureMapHeight = Math.ceil(inputSize / stride); + const featureMapWidth = Math.ceil(inputSize / stride); + for (let y = 0; y < featureMapHeight; ++y) { + for (let x = 0; x < featureMapWidth; ++x) { + for (let anchorId = 0; anchorId < anchorCount; ++anchorId) { + anchors.push({ x: (x + 0.5) / featureMapWidth, y: (y + 0.5) / featureMapHeight }); + } + } + } + layerId = lastSameStrideLayer; + } + anchorTensor = { x: tf.tensor1d(anchors.map((a) => a.x)), y: tf.tensor1d(anchors.map((a) => a.y)) }; +} + +const cropFactor = [5.0, 5.0]; +function decodeBoxes(boxesTensor, anchor): Tensor { + return tf.tidy(() => { + const split = tf.split(boxesTensor, 12, 1); // first 4 are box data [x,y,w,h] and 4 are keypoints data [x,y] for total of 12 + let xCenter = tf.squeeze(split[0]); + let yCenter = tf.squeeze(split[1]); + let width = tf.squeeze(split[2]); + let height = tf.squeeze(split[3]); + xCenter = tf.add(tf.div(xCenter, inputSize), anchor.x); + yCenter = tf.add(tf.div(yCenter, inputSize), anchor.y); + width = tf.mul(tf.div(width, inputSize), cropFactor[0]); + height = tf.mul(tf.div(height, inputSize), cropFactor[1]); + const xMin = tf.sub(xCenter, tf.div(width, 2)); + const yMin = tf.sub(yCenter, tf.div(height, 2)); + const boxes = tf.stack([xMin, yMin, width, height], 1); + return boxes; + }); +} + +export async function decode(boxesTensor: Tensor, logitsTensor: Tensor, config: Config, outputSize: [number, number]): Promise { + const t: Record = {}; + t.boxes = decodeBoxes(boxesTensor, anchorTensor); + t.scores = tf.sigmoid(logitsTensor); + t.argmax = tf.argMax(t.scores); + const i = (await t.argmax.data())[0] as number; + const scores = await t.scores.data(); + const detected: Array<{ box: Box, boxRaw: Box, score: number }> = []; + const minScore = (config.body['detector'] && config.body['detector']['minConfidence']) ? config.body['detector']['minConfidence'] : 0; + if (scores[i] >= minScore) { + const boxes = await t.boxes.array(); + const boxRaw: Box = boxes[i]; + const box: Box = [boxRaw[0] * outputSize[0], boxRaw[1] * outputSize[1], boxRaw[2] * outputSize[0], boxRaw[3] * outputSize[1]]; + // console.log(box); + detected.push({ box, boxRaw, score: scores[i] }); + } + /* + t.nms = await tf.image.nonMaxSuppressionAsync(t.boxes, t.scores, 1, config.body.detector?.minConfidence || 0.1, config.body.detector?.iouThreshold || 0.1); + const boxes = t.boxes.arraySync(); + const scores = t.scores.dataSync(); + const nms = t.nms.dataSync(); + const detected: Array = []; + for (const i of Array.from(nms)) { + const boxRaw: Box = boxes[i]; + const box: Box = [boxRaw[0] * outputSize[0], boxRaw[0] * outputSize[1], boxRaw[3] * outputSize[0], boxRaw[2] * outputSize[1]]; + detected.push({ box, boxRaw, score: scores[i] }); + } + */ + Object.keys(t).forEach((tensor) => tf.dispose(t[tensor])); + return detected; +} + +/* +const humanConfig: Partial = { + warmup: 'full' as const, + modelBasePath: '../../models', + cacheSensitivity: 0, + filter: { enabled: false }, + face: { enabled: false }, + hand: { enabled: false }, + object: { enabled: false }, + gesture: { enabled: false }, + body: { + enabled: true, + minConfidence: 0.1, + modelPath: 'blazepose/blazepose-full.json', + detector: { + enabled: false, + modelPath: 'blazepose/blazepose-detector.json', + minConfidence: 0.1, + iouThreshold: 0.1, + }, + }, +}; +*/ diff --git a/src/config.ts b/src/config.ts index 1a51084f..2ffa6b4c 100644 --- a/src/config.ts +++ b/src/config.ts @@ -78,11 +78,19 @@ export interface BodyConfig extends GenericConfig { maxDetected: number, /** minimum confidence for a detected body before results are discarded */ minConfidence: number, - /** detector used for body model before actual analysis */ + /* experimental + /** experimental: detector used for body model before actual analysis detector?: { - /** path to optional body detector model json file */ - modelPath: string + /** experimental: enable body detector before body landmarks + enabled: boolean, + /** experimental: path to optional body detector model json file + modelPath: string, + /** experimental: minimum confidence for a detected body before results are discarded + minConfidence: number, + /** experimental: minimum overlap between two detected bodies before one is discarded + iouThreshold: number }, + */ } /** Configures all hand detection specific options */ @@ -365,9 +373,6 @@ const config: Config = { body: { enabled: true, modelPath: 'movenet-lightning.json', - detector: { - modelPath: '', - }, maxDetected: -1, minConfidence: 0.3, skipFrames: 1,