mind-ar
Version:
web augmented reality framework
303 lines (254 loc) • 10.8 kB
JavaScript
import {Matrix, inverse} from 'ml-matrix';
import { applyModelViewProjectionTransform, buildModelViewProjectionTransform, computeScreenCoordiate} from './utils.js';
const TRACKING_THRESH = 5.0; // default
const K2_FACTOR = 4.0; // Question: should it be relative to the size of the screen instead of hardcoded?
const ICP_MAX_LOOP = 10;
const ICP_BREAK_LOOP_ERROR_THRESH = 0.1;
const ICP_BREAK_LOOP_ERROR_RATIO_THRESH = 0.99;
const ICP_BREAK_LOOP_ERROR_THRESH2 = 4.0;
// some temporary/intermediate variables used later. Declare them beforehand to reduce new object allocations
let mat = [[],[],[]];
let J_U_Xc = [[],[]]; // 2x3
let J_Xc_S = [[],[],[]]; // 3x6
const refineEstimate = ({initialModelViewTransform, projectionTransform, worldCoords, screenCoords}) => {
// Question: shall we normlize the screen coords as well?
// Question: do we need to normlize the scale as well, i.e. make coords from -1 to 1
//
// normalize world coords - reposition them to center of mass
// assume z coordinate is always zero (in our case, the image target is planar with z = 0
let dx = 0;
let dy = 0;
for (let i = 0; i < worldCoords.length; i++) {
dx += worldCoords[i].x;
dy += worldCoords[i].y;
}
dx /= worldCoords.length;
dy /= worldCoords.length;
const normalizedWorldCoords = [];
for (let i = 0; i < worldCoords.length; i++) {
normalizedWorldCoords.push({x: worldCoords[i].x - dx, y: worldCoords[i].y - dy, z: worldCoords[i].z});
}
const diffModelViewTransform = [[],[],[]];
for (let j = 0; j < 3; j++) {
for (let i = 0; i < 3; i++) {
diffModelViewTransform[j][i] = initialModelViewTransform[j][i];
}
}
diffModelViewTransform[0][3] = initialModelViewTransform[0][0] * dx + initialModelViewTransform[0][1] * dy + initialModelViewTransform[0][3];
diffModelViewTransform[1][3] = initialModelViewTransform[1][0] * dx + initialModelViewTransform[1][1] * dy + initialModelViewTransform[1][3];
diffModelViewTransform[2][3] = initialModelViewTransform[2][0] * dx + initialModelViewTransform[2][1] * dy + initialModelViewTransform[2][3];
// use iterative closest point algorithm to refine the modelViewTransform
const inlierProbs = [1.0, 0.8, 0.6, 0.4, 0.0];
let updatedModelViewTransform = diffModelViewTransform; // iteratively update this transform
let finalModelViewTransform = null;
for (let i = 0; i < inlierProbs.length; i++) {
const ret = _doICP({initialModelViewTransform: updatedModelViewTransform, projectionTransform, worldCoords: normalizedWorldCoords, screenCoords, inlierProb: inlierProbs[i]});
updatedModelViewTransform = ret.modelViewTransform;
//console.log("err", ret.err);
if (ret.err < TRACKING_THRESH) {
finalModelViewTransform = updatedModelViewTransform;
break;
}
}
if (finalModelViewTransform === null) return null;
// de-normalize
finalModelViewTransform[0][3] = finalModelViewTransform[0][3] - finalModelViewTransform[0][0] * dx - finalModelViewTransform[0][1] * dy;
finalModelViewTransform[1][3] = finalModelViewTransform[1][3] - finalModelViewTransform[1][0] * dx - finalModelViewTransform[1][1] * dy;
finalModelViewTransform[2][3] = finalModelViewTransform[2][3] - finalModelViewTransform[2][0] * dx - finalModelViewTransform[2][1] * dy;
return finalModelViewTransform;
}
// ICP iteration
// Question: can someone provide theoretical reference / mathematical proof for the following computations?
const _doICP = ({initialModelViewTransform, projectionTransform, worldCoords, screenCoords, inlierProb}) => {
const isRobustMode = inlierProb < 1;
let modelViewTransform = initialModelViewTransform;
let err0 = 0.0;
let err1 = 0.0;
let E = new Array(worldCoords.length);
let E2 = new Array(worldCoords.length);
let dxs = new Array(worldCoords.length);
let dys = new Array(worldCoords.length);
for (let l = 0; l <= ICP_MAX_LOOP; l++) {
const modelViewProjectionTransform = buildModelViewProjectionTransform(projectionTransform, modelViewTransform);
for (let n = 0; n < worldCoords.length; n++) {
const u = computeScreenCoordiate(modelViewProjectionTransform, worldCoords[n].x, worldCoords[n].y, worldCoords[n].z);
const dx = screenCoords[n].x - u.x;
const dy = screenCoords[n].y - u.y;
dxs[n] = dx;
dys[n] = dy;
E[n] = (dx * dx + dy * dy);
}
let K2; // robust mode only
err1 = 0.0;
if (isRobustMode) {
const inlierNum = Math.max(3, Math.floor(worldCoords.length * inlierProb) - 1);
for (let n = 0; n < worldCoords.length; n++) {
E2[n] = E[n];
}
E2.sort((a, b) => {return a-b;});
K2 = Math.max(E2[inlierNum] * K2_FACTOR, 16.0);
for (let n = 0; n < worldCoords.length; n++) {
if (E2[n] > K2) err1 += K2/ 6;
else err1 += K2/6.0 * (1.0 - (1.0-E2[n]/K2)*(1.0-E2[n]/K2)*(1.0-E2[n]/K2));
}
} else {
for (let n = 0; n < worldCoords.length; n++) {
err1 += E[n];
}
}
err1 /= worldCoords.length;
//console.log("icp loop", inlierProb, l, err1);
if (err1 < ICP_BREAK_LOOP_ERROR_THRESH) break;
//if (l > 0 && err1 < ICP_BREAK_LOOP_ERROR_THRESH2 && err1/err0 > ICP_BREAK_LOOP_ERROR_RATIO_THRESH) break;
if (l > 0 && err1/err0 > ICP_BREAK_LOOP_ERROR_RATIO_THRESH) break;
if (l === ICP_MAX_LOOP) break;
err0 = err1;
const dU = [];
const allJ_U_S = [];
for (let n = 0; n < worldCoords.length; n++) {
if (isRobustMode && E[n] > K2) {
continue;
}
const J_U_S = _getJ_U_S({modelViewProjectionTransform, modelViewTransform, projectionTransform, worldCoord: worldCoords[n]});
if (isRobustMode) {
const W = (1.0 - E[n]/K2)*(1.0 - E[n]/K2);
for (let j = 0; j < 2; j++) {
for (let i = 0; i < 6; i++) {
J_U_S[j][i] *= W;
}
}
dU.push([dxs[n] * W]);
dU.push([dys[n] * W]);
} else {
dU.push([dxs[n]]);
dU.push([dys[n]]);
}
for (let i = 0; i < J_U_S.length; i++) {
allJ_U_S.push(J_U_S[i]);
}
}
const dS = _getDeltaS({dU, J_U_S: allJ_U_S});
if (dS === null) break;
modelViewTransform = _updateModelViewTransform({modelViewTransform, dS});
}
return {modelViewTransform, err: err1};
}
const _updateModelViewTransform = ({modelViewTransform, dS}) => {
/**
* dS has 6 paragrams, first half is rotation, second half is translation
* rotation is expressed in angle-axis,
* [S[0], S[1] ,S[2]] is the axis of rotation, and the magnitude is the angle
*/
let ra = dS[0] * dS[0] + dS[1] * dS[1] + dS[2] * dS[2];
let q0, q1, q2;
if( ra < 0.000001 ) {
q0 = 1.0;
q1 = 0.0;
q2 = 0.0;
ra = 0.0;
} else {
ra = Math.sqrt(ra);
q0 = dS[0] / ra;
q1 = dS[1] / ra;
q2 = dS[2] / ra;
}
const cra = Math.cos(ra);
const sra = Math.sin(ra);
const one_cra = 1.0 - cra;
// mat is [R|t], 3D rotation and translation
mat[0][0] = q0*q0*one_cra + cra;
mat[0][1] = q0*q1*one_cra - q2*sra;
mat[0][2] = q0*q2*one_cra + q1*sra;
mat[0][3] = dS[3];
mat[1][0] = q1*q0*one_cra + q2*sra;
mat[1][1] = q1*q1*one_cra + cra;
mat[1][2] = q1*q2*one_cra - q0*sra;
mat[1][3] = dS[4]
mat[2][0] = q2*q0*one_cra - q1*sra;
mat[2][1] = q2*q1*one_cra + q0*sra;
mat[2][2] = q2*q2*one_cra + cra;
mat[2][3] = dS[5];
// the updated transform is the original transform x delta transform
const mat2 = [[],[],[]];
for (let j = 0; j < 3; j++ ) {
for (let i = 0; i < 4; i++ ) {
mat2[j][i] = modelViewTransform[j][0] * mat[0][i]
+ modelViewTransform[j][1] * mat[1][i]
+ modelViewTransform[j][2] * mat[2][i];
}
mat2[j][3] += modelViewTransform[j][3];
}
return mat2;
}
const _getDeltaS = ({dU, J_U_S}) => {
const J = new Matrix(J_U_S);
const U = new Matrix(dU);
const JT = J.transpose();
const JTJ = JT.mmul(J);
const JTU = JT.mmul(U);
let JTJInv;
try {
JTJInv = inverse(JTJ);
} catch (e) {
return null;
}
const S = JTJInv.mmul(JTU);
return S.to1DArray();
}
const _getJ_U_S = ({modelViewProjectionTransform, modelViewTransform, projectionTransform, worldCoord}) => {
const T = modelViewTransform;
const {x, y, z} = worldCoord;
const u = applyModelViewProjectionTransform(modelViewProjectionTransform, x, y, z);
const z2 = u.z * u.z;
// Question: This is the most confusing matrix to me. I've no idea how to derive this.
//J_U_Xc[0][0] = (projectionTransform[0][0] * u.z - projectionTransform[2][0] * u.x) / z2;
//J_U_Xc[0][1] = (projectionTransform[0][1] * u.z - projectionTransform[2][1] * u.x) / z2;
//J_U_Xc[0][2] = (projectionTransform[0][2] * u.z - projectionTransform[2][2] * u.x) / z2;
//J_U_Xc[1][0] = (projectionTransform[1][0] * u.z - projectionTransform[2][0] * u.y) / z2;
//J_U_Xc[1][1] = (projectionTransform[1][1] * u.z - projectionTransform[2][1] * u.y) / z2;
//J_U_Xc[1][2] = (projectionTransform[1][2] * u.z - projectionTransform[2][2] * u.y) / z2;
// The above is the original implementation, but simplify to below becuase projetionTransform[2][0] and [2][1] are zero
J_U_Xc[0][0] = (projectionTransform[0][0] * u.z) / z2;
J_U_Xc[0][1] = (projectionTransform[0][1] * u.z) / z2;
J_U_Xc[0][2] = (projectionTransform[0][2] * u.z - projectionTransform[2][2] * u.x) / z2;
J_U_Xc[1][0] = (projectionTransform[1][0] * u.z) / z2;
J_U_Xc[1][1] = (projectionTransform[1][1] * u.z) / z2;
J_U_Xc[1][2] = (projectionTransform[1][2] * u.z - projectionTransform[2][2] * u.y) / z2;
/*
J_Xc_S should be like this, but z is zero, so we can simplify
[T[0][2] * y - T[0][1] * z, T[0][0] * z - T[0][2] * x, T[0][1] * x - T[0][0] * y, T[0][0], T[0][1], T[0][2]],
[T[1][2] * y - T[1][1] * z, T[1][0] * z - T[1][2] * x, T[1][1] * x - T[1][0] * y, T[1][0], T[1][1], T[1][2]],
[T[2][2] * y - T[2][1] * z, T[2][0] * z - T[2][2] * x, T[2][1] * x - T[2][0] * y, T[2][0], T[2][1], T[2][2]],
*/
J_Xc_S[0][0] = T[0][2] * y;
J_Xc_S[0][1] = -T[0][2] * x;
J_Xc_S[0][2] = T[0][1] * x - T[0][0] * y;
J_Xc_S[0][3] = T[0][0];
J_Xc_S[0][4] = T[0][1];
J_Xc_S[0][5] = T[0][2];
J_Xc_S[1][0] = T[1][2] * y;
J_Xc_S[1][1] = -T[1][2] * x;
J_Xc_S[1][2] = T[1][1] * x - T[1][0] * y;
J_Xc_S[1][3] = T[1][0];
J_Xc_S[1][4] = T[1][1];
J_Xc_S[1][5] = T[1][2];
J_Xc_S[2][0] = T[2][2] * y;
J_Xc_S[2][1] = -T[2][2] * x;
J_Xc_S[2][2] = T[2][1] * x - T[2][0] * y;
J_Xc_S[2][3] = T[2][0];
J_Xc_S[2][4] = T[2][1];
J_Xc_S[2][5] = T[2][2];
const J_U_S = [[], []];
for (let j = 0; j < 2; j++) {
for (let i = 0; i < 6; i++) {
J_U_S[j][i] = 0.0;
for (let k = 0; k < 3; k++ ) {
J_U_S[j][i] += J_U_Xc[j][k] * J_Xc_S[k][i];
}
}
}
return J_U_S;
}
export {
refineEstimate
}