arquero
Version:
Query processing and transformation of array-backed data tables.
226 lines (198 loc) • 6.14 kB
JavaScript
import { indexLookup } from './join/lookup.js';
import { inferKeys, keyPredicate } from './util/join-keys.js';
import { parseValue } from './util/parse.js';
import { parse } from '../expression/parse.js';
import { all, not } from '../helpers/selection.js';
import { columnSet } from '../table/ColumnSet.js';
import { concat } from '../util/concat.js';
import { isArray } from '../util/is-array.js';
import { isString } from '../util/is-string.js';
import { toArray } from '../util/to-array.js';
import { toString } from '../util/to-string.js';
import { unroll } from '../util/unroll.js';
const OPT_L = { aggregate: false, window: false };
const OPT_R = { ...OPT_L, index: 1 };
const NONE = -Infinity;
export function cross(table, other, values, options) {
return join(
table,
other,
() => true,
values,
{ ...options, left: true, right: true }
);
}
export function join(tableL, tableR, on, values, options = {}) {
on = inferKeys(tableL, tableR, on);
const optParse = { join: [tableL, tableR] };
let predicate;
if (isArray(on)) {
const [onL, onR] = on.map(toArray);
predicate = keyPredicate(tableL, tableR, onL, onR);
if (!values) {
// infer output columns, suppress duplicated key columns
values = inferValues(tableL, onL, onR, options);
}
} else {
predicate = parse({ on }, optParse).exprs[0];
if (!values) {
// include all table columns if values not provided
values = [all(), all()];
}
}
return _join(
tableL, tableR, predicate,
parseValues(tableL, tableR, values, optParse, options && options.suffix),
options
);
}
function inferValues(tableL, onL, onR, options) {
const isect = [];
onL.forEach((s, i) => isString(s) && s === onR[i] ? isect.push(s) : 0);
const vR = not(isect);
if (options.left && options.right) {
// for full join, merge shared key columns together
const shared = new Set(isect);
return [
tableL.columnNames().map(s => {
const c = `[${toString(s)}]`;
return shared.has(s)
? { [s]: `(a, b) => a${c} == null ? b${c} : a${c}` }
: s;
}),
vR
];
}
return options.right ? [vR, all()] : [all(), vR];
}
function parseValues(tableL, tableR, values, optParse, suffix = []) {
if (isArray(values)) {
let vL, vR, vJ, n = values.length;
vL = vR = vJ = { names: [], exprs: [] };
if (n--) {
vL = parseValue('join', tableL, values[0], optParse);
}
if (n--) {
vR = parseValue('join', tableR, values[1], OPT_R);
}
if (n--) {
vJ = parse(values[2], optParse);
}
// handle name collisions
const rename = new Set();
const namesL = new Set(vL.names);
vR.names.forEach(name => {
if (namesL.has(name)) {
rename.add(name);
}
});
if (rename.size) {
suffix[0] !== '' && rekey(vL.names, rename, suffix[0] || '_1');
suffix[1] !== '' && rekey(vR.names, rename, suffix[1] || '_2');
}
return {
names: vL.names.concat(vR.names, vJ.names),
exprs: vL.exprs.concat(vR.exprs, vJ.exprs)
};
} else {
return parse(values, optParse);
}
}
function rekey(names, rename, suffix) {
names.forEach((name, i) => rename.has(name)
? (names[i] = name + suffix)
: 0);
}
function emitter(columns, getters) {
const args = ['i', 'a', 'j', 'b'];
return unroll(
args,
'{' + concat(columns, (_, i) => `_${i}.push($${i}(${args}));`) + '}',
columns, getters
);
}
export function _join(tableL, tableR, predicate, { names, exprs }, options = {}) {
// initialize data for left table
const dataL = tableL.data();
const idxL = tableL.indices(false);
const nL = idxL.length;
const hitL = new Int32Array(nL);
// initialize data for right table
const dataR = tableR.data();
const idxR = tableR.indices(false);
const nR = idxR.length;
const hitR = new Int32Array(nR);
// initialize output data
const ncols = names.length;
const cols = columnSet();
const columns = Array(ncols);
const getters = Array(ncols);
for (let i = 0; i < names.length; ++i) {
columns[i] = cols.add(names[i], []);
getters[i] = exprs[i];
}
const emit = emitter(columns, getters);
// perform join
const join = isArray(predicate) ? hashJoin : loopJoin;
join(emit, predicate, dataL, dataR, idxL, idxR, hitL, hitR, nL, nR);
if (options.left) {
for (let i = 0; i < nL; ++i) {
if (!hitL[i]) {
emit(idxL[i], dataL, NONE, dataR);
}
}
}
if (options.right) {
for (let j = 0; j < nR; ++j) {
if (!hitR[j]) {
emit(NONE, dataL, idxR[j], dataR);
}
}
}
return cols.new(tableL);
}
function loopJoin(emit, predicate, dataL, dataR, idxL, idxR, hitL, hitR, nL, nR) {
// perform nested-loops join
for (let i = 0; i < nL; ++i) {
const rowL = idxL[i];
for (let j = 0; j < nR; ++j) {
const rowR = idxR[j];
if (predicate(rowL, dataL, rowR, dataR)) {
emit(rowL, dataL, rowR, dataR);
hitL[i] = 1;
hitR[j] = 1;
}
}
}
}
function hashJoin(emit, [keyL, keyR], dataL, dataR, idxL, idxR, hitL, hitR, nL, nR) {
// determine which table to hash
let dataScan, keyScan, hitScan, idxScan;
let dataHash, keyHash, hitHash, idxHash;
let emitScan = emit;
if (nL >= nR) {
dataScan = dataL; keyScan = keyL; hitScan = hitL; idxScan = idxL;
dataHash = dataR; keyHash = keyR; hitHash = hitR; idxHash = idxR;
} else {
dataScan = dataR; keyScan = keyR; hitScan = hitR; idxScan = idxR;
dataHash = dataL; keyHash = keyL; hitHash = hitL; idxHash = idxL;
emitScan = (i, a, j, b) => emit(j, b, i, a);
}
// build lookup table
const lut = indexLookup(idxHash, dataHash, keyHash);
// scan other table
const m = idxScan.length;
for (let j = 0; j < m; ++j) {
const rowScan = idxScan[j];
const list = lut.get(keyScan(rowScan, dataScan));
if (list) {
const n = list.length;
for (let k = 0; k < n; ++k) {
const i = list[k];
emitScan(rowScan, dataScan, idxHash[i], dataHash);
hitHash[i] = 1;
}
hitScan[j] = 1;
}
}
}