vega-transforms
Version:
Data processing transforms for Vega dataflows.
1,875 lines (1,824 loc) • 102 kB
JavaScript
import { extend, identity, field, hasOwnProperty, extentIndex, inherits, array, accessorName, error, accessorFields, accessor, toNumber, merge, compare, truthy, extent, span, fastmap, isArray, key, ascending, peek, zero, constant } from 'vega-util';
import { tupleid, Transform, replace, ingest, stableCompare, Operator, derive, rederive } from 'vega-dataflow';
import { quartiles, bootstrapCI, bin, randomUniform, randomLogNormal, randomNormal, randomMixture, randomKDE, sampleCurve, dotbin, quantiles, random } from 'vega-statistics';
import { max, min, mean, median, range, bisector } from 'd3-array';
import { TIME_UNITS, timeBin, timeUnits, utcFloor, timeFloor, utcInterval, timeInterval } from 'vega-time';
function multikey(f) {
return x => {
const n = f.length;
let i = 1,
k = String(f[0](x));
for (; i < n; ++i) {
k += '|' + f[i](x);
}
return k;
};
}
function groupkey(fields) {
return !fields || !fields.length ? function () {
return '';
} : fields.length === 1 ? fields[0] : multikey(fields);
}
function measureName(op, field, as) {
return as || op + (!field ? '' : '_' + field);
}
const noop = () => {};
const base_op = {
init: noop,
add: noop,
rem: noop,
idx: 0
};
const AggregateOps = {
values: {
init: m => m.cell.store = true,
value: m => m.cell.data.values(),
idx: -1
},
count: {
value: m => m.cell.num
},
__count__: {
value: m => m.missing + m.valid
},
missing: {
value: m => m.missing
},
valid: {
value: m => m.valid
},
sum: {
init: m => m.sum = 0,
value: m => m.valid ? m.sum : undefined,
add: (m, v) => m.sum += +v,
rem: (m, v) => m.sum -= v
},
product: {
init: m => m.product = 1,
value: m => m.valid ? m.product : undefined,
add: (m, v) => m.product *= v,
rem: (m, v) => m.product /= v
},
mean: {
init: m => m.mean = 0,
value: m => m.valid ? m.mean : undefined,
add: (m, v) => (m.mean_d = v - m.mean, m.mean += m.mean_d / m.valid),
rem: (m, v) => (m.mean_d = v - m.mean, m.mean -= m.valid ? m.mean_d / m.valid : m.mean)
},
average: {
value: m => m.valid ? m.mean : undefined,
req: ['mean'],
idx: 1
},
variance: {
init: m => m.dev = 0,
value: m => m.valid > 1 ? m.dev / (m.valid - 1) : undefined,
add: (m, v) => m.dev += m.mean_d * (v - m.mean),
rem: (m, v) => m.dev -= m.mean_d * (v - m.mean),
req: ['mean'],
idx: 1
},
variancep: {
value: m => m.valid > 1 ? m.dev / m.valid : undefined,
req: ['variance'],
idx: 2
},
stdev: {
value: m => m.valid > 1 ? Math.sqrt(m.dev / (m.valid - 1)) : undefined,
req: ['variance'],
idx: 2
},
stdevp: {
value: m => m.valid > 1 ? Math.sqrt(m.dev / m.valid) : undefined,
req: ['variance'],
idx: 2
},
stderr: {
value: m => m.valid > 1 ? Math.sqrt(m.dev / (m.valid * (m.valid - 1))) : undefined,
req: ['variance'],
idx: 2
},
distinct: {
value: m => m.cell.data.distinct(m.get),
req: ['values'],
idx: 3
},
ci0: {
value: m => m.cell.data.ci0(m.get),
req: ['values'],
idx: 3
},
ci1: {
value: m => m.cell.data.ci1(m.get),
req: ['values'],
idx: 3
},
median: {
value: m => m.cell.data.q2(m.get),
req: ['values'],
idx: 3
},
q1: {
value: m => m.cell.data.q1(m.get),
req: ['values'],
idx: 3
},
q3: {
value: m => m.cell.data.q3(m.get),
req: ['values'],
idx: 3
},
min: {
init: m => m.min = undefined,
value: m => m.min = Number.isNaN(m.min) ? m.cell.data.min(m.get) : m.min,
add: (m, v) => {
if (v < m.min || m.min === undefined) m.min = v;
},
rem: (m, v) => {
if (v <= m.min) m.min = NaN;
},
req: ['values'],
idx: 4
},
max: {
init: m => m.max = undefined,
value: m => m.max = Number.isNaN(m.max) ? m.cell.data.max(m.get) : m.max,
add: (m, v) => {
if (v > m.max || m.max === undefined) m.max = v;
},
rem: (m, v) => {
if (v >= m.max) m.max = NaN;
},
req: ['values'],
idx: 4
},
argmin: {
init: m => m.argmin = undefined,
value: m => m.argmin || m.cell.data.argmin(m.get),
add: (m, v, t) => {
if (v < m.min) m.argmin = t;
},
rem: (m, v) => {
if (v <= m.min) m.argmin = undefined;
},
req: ['min', 'values'],
idx: 3
},
argmax: {
init: m => m.argmax = undefined,
value: m => m.argmax || m.cell.data.argmax(m.get),
add: (m, v, t) => {
if (v > m.max) m.argmax = t;
},
rem: (m, v) => {
if (v >= m.max) m.argmax = undefined;
},
req: ['max', 'values'],
idx: 3
},
exponential: {
init: (m, r) => {
m.exp = 0;
m.exp_r = r;
},
value: m => m.valid ? m.exp * (1 - m.exp_r) / (1 - m.exp_r ** m.valid) : undefined,
add: (m, v) => m.exp = m.exp_r * m.exp + v,
rem: (m, v) => m.exp = (m.exp - v / m.exp_r ** (m.valid - 1)) / m.exp_r
},
exponentialb: {
value: m => m.valid ? m.exp * (1 - m.exp_r) : undefined,
req: ['exponential'],
idx: 1
}
};
const ValidAggregateOps = Object.keys(AggregateOps).filter(d => d !== '__count__');
function measure(key, value) {
return (out, aggregate_param) => extend({
name: key,
aggregate_param: aggregate_param,
out: out || key
}, base_op, value);
}
[...ValidAggregateOps, '__count__'].forEach(key => {
AggregateOps[key] = measure(key, AggregateOps[key]);
});
function createMeasure(op, param, name) {
return AggregateOps[op](name, param);
}
function compareIndex(a, b) {
return a.idx - b.idx;
}
function resolve(agg) {
const map = {};
agg.forEach(a => map[a.name] = a);
const getreqs = a => {
if (!a.req) return;
a.req.forEach(key => {
if (!map[key]) getreqs(map[key] = AggregateOps[key]());
});
};
agg.forEach(getreqs);
return Object.values(map).sort(compareIndex);
}
function init() {
this.valid = 0;
this.missing = 0;
this._ops.forEach(op => op.aggregate_param == null ? op.init(this) : op.init(this, op.aggregate_param));
}
function add(v, t) {
if (v == null || v === '') {
++this.missing;
return;
}
if (v !== v) return;
++this.valid;
this._ops.forEach(op => op.add(this, v, t));
}
function rem(v, t) {
if (v == null || v === '') {
--this.missing;
return;
}
if (v !== v) return;
--this.valid;
this._ops.forEach(op => op.rem(this, v, t));
}
function set(t) {
this._out.forEach(op => t[op.out] = op.value(this));
return t;
}
function compileMeasures(agg, field) {
const get = field || identity,
ops = resolve(agg),
out = agg.slice().sort(compareIndex);
function ctr(cell) {
this._ops = ops;
this._out = out;
this.cell = cell;
this.init();
}
ctr.prototype.init = init;
ctr.prototype.add = add;
ctr.prototype.rem = rem;
ctr.prototype.set = set;
ctr.prototype.get = get;
ctr.fields = agg.map(op => op.out);
return ctr;
}
function TupleStore(key) {
this._key = key ? field(key) : tupleid;
this.reset();
}
const prototype$1 = TupleStore.prototype;
prototype$1.reset = function () {
this._add = [];
this._rem = [];
this._ext = null;
this._get = null;
this._q = null;
};
prototype$1.add = function (v) {
this._add.push(v);
};
prototype$1.rem = function (v) {
this._rem.push(v);
};
prototype$1.values = function () {
this._get = null;
if (this._rem.length === 0) return this._add;
const a = this._add,
r = this._rem,
k = this._key,
n = a.length,
m = r.length,
x = Array(n - m),
map = {};
let i, j, v;
// use unique key field to clear removed values
for (i = 0; i < m; ++i) {
map[k(r[i])] = 1;
}
for (i = 0, j = 0; i < n; ++i) {
if (map[k(v = a[i])]) {
map[k(v)] = 0;
} else {
x[j++] = v;
}
}
this._rem = [];
return this._add = x;
};
// memoizing statistics methods
prototype$1.distinct = function (get) {
const v = this.values(),
map = {};
let n = v.length,
count = 0,
s;
while (--n >= 0) {
s = get(v[n]) + '';
if (!hasOwnProperty(map, s)) {
map[s] = 1;
++count;
}
}
return count;
};
prototype$1.extent = function (get) {
if (this._get !== get || !this._ext) {
const v = this.values(),
i = extentIndex(v, get);
this._ext = [v[i[0]], v[i[1]]];
this._get = get;
}
return this._ext;
};
prototype$1.argmin = function (get) {
return this.extent(get)[0] || {};
};
prototype$1.argmax = function (get) {
return this.extent(get)[1] || {};
};
prototype$1.min = function (get) {
const m = this.extent(get)[0];
return m != null ? get(m) : undefined;
};
prototype$1.max = function (get) {
const m = this.extent(get)[1];
return m != null ? get(m) : undefined;
};
prototype$1.quartile = function (get) {
if (this._get !== get || !this._q) {
this._q = quartiles(this.values(), get);
this._get = get;
}
return this._q;
};
prototype$1.q1 = function (get) {
return this.quartile(get)[0];
};
prototype$1.q2 = function (get) {
return this.quartile(get)[1];
};
prototype$1.q3 = function (get) {
return this.quartile(get)[2];
};
prototype$1.ci = function (get) {
if (this._get !== get || !this._ci) {
this._ci = bootstrapCI(this.values(), 1000, 0.05, get);
this._get = get;
}
return this._ci;
};
prototype$1.ci0 = function (get) {
return this.ci(get)[0];
};
prototype$1.ci1 = function (get) {
return this.ci(get)[1];
};
/**
* Group-by aggregation operator.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {Array<function(object): *>} [params.groupby] - An array of accessors to groupby.
* @param {Array<function(object): *>} [params.fields] - An array of accessors to aggregate.
* @param {Array<string>} [params.ops] - An array of strings indicating aggregation operations.
* @param {Array<number>} [params.aggregate_params] - An optional array of parameters for aggregation operations.
* @param {Array<string>} [params.as] - An array of output field names for aggregated values.
* @param {boolean} [params.cross=false] - A flag indicating that the full
* cross-product of groupby values should be generated, including empty cells.
* If true, the drop parameter is ignored and empty cells are retained.
* @param {boolean} [params.drop=true] - A flag indicating if empty cells should be removed.
*/
function Aggregate(params) {
Transform.call(this, null, params);
this._adds = []; // array of added output tuples
this._mods = []; // array of modified output tuples
this._alen = 0; // number of active added tuples
this._mlen = 0; // number of active modified tuples
this._drop = true; // should empty aggregation cells be removed
this._cross = false; // produce full cross-product of group-by values
this._dims = []; // group-by dimension accessors
this._dnames = []; // group-by dimension names
this._measures = []; // collection of aggregation monoids
this._countOnly = false; // flag indicating only count aggregation
this._counts = null; // collection of count fields
this._prev = null; // previous aggregation cells
this._inputs = null; // array of dependent input tuple field names
this._outputs = null; // array of output tuple field names
}
Aggregate.Definition = {
'type': 'Aggregate',
'metadata': {
'generates': true,
'changes': true
},
'params': [{
'name': 'groupby',
'type': 'field',
'array': true
}, {
'name': 'ops',
'type': 'enum',
'array': true,
'values': ValidAggregateOps
}, {
'name': 'aggregate_params',
'type': 'number',
'null': true,
'array': true
}, {
'name': 'fields',
'type': 'field',
'null': true,
'array': true
}, {
'name': 'as',
'type': 'string',
'null': true,
'array': true
}, {
'name': 'drop',
'type': 'boolean',
'default': true
}, {
'name': 'cross',
'type': 'boolean',
'default': false
}, {
'name': 'key',
'type': 'field'
}]
};
inherits(Aggregate, Transform, {
transform(_, pulse) {
const aggr = this,
out = pulse.fork(pulse.NO_SOURCE | pulse.NO_FIELDS),
mod = _.modified();
aggr.stamp = out.stamp;
if (aggr.value && (mod || pulse.modified(aggr._inputs, true))) {
aggr._prev = aggr.value;
aggr.value = mod ? aggr.init(_) : Object.create(null);
pulse.visit(pulse.SOURCE, t => aggr.add(t));
} else {
aggr.value = aggr.value || aggr.init(_);
pulse.visit(pulse.REM, t => aggr.rem(t));
pulse.visit(pulse.ADD, t => aggr.add(t));
}
// Indicate output fields and return aggregate tuples.
out.modifies(aggr._outputs);
// Should empty cells be dropped?
aggr._drop = _.drop !== false;
// If domain cross-product requested, generate empty cells as needed
// and ensure that empty cells are not dropped
if (_.cross && aggr._dims.length > 1) {
aggr._drop = false;
aggr.cross();
}
if (pulse.clean() && aggr._drop) {
out.clean(true).runAfter(() => this.clean());
}
return aggr.changes(out);
},
cross() {
const aggr = this,
curr = aggr.value,
dims = aggr._dnames,
vals = dims.map(() => ({})),
n = dims.length;
// collect all group-by domain values
function collect(cells) {
let key, i, t, v;
for (key in cells) {
t = cells[key].tuple;
for (i = 0; i < n; ++i) {
vals[i][v = t[dims[i]]] = v;
}
}
}
collect(aggr._prev);
collect(curr);
// iterate over key cross-product, create cells as needed
function generate(base, tuple, index) {
const name = dims[index],
v = vals[index++];
for (const k in v) {
const key = base ? base + '|' + k : k;
tuple[name] = v[k];
if (index < n) generate(key, tuple, index);else if (!curr[key]) aggr.cell(key, tuple);
}
}
generate('', {}, 0);
},
init(_) {
// initialize input and output fields
const inputs = this._inputs = [],
outputs = this._outputs = [],
inputMap = {};
function inputVisit(get) {
const fields = array(accessorFields(get)),
n = fields.length;
let i = 0,
f;
for (; i < n; ++i) {
if (!inputMap[f = fields[i]]) {
inputMap[f] = 1;
inputs.push(f);
}
}
}
// initialize group-by dimensions
this._dims = array(_.groupby);
this._dnames = this._dims.map(d => {
const dname = accessorName(d);
inputVisit(d);
outputs.push(dname);
return dname;
});
this.cellkey = _.key ? _.key : groupkey(this._dims);
// initialize aggregate measures
this._countOnly = true;
this._counts = [];
this._measures = [];
const fields = _.fields || [null],
ops = _.ops || ['count'],
aggregate_params = _.aggregate_params || [null],
as = _.as || [],
n = fields.length,
map = {};
let field, op, aggregate_param, m, mname, outname, i;
if (n !== ops.length) {
error('Unmatched number of fields and aggregate ops.');
}
for (i = 0; i < n; ++i) {
field = fields[i];
op = ops[i];
aggregate_param = aggregate_params[i] || null;
if (field == null && op !== 'count') {
error('Null aggregate field specified.');
}
mname = accessorName(field);
outname = measureName(op, mname, as[i]);
outputs.push(outname);
if (op === 'count') {
this._counts.push(outname);
continue;
}
m = map[mname];
if (!m) {
inputVisit(field);
m = map[mname] = [];
m.field = field;
this._measures.push(m);
}
if (op !== 'count') this._countOnly = false;
m.push(createMeasure(op, aggregate_param, outname));
}
this._measures = this._measures.map(m => compileMeasures(m, m.field));
return Object.create(null); // aggregation cells (this.value)
},
// -- Cell Management -----
cellkey: groupkey(),
cell(key, t) {
let cell = this.value[key];
if (!cell) {
cell = this.value[key] = this.newcell(key, t);
this._adds[this._alen++] = cell;
} else if (cell.num === 0 && this._drop && cell.stamp < this.stamp) {
cell.stamp = this.stamp;
this._adds[this._alen++] = cell;
} else if (cell.stamp < this.stamp) {
cell.stamp = this.stamp;
this._mods[this._mlen++] = cell;
}
return cell;
},
newcell(key, t) {
const cell = {
key: key,
num: 0,
agg: null,
tuple: this.newtuple(t, this._prev && this._prev[key]),
stamp: this.stamp,
store: false
};
if (!this._countOnly) {
const measures = this._measures,
n = measures.length;
cell.agg = Array(n);
for (let i = 0; i < n; ++i) {
cell.agg[i] = new measures[i](cell);
}
}
if (cell.store) {
cell.data = new TupleStore();
}
return cell;
},
newtuple(t, p) {
const names = this._dnames,
dims = this._dims,
n = dims.length,
x = {};
for (let i = 0; i < n; ++i) {
x[names[i]] = dims[i](t);
}
return p ? replace(p.tuple, x) : ingest(x);
},
clean() {
const cells = this.value;
for (const key in cells) {
if (cells[key].num === 0) {
delete cells[key];
}
}
},
// -- Process Tuples -----
add(t) {
const key = this.cellkey(t),
cell = this.cell(key, t);
cell.num += 1;
if (this._countOnly) return;
if (cell.store) cell.data.add(t);
const agg = cell.agg;
for (let i = 0, n = agg.length; i < n; ++i) {
agg[i].add(agg[i].get(t), t);
}
},
rem(t) {
const key = this.cellkey(t),
cell = this.cell(key, t);
cell.num -= 1;
if (this._countOnly) return;
if (cell.store) cell.data.rem(t);
const agg = cell.agg;
for (let i = 0, n = agg.length; i < n; ++i) {
agg[i].rem(agg[i].get(t), t);
}
},
celltuple(cell) {
const tuple = cell.tuple,
counts = this._counts;
// consolidate stored values
if (cell.store) {
cell.data.values();
}
// update tuple properties
for (let i = 0, n = counts.length; i < n; ++i) {
tuple[counts[i]] = cell.num;
}
if (!this._countOnly) {
const agg = cell.agg;
for (let i = 0, n = agg.length; i < n; ++i) {
agg[i].set(tuple);
}
}
return tuple;
},
changes(out) {
const adds = this._adds,
mods = this._mods,
prev = this._prev,
drop = this._drop,
add = out.add,
rem = out.rem,
mod = out.mod;
let cell, key, i, n;
if (prev) for (key in prev) {
cell = prev[key];
if (!drop || cell.num) rem.push(cell.tuple);
}
for (i = 0, n = this._alen; i < n; ++i) {
add.push(this.celltuple(adds[i]));
adds[i] = null; // for garbage collection
}
for (i = 0, n = this._mlen; i < n; ++i) {
cell = mods[i];
(cell.num === 0 && drop ? rem : mod).push(this.celltuple(cell));
mods[i] = null; // for garbage collection
}
this._alen = this._mlen = 0; // reset list of active cells
this._prev = null;
return out;
}
});
// epsilon bias to offset floating point error (#1737)
const EPSILON$1 = 1e-14;
/**
* Generates a binning function for discretizing data.
* @constructor
* @param {object} params - The parameters for this operator. The
* provided values should be valid options for the {@link bin} function.
* @param {function(object): *} params.field - The data field to bin.
*/
function Bin(params) {
Transform.call(this, null, params);
}
Bin.Definition = {
'type': 'Bin',
'metadata': {
'modifies': true
},
'params': [{
'name': 'field',
'type': 'field',
'required': true
}, {
'name': 'interval',
'type': 'boolean',
'default': true
}, {
'name': 'anchor',
'type': 'number'
}, {
'name': 'maxbins',
'type': 'number',
'default': 20
}, {
'name': 'base',
'type': 'number',
'default': 10
}, {
'name': 'divide',
'type': 'number',
'array': true,
'default': [5, 2]
}, {
'name': 'extent',
'type': 'number',
'array': true,
'length': 2,
'required': true
}, {
'name': 'span',
'type': 'number'
}, {
'name': 'step',
'type': 'number'
}, {
'name': 'steps',
'type': 'number',
'array': true
}, {
'name': 'minstep',
'type': 'number',
'default': 0
}, {
'name': 'nice',
'type': 'boolean',
'default': true
}, {
'name': 'name',
'type': 'string'
}, {
'name': 'as',
'type': 'string',
'array': true,
'length': 2,
'default': ['bin0', 'bin1']
}]
};
inherits(Bin, Transform, {
transform(_, pulse) {
const band = _.interval !== false,
bins = this._bins(_),
start = bins.start,
step = bins.step,
as = _.as || ['bin0', 'bin1'],
b0 = as[0],
b1 = as[1];
let flag;
if (_.modified()) {
pulse = pulse.reflow(true);
flag = pulse.SOURCE;
} else {
flag = pulse.modified(accessorFields(_.field)) ? pulse.ADD_MOD : pulse.ADD;
}
pulse.visit(flag, band ? t => {
const v = bins(t);
// minimum bin value (inclusive)
t[b0] = v;
// maximum bin value (exclusive)
// use convoluted math for better floating point agreement
// see https://github.com/vega/vega/issues/830
// infinite values propagate through this formula! #2227
t[b1] = v == null ? null : start + step * (1 + (v - start) / step);
} : t => t[b0] = bins(t));
return pulse.modifies(band ? as : b0);
},
_bins(_) {
if (this.value && !_.modified()) {
return this.value;
}
const field = _.field,
bins = bin(_),
step = bins.step;
let start = bins.start,
stop = start + Math.ceil((bins.stop - start) / step) * step,
a,
d;
if ((a = _.anchor) != null) {
d = a - (start + step * Math.floor((a - start) / step));
start += d;
stop += d;
}
const f = function (t) {
let v = toNumber(field(t));
return v == null ? null : v < start ? -Infinity : v > stop ? +Infinity : (v = Math.max(start, Math.min(v, stop - step)), start + step * Math.floor(EPSILON$1 + (v - start) / step));
};
f.start = start;
f.stop = bins.stop;
f.step = step;
return this.value = accessor(f, accessorFields(field), _.name || 'bin_' + accessorName(field));
}
});
function SortedList (idFunc, source, input) {
const $ = idFunc;
let data = source || [],
add = input || [],
rem = {},
cnt = 0;
return {
add: t => add.push(t),
remove: t => rem[$(t)] = ++cnt,
size: () => data.length,
data: (compare, resort) => {
if (cnt) {
data = data.filter(t => !rem[$(t)]);
rem = {};
cnt = 0;
}
if (resort && compare) {
data.sort(compare);
}
if (add.length) {
data = compare ? merge(compare, data, add.sort(compare)) : data.concat(add);
add = [];
}
return data;
}
};
}
/**
* Collects all data tuples that pass through this operator.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {function(*,*): number} [params.sort] - An optional
* comparator function for additionally sorting the collected tuples.
*/
function Collect(params) {
Transform.call(this, [], params);
}
Collect.Definition = {
'type': 'Collect',
'metadata': {
'source': true
},
'params': [{
'name': 'sort',
'type': 'compare'
}]
};
inherits(Collect, Transform, {
transform(_, pulse) {
const out = pulse.fork(pulse.ALL),
list = SortedList(tupleid, this.value, out.materialize(out.ADD).add),
sort = _.sort,
mod = pulse.changed() || sort && (_.modified('sort') || pulse.modified(sort.fields));
out.visit(out.REM, list.remove);
this.modified(mod);
this.value = out.source = list.data(stableCompare(sort), mod);
// propagate tree root if defined
if (pulse.source && pulse.source.root) {
this.value.root = pulse.source.root;
}
return out;
}
});
/**
* Generates a comparator function.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {Array<string|function>} params.fields - The fields to compare.
* @param {Array<string>} [params.orders] - The sort orders.
* Each entry should be one of "ascending" (default) or "descending".
*/
function Compare(params) {
Operator.call(this, null, update$5, params);
}
inherits(Compare, Operator);
function update$5(_) {
return this.value && !_.modified() ? this.value : compare(_.fields, _.orders);
}
/**
* Count regexp-defined pattern occurrences in a text field.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {function(object): *} params.field - An accessor for the text field.
* @param {string} [params.pattern] - RegExp string defining the text pattern.
* @param {string} [params.case] - One of 'lower', 'upper' or null (mixed) case.
* @param {string} [params.stopwords] - RegExp string of words to ignore.
*/
function CountPattern(params) {
Transform.call(this, null, params);
}
CountPattern.Definition = {
'type': 'CountPattern',
'metadata': {
'generates': true,
'changes': true
},
'params': [{
'name': 'field',
'type': 'field',
'required': true
}, {
'name': 'case',
'type': 'enum',
'values': ['upper', 'lower', 'mixed'],
'default': 'mixed'
}, {
'name': 'pattern',
'type': 'string',
'default': '[\\w"]+'
}, {
'name': 'stopwords',
'type': 'string',
'default': ''
}, {
'name': 'as',
'type': 'string',
'array': true,
'length': 2,
'default': ['text', 'count']
}]
};
function tokenize(text, tcase, match) {
switch (tcase) {
case 'upper':
text = text.toUpperCase();
break;
case 'lower':
text = text.toLowerCase();
break;
}
return text.match(match);
}
inherits(CountPattern, Transform, {
transform(_, pulse) {
const process = update => tuple => {
var tokens = tokenize(get(tuple), _.case, match) || [],
t;
for (var i = 0, n = tokens.length; i < n; ++i) {
if (!stop.test(t = tokens[i])) update(t);
}
};
const init = this._parameterCheck(_, pulse),
counts = this._counts,
match = this._match,
stop = this._stop,
get = _.field,
as = _.as || ['text', 'count'],
add = process(t => counts[t] = 1 + (counts[t] || 0)),
rem = process(t => counts[t] -= 1);
if (init) {
pulse.visit(pulse.SOURCE, add);
} else {
pulse.visit(pulse.ADD, add);
pulse.visit(pulse.REM, rem);
}
return this._finish(pulse, as); // generate output tuples
},
_parameterCheck(_, pulse) {
let init = false;
if (_.modified('stopwords') || !this._stop) {
this._stop = new RegExp('^' + (_.stopwords || '') + '$', 'i');
init = true;
}
if (_.modified('pattern') || !this._match) {
this._match = new RegExp(_.pattern || '[\\w\']+', 'g');
init = true;
}
if (_.modified('field') || pulse.modified(_.field.fields)) {
init = true;
}
if (init) this._counts = {};
return init;
},
_finish(pulse, as) {
const counts = this._counts,
tuples = this._tuples || (this._tuples = {}),
text = as[0],
count = as[1],
out = pulse.fork(pulse.NO_SOURCE | pulse.NO_FIELDS);
let w, t, c;
for (w in counts) {
t = tuples[w];
c = counts[w] || 0;
if (!t && c) {
tuples[w] = t = ingest({});
t[text] = w;
t[count] = c;
out.add.push(t);
} else if (c === 0) {
if (t) out.rem.push(t);
counts[w] = null;
tuples[w] = null;
} else if (t[count] !== c) {
t[count] = c;
out.mod.push(t);
}
}
return out.modifies(as);
}
});
/**
* Perform a cross-product of a tuple stream with itself.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {function(object):boolean} [params.filter] - An optional filter
* function for selectively including tuples in the cross product.
* @param {Array<string>} [params.as] - The names of the output fields.
*/
function Cross(params) {
Transform.call(this, null, params);
}
Cross.Definition = {
'type': 'Cross',
'metadata': {
'generates': true
},
'params': [{
'name': 'filter',
'type': 'expr'
}, {
'name': 'as',
'type': 'string',
'array': true,
'length': 2,
'default': ['a', 'b']
}]
};
inherits(Cross, Transform, {
transform(_, pulse) {
const out = pulse.fork(pulse.NO_SOURCE),
as = _.as || ['a', 'b'],
a = as[0],
b = as[1],
reset = !this.value || pulse.changed(pulse.ADD_REM) || _.modified('as') || _.modified('filter');
let data = this.value;
if (reset) {
if (data) out.rem = data;
data = pulse.materialize(pulse.SOURCE).source;
out.add = this.value = cross(data, a, b, _.filter || truthy);
} else {
out.mod = data;
}
out.source = this.value;
return out.modifies(as);
}
});
function cross(input, a, b, filter) {
var data = [],
t = {},
n = input.length,
i = 0,
j,
left;
for (; i < n; ++i) {
t[a] = left = input[i];
for (j = 0; j < n; ++j) {
t[b] = input[j];
if (filter(t)) {
data.push(ingest(t));
t = {};
t[a] = left;
}
}
}
return data;
}
const Distributions = {
kde: randomKDE,
mixture: randomMixture,
normal: randomNormal,
lognormal: randomLogNormal,
uniform: randomUniform
};
const DISTRIBUTIONS = 'distributions',
FUNCTION = 'function',
FIELD = 'field';
/**
* Parse a parameter object for a probability distribution.
* @param {object} def - The distribution parameter object.
* @param {function():Array<object>} - A method for requesting
* source data. Used for distributions (such as KDE) that
* require sample data points. This method will only be
* invoked if the 'from' parameter for a target data source
* is not provided. Typically this method returns backing
* source data for a Pulse object.
* @return {object} - The output distribution object.
*/
function parse(def, data) {
const func = def[FUNCTION];
if (!hasOwnProperty(Distributions, func)) {
error('Unknown distribution function: ' + func);
}
const d = Distributions[func]();
for (const name in def) {
// if data field, extract values
if (name === FIELD) {
d.data((def.from || data()).map(def[name]));
}
// if distribution mixture, recurse to parse each definition
else if (name === DISTRIBUTIONS) {
d[name](def[name].map(_ => parse(_, data)));
}
// otherwise, simply set the parameter
else if (typeof d[name] === FUNCTION) {
d[name](def[name]);
}
}
return d;
}
/**
* Grid sample points for a probability density. Given a distribution and
* a sampling extent, will generate points suitable for plotting either
* PDF (probability density function) or CDF (cumulative distribution
* function) curves.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {object} params.distribution - The probability distribution. This
* is an object parameter dependent on the distribution type.
* @param {string} [params.method='pdf'] - The distribution method to sample.
* One of 'pdf' or 'cdf'.
* @param {Array<number>} [params.extent] - The [min, max] extent over which
* to sample the distribution. This argument is required in most cases, but
* can be omitted if the distribution (e.g., 'kde') supports a 'data' method
* that returns numerical sample points from which the extent can be deduced.
* @param {number} [params.minsteps=25] - The minimum number of curve samples
* for plotting the density.
* @param {number} [params.maxsteps=200] - The maximum number of curve samples
* for plotting the density.
* @param {number} [params.steps] - The exact number of curve samples for
* plotting the density. If specified, overrides both minsteps and maxsteps
* to set an exact number of uniform samples. Useful in conjunction with
* a fixed extent to ensure consistent sample points for stacked densities.
*/
function Density(params) {
Transform.call(this, null, params);
}
const distributions = [{
'key': {
'function': 'normal'
},
'params': [{
'name': 'mean',
'type': 'number',
'default': 0
}, {
'name': 'stdev',
'type': 'number',
'default': 1
}]
}, {
'key': {
'function': 'lognormal'
},
'params': [{
'name': 'mean',
'type': 'number',
'default': 0
}, {
'name': 'stdev',
'type': 'number',
'default': 1
}]
}, {
'key': {
'function': 'uniform'
},
'params': [{
'name': 'min',
'type': 'number',
'default': 0
}, {
'name': 'max',
'type': 'number',
'default': 1
}]
}, {
'key': {
'function': 'kde'
},
'params': [{
'name': 'field',
'type': 'field',
'required': true
}, {
'name': 'from',
'type': 'data'
}, {
'name': 'bandwidth',
'type': 'number',
'default': 0
}]
}];
const mixture = {
'key': {
'function': 'mixture'
},
'params': [{
'name': 'distributions',
'type': 'param',
'array': true,
'params': distributions
}, {
'name': 'weights',
'type': 'number',
'array': true
}]
};
Density.Definition = {
'type': 'Density',
'metadata': {
'generates': true
},
'params': [{
'name': 'extent',
'type': 'number',
'array': true,
'length': 2
}, {
'name': 'steps',
'type': 'number'
}, {
'name': 'minsteps',
'type': 'number',
'default': 25
}, {
'name': 'maxsteps',
'type': 'number',
'default': 200
}, {
'name': 'method',
'type': 'string',
'default': 'pdf',
'values': ['pdf', 'cdf']
}, {
'name': 'distribution',
'type': 'param',
'params': distributions.concat(mixture)
}, {
'name': 'as',
'type': 'string',
'array': true,
'default': ['value', 'density']
}]
};
inherits(Density, Transform, {
transform(_, pulse) {
const out = pulse.fork(pulse.NO_SOURCE | pulse.NO_FIELDS);
if (!this.value || pulse.changed() || _.modified()) {
const dist = parse(_.distribution, source(pulse)),
minsteps = _.steps || _.minsteps || 25,
maxsteps = _.steps || _.maxsteps || 200;
let method = _.method || 'pdf';
if (method !== 'pdf' && method !== 'cdf') {
error('Invalid density method: ' + method);
}
if (!_.extent && !dist.data) {
error('Missing density extent parameter.');
}
method = dist[method];
const as = _.as || ['value', 'density'],
domain = _.extent || extent(dist.data()),
values = sampleCurve(method, domain, minsteps, maxsteps).map(v => {
const tuple = {};
tuple[as[0]] = v[0];
tuple[as[1]] = v[1];
return ingest(tuple);
});
if (this.value) out.rem = this.value;
this.value = out.add = out.source = values;
}
return out;
}
});
function source(pulse) {
return () => pulse.materialize(pulse.SOURCE).source;
}
// use either provided alias or accessor field name
function fieldNames(fields, as) {
if (!fields) return null;
return fields.map((f, i) => as[i] || accessorName(f));
}
function partition$1(data, groupby, field) {
const groups = [],
get = f => f(t);
let map, i, n, t, k, g;
// partition data points into groups
if (groupby == null) {
groups.push(data.map(field));
} else {
for (map = {}, i = 0, n = data.length; i < n; ++i) {
t = data[i];
k = groupby.map(get);
g = map[k];
if (!g) {
map[k] = g = [];
g.dims = k;
groups.push(g);
}
g.push(field(t));
}
}
return groups;
}
const Output = 'bin';
/**
* Dot density binning for dot plot construction.
* Based on Leland Wilkinson, Dot Plots, The American Statistician, 1999.
* https://www.cs.uic.edu/~wilkinson/Publications/dotplots.pdf
* @constructor
* @param {object} params - The parameters for this operator.
* @param {function(object): *} params.field - The value field to bin.
* @param {Array<function(object): *>} [params.groupby] - An array of accessors to groupby.
* @param {number} [params.step] - The step size (bin width) within which dots should be
* stacked. Defaults to 1/30 of the extent of the data *field*.
* @param {boolean} [params.smooth=false] - A boolean flag indicating if dot density
* stacks should be smoothed to reduce variance.
*/
function DotBin(params) {
Transform.call(this, null, params);
}
DotBin.Definition = {
'type': 'DotBin',
'metadata': {
'modifies': true
},
'params': [{
'name': 'field',
'type': 'field',
'required': true
}, {
'name': 'groupby',
'type': 'field',
'array': true
}, {
'name': 'step',
'type': 'number'
}, {
'name': 'smooth',
'type': 'boolean',
'default': false
}, {
'name': 'as',
'type': 'string',
'default': Output
}]
};
const autostep = (data, field) => span(extent(data, field)) / 30;
inherits(DotBin, Transform, {
transform(_, pulse) {
if (this.value && !(_.modified() || pulse.changed())) {
return pulse; // early exit
}
const source = pulse.materialize(pulse.SOURCE).source,
groups = partition$1(pulse.source, _.groupby, identity),
smooth = _.smooth || false,
field = _.field,
step = _.step || autostep(source, field),
sort = stableCompare((a, b) => field(a) - field(b)),
as = _.as || Output,
n = groups.length;
// compute dotplot bins per group
let min = Infinity,
max = -Infinity,
i = 0,
j;
for (; i < n; ++i) {
const g = groups[i].sort(sort);
j = -1;
for (const v of dotbin(g, step, smooth, field)) {
if (v < min) min = v;
if (v > max) max = v;
g[++j][as] = v;
}
}
this.value = {
start: min,
stop: max,
step: step
};
return pulse.reflow(true).modifies(as);
}
});
/**
* Wraps an expression function with access to external parameters.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {function} params.expr - The expression function. The
* function should accept both a datum and a parameter object.
* This operator's value will be a new function that wraps the
* expression function with access to this operator's parameters.
*/
function Expression(params) {
Operator.call(this, null, update$4, params);
this.modified(true);
}
inherits(Expression, Operator);
function update$4(_) {
const expr = _.expr;
return this.value && !_.modified('expr') ? this.value : accessor(datum => expr(datum, _), accessorFields(expr), accessorName(expr));
}
/**
* Computes extents (min/max) for a data field.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {function(object): *} params.field - The field over which to compute extends.
*/
function Extent(params) {
Transform.call(this, [undefined, undefined], params);
}
Extent.Definition = {
'type': 'Extent',
'metadata': {},
'params': [{
'name': 'field',
'type': 'field',
'required': true
}]
};
inherits(Extent, Transform, {
transform(_, pulse) {
const extent = this.value,
field = _.field,
mod = pulse.changed() || pulse.modified(field.fields) || _.modified('field');
let min = extent[0],
max = extent[1];
if (mod || min == null) {
min = +Infinity;
max = -Infinity;
}
pulse.visit(mod ? pulse.SOURCE : pulse.ADD, t => {
const v = toNumber(field(t));
if (v != null) {
// NaNs will fail all comparisons!
if (v < min) min = v;
if (v > max) max = v;
}
});
if (!Number.isFinite(min) || !Number.isFinite(max)) {
let name = accessorName(field);
if (name) name = ` for field "${name}"`;
pulse.dataflow.warn(`Infinite extent${name}: [${min}, ${max}]`);
min = max = undefined;
}
this.value = [min, max];
}
});
/**
* Provides a bridge between a parent transform and a target subflow that
* consumes only a subset of the tuples that pass through the parent.
* @constructor
* @param {Pulse} pulse - A pulse to use as the value of this operator.
* @param {Transform} parent - The parent transform (typically a Facet instance).
*/
function Subflow(pulse, parent) {
Operator.call(this, pulse);
this.parent = parent;
this.count = 0;
}
inherits(Subflow, Operator, {
/**
* Routes pulses from this subflow to a target transform.
* @param {Transform} target - A transform that receives the subflow of tuples.
*/
connect(target) {
this.detachSubflow = target.detachSubflow;
this.targets().add(target);
return target.source = this;
},
/**
* Add an 'add' tuple to the subflow pulse.
* @param {Tuple} t - The tuple being added.
*/
add(t) {
this.count += 1;
this.value.add.push(t);
},
/**
* Add a 'rem' tuple to the subflow pulse.
* @param {Tuple} t - The tuple being removed.
*/
rem(t) {
this.count -= 1;
this.value.rem.push(t);
},
/**
* Add a 'mod' tuple to the subflow pulse.
* @param {Tuple} t - The tuple being modified.
*/
mod(t) {
this.value.mod.push(t);
},
/**
* Re-initialize this operator's pulse value.
* @param {Pulse} pulse - The pulse to copy from.
* @see Pulse.init
*/
init(pulse) {
this.value.init(pulse, pulse.NO_SOURCE);
},
/**
* Evaluate this operator. This method overrides the
* default behavior to simply return the contained pulse value.
* @return {Pulse}
*/
evaluate() {
// assert: this.value.stamp === pulse.stamp
return this.value;
}
});
/**
* Facets a dataflow into a set of subflows based on a key.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {function(Dataflow, string): Operator} params.subflow - A function
* that generates a subflow of operators and returns its root operator.
* @param {function(object): *} params.key - The key field to facet by.
*/
function Facet(params) {
Transform.call(this, {}, params);
this._keys = fastmap(); // cache previously calculated key values
// keep track of active subflows, use as targets array for listeners
// this allows us to limit propagation to only updated subflows
const a = this._targets = [];
a.active = 0;
a.forEach = f => {
for (let i = 0, n = a.active; i < n; ++i) {
f(a[i], i, a);
}
};
}
inherits(Facet, Transform, {
activate(flow) {
this._targets[this._targets.active++] = flow;
},
// parent argument provided by PreFacet subclass
subflow(key, flow, pulse, parent) {
const flows = this.value;
let sf = hasOwnProperty(flows, key) && flows[key],
df,
p;
if (!sf) {
p = parent || (p = this._group[key]) && p.tuple;
df = pulse.dataflow;
sf = new Subflow(pulse.fork(pulse.NO_SOURCE), this);
df.add(sf).connect(flow(df, key, p));
flows[key] = sf;
this.activate(sf);
} else if (sf.value.stamp < pulse.stamp) {
sf.init(pulse);
this.activate(sf);
}
return sf;
},
clean() {
const flows = this.value;
let detached = 0;
for (const key in flows) {
if (flows[key].count === 0) {
const detach = flows[key].detachSubflow;
if (detach) detach();
delete flows[key];
++detached;
}
}
// remove inactive targets from the active targets array
if (detached) {
const active = this._targets.filter(sf => sf && sf.count > 0);
this.initTargets(active);
}
},
initTargets(act) {
const a = this._targets,
n = a.length,
m = act ? act.length : 0;
let i = 0;
for (; i < m; ++i) {
a[i] = act[i];
}
for (; i < n && a[i] != null; ++i) {
a[i] = null; // ensure old flows can be garbage collected
}
a.active = m;
},
transform(_, pulse) {
const df = pulse.dataflow,
key = _.key,
flow = _.subflow,
cache = this._keys,
rekey = _.modified('key'),
subflow = key => this.subflow(key, flow, pulse);
this._group = _.group || {};
this.initTargets(); // reset list of active subflows
pulse.visit(pulse.REM, t => {
const id = tupleid(t),
k = cache.get(id);
if (k !== undefined) {
cache.delete(id);
subflow(k).rem(t);
}
});
pulse.visit(pulse.ADD, t => {
const k = key(t);
cache.set(tupleid(t), k);
subflow(k).add(t);
});
if (rekey || pulse.modified(key.fields)) {
pulse.visit(pulse.MOD, t => {
const id = tupleid(t),
k0 = cache.get(id),
k1 = key(t);
if (k0 === k1) {
subflow(k1).mod(t);
} else {
cache.set(id, k1);
subflow(k0).rem(t);
subflow(k1).add(t);
}
});
} else if (pulse.changed(pulse.MOD)) {
pulse.visit(pulse.MOD, t => {
subflow(cache.get(tupleid(t))).mod(t);
});
}
if (rekey) {
pulse.visit(pulse.REFLOW, t => {
const id = tupleid(t),
k0 = cache.get(id),
k1 = key(t);
if (k0 !== k1) {
cache.set(id, k1);
subflow(k0).rem(t);
subflow(k1).add(t);
}
});
}
if (pulse.clean()) {
df.runAfter(() => {
this.clean();
cache.clean();
});
} else if (cache.empty > df.cleanThreshold) {
df.runAfter(cache.clean);
}
return pulse;
}
});
/**
* Generates one or more field accessor functions.
* If the 'name' parameter is an array, an array of field accessors
* will be created and the 'as' parameter will be ignored.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {string} params.name - The field name(s) to access.
* @param {string} params.as - The accessor function name.
*/
function Field(params) {
Operator.call(this, null, update$3, params);
}
inherits(Field, Operator);
function update$3(_) {
return this.value && !_.modified() ? this.value : isArray(_.name) ? array(_.name).map(f => field(f)) : field(_.name, _.as);
}
/**
* Filters data tuples according to a predicate function.
* @constructor
* @param {object} params - The parameters for this operator.
* @param {function(object): *} params.expr - The predicate expression function
* that determines a tuple's filter status. Truthy values pass the filter.
*/
function Filter(params) {
Transform.call(this, fastmap(), params);
}
Filter.Definition = {
'type': 'Filter',
'metadata': {
'changes': true
},
'params': [{
'name': 'expr',
'type': 'expr',
'required': true
}]
};
inherits(Filter, Transform, {
transform(_, pulse) {
const df = pulse.dataflow,
cache = this.value,
// cache ids of filtered tuples
output = pulse.fork(),
add = output.add,
rem = output.rem,
mod = output.mod,
test = _.expr;
let isMod = true;
pulse.visit(pulse.REM, t => {
const id = tupleid(t);
if (!cache.has(id)) rem.push(t);else cache.delete(id);
});
pulse.visit(pulse.ADD, t => {
if (test(t, _)) add.push(t);else cache.set(tupleid(t), 1);
});
function revisit(t) {
const id = tupleid(t),
b = test(t, _),
s = cache.get(id);
if (b && s) {
cache.delete(id);
add.push(t);
} else if (!b && !s) {
cache.set(id, 1);
rem.push(t);
} else if (isMod && b && !s) {
mod.push(t);
}
}
pulse.visit(pulse.MOD, revisit);
if (_.modified()) {
isMod = false;
pulse.visit(pulse.REFLOW, revisit);
}
if (cache.empty > df.cleanThreshold) df.runAfter(cache.clean);
return output;
}
});
/**
* Flattens array-typed field values into new data objects.
* If multiple fields are specified, they are treated as parallel arrays,
* with output values included for each matching index (or null if missing).
* @constructor
* @param {object} params - The parameters for this operator.
* @param {Array<function(object): *>} params.fields - An array of field
* accessors for the tuple fields that should be flattened.
* @param {string} [params.index] - Optional output field name for index
* value. If unspecified, no index field is included in the output.
* @param {Array<string>} [params.as] - Output field names for flattened
* array fields. Any unspecified fields will use the field name provided
* by the fields accessors.
*/
function Flatten(params) {
Transform.call(this, [], params);
}
Flatten.Definition = {
'type': 'Flatten',
'metadata': {
'generates': true
},
'params': [{
'name': 'fields',
'type': 'field',
'array': true,
'required': true
}, {
'name': 'index',
'type': 'string'
}, {
'name': 'as',
'type': 'string',
'array': true
}]
};
inherits(Flatten, Transform, {
transform(_, pulse) {
const out = pulse.fork(pulse.NO_SOURCE),
fields = _.fields,
as = fieldNames(fields, _.as || []),
index = _.index || null,
m = as.length;
// remove any previous results
out.rem = this.value;
// generate flattened tuples
pulse.visit(pulse.SOURCE, t => {
const arrays = fields.map(f => f(t)),
maxlen = arrays.reduce((l, a) => Math.max(l, a.length), 0);
let i = 0,
j,
d,
v;
for (; i < maxlen; ++i) {
d = derive(t);
for (j = 0; j < m; ++j) {
d[as[j]] = (v = arrays[j][i]) == null ? null : v;
}
if (index) {
d[index] = i;
}
out.add.push(d);
}
});
this.value = out.source = out.add;
if