UNPKG

vega-transforms

Version:

Data processing transforms for Vega dataflows.

1,875 lines (1,824 loc) 102 kB
import { extend, identity, field, hasOwnProperty, extentIndex, inherits, array, accessorName, error, accessorFields, accessor, toNumber, merge, compare, truthy, extent, span, fastmap, isArray, key, ascending, peek, zero, constant } from 'vega-util'; import { tupleid, Transform, replace, ingest, stableCompare, Operator, derive, rederive } from 'vega-dataflow'; import { quartiles, bootstrapCI, bin, randomUniform, randomLogNormal, randomNormal, randomMixture, randomKDE, sampleCurve, dotbin, quantiles, random } from 'vega-statistics'; import { max, min, mean, median, range, bisector } from 'd3-array'; import { TIME_UNITS, timeBin, timeUnits, utcFloor, timeFloor, utcInterval, timeInterval } from 'vega-time'; function multikey(f) { return x => { const n = f.length; let i = 1, k = String(f[0](x)); for (; i < n; ++i) { k += '|' + f[i](x); } return k; }; } function groupkey(fields) { return !fields || !fields.length ? function () { return ''; } : fields.length === 1 ? fields[0] : multikey(fields); } function measureName(op, field, as) { return as || op + (!field ? '' : '_' + field); } const noop = () => {}; const base_op = { init: noop, add: noop, rem: noop, idx: 0 }; const AggregateOps = { values: { init: m => m.cell.store = true, value: m => m.cell.data.values(), idx: -1 }, count: { value: m => m.cell.num }, __count__: { value: m => m.missing + m.valid }, missing: { value: m => m.missing }, valid: { value: m => m.valid }, sum: { init: m => m.sum = 0, value: m => m.valid ? m.sum : undefined, add: (m, v) => m.sum += +v, rem: (m, v) => m.sum -= v }, product: { init: m => m.product = 1, value: m => m.valid ? m.product : undefined, add: (m, v) => m.product *= v, rem: (m, v) => m.product /= v }, mean: { init: m => m.mean = 0, value: m => m.valid ? m.mean : undefined, add: (m, v) => (m.mean_d = v - m.mean, m.mean += m.mean_d / m.valid), rem: (m, v) => (m.mean_d = v - m.mean, m.mean -= m.valid ? m.mean_d / m.valid : m.mean) }, average: { value: m => m.valid ? m.mean : undefined, req: ['mean'], idx: 1 }, variance: { init: m => m.dev = 0, value: m => m.valid > 1 ? m.dev / (m.valid - 1) : undefined, add: (m, v) => m.dev += m.mean_d * (v - m.mean), rem: (m, v) => m.dev -= m.mean_d * (v - m.mean), req: ['mean'], idx: 1 }, variancep: { value: m => m.valid > 1 ? m.dev / m.valid : undefined, req: ['variance'], idx: 2 }, stdev: { value: m => m.valid > 1 ? Math.sqrt(m.dev / (m.valid - 1)) : undefined, req: ['variance'], idx: 2 }, stdevp: { value: m => m.valid > 1 ? Math.sqrt(m.dev / m.valid) : undefined, req: ['variance'], idx: 2 }, stderr: { value: m => m.valid > 1 ? Math.sqrt(m.dev / (m.valid * (m.valid - 1))) : undefined, req: ['variance'], idx: 2 }, distinct: { value: m => m.cell.data.distinct(m.get), req: ['values'], idx: 3 }, ci0: { value: m => m.cell.data.ci0(m.get), req: ['values'], idx: 3 }, ci1: { value: m => m.cell.data.ci1(m.get), req: ['values'], idx: 3 }, median: { value: m => m.cell.data.q2(m.get), req: ['values'], idx: 3 }, q1: { value: m => m.cell.data.q1(m.get), req: ['values'], idx: 3 }, q3: { value: m => m.cell.data.q3(m.get), req: ['values'], idx: 3 }, min: { init: m => m.min = undefined, value: m => m.min = Number.isNaN(m.min) ? m.cell.data.min(m.get) : m.min, add: (m, v) => { if (v < m.min || m.min === undefined) m.min = v; }, rem: (m, v) => { if (v <= m.min) m.min = NaN; }, req: ['values'], idx: 4 }, max: { init: m => m.max = undefined, value: m => m.max = Number.isNaN(m.max) ? m.cell.data.max(m.get) : m.max, add: (m, v) => { if (v > m.max || m.max === undefined) m.max = v; }, rem: (m, v) => { if (v >= m.max) m.max = NaN; }, req: ['values'], idx: 4 }, argmin: { init: m => m.argmin = undefined, value: m => m.argmin || m.cell.data.argmin(m.get), add: (m, v, t) => { if (v < m.min) m.argmin = t; }, rem: (m, v) => { if (v <= m.min) m.argmin = undefined; }, req: ['min', 'values'], idx: 3 }, argmax: { init: m => m.argmax = undefined, value: m => m.argmax || m.cell.data.argmax(m.get), add: (m, v, t) => { if (v > m.max) m.argmax = t; }, rem: (m, v) => { if (v >= m.max) m.argmax = undefined; }, req: ['max', 'values'], idx: 3 }, exponential: { init: (m, r) => { m.exp = 0; m.exp_r = r; }, value: m => m.valid ? m.exp * (1 - m.exp_r) / (1 - m.exp_r ** m.valid) : undefined, add: (m, v) => m.exp = m.exp_r * m.exp + v, rem: (m, v) => m.exp = (m.exp - v / m.exp_r ** (m.valid - 1)) / m.exp_r }, exponentialb: { value: m => m.valid ? m.exp * (1 - m.exp_r) : undefined, req: ['exponential'], idx: 1 } }; const ValidAggregateOps = Object.keys(AggregateOps).filter(d => d !== '__count__'); function measure(key, value) { return (out, aggregate_param) => extend({ name: key, aggregate_param: aggregate_param, out: out || key }, base_op, value); } [...ValidAggregateOps, '__count__'].forEach(key => { AggregateOps[key] = measure(key, AggregateOps[key]); }); function createMeasure(op, param, name) { return AggregateOps[op](name, param); } function compareIndex(a, b) { return a.idx - b.idx; } function resolve(agg) { const map = {}; agg.forEach(a => map[a.name] = a); const getreqs = a => { if (!a.req) return; a.req.forEach(key => { if (!map[key]) getreqs(map[key] = AggregateOps[key]()); }); }; agg.forEach(getreqs); return Object.values(map).sort(compareIndex); } function init() { this.valid = 0; this.missing = 0; this._ops.forEach(op => op.aggregate_param == null ? op.init(this) : op.init(this, op.aggregate_param)); } function add(v, t) { if (v == null || v === '') { ++this.missing; return; } if (v !== v) return; ++this.valid; this._ops.forEach(op => op.add(this, v, t)); } function rem(v, t) { if (v == null || v === '') { --this.missing; return; } if (v !== v) return; --this.valid; this._ops.forEach(op => op.rem(this, v, t)); } function set(t) { this._out.forEach(op => t[op.out] = op.value(this)); return t; } function compileMeasures(agg, field) { const get = field || identity, ops = resolve(agg), out = agg.slice().sort(compareIndex); function ctr(cell) { this._ops = ops; this._out = out; this.cell = cell; this.init(); } ctr.prototype.init = init; ctr.prototype.add = add; ctr.prototype.rem = rem; ctr.prototype.set = set; ctr.prototype.get = get; ctr.fields = agg.map(op => op.out); return ctr; } function TupleStore(key) { this._key = key ? field(key) : tupleid; this.reset(); } const prototype$1 = TupleStore.prototype; prototype$1.reset = function () { this._add = []; this._rem = []; this._ext = null; this._get = null; this._q = null; }; prototype$1.add = function (v) { this._add.push(v); }; prototype$1.rem = function (v) { this._rem.push(v); }; prototype$1.values = function () { this._get = null; if (this._rem.length === 0) return this._add; const a = this._add, r = this._rem, k = this._key, n = a.length, m = r.length, x = Array(n - m), map = {}; let i, j, v; // use unique key field to clear removed values for (i = 0; i < m; ++i) { map[k(r[i])] = 1; } for (i = 0, j = 0; i < n; ++i) { if (map[k(v = a[i])]) { map[k(v)] = 0; } else { x[j++] = v; } } this._rem = []; return this._add = x; }; // memoizing statistics methods prototype$1.distinct = function (get) { const v = this.values(), map = {}; let n = v.length, count = 0, s; while (--n >= 0) { s = get(v[n]) + ''; if (!hasOwnProperty(map, s)) { map[s] = 1; ++count; } } return count; }; prototype$1.extent = function (get) { if (this._get !== get || !this._ext) { const v = this.values(), i = extentIndex(v, get); this._ext = [v[i[0]], v[i[1]]]; this._get = get; } return this._ext; }; prototype$1.argmin = function (get) { return this.extent(get)[0] || {}; }; prototype$1.argmax = function (get) { return this.extent(get)[1] || {}; }; prototype$1.min = function (get) { const m = this.extent(get)[0]; return m != null ? get(m) : undefined; }; prototype$1.max = function (get) { const m = this.extent(get)[1]; return m != null ? get(m) : undefined; }; prototype$1.quartile = function (get) { if (this._get !== get || !this._q) { this._q = quartiles(this.values(), get); this._get = get; } return this._q; }; prototype$1.q1 = function (get) { return this.quartile(get)[0]; }; prototype$1.q2 = function (get) { return this.quartile(get)[1]; }; prototype$1.q3 = function (get) { return this.quartile(get)[2]; }; prototype$1.ci = function (get) { if (this._get !== get || !this._ci) { this._ci = bootstrapCI(this.values(), 1000, 0.05, get); this._get = get; } return this._ci; }; prototype$1.ci0 = function (get) { return this.ci(get)[0]; }; prototype$1.ci1 = function (get) { return this.ci(get)[1]; }; /** * Group-by aggregation operator. * @constructor * @param {object} params - The parameters for this operator. * @param {Array<function(object): *>} [params.groupby] - An array of accessors to groupby. * @param {Array<function(object): *>} [params.fields] - An array of accessors to aggregate. * @param {Array<string>} [params.ops] - An array of strings indicating aggregation operations. * @param {Array<number>} [params.aggregate_params] - An optional array of parameters for aggregation operations. * @param {Array<string>} [params.as] - An array of output field names for aggregated values. * @param {boolean} [params.cross=false] - A flag indicating that the full * cross-product of groupby values should be generated, including empty cells. * If true, the drop parameter is ignored and empty cells are retained. * @param {boolean} [params.drop=true] - A flag indicating if empty cells should be removed. */ function Aggregate(params) { Transform.call(this, null, params); this._adds = []; // array of added output tuples this._mods = []; // array of modified output tuples this._alen = 0; // number of active added tuples this._mlen = 0; // number of active modified tuples this._drop = true; // should empty aggregation cells be removed this._cross = false; // produce full cross-product of group-by values this._dims = []; // group-by dimension accessors this._dnames = []; // group-by dimension names this._measures = []; // collection of aggregation monoids this._countOnly = false; // flag indicating only count aggregation this._counts = null; // collection of count fields this._prev = null; // previous aggregation cells this._inputs = null; // array of dependent input tuple field names this._outputs = null; // array of output tuple field names } Aggregate.Definition = { 'type': 'Aggregate', 'metadata': { 'generates': true, 'changes': true }, 'params': [{ 'name': 'groupby', 'type': 'field', 'array': true }, { 'name': 'ops', 'type': 'enum', 'array': true, 'values': ValidAggregateOps }, { 'name': 'aggregate_params', 'type': 'number', 'null': true, 'array': true }, { 'name': 'fields', 'type': 'field', 'null': true, 'array': true }, { 'name': 'as', 'type': 'string', 'null': true, 'array': true }, { 'name': 'drop', 'type': 'boolean', 'default': true }, { 'name': 'cross', 'type': 'boolean', 'default': false }, { 'name': 'key', 'type': 'field' }] }; inherits(Aggregate, Transform, { transform(_, pulse) { const aggr = this, out = pulse.fork(pulse.NO_SOURCE | pulse.NO_FIELDS), mod = _.modified(); aggr.stamp = out.stamp; if (aggr.value && (mod || pulse.modified(aggr._inputs, true))) { aggr._prev = aggr.value; aggr.value = mod ? aggr.init(_) : Object.create(null); pulse.visit(pulse.SOURCE, t => aggr.add(t)); } else { aggr.value = aggr.value || aggr.init(_); pulse.visit(pulse.REM, t => aggr.rem(t)); pulse.visit(pulse.ADD, t => aggr.add(t)); } // Indicate output fields and return aggregate tuples. out.modifies(aggr._outputs); // Should empty cells be dropped? aggr._drop = _.drop !== false; // If domain cross-product requested, generate empty cells as needed // and ensure that empty cells are not dropped if (_.cross && aggr._dims.length > 1) { aggr._drop = false; aggr.cross(); } if (pulse.clean() && aggr._drop) { out.clean(true).runAfter(() => this.clean()); } return aggr.changes(out); }, cross() { const aggr = this, curr = aggr.value, dims = aggr._dnames, vals = dims.map(() => ({})), n = dims.length; // collect all group-by domain values function collect(cells) { let key, i, t, v; for (key in cells) { t = cells[key].tuple; for (i = 0; i < n; ++i) { vals[i][v = t[dims[i]]] = v; } } } collect(aggr._prev); collect(curr); // iterate over key cross-product, create cells as needed function generate(base, tuple, index) { const name = dims[index], v = vals[index++]; for (const k in v) { const key = base ? base + '|' + k : k; tuple[name] = v[k]; if (index < n) generate(key, tuple, index);else if (!curr[key]) aggr.cell(key, tuple); } } generate('', {}, 0); }, init(_) { // initialize input and output fields const inputs = this._inputs = [], outputs = this._outputs = [], inputMap = {}; function inputVisit(get) { const fields = array(accessorFields(get)), n = fields.length; let i = 0, f; for (; i < n; ++i) { if (!inputMap[f = fields[i]]) { inputMap[f] = 1; inputs.push(f); } } } // initialize group-by dimensions this._dims = array(_.groupby); this._dnames = this._dims.map(d => { const dname = accessorName(d); inputVisit(d); outputs.push(dname); return dname; }); this.cellkey = _.key ? _.key : groupkey(this._dims); // initialize aggregate measures this._countOnly = true; this._counts = []; this._measures = []; const fields = _.fields || [null], ops = _.ops || ['count'], aggregate_params = _.aggregate_params || [null], as = _.as || [], n = fields.length, map = {}; let field, op, aggregate_param, m, mname, outname, i; if (n !== ops.length) { error('Unmatched number of fields and aggregate ops.'); } for (i = 0; i < n; ++i) { field = fields[i]; op = ops[i]; aggregate_param = aggregate_params[i] || null; if (field == null && op !== 'count') { error('Null aggregate field specified.'); } mname = accessorName(field); outname = measureName(op, mname, as[i]); outputs.push(outname); if (op === 'count') { this._counts.push(outname); continue; } m = map[mname]; if (!m) { inputVisit(field); m = map[mname] = []; m.field = field; this._measures.push(m); } if (op !== 'count') this._countOnly = false; m.push(createMeasure(op, aggregate_param, outname)); } this._measures = this._measures.map(m => compileMeasures(m, m.field)); return Object.create(null); // aggregation cells (this.value) }, // -- Cell Management ----- cellkey: groupkey(), cell(key, t) { let cell = this.value[key]; if (!cell) { cell = this.value[key] = this.newcell(key, t); this._adds[this._alen++] = cell; } else if (cell.num === 0 && this._drop && cell.stamp < this.stamp) { cell.stamp = this.stamp; this._adds[this._alen++] = cell; } else if (cell.stamp < this.stamp) { cell.stamp = this.stamp; this._mods[this._mlen++] = cell; } return cell; }, newcell(key, t) { const cell = { key: key, num: 0, agg: null, tuple: this.newtuple(t, this._prev && this._prev[key]), stamp: this.stamp, store: false }; if (!this._countOnly) { const measures = this._measures, n = measures.length; cell.agg = Array(n); for (let i = 0; i < n; ++i) { cell.agg[i] = new measures[i](cell); } } if (cell.store) { cell.data = new TupleStore(); } return cell; }, newtuple(t, p) { const names = this._dnames, dims = this._dims, n = dims.length, x = {}; for (let i = 0; i < n; ++i) { x[names[i]] = dims[i](t); } return p ? replace(p.tuple, x) : ingest(x); }, clean() { const cells = this.value; for (const key in cells) { if (cells[key].num === 0) { delete cells[key]; } } }, // -- Process Tuples ----- add(t) { const key = this.cellkey(t), cell = this.cell(key, t); cell.num += 1; if (this._countOnly) return; if (cell.store) cell.data.add(t); const agg = cell.agg; for (let i = 0, n = agg.length; i < n; ++i) { agg[i].add(agg[i].get(t), t); } }, rem(t) { const key = this.cellkey(t), cell = this.cell(key, t); cell.num -= 1; if (this._countOnly) return; if (cell.store) cell.data.rem(t); const agg = cell.agg; for (let i = 0, n = agg.length; i < n; ++i) { agg[i].rem(agg[i].get(t), t); } }, celltuple(cell) { const tuple = cell.tuple, counts = this._counts; // consolidate stored values if (cell.store) { cell.data.values(); } // update tuple properties for (let i = 0, n = counts.length; i < n; ++i) { tuple[counts[i]] = cell.num; } if (!this._countOnly) { const agg = cell.agg; for (let i = 0, n = agg.length; i < n; ++i) { agg[i].set(tuple); } } return tuple; }, changes(out) { const adds = this._adds, mods = this._mods, prev = this._prev, drop = this._drop, add = out.add, rem = out.rem, mod = out.mod; let cell, key, i, n; if (prev) for (key in prev) { cell = prev[key]; if (!drop || cell.num) rem.push(cell.tuple); } for (i = 0, n = this._alen; i < n; ++i) { add.push(this.celltuple(adds[i])); adds[i] = null; // for garbage collection } for (i = 0, n = this._mlen; i < n; ++i) { cell = mods[i]; (cell.num === 0 && drop ? rem : mod).push(this.celltuple(cell)); mods[i] = null; // for garbage collection } this._alen = this._mlen = 0; // reset list of active cells this._prev = null; return out; } }); // epsilon bias to offset floating point error (#1737) const EPSILON$1 = 1e-14; /** * Generates a binning function for discretizing data. * @constructor * @param {object} params - The parameters for this operator. The * provided values should be valid options for the {@link bin} function. * @param {function(object): *} params.field - The data field to bin. */ function Bin(params) { Transform.call(this, null, params); } Bin.Definition = { 'type': 'Bin', 'metadata': { 'modifies': true }, 'params': [{ 'name': 'field', 'type': 'field', 'required': true }, { 'name': 'interval', 'type': 'boolean', 'default': true }, { 'name': 'anchor', 'type': 'number' }, { 'name': 'maxbins', 'type': 'number', 'default': 20 }, { 'name': 'base', 'type': 'number', 'default': 10 }, { 'name': 'divide', 'type': 'number', 'array': true, 'default': [5, 2] }, { 'name': 'extent', 'type': 'number', 'array': true, 'length': 2, 'required': true }, { 'name': 'span', 'type': 'number' }, { 'name': 'step', 'type': 'number' }, { 'name': 'steps', 'type': 'number', 'array': true }, { 'name': 'minstep', 'type': 'number', 'default': 0 }, { 'name': 'nice', 'type': 'boolean', 'default': true }, { 'name': 'name', 'type': 'string' }, { 'name': 'as', 'type': 'string', 'array': true, 'length': 2, 'default': ['bin0', 'bin1'] }] }; inherits(Bin, Transform, { transform(_, pulse) { const band = _.interval !== false, bins = this._bins(_), start = bins.start, step = bins.step, as = _.as || ['bin0', 'bin1'], b0 = as[0], b1 = as[1]; let flag; if (_.modified()) { pulse = pulse.reflow(true); flag = pulse.SOURCE; } else { flag = pulse.modified(accessorFields(_.field)) ? pulse.ADD_MOD : pulse.ADD; } pulse.visit(flag, band ? t => { const v = bins(t); // minimum bin value (inclusive) t[b0] = v; // maximum bin value (exclusive) // use convoluted math for better floating point agreement // see https://github.com/vega/vega/issues/830 // infinite values propagate through this formula! #2227 t[b1] = v == null ? null : start + step * (1 + (v - start) / step); } : t => t[b0] = bins(t)); return pulse.modifies(band ? as : b0); }, _bins(_) { if (this.value && !_.modified()) { return this.value; } const field = _.field, bins = bin(_), step = bins.step; let start = bins.start, stop = start + Math.ceil((bins.stop - start) / step) * step, a, d; if ((a = _.anchor) != null) { d = a - (start + step * Math.floor((a - start) / step)); start += d; stop += d; } const f = function (t) { let v = toNumber(field(t)); return v == null ? null : v < start ? -Infinity : v > stop ? +Infinity : (v = Math.max(start, Math.min(v, stop - step)), start + step * Math.floor(EPSILON$1 + (v - start) / step)); }; f.start = start; f.stop = bins.stop; f.step = step; return this.value = accessor(f, accessorFields(field), _.name || 'bin_' + accessorName(field)); } }); function SortedList (idFunc, source, input) { const $ = idFunc; let data = source || [], add = input || [], rem = {}, cnt = 0; return { add: t => add.push(t), remove: t => rem[$(t)] = ++cnt, size: () => data.length, data: (compare, resort) => { if (cnt) { data = data.filter(t => !rem[$(t)]); rem = {}; cnt = 0; } if (resort && compare) { data.sort(compare); } if (add.length) { data = compare ? merge(compare, data, add.sort(compare)) : data.concat(add); add = []; } return data; } }; } /** * Collects all data tuples that pass through this operator. * @constructor * @param {object} params - The parameters for this operator. * @param {function(*,*): number} [params.sort] - An optional * comparator function for additionally sorting the collected tuples. */ function Collect(params) { Transform.call(this, [], params); } Collect.Definition = { 'type': 'Collect', 'metadata': { 'source': true }, 'params': [{ 'name': 'sort', 'type': 'compare' }] }; inherits(Collect, Transform, { transform(_, pulse) { const out = pulse.fork(pulse.ALL), list = SortedList(tupleid, this.value, out.materialize(out.ADD).add), sort = _.sort, mod = pulse.changed() || sort && (_.modified('sort') || pulse.modified(sort.fields)); out.visit(out.REM, list.remove); this.modified(mod); this.value = out.source = list.data(stableCompare(sort), mod); // propagate tree root if defined if (pulse.source && pulse.source.root) { this.value.root = pulse.source.root; } return out; } }); /** * Generates a comparator function. * @constructor * @param {object} params - The parameters for this operator. * @param {Array<string|function>} params.fields - The fields to compare. * @param {Array<string>} [params.orders] - The sort orders. * Each entry should be one of "ascending" (default) or "descending". */ function Compare(params) { Operator.call(this, null, update$5, params); } inherits(Compare, Operator); function update$5(_) { return this.value && !_.modified() ? this.value : compare(_.fields, _.orders); } /** * Count regexp-defined pattern occurrences in a text field. * @constructor * @param {object} params - The parameters for this operator. * @param {function(object): *} params.field - An accessor for the text field. * @param {string} [params.pattern] - RegExp string defining the text pattern. * @param {string} [params.case] - One of 'lower', 'upper' or null (mixed) case. * @param {string} [params.stopwords] - RegExp string of words to ignore. */ function CountPattern(params) { Transform.call(this, null, params); } CountPattern.Definition = { 'type': 'CountPattern', 'metadata': { 'generates': true, 'changes': true }, 'params': [{ 'name': 'field', 'type': 'field', 'required': true }, { 'name': 'case', 'type': 'enum', 'values': ['upper', 'lower', 'mixed'], 'default': 'mixed' }, { 'name': 'pattern', 'type': 'string', 'default': '[\\w"]+' }, { 'name': 'stopwords', 'type': 'string', 'default': '' }, { 'name': 'as', 'type': 'string', 'array': true, 'length': 2, 'default': ['text', 'count'] }] }; function tokenize(text, tcase, match) { switch (tcase) { case 'upper': text = text.toUpperCase(); break; case 'lower': text = text.toLowerCase(); break; } return text.match(match); } inherits(CountPattern, Transform, { transform(_, pulse) { const process = update => tuple => { var tokens = tokenize(get(tuple), _.case, match) || [], t; for (var i = 0, n = tokens.length; i < n; ++i) { if (!stop.test(t = tokens[i])) update(t); } }; const init = this._parameterCheck(_, pulse), counts = this._counts, match = this._match, stop = this._stop, get = _.field, as = _.as || ['text', 'count'], add = process(t => counts[t] = 1 + (counts[t] || 0)), rem = process(t => counts[t] -= 1); if (init) { pulse.visit(pulse.SOURCE, add); } else { pulse.visit(pulse.ADD, add); pulse.visit(pulse.REM, rem); } return this._finish(pulse, as); // generate output tuples }, _parameterCheck(_, pulse) { let init = false; if (_.modified('stopwords') || !this._stop) { this._stop = new RegExp('^' + (_.stopwords || '') + '$', 'i'); init = true; } if (_.modified('pattern') || !this._match) { this._match = new RegExp(_.pattern || '[\\w\']+', 'g'); init = true; } if (_.modified('field') || pulse.modified(_.field.fields)) { init = true; } if (init) this._counts = {}; return init; }, _finish(pulse, as) { const counts = this._counts, tuples = this._tuples || (this._tuples = {}), text = as[0], count = as[1], out = pulse.fork(pulse.NO_SOURCE | pulse.NO_FIELDS); let w, t, c; for (w in counts) { t = tuples[w]; c = counts[w] || 0; if (!t && c) { tuples[w] = t = ingest({}); t[text] = w; t[count] = c; out.add.push(t); } else if (c === 0) { if (t) out.rem.push(t); counts[w] = null; tuples[w] = null; } else if (t[count] !== c) { t[count] = c; out.mod.push(t); } } return out.modifies(as); } }); /** * Perform a cross-product of a tuple stream with itself. * @constructor * @param {object} params - The parameters for this operator. * @param {function(object):boolean} [params.filter] - An optional filter * function for selectively including tuples in the cross product. * @param {Array<string>} [params.as] - The names of the output fields. */ function Cross(params) { Transform.call(this, null, params); } Cross.Definition = { 'type': 'Cross', 'metadata': { 'generates': true }, 'params': [{ 'name': 'filter', 'type': 'expr' }, { 'name': 'as', 'type': 'string', 'array': true, 'length': 2, 'default': ['a', 'b'] }] }; inherits(Cross, Transform, { transform(_, pulse) { const out = pulse.fork(pulse.NO_SOURCE), as = _.as || ['a', 'b'], a = as[0], b = as[1], reset = !this.value || pulse.changed(pulse.ADD_REM) || _.modified('as') || _.modified('filter'); let data = this.value; if (reset) { if (data) out.rem = data; data = pulse.materialize(pulse.SOURCE).source; out.add = this.value = cross(data, a, b, _.filter || truthy); } else { out.mod = data; } out.source = this.value; return out.modifies(as); } }); function cross(input, a, b, filter) { var data = [], t = {}, n = input.length, i = 0, j, left; for (; i < n; ++i) { t[a] = left = input[i]; for (j = 0; j < n; ++j) { t[b] = input[j]; if (filter(t)) { data.push(ingest(t)); t = {}; t[a] = left; } } } return data; } const Distributions = { kde: randomKDE, mixture: randomMixture, normal: randomNormal, lognormal: randomLogNormal, uniform: randomUniform }; const DISTRIBUTIONS = 'distributions', FUNCTION = 'function', FIELD = 'field'; /** * Parse a parameter object for a probability distribution. * @param {object} def - The distribution parameter object. * @param {function():Array<object>} - A method for requesting * source data. Used for distributions (such as KDE) that * require sample data points. This method will only be * invoked if the 'from' parameter for a target data source * is not provided. Typically this method returns backing * source data for a Pulse object. * @return {object} - The output distribution object. */ function parse(def, data) { const func = def[FUNCTION]; if (!hasOwnProperty(Distributions, func)) { error('Unknown distribution function: ' + func); } const d = Distributions[func](); for (const name in def) { // if data field, extract values if (name === FIELD) { d.data((def.from || data()).map(def[name])); } // if distribution mixture, recurse to parse each definition else if (name === DISTRIBUTIONS) { d[name](def[name].map(_ => parse(_, data))); } // otherwise, simply set the parameter else if (typeof d[name] === FUNCTION) { d[name](def[name]); } } return d; } /** * Grid sample points for a probability density. Given a distribution and * a sampling extent, will generate points suitable for plotting either * PDF (probability density function) or CDF (cumulative distribution * function) curves. * @constructor * @param {object} params - The parameters for this operator. * @param {object} params.distribution - The probability distribution. This * is an object parameter dependent on the distribution type. * @param {string} [params.method='pdf'] - The distribution method to sample. * One of 'pdf' or 'cdf'. * @param {Array<number>} [params.extent] - The [min, max] extent over which * to sample the distribution. This argument is required in most cases, but * can be omitted if the distribution (e.g., 'kde') supports a 'data' method * that returns numerical sample points from which the extent can be deduced. * @param {number} [params.minsteps=25] - The minimum number of curve samples * for plotting the density. * @param {number} [params.maxsteps=200] - The maximum number of curve samples * for plotting the density. * @param {number} [params.steps] - The exact number of curve samples for * plotting the density. If specified, overrides both minsteps and maxsteps * to set an exact number of uniform samples. Useful in conjunction with * a fixed extent to ensure consistent sample points for stacked densities. */ function Density(params) { Transform.call(this, null, params); } const distributions = [{ 'key': { 'function': 'normal' }, 'params': [{ 'name': 'mean', 'type': 'number', 'default': 0 }, { 'name': 'stdev', 'type': 'number', 'default': 1 }] }, { 'key': { 'function': 'lognormal' }, 'params': [{ 'name': 'mean', 'type': 'number', 'default': 0 }, { 'name': 'stdev', 'type': 'number', 'default': 1 }] }, { 'key': { 'function': 'uniform' }, 'params': [{ 'name': 'min', 'type': 'number', 'default': 0 }, { 'name': 'max', 'type': 'number', 'default': 1 }] }, { 'key': { 'function': 'kde' }, 'params': [{ 'name': 'field', 'type': 'field', 'required': true }, { 'name': 'from', 'type': 'data' }, { 'name': 'bandwidth', 'type': 'number', 'default': 0 }] }]; const mixture = { 'key': { 'function': 'mixture' }, 'params': [{ 'name': 'distributions', 'type': 'param', 'array': true, 'params': distributions }, { 'name': 'weights', 'type': 'number', 'array': true }] }; Density.Definition = { 'type': 'Density', 'metadata': { 'generates': true }, 'params': [{ 'name': 'extent', 'type': 'number', 'array': true, 'length': 2 }, { 'name': 'steps', 'type': 'number' }, { 'name': 'minsteps', 'type': 'number', 'default': 25 }, { 'name': 'maxsteps', 'type': 'number', 'default': 200 }, { 'name': 'method', 'type': 'string', 'default': 'pdf', 'values': ['pdf', 'cdf'] }, { 'name': 'distribution', 'type': 'param', 'params': distributions.concat(mixture) }, { 'name': 'as', 'type': 'string', 'array': true, 'default': ['value', 'density'] }] }; inherits(Density, Transform, { transform(_, pulse) { const out = pulse.fork(pulse.NO_SOURCE | pulse.NO_FIELDS); if (!this.value || pulse.changed() || _.modified()) { const dist = parse(_.distribution, source(pulse)), minsteps = _.steps || _.minsteps || 25, maxsteps = _.steps || _.maxsteps || 200; let method = _.method || 'pdf'; if (method !== 'pdf' && method !== 'cdf') { error('Invalid density method: ' + method); } if (!_.extent && !dist.data) { error('Missing density extent parameter.'); } method = dist[method]; const as = _.as || ['value', 'density'], domain = _.extent || extent(dist.data()), values = sampleCurve(method, domain, minsteps, maxsteps).map(v => { const tuple = {}; tuple[as[0]] = v[0]; tuple[as[1]] = v[1]; return ingest(tuple); }); if (this.value) out.rem = this.value; this.value = out.add = out.source = values; } return out; } }); function source(pulse) { return () => pulse.materialize(pulse.SOURCE).source; } // use either provided alias or accessor field name function fieldNames(fields, as) { if (!fields) return null; return fields.map((f, i) => as[i] || accessorName(f)); } function partition$1(data, groupby, field) { const groups = [], get = f => f(t); let map, i, n, t, k, g; // partition data points into groups if (groupby == null) { groups.push(data.map(field)); } else { for (map = {}, i = 0, n = data.length; i < n; ++i) { t = data[i]; k = groupby.map(get); g = map[k]; if (!g) { map[k] = g = []; g.dims = k; groups.push(g); } g.push(field(t)); } } return groups; } const Output = 'bin'; /** * Dot density binning for dot plot construction. * Based on Leland Wilkinson, Dot Plots, The American Statistician, 1999. * https://www.cs.uic.edu/~wilkinson/Publications/dotplots.pdf * @constructor * @param {object} params - The parameters for this operator. * @param {function(object): *} params.field - The value field to bin. * @param {Array<function(object): *>} [params.groupby] - An array of accessors to groupby. * @param {number} [params.step] - The step size (bin width) within which dots should be * stacked. Defaults to 1/30 of the extent of the data *field*. * @param {boolean} [params.smooth=false] - A boolean flag indicating if dot density * stacks should be smoothed to reduce variance. */ function DotBin(params) { Transform.call(this, null, params); } DotBin.Definition = { 'type': 'DotBin', 'metadata': { 'modifies': true }, 'params': [{ 'name': 'field', 'type': 'field', 'required': true }, { 'name': 'groupby', 'type': 'field', 'array': true }, { 'name': 'step', 'type': 'number' }, { 'name': 'smooth', 'type': 'boolean', 'default': false }, { 'name': 'as', 'type': 'string', 'default': Output }] }; const autostep = (data, field) => span(extent(data, field)) / 30; inherits(DotBin, Transform, { transform(_, pulse) { if (this.value && !(_.modified() || pulse.changed())) { return pulse; // early exit } const source = pulse.materialize(pulse.SOURCE).source, groups = partition$1(pulse.source, _.groupby, identity), smooth = _.smooth || false, field = _.field, step = _.step || autostep(source, field), sort = stableCompare((a, b) => field(a) - field(b)), as = _.as || Output, n = groups.length; // compute dotplot bins per group let min = Infinity, max = -Infinity, i = 0, j; for (; i < n; ++i) { const g = groups[i].sort(sort); j = -1; for (const v of dotbin(g, step, smooth, field)) { if (v < min) min = v; if (v > max) max = v; g[++j][as] = v; } } this.value = { start: min, stop: max, step: step }; return pulse.reflow(true).modifies(as); } }); /** * Wraps an expression function with access to external parameters. * @constructor * @param {object} params - The parameters for this operator. * @param {function} params.expr - The expression function. The * function should accept both a datum and a parameter object. * This operator's value will be a new function that wraps the * expression function with access to this operator's parameters. */ function Expression(params) { Operator.call(this, null, update$4, params); this.modified(true); } inherits(Expression, Operator); function update$4(_) { const expr = _.expr; return this.value && !_.modified('expr') ? this.value : accessor(datum => expr(datum, _), accessorFields(expr), accessorName(expr)); } /** * Computes extents (min/max) for a data field. * @constructor * @param {object} params - The parameters for this operator. * @param {function(object): *} params.field - The field over which to compute extends. */ function Extent(params) { Transform.call(this, [undefined, undefined], params); } Extent.Definition = { 'type': 'Extent', 'metadata': {}, 'params': [{ 'name': 'field', 'type': 'field', 'required': true }] }; inherits(Extent, Transform, { transform(_, pulse) { const extent = this.value, field = _.field, mod = pulse.changed() || pulse.modified(field.fields) || _.modified('field'); let min = extent[0], max = extent[1]; if (mod || min == null) { min = +Infinity; max = -Infinity; } pulse.visit(mod ? pulse.SOURCE : pulse.ADD, t => { const v = toNumber(field(t)); if (v != null) { // NaNs will fail all comparisons! if (v < min) min = v; if (v > max) max = v; } }); if (!Number.isFinite(min) || !Number.isFinite(max)) { let name = accessorName(field); if (name) name = ` for field "${name}"`; pulse.dataflow.warn(`Infinite extent${name}: [${min}, ${max}]`); min = max = undefined; } this.value = [min, max]; } }); /** * Provides a bridge between a parent transform and a target subflow that * consumes only a subset of the tuples that pass through the parent. * @constructor * @param {Pulse} pulse - A pulse to use as the value of this operator. * @param {Transform} parent - The parent transform (typically a Facet instance). */ function Subflow(pulse, parent) { Operator.call(this, pulse); this.parent = parent; this.count = 0; } inherits(Subflow, Operator, { /** * Routes pulses from this subflow to a target transform. * @param {Transform} target - A transform that receives the subflow of tuples. */ connect(target) { this.detachSubflow = target.detachSubflow; this.targets().add(target); return target.source = this; }, /** * Add an 'add' tuple to the subflow pulse. * @param {Tuple} t - The tuple being added. */ add(t) { this.count += 1; this.value.add.push(t); }, /** * Add a 'rem' tuple to the subflow pulse. * @param {Tuple} t - The tuple being removed. */ rem(t) { this.count -= 1; this.value.rem.push(t); }, /** * Add a 'mod' tuple to the subflow pulse. * @param {Tuple} t - The tuple being modified. */ mod(t) { this.value.mod.push(t); }, /** * Re-initialize this operator's pulse value. * @param {Pulse} pulse - The pulse to copy from. * @see Pulse.init */ init(pulse) { this.value.init(pulse, pulse.NO_SOURCE); }, /** * Evaluate this operator. This method overrides the * default behavior to simply return the contained pulse value. * @return {Pulse} */ evaluate() { // assert: this.value.stamp === pulse.stamp return this.value; } }); /** * Facets a dataflow into a set of subflows based on a key. * @constructor * @param {object} params - The parameters for this operator. * @param {function(Dataflow, string): Operator} params.subflow - A function * that generates a subflow of operators and returns its root operator. * @param {function(object): *} params.key - The key field to facet by. */ function Facet(params) { Transform.call(this, {}, params); this._keys = fastmap(); // cache previously calculated key values // keep track of active subflows, use as targets array for listeners // this allows us to limit propagation to only updated subflows const a = this._targets = []; a.active = 0; a.forEach = f => { for (let i = 0, n = a.active; i < n; ++i) { f(a[i], i, a); } }; } inherits(Facet, Transform, { activate(flow) { this._targets[this._targets.active++] = flow; }, // parent argument provided by PreFacet subclass subflow(key, flow, pulse, parent) { const flows = this.value; let sf = hasOwnProperty(flows, key) && flows[key], df, p; if (!sf) { p = parent || (p = this._group[key]) && p.tuple; df = pulse.dataflow; sf = new Subflow(pulse.fork(pulse.NO_SOURCE), this); df.add(sf).connect(flow(df, key, p)); flows[key] = sf; this.activate(sf); } else if (sf.value.stamp < pulse.stamp) { sf.init(pulse); this.activate(sf); } return sf; }, clean() { const flows = this.value; let detached = 0; for (const key in flows) { if (flows[key].count === 0) { const detach = flows[key].detachSubflow; if (detach) detach(); delete flows[key]; ++detached; } } // remove inactive targets from the active targets array if (detached) { const active = this._targets.filter(sf => sf && sf.count > 0); this.initTargets(active); } }, initTargets(act) { const a = this._targets, n = a.length, m = act ? act.length : 0; let i = 0; for (; i < m; ++i) { a[i] = act[i]; } for (; i < n && a[i] != null; ++i) { a[i] = null; // ensure old flows can be garbage collected } a.active = m; }, transform(_, pulse) { const df = pulse.dataflow, key = _.key, flow = _.subflow, cache = this._keys, rekey = _.modified('key'), subflow = key => this.subflow(key, flow, pulse); this._group = _.group || {}; this.initTargets(); // reset list of active subflows pulse.visit(pulse.REM, t => { const id = tupleid(t), k = cache.get(id); if (k !== undefined) { cache.delete(id); subflow(k).rem(t); } }); pulse.visit(pulse.ADD, t => { const k = key(t); cache.set(tupleid(t), k); subflow(k).add(t); }); if (rekey || pulse.modified(key.fields)) { pulse.visit(pulse.MOD, t => { const id = tupleid(t), k0 = cache.get(id), k1 = key(t); if (k0 === k1) { subflow(k1).mod(t); } else { cache.set(id, k1); subflow(k0).rem(t); subflow(k1).add(t); } }); } else if (pulse.changed(pulse.MOD)) { pulse.visit(pulse.MOD, t => { subflow(cache.get(tupleid(t))).mod(t); }); } if (rekey) { pulse.visit(pulse.REFLOW, t => { const id = tupleid(t), k0 = cache.get(id), k1 = key(t); if (k0 !== k1) { cache.set(id, k1); subflow(k0).rem(t); subflow(k1).add(t); } }); } if (pulse.clean()) { df.runAfter(() => { this.clean(); cache.clean(); }); } else if (cache.empty > df.cleanThreshold) { df.runAfter(cache.clean); } return pulse; } }); /** * Generates one or more field accessor functions. * If the 'name' parameter is an array, an array of field accessors * will be created and the 'as' parameter will be ignored. * @constructor * @param {object} params - The parameters for this operator. * @param {string} params.name - The field name(s) to access. * @param {string} params.as - The accessor function name. */ function Field(params) { Operator.call(this, null, update$3, params); } inherits(Field, Operator); function update$3(_) { return this.value && !_.modified() ? this.value : isArray(_.name) ? array(_.name).map(f => field(f)) : field(_.name, _.as); } /** * Filters data tuples according to a predicate function. * @constructor * @param {object} params - The parameters for this operator. * @param {function(object): *} params.expr - The predicate expression function * that determines a tuple's filter status. Truthy values pass the filter. */ function Filter(params) { Transform.call(this, fastmap(), params); } Filter.Definition = { 'type': 'Filter', 'metadata': { 'changes': true }, 'params': [{ 'name': 'expr', 'type': 'expr', 'required': true }] }; inherits(Filter, Transform, { transform(_, pulse) { const df = pulse.dataflow, cache = this.value, // cache ids of filtered tuples output = pulse.fork(), add = output.add, rem = output.rem, mod = output.mod, test = _.expr; let isMod = true; pulse.visit(pulse.REM, t => { const id = tupleid(t); if (!cache.has(id)) rem.push(t);else cache.delete(id); }); pulse.visit(pulse.ADD, t => { if (test(t, _)) add.push(t);else cache.set(tupleid(t), 1); }); function revisit(t) { const id = tupleid(t), b = test(t, _), s = cache.get(id); if (b && s) { cache.delete(id); add.push(t); } else if (!b && !s) { cache.set(id, 1); rem.push(t); } else if (isMod && b && !s) { mod.push(t); } } pulse.visit(pulse.MOD, revisit); if (_.modified()) { isMod = false; pulse.visit(pulse.REFLOW, revisit); } if (cache.empty > df.cleanThreshold) df.runAfter(cache.clean); return output; } }); /** * Flattens array-typed field values into new data objects. * If multiple fields are specified, they are treated as parallel arrays, * with output values included for each matching index (or null if missing). * @constructor * @param {object} params - The parameters for this operator. * @param {Array<function(object): *>} params.fields - An array of field * accessors for the tuple fields that should be flattened. * @param {string} [params.index] - Optional output field name for index * value. If unspecified, no index field is included in the output. * @param {Array<string>} [params.as] - Output field names for flattened * array fields. Any unspecified fields will use the field name provided * by the fields accessors. */ function Flatten(params) { Transform.call(this, [], params); } Flatten.Definition = { 'type': 'Flatten', 'metadata': { 'generates': true }, 'params': [{ 'name': 'fields', 'type': 'field', 'array': true, 'required': true }, { 'name': 'index', 'type': 'string' }, { 'name': 'as', 'type': 'string', 'array': true }] }; inherits(Flatten, Transform, { transform(_, pulse) { const out = pulse.fork(pulse.NO_SOURCE), fields = _.fields, as = fieldNames(fields, _.as || []), index = _.index || null, m = as.length; // remove any previous results out.rem = this.value; // generate flattened tuples pulse.visit(pulse.SOURCE, t => { const arrays = fields.map(f => f(t)), maxlen = arrays.reduce((l, a) => Math.max(l, a.length), 0); let i = 0, j, d, v; for (; i < maxlen; ++i) { d = derive(t); for (j = 0; j < m; ++j) { d[as[j]] = (v = arrays[j][i]) == null ? null : v; } if (index) { d[index] = i; } out.add.push(d); } }); this.value = out.source = out.add; if