UNPKG

synopsize

Version:

Print out a synopsis of values from a stream of newline-delimited JSON objects

108 lines (107 loc) 3.65 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.synopsize = synopsize; function count(values) { const counts = new Map(); values.forEach(value => { const previous = counts.has(value) ? counts.get(value) : 0; counts.set(value, previous + 1); }); return counts; } function isEmpty(value) { return (value === undefined) || /^\s*$/.test(value); } /** Sort strings in locale-sensitive ascending order */ function compareStrings(a, b) { return a.localeCompare(b); } /** Sort numbers in ascending order */ function compareNumbers(a, b) { return a - b; } const knownTypes = [ { id: 'DATETIME', // DATETIME: '2016-01-18T01:45:53Z', '2016-01-18 15:10:20' regExp: /^[12]\d{3}(-?)[01]\d\1[0123]\d[T ][012]?\d:[0-5]\d(:[0-5]\d)?Z?$/, compareFunction: compareStrings, }, { id: 'DATE', // DATE: '2016-01-18', '20160118' (but not '2016-01-40', '2016-0118', or '201601-18') regExp: /^[12]\d{3}(-?)[01]\d\1[0123]\d$/, compareFunction: compareStrings, }, { // INTEGER is a subset of some DATE formats, so it must come after id: 'INTEGER', // INTEGER: '-100', '0', '99' (but not '-' or '9223372036854775808') regExp: /^-?\d{1,10}$/, compareFunction: compareNumbers, }, { // BIGINT is a superset of INTEGER, but we want to prefer INTEGER if possible id: 'BIGINT', // BIGINT: '-1000000000000000000', '0', or '9223372036854775808' (but not '-') regExp: /^-?\d{1,19}$/, compareFunction: compareNumbers, }, { // REAL is a subset of INTEGER, so it must come after id: 'REAL', // REAL: '-100.05', '20', '99.004' (but not '.') regExp: /^-?(\d+|\.\d+|\d+\.\d*)$/, compareFunction: compareNumbers, }, { id: 'TIME', // TIME: '23:54', '01:45', '4:90' (but not '2016-0118' or '201601-18') regExp: /^[012]?\d:[0-5]\d$/, compareFunction: compareStrings, }, ].map(({ id, compareFunction, regExp }) => { const test = (values) => values.every(value => regExp.test(String(value))); return { id, compareFunction, regExp, test }; }); const defaultType = { id: 'TEXT', regExp: /^.*$/, test: (values) => true, compareFunction: compareStrings, }; /** Iterate through knownTypes, testing each one in turn on {values}, returning the id (name) of the first one that matches. Thus, knownTypes should be ordered from more- to less-specific types. */ function inferType(values) { // knownType.test calls values.every(...), which is trivially true for the // empty array; so it's uninformative and we want to avoid that case. if (values.length > 0) { for (let knownType of knownTypes) { if (knownType.test(values)) { // return as soon as we find a match return knownType; } } } return defaultType; } /** @param {string[]} values - A column of values that are similar in some way. */ function synopsize(values) { const nonEmptyValues = values.filter(value => !isEmpty(String(value))); const type = inferType(nonEmptyValues); // TODO: maybe just extract the minimum / maximum, rather than sorting the whole thing? const sortedNonEmptyValues = nonEmptyValues.sort(type.compareFunction); return { typeName: type.id, values, nonEmptyValues, minimum: sortedNonEmptyValues[0], maximum: sortedNonEmptyValues[sortedNonEmptyValues.length - 1], counts: count(nonEmptyValues), }; }