UNPKG

compassql

Version:

CompassQL visualization query language

400 lines 15.8 kB
import dlBin_ from 'datalib/src/bins/bins'; import { inferAll } from 'datalib/src/import/type'; import { summary } from 'datalib/src/stats'; import { autoMaxBins } from 'vega-lite/build/src/bin'; import { containsTimeUnit, convert, TimeUnit, TIMEUNIT_PARTS } from 'vega-lite/build/src/timeunit'; import * as TYPE from 'vega-lite/build/src/type'; import { DEFAULT_QUERY_CONFIG } from './config'; import { isAutoCountQuery } from './query/encoding'; import { ExpandedType } from './query/expandedtype'; import { cmp, duplicate, extend, keys } from './util'; const dlBin = dlBin_; /** * Build a Schema object. * * @param data - a set of raw data in the same format that Vega-Lite / Vega takes * Basically, it's an array in the form of: * * [ * {a: 1, b:2}, * {a: 2, b:3}, * ... * ] * * @return a Schema object */ export function build(data, opt = {}, tableSchema = { fields: [] }) { opt = extend({}, DEFAULT_QUERY_CONFIG, opt); // create profiles for each variable let summaries = summary(data); let types = inferAll(data); // inferAll does stronger type inference than summary let tableSchemaFieldIndex = tableSchema.fields.reduce((m, field) => { m[field.name] = field; return m; }, {}); let fieldSchemas = summaries.map(function (fieldProfile, index) { const name = fieldProfile.field; // In Table schema, 'date' doesn't include time so use 'datetime' const type = types[name] === 'date' ? PrimitiveType.DATETIME : types[name]; let distinct = fieldProfile.distinct; let vlType; if (type === PrimitiveType.NUMBER) { vlType = TYPE.QUANTITATIVE; } else if (type === PrimitiveType.INTEGER) { // use ordinal or nominal when cardinality of integer type is relatively low and the distinct values are less than an amount specified in options if (distinct < opt.numberNominalLimit && distinct / fieldProfile.count < opt.numberNominalProportion) { vlType = TYPE.NOMINAL; } else { vlType = TYPE.QUANTITATIVE; } } else if (type === PrimitiveType.DATETIME) { vlType = TYPE.TEMPORAL; // need to get correct min/max of date data because datalib's summary method does not // calculate this correctly for date types. fieldProfile.min = new Date(data[0][name]); fieldProfile.max = new Date(data[0][name]); for (const dataEntry of data) { const time = new Date(dataEntry[name]).getTime(); if (time < fieldProfile.min.getTime()) { fieldProfile.min = new Date(time); } if (time > fieldProfile.max.getTime()) { fieldProfile.max = new Date(time); } } } else { vlType = TYPE.NOMINAL; } if (vlType === TYPE.NOMINAL && distinct / fieldProfile.count > opt.minPercentUniqueForKey && fieldProfile.count > opt.minCardinalityForKey) { vlType = ExpandedType.KEY; } let fieldSchema = { name: name, // Need to keep original index for re-exporting TableSchema originalIndex: index, vlType: vlType, type: type, stats: fieldProfile, timeStats: {}, binStats: {} }; // extend field schema with table schema field - if present const orgFieldSchema = tableSchemaFieldIndex[fieldSchema.name]; fieldSchema = extend(fieldSchema, orgFieldSchema); return fieldSchema; }); // calculate preset bins for quantitative and temporal data for (let fieldSchema of fieldSchemas) { if (fieldSchema.vlType === TYPE.QUANTITATIVE) { for (let maxbins of opt.enum.binProps.maxbins) { fieldSchema.binStats[maxbins] = binSummary(maxbins, fieldSchema.stats); } } else if (fieldSchema.vlType === TYPE.TEMPORAL) { for (let unit of opt.enum.timeUnit) { if (unit !== undefined) { fieldSchema.timeStats[unit] = timeSummary(unit, fieldSchema.stats); } } } } const derivedTableSchema = Object.assign({}, tableSchema, { fields: fieldSchemas }); return new Schema(derivedTableSchema); } // order the field schema when we construct a new Schema // this orders the fields in the UI const order = { nominal: 0, key: 1, ordinal: 2, temporal: 3, quantitative: 4 }; export class Schema { constructor(tableSchema) { this._tableSchema = tableSchema; tableSchema.fields.sort(function (a, b) { // first order by vlType: nominal < temporal < quantitative < ordinal if (order[a.vlType] < order[b.vlType]) { return -1; } else if (order[a.vlType] > order[b.vlType]) { return 1; } else { // then order by field (alphabetically) return a.name.localeCompare(b.name); } }); // Add index for sorting tableSchema.fields.forEach((fieldSchema, index) => (fieldSchema.index = index)); this._fieldSchemaIndex = tableSchema.fields.reduce((m, fieldSchema) => { m[fieldSchema.name] = fieldSchema; return m; }, {}); } /** @return a list of the field names (for enumerating). */ fieldNames() { return this._tableSchema.fields.map(fieldSchema => fieldSchema.name); } /** @return a list of FieldSchemas */ get fieldSchemas() { return this._tableSchema.fields; } fieldSchema(fieldName) { return this._fieldSchemaIndex[fieldName]; } tableSchema() { // the fieldschemas are re-arranged // but this is not allowed in table schema. // so we will re-order based on original index. const tableSchema = duplicate(this._tableSchema); tableSchema.fields.sort((a, b) => a.originalIndex - b.originalIndex); return tableSchema; } /** * @return primitive type of the field if exist, otherwise return null */ primitiveType(fieldName) { return this._fieldSchemaIndex[fieldName] ? this._fieldSchemaIndex[fieldName].type : null; } /** * @return vlType of measturement of the field if exist, otherwise return null */ vlType(fieldName) { return this._fieldSchemaIndex[fieldName] ? this._fieldSchemaIndex[fieldName].vlType : null; } /** @return cardinality of the field associated with encQ, null if it doesn't exist. * @param augmentTimeUnitDomain - TimeUnit field domains will not be augmented if explicitly set to false. */ cardinality(fieldQ, augmentTimeUnitDomain = true, excludeInvalid = false) { const fieldSchema = this._fieldSchemaIndex[fieldQ.field]; if (fieldQ.aggregate || (isAutoCountQuery(fieldQ) && fieldQ.autoCount)) { return 1; } else if (fieldQ.bin) { // encQ.bin will either be a boolean or a BinQuery let bin; if (typeof fieldQ.bin === 'boolean') { // autoMaxBins defaults to 10 if channel is Wildcard bin = { maxbins: autoMaxBins(fieldQ.channel) }; } else if (fieldQ.bin === '?') { bin = { enum: [true, false] }; } else { bin = fieldQ.bin; } const maxbins = bin.maxbins; if (!fieldSchema.binStats[maxbins]) { // need to calculate fieldSchema.binStats[maxbins] = binSummary(maxbins, fieldSchema.stats); } // don't need to worry about excludeInvalid here because invalid values don't affect linearly binned field's cardinality return fieldSchema.binStats[maxbins].distinct; } else if (fieldQ.timeUnit) { if (augmentTimeUnitDomain) { switch (fieldQ.timeUnit) { // TODO: this should not always be the case once Vega-Lite supports turning off domain augmenting (VL issue #1385) case TimeUnit.SECONDS: return 60; case TimeUnit.MINUTES: return 60; case TimeUnit.HOURS: return 24; case TimeUnit.DAY: return 7; case TimeUnit.DATE: return 31; case TimeUnit.MONTH: return 12; case TimeUnit.QUARTER: return 4; case TimeUnit.MILLISECONDS: return 1000; } } let unit = fieldQ.timeUnit; let timeStats = fieldSchema.timeStats; // if the cardinality for the timeUnit is not cached, calculate it if (!timeStats || !timeStats[unit]) { timeStats = Object.assign({}, timeStats, { [unit]: timeSummary(fieldQ.timeUnit, fieldSchema.stats) }); } if (excludeInvalid) { return timeStats[unit].distinct - invalidCount(timeStats[unit].unique, ['Invalid Date', null]); } else { return timeStats[unit].distinct; } } else { if (fieldSchema) { if (excludeInvalid) { return fieldSchema.stats.distinct - invalidCount(fieldSchema.stats.unique, [NaN, null]); } else { return fieldSchema.stats.distinct; } } else { return null; } } } /** * Given an EncodingQuery with a timeUnit, returns true if the date field * has multiple distinct values for all parts of the timeUnit. Returns undefined * if the timeUnit is undefined. * i.e. * ('yearmonth', [Jan 1 2000, Feb 2 2000] returns false) * ('yearmonth', [Jan 1 2000, Feb 2 2001] returns true) */ timeUnitHasVariation(fieldQ) { if (!fieldQ.timeUnit) { return; } // if there is no variation in `date`, there should not be variation in `day` if (fieldQ.timeUnit === TimeUnit.DAY) { const dateEncQ = extend({}, fieldQ, { timeUnit: TimeUnit.DATE }); if (this.cardinality(dateEncQ, false, true) <= 1) { return false; } } let fullTimeUnit = fieldQ.timeUnit; for (let timeUnitPart of TIMEUNIT_PARTS) { if (containsTimeUnit(fullTimeUnit, timeUnitPart)) { // Create a clone of encQ, but with singleTimeUnit const singleUnitEncQ = extend({}, fieldQ, { timeUnit: timeUnitPart }); if (this.cardinality(singleUnitEncQ, false, true) <= 1) { return false; } } } return true; } domain(fieldQueryParts) { // TODO: differentiate for field with bin / timeUnit const fieldSchema = this._fieldSchemaIndex[fieldQueryParts.field]; let domain = keys(fieldSchema.stats.unique); if (fieldSchema.vlType === TYPE.QUANTITATIVE) { // return [min, max], coerced into number types return [+fieldSchema.stats.min, +fieldSchema.stats.max]; } else if (fieldSchema.type === PrimitiveType.DATETIME) { // return [min, max] dates return [fieldSchema.stats.min, fieldSchema.stats.max]; } else if (fieldSchema.type === PrimitiveType.INTEGER || fieldSchema.type === PrimitiveType.NUMBER) { // coerce non-quantitative numerical data into number type domain = domain.map(x => +x); return domain.sort(cmp); } else if (fieldSchema.vlType === TYPE.ORDINAL && fieldSchema.ordinalDomain) { return fieldSchema.ordinalDomain; } return domain .map(x => { // Convert 'null' to null as it is encoded similarly in datalib. // This is wrong when it is a string 'null' but that rarely happens. return x === 'null' ? null : x; }) .sort(cmp); } /** * @return a Summary corresponding to the field of the given EncodingQuery */ stats(fieldQ) { // TODO: differentiate for field with bin / timeUnit vs without const fieldSchema = this._fieldSchemaIndex[fieldQ.field]; return fieldSchema ? fieldSchema.stats : null; } } /** * @return a summary of the binning scheme determined from the given max number of bins */ function binSummary(maxbins, summary) { const bin = dlBin({ min: summary.min, max: summary.max, maxbins: maxbins }); // start with summary, pre-binning const result = extend({}, summary); result.unique = binUnique(bin, summary.unique); result.distinct = (bin.stop - bin.start) / bin.step; result.min = bin.start; result.max = bin.stop; return result; } /** @return a modified version of the passed summary with unique and distinct set according to the timeunit. * Maps 'null' (string) keys to the null value and invalid dates to 'Invalid Date' in the unique dictionary. */ function timeSummary(timeunit, summary) { const result = extend({}, summary); let unique = {}; keys(summary.unique).forEach(function (dateString) { // don't convert null value because the Date constructor will actually convert it to a date let date = dateString === 'null' ? null : new Date(dateString); // at this point, `date` is either the null value, a valid Date object, or "Invalid Date" which is a Date let key; if (date === null) { key = null; } else if (isNaN(date.getTime())) { key = 'Invalid Date'; } else { key = (timeunit === TimeUnit.DAY ? date.getDay() : convert(timeunit, date)).toString(); } unique[key] = (unique[key] || 0) + summary.unique[dateString]; }); result.unique = unique; result.distinct = keys(unique).length; return result; } /** * @return a new unique object based off of the old unique count and a binning scheme */ function binUnique(bin, oldUnique) { const newUnique = {}; for (let value in oldUnique) { let bucket; if (value === null) { bucket = null; } else if (isNaN(Number(value))) { bucket = NaN; } else { bucket = bin.value(Number(value)); } newUnique[bucket] = (newUnique[bucket] || 0) + oldUnique[value]; } return newUnique; } /** @return the number of items in list that occur as keys of unique */ function invalidCount(unique, list) { return list.reduce(function (prev, cur) { return unique[cur] ? prev + 1 : prev; }, 0); } export var PrimitiveType; (function (PrimitiveType) { PrimitiveType[PrimitiveType["STRING"] = 'string'] = "STRING"; PrimitiveType[PrimitiveType["NUMBER"] = 'number'] = "NUMBER"; PrimitiveType[PrimitiveType["INTEGER"] = 'integer'] = "INTEGER"; PrimitiveType[PrimitiveType["BOOLEAN"] = 'boolean'] = "BOOLEAN"; PrimitiveType[PrimitiveType["DATETIME"] = 'datetime'] = "DATETIME"; })(PrimitiveType || (PrimitiveType = {})); //# sourceMappingURL=schema.js.map