compassql
Version:
CompassQL visualization query language
485 lines (431 loc) • 15.9 kB
text/typescript
import dlBin_ from 'datalib/src/bins/bins';
import {inferAll} from 'datalib/src/import/type';
import {summary} from 'datalib/src/stats';
import {autoMaxBins} from 'vega-lite/build/src/bin';
import {Channel} from 'vega-lite/build/src/channel';
import {containsTimeUnit, convert, TimeUnit, TIMEUNIT_PARTS} from 'vega-lite/build/src/timeunit';
import * as TYPE from 'vega-lite/build/src/type';
import {DEFAULT_QUERY_CONFIG, QueryConfig} from './config';
import {BinQuery, EncodingQuery, FieldQuery, isAutoCountQuery} from './query/encoding';
import {ExpandedType} from './query/expandedtype';
import {cmp, duplicate, extend, keys} from './util';
const dlBin = dlBin_;
/**
* Table Schema Field Descriptor interface
* see: https://specs.frictionlessdata.io/table-schema/
*/
export interface TableSchemaFieldDescriptor {
/* name of field **/
name: string;
/* A nicer human readable label or title for the field **/
title?: string;
/* number, integer, string, datetime */
type: PrimitiveType;
/* A string specifying a format */
format?: string;
/* A description for the field */
description?: string;
}
/**
* Field Schema
*/
export interface FieldSchema extends TableSchemaFieldDescriptor {
vlType?: ExpandedType;
index?: number;
// Need to keep original index for re-exporting TableSchema
originalIndex?: number;
stats: DLFieldProfile;
binStats?: {[maxbins: string]: DLFieldProfile};
timeStats?: {[timeUnit: string]: DLFieldProfile};
// array of valid input values (fields)
ordinalDomain?: string[];
}
/**
* Table Schema
* see: https://specs.frictionlessdata.io/table-schema/
*/
export interface TableSchema<F extends TableSchemaFieldDescriptor> {
fields: F[];
missingValues?: string[];
primaryKey?: string | string[];
foreignKeys?: object[];
}
/**
* Build a Schema object.
*
* @param data - a set of raw data in the same format that Vega-Lite / Vega takes
* Basically, it's an array in the form of:
*
* [
* {a: 1, b:2},
* {a: 2, b:3},
* ...
* ]
*
* @return a Schema object
*/
export function build(
data: any,
opt: QueryConfig = {},
tableSchema: TableSchema<TableSchemaFieldDescriptor> = {fields: []}
): Schema {
opt = extend({}, DEFAULT_QUERY_CONFIG, opt);
// create profiles for each variable
let summaries: DLFieldProfile[] = summary(data);
let types = inferAll(data); // inferAll does stronger type inference than summary
let tableSchemaFieldIndex = tableSchema.fields.reduce((m, field: TableSchemaFieldDescriptor) => {
m[field.name] = field;
return m;
}, {});
let fieldSchemas: FieldSchema[] = summaries.map(function(fieldProfile, index) {
const name: string = fieldProfile.field;
// In Table schema, 'date' doesn't include time so use 'datetime'
const type: PrimitiveType = types[name] === 'date' ? PrimitiveType.DATETIME : (types[name] as any);
let distinct: number = fieldProfile.distinct;
let vlType: ExpandedType;
if (type === PrimitiveType.NUMBER) {
vlType = TYPE.QUANTITATIVE;
} else if (type === PrimitiveType.INTEGER) {
// use ordinal or nominal when cardinality of integer type is relatively low and the distinct values are less than an amount specified in options
if (distinct < opt.numberNominalLimit && distinct / fieldProfile.count < opt.numberNominalProportion) {
vlType = TYPE.NOMINAL;
} else {
vlType = TYPE.QUANTITATIVE;
}
} else if (type === PrimitiveType.DATETIME) {
vlType = TYPE.TEMPORAL;
// need to get correct min/max of date data because datalib's summary method does not
// calculate this correctly for date types.
fieldProfile.min = new Date(data[0][name]);
fieldProfile.max = new Date(data[0][name]);
for (const dataEntry of data) {
const time = new Date(dataEntry[name]).getTime();
if (time < (fieldProfile.min as Date).getTime()) {
fieldProfile.min = new Date(time);
}
if (time > (fieldProfile.max as Date).getTime()) {
fieldProfile.max = new Date(time);
}
}
} else {
vlType = TYPE.NOMINAL;
}
if (
vlType === TYPE.NOMINAL &&
distinct / fieldProfile.count > opt.minPercentUniqueForKey &&
fieldProfile.count > opt.minCardinalityForKey
) {
vlType = ExpandedType.KEY;
}
let fieldSchema = {
name: name,
// Need to keep original index for re-exporting TableSchema
originalIndex: index,
vlType: vlType,
type: type,
stats: fieldProfile,
timeStats: {} as {[timeUnit: string]: DLFieldProfile},
binStats: {} as {[key: string]: DLFieldProfile}
};
// extend field schema with table schema field - if present
const orgFieldSchema = tableSchemaFieldIndex[fieldSchema.name];
fieldSchema = extend(fieldSchema, orgFieldSchema);
return fieldSchema;
});
// calculate preset bins for quantitative and temporal data
for (let fieldSchema of fieldSchemas) {
if (fieldSchema.vlType === TYPE.QUANTITATIVE) {
for (let maxbins of opt.enum.binProps.maxbins) {
fieldSchema.binStats[maxbins] = binSummary(maxbins, fieldSchema.stats);
}
} else if (fieldSchema.vlType === TYPE.TEMPORAL) {
for (let unit of opt.enum.timeUnit) {
if (unit !== undefined) {
fieldSchema.timeStats[unit] = timeSummary(unit, fieldSchema.stats);
}
}
}
}
const derivedTableSchema: TableSchema<FieldSchema> = {
...tableSchema,
fields: fieldSchemas
};
return new Schema(derivedTableSchema);
}
// order the field schema when we construct a new Schema
// this orders the fields in the UI
const order = {
nominal: 0,
key: 1,
ordinal: 2,
temporal: 3,
quantitative: 4
};
export class Schema {
private _tableSchema: TableSchema<FieldSchema>;
private _fieldSchemaIndex: {[field: string]: FieldSchema};
constructor(tableSchema: TableSchema<FieldSchema>) {
this._tableSchema = tableSchema;
tableSchema.fields.sort(function(a: FieldSchema, b: FieldSchema) {
// first order by vlType: nominal < temporal < quantitative < ordinal
if (order[a.vlType] < order[b.vlType]) {
return -1;
} else if (order[a.vlType] > order[b.vlType]) {
return 1;
} else {
// then order by field (alphabetically)
return a.name.localeCompare(b.name);
}
});
// Add index for sorting
tableSchema.fields.forEach((fieldSchema, index) => (fieldSchema.index = index));
this._fieldSchemaIndex = tableSchema.fields.reduce((m, fieldSchema: FieldSchema) => {
m[fieldSchema.name] = fieldSchema;
return m;
}, {});
}
/** @return a list of the field names (for enumerating). */
public fieldNames() {
return this._tableSchema.fields.map(fieldSchema => fieldSchema.name);
}
/** @return a list of FieldSchemas */
public get fieldSchemas() {
return this._tableSchema.fields;
}
public fieldSchema(fieldName: string) {
return this._fieldSchemaIndex[fieldName];
}
public tableSchema() {
// the fieldschemas are re-arranged
// but this is not allowed in table schema.
// so we will re-order based on original index.
const tableSchema = duplicate(this._tableSchema);
tableSchema.fields.sort((a, b) => a.originalIndex - b.originalIndex);
return tableSchema;
}
/**
* @return primitive type of the field if exist, otherwise return null
*/
public primitiveType(fieldName: string) {
return this._fieldSchemaIndex[fieldName] ? this._fieldSchemaIndex[fieldName].type : null;
}
/**
* @return vlType of measturement of the field if exist, otherwise return null
*/
public vlType(fieldName: string) {
return this._fieldSchemaIndex[fieldName] ? this._fieldSchemaIndex[fieldName].vlType : null;
}
/** @return cardinality of the field associated with encQ, null if it doesn't exist.
* @param augmentTimeUnitDomain - TimeUnit field domains will not be augmented if explicitly set to false.
*/
public cardinality(fieldQ: FieldQuery, augmentTimeUnitDomain: boolean = true, excludeInvalid: boolean = false) {
const fieldSchema = this._fieldSchemaIndex[fieldQ.field as string];
if (fieldQ.aggregate || (isAutoCountQuery(fieldQ) && fieldQ.autoCount)) {
return 1;
} else if (fieldQ.bin) {
// encQ.bin will either be a boolean or a BinQuery
let bin: BinQuery;
if (typeof fieldQ.bin === 'boolean') {
// autoMaxBins defaults to 10 if channel is Wildcard
bin = {
maxbins: autoMaxBins(fieldQ.channel as Channel)
};
} else if (fieldQ.bin === '?') {
bin = {
enum: [true, false]
};
} else {
bin = fieldQ.bin;
}
const maxbins: any = bin.maxbins;
if (!fieldSchema.binStats[maxbins]) {
// need to calculate
fieldSchema.binStats[maxbins] = binSummary(maxbins, fieldSchema.stats);
}
// don't need to worry about excludeInvalid here because invalid values don't affect linearly binned field's cardinality
return fieldSchema.binStats[maxbins].distinct;
} else if (fieldQ.timeUnit) {
if (augmentTimeUnitDomain) {
switch (fieldQ.timeUnit) {
// TODO: this should not always be the case once Vega-Lite supports turning off domain augmenting (VL issue #1385)
case TimeUnit.SECONDS:
return 60;
case TimeUnit.MINUTES:
return 60;
case TimeUnit.HOURS:
return 24;
case TimeUnit.DAY:
return 7;
case TimeUnit.DATE:
return 31;
case TimeUnit.MONTH:
return 12;
case TimeUnit.QUARTER:
return 4;
case TimeUnit.MILLISECONDS:
return 1000;
}
}
let unit = fieldQ.timeUnit as string;
let timeStats = fieldSchema.timeStats;
// if the cardinality for the timeUnit is not cached, calculate it
if (!timeStats || !timeStats[unit]) {
timeStats = {
...timeStats,
[unit]: timeSummary(fieldQ.timeUnit as TimeUnit, fieldSchema.stats)
};
}
if (excludeInvalid) {
return timeStats[unit].distinct - invalidCount(timeStats[unit].unique, ['Invalid Date', null]);
} else {
return timeStats[unit].distinct;
}
} else {
if (fieldSchema) {
if (excludeInvalid) {
return fieldSchema.stats.distinct - invalidCount(fieldSchema.stats.unique, [NaN, null]);
} else {
return fieldSchema.stats.distinct;
}
} else {
return null;
}
}
}
/**
* Given an EncodingQuery with a timeUnit, returns true if the date field
* has multiple distinct values for all parts of the timeUnit. Returns undefined
* if the timeUnit is undefined.
* i.e.
* ('yearmonth', [Jan 1 2000, Feb 2 2000] returns false)
* ('yearmonth', [Jan 1 2000, Feb 2 2001] returns true)
*/
public timeUnitHasVariation(fieldQ: FieldQuery): boolean {
if (!fieldQ.timeUnit) {
return;
}
// if there is no variation in `date`, there should not be variation in `day`
if (fieldQ.timeUnit === TimeUnit.DAY) {
const dateEncQ: EncodingQuery = extend({}, fieldQ, {timeUnit: TimeUnit.DATE});
if (this.cardinality(dateEncQ, false, true) <= 1) {
return false;
}
}
let fullTimeUnit = fieldQ.timeUnit;
for (let timeUnitPart of TIMEUNIT_PARTS) {
if (containsTimeUnit(fullTimeUnit as TimeUnit, timeUnitPart)) {
// Create a clone of encQ, but with singleTimeUnit
const singleUnitEncQ = extend({}, fieldQ, {timeUnit: timeUnitPart});
if (this.cardinality(singleUnitEncQ, false, true) <= 1) {
return false;
}
}
}
return true;
}
public domain(fieldQueryParts: {field: string}): any[] {
// TODO: differentiate for field with bin / timeUnit
const fieldSchema = this._fieldSchemaIndex[fieldQueryParts.field as string];
let domain: any[] = keys(fieldSchema.stats.unique);
if (fieldSchema.vlType === TYPE.QUANTITATIVE) {
// return [min, max], coerced into number types
return [+fieldSchema.stats.min, +fieldSchema.stats.max];
} else if (fieldSchema.type === PrimitiveType.DATETIME) {
// return [min, max] dates
return [fieldSchema.stats.min, fieldSchema.stats.max];
} else if (fieldSchema.type === PrimitiveType.INTEGER || fieldSchema.type === PrimitiveType.NUMBER) {
// coerce non-quantitative numerical data into number type
domain = domain.map(x => +x);
return domain.sort(cmp);
} else if (fieldSchema.vlType === TYPE.ORDINAL && fieldSchema.ordinalDomain) {
return fieldSchema.ordinalDomain;
}
return domain
.map(x => {
// Convert 'null' to null as it is encoded similarly in datalib.
// This is wrong when it is a string 'null' but that rarely happens.
return x === 'null' ? null : x;
})
.sort(cmp);
}
/**
* @return a Summary corresponding to the field of the given EncodingQuery
*/
public stats(fieldQ: FieldQuery) {
// TODO: differentiate for field with bin / timeUnit vs without
const fieldSchema = this._fieldSchemaIndex[fieldQ.field as string];
return fieldSchema ? fieldSchema.stats : null;
}
}
/**
* @return a summary of the binning scheme determined from the given max number of bins
*/
function binSummary(maxbins: number, summary: DLFieldProfile): DLFieldProfile {
const bin = dlBin({
min: summary.min,
max: summary.max,
maxbins: maxbins
});
// start with summary, pre-binning
const result = extend({}, summary);
result.unique = binUnique(bin, summary.unique);
result.distinct = (bin.stop - bin.start) / bin.step;
result.min = bin.start;
result.max = bin.stop;
return result;
}
/** @return a modified version of the passed summary with unique and distinct set according to the timeunit.
* Maps 'null' (string) keys to the null value and invalid dates to 'Invalid Date' in the unique dictionary.
*/
function timeSummary(timeunit: TimeUnit, summary: DLFieldProfile): DLFieldProfile {
const result = extend({}, summary);
let unique: {[value: string]: number} = {};
keys(summary.unique).forEach(function(dateString) {
// don't convert null value because the Date constructor will actually convert it to a date
let date: Date = dateString === 'null' ? null : new Date(dateString);
// at this point, `date` is either the null value, a valid Date object, or "Invalid Date" which is a Date
let key: string;
if (date === null) {
key = null;
} else if (isNaN(date.getTime())) {
key = 'Invalid Date';
} else {
key = (timeunit === TimeUnit.DAY ? date.getDay() : convert(timeunit, date)).toString();
}
unique[key] = (unique[key] || 0) + summary.unique[dateString];
});
result.unique = unique;
result.distinct = keys(unique).length;
return result;
}
/**
* @return a new unique object based off of the old unique count and a binning scheme
*/
function binUnique(bin: any, oldUnique: any) {
const newUnique = {};
for (let value in oldUnique) {
let bucket: number;
if (value === null) {
bucket = null;
} else if (isNaN(Number(value))) {
bucket = NaN;
} else {
bucket = bin.value(Number(value)) as number;
}
newUnique[bucket] = (newUnique[bucket] || 0) + oldUnique[value];
}
return newUnique;
}
/** @return the number of items in list that occur as keys of unique */
function invalidCount(unique: {}, list: any[]) {
return list.reduce(function(prev, cur) {
return unique[cur] ? prev + 1 : prev;
}, 0);
}
export enum PrimitiveType {
STRING = 'string' as any,
NUMBER = 'number' as any,
INTEGER = 'integer' as any,
BOOLEAN = 'boolean' as any,
DATETIME = 'datetime' as any
}