kepler.gl
Version:
kepler.gl is a webgl based application to visualize large scale location data in the browser
691 lines (629 loc) • 21 kB
JavaScript
// Copyright (c) 2021 Uber Technologies, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
import {csvParseRows, csvFormatRows} from 'd3-dsv';
import {range} from 'd3-array';
import {console as globalConsole} from 'global/window';
import assert from 'assert';
import {Analyzer, DATA_TYPES as AnalyzerDATA_TYPES} from 'type-analyzer';
import normalize from '@mapbox/geojson-normalize';
import {ALL_FIELD_TYPES, DATASET_FORMATS} from 'constants/default-settings';
import {notNullorUndefined, parseFieldValue} from 'utils/data-utils';
import KeplerGlSchema from 'schemas';
import {GUIDES_FILE_FORMAT_DOC} from 'constants/user-guides';
import {isPlainObject, toArray} from 'utils/utils';
export const ACCEPTED_ANALYZER_TYPES = [
AnalyzerDATA_TYPES.DATE,
AnalyzerDATA_TYPES.TIME,
AnalyzerDATA_TYPES.DATETIME,
AnalyzerDATA_TYPES.NUMBER,
AnalyzerDATA_TYPES.INT,
AnalyzerDATA_TYPES.FLOAT,
AnalyzerDATA_TYPES.BOOLEAN,
AnalyzerDATA_TYPES.STRING,
AnalyzerDATA_TYPES.GEOMETRY,
AnalyzerDATA_TYPES.GEOMETRY_FROM_STRING,
AnalyzerDATA_TYPES.PAIR_GEOMETRY_FROM_STRING,
AnalyzerDATA_TYPES.ZIPCODE,
AnalyzerDATA_TYPES.ARRAY,
AnalyzerDATA_TYPES.OBJECT
];
// if any of these value occurs in csv, parse it to null;
// const CSV_NULLS = ['', 'null', 'NULL', 'Null', 'NaN', '/N'];
// matches empty string
export const CSV_NULLS = /^(null|NULL|Null|NaN|\/N||)$/;
const IGNORE_DATA_TYPES = Object.keys(AnalyzerDATA_TYPES).filter(
type => !ACCEPTED_ANALYZER_TYPES.includes(type)
);
export const PARSE_FIELD_VALUE_FROM_STRING = {
[ALL_FIELD_TYPES.boolean]: {
valid: d => typeof d === 'boolean',
parse: d => d === 'true' || d === 'True' || d === 'TRUE' || d === '1'
},
[ALL_FIELD_TYPES.integer]: {
valid: d => parseInt(d, 10) === d,
parse: d => parseInt(d, 10)
},
[ALL_FIELD_TYPES.timestamp]: {
valid: (d, field) =>
['x', 'X'].includes(field.format) ? typeof d === 'number' : typeof d === 'string',
parse: (d, field) => (['x', 'X'].includes(field.format) ? Number(d) : d)
},
[ALL_FIELD_TYPES.real]: {
valid: d => parseFloat(d) === d,
// Note this will result in NaN for some string
parse: parseFloat
}
};
/**
* Process csv data, output a data object with `{fields: [], rows: []}`.
* The data object can be wrapped in a `dataset` and pass to [`addDataToMap`](../actions/actions.md#adddatatomap)
* @param rawData raw csv string
* @returns data object `{fields: [], rows: []}` can be passed to addDataToMaps
* @type {typeof import('./data-processor').processCsvData}
* @public
* @example
* import {processCsvData} from 'kepler.gl/processors';
*
* const testData = `gps_data.utc_timestamp,gps_data.lat,gps_data.lng,gps_data.types,epoch,has_result,id,time,begintrip_ts_utc,begintrip_ts_local,date
* 2016-09-17 00:09:55,29.9900937,31.2590542,driver_analytics,1472688000000,False,1,2016-09-23T00:00:00.000Z,2016-10-01 09:41:39+00:00,2016-10-01 09:41:39+00:00,2016-09-23
* 2016-09-17 00:10:56,29.9927699,31.2461142,driver_analytics,1472688000000,False,2,2016-09-23T00:00:00.000Z,2016-10-01 09:46:37+00:00,2016-10-01 16:46:37+00:00,2016-09-23
* 2016-09-17 00:11:56,29.9907261,31.2312742,driver_analytics,1472688000000,False,3,2016-09-23T00:00:00.000Z,,,2016-09-23
* 2016-09-17 00:12:58,29.9870074,31.2175827,driver_analytics,1472688000000,False,4,2016-09-23T00:00:00.000Z,,,2016-09-23`
*
* const dataset = {
* info: {id: 'test_data', label: 'My Csv'},
* data: processCsvData(testData)
* };
*
* dispatch(addDataToMap({
* datasets: [dataset],
* options: {centerMap: true, readOnly: true}
* }));
*/
export function processCsvData(rawData, header) {
let rows;
let headerRow;
if (typeof rawData === 'string') {
const parsedRows = csvParseRows(rawData);
if (!Array.isArray(parsedRows) || parsedRows.length < 2) {
// looks like an empty file, throw error to be catch
throw new Error('process Csv Data Failed: CSV is empty');
}
headerRow = parsedRows[0];
rows = parsedRows.slice(1);
} else if (Array.isArray(rawData) && rawData.length) {
rows = rawData;
headerRow = header;
if (!Array.isArray(headerRow)) {
// if data is passed in as array of rows and missing header
// assume first row is header
headerRow = rawData[0];
rows = rawData.slice(1);
}
}
if (!rows || !headerRow) {
throw new Error('invalid input passed to processCsvData');
}
// here we assume the csv file that people uploaded will have first row
// as name of the column
cleanUpFalsyCsvValue(rows);
// No need to run type detection on every data point
// here we get a list of none null values to run analyze on
const sample = getSampleForTypeAnalyze({fields: headerRow, rows});
const fields = getFieldsFromData(sample, headerRow);
const parsedRows = parseRowsByFields(rows, fields);
return {fields, rows: parsedRows};
}
/**
* Parse rows of csv by analyzed field types. So that `'1'` -> `1`, `'True'` -> `true`
* @param {Array<Array>} rows
* @param {Array<Object>} fields
*/
export function parseRowsByFields(rows, fields) {
// Edit rows in place
const geojsonFieldIdx = fields.findIndex(f => f.name === '_geojson');
fields.forEach(parseCsvRowsByFieldType.bind(null, rows, geojsonFieldIdx));
return rows;
}
/**
* Getting sample data for analyzing field type.
*
* @type {typeof import('./data-processor').getSampleForTypeAnalyze}
*/
export function getSampleForTypeAnalyze({fields, rows, sampleCount = 50}) {
const total = Math.min(sampleCount, rows.length);
// const fieldOrder = fields.map(f => f.name);
const sample = range(0, total, 1).map(d => ({}));
// collect sample data for each field
fields.forEach((field, fieldIdx) => {
// data counter
let i = 0;
// sample counter
let j = 0;
while (j < total) {
if (i >= rows.length) {
// if depleted data pool
sample[j][field] = null;
j++;
} else if (notNullorUndefined(rows[i][fieldIdx])) {
const value = rows[i][fieldIdx];
sample[j][field] = typeof value === 'string' ? value.trim() : value;
j++;
i++;
} else {
i++;
}
}
});
return sample;
}
/**
* Convert falsy value in csv including `'', 'null', 'NULL', 'Null', 'NaN'` to `null`,
* so that type-analyzer won't detect it as string
*
* @param {Array<Array>} rows
*/
function cleanUpFalsyCsvValue(rows) {
const re = new RegExp(CSV_NULLS, 'g');
for (let i = 0; i < rows.length; i++) {
for (let j = 0; j < rows[i].length; j++) {
// analyzer will set any fields to 'string' if there are empty values
// which will be parsed as '' by d3.csv
// here we parse empty data as null
// TODO: create warning when deltect `CSV_NULLS` in the data
if (typeof rows[i][j] === 'string' && rows[i][j].match(re)) {
rows[i][j] = null;
}
}
}
}
/**
* Process uploaded csv file to parse value by field type
*
* @param rows
* @param geoFieldIdx field index
* @param field
* @param i
* @type {typeof import('./data-processor').parseCsvRowsByFieldType}
*/
export function parseCsvRowsByFieldType(rows, geoFieldIdx, field, i) {
const parser = PARSE_FIELD_VALUE_FROM_STRING[field.type];
if (parser) {
// check first not null value of it's already parsed
const first = rows.find(r => notNullorUndefined(r[i]));
if (!first || parser.valid(first[i], field)) {
return;
}
rows.forEach(row => {
// parse string value based on field type
if (row[i] !== null) {
row[i] = parser.parse(row[i], field);
if (geoFieldIdx > -1 && row[geoFieldIdx] && row[geoFieldIdx].properties) {
row[geoFieldIdx].properties[field.name] = row[i];
}
}
});
}
}
/**
* Analyze field types from data in `string` format, e.g. uploaded csv.
* Assign `type`, `fieldIdx` and `format` (timestamp only) to each field
*
* @param data array of row object
* @param fieldOrder array of field names as string
* @returns formatted fields
* @type {typeof import('./data-processor').getFieldsFromData}
* @public
* @example
*
* import {getFieldsFromData} from 'kepler.gl/processors';
* const data = [{
* time: '2016-09-17 00:09:55',
* value: '4',
* surge: '1.2',
* isTrip: 'true',
* zeroOnes: '0'
* }, {
* time: '2016-09-17 00:30:08',
* value: '3',
* surge: null,
* isTrip: 'false',
* zeroOnes: '1'
* }, {
* time: null,
* value: '2',
* surge: '1.3',
* isTrip: null,
* zeroOnes: '1'
* }];
*
* const fieldOrder = ['time', 'value', 'surge', 'isTrip', 'zeroOnes'];
* const fields = getFieldsFromData(data, fieldOrder);
* // fields = [
* // {name: 'time', format: 'YYYY-M-D H:m:s', fieldIdx: 1, type: 'timestamp'},
* // {name: 'value', format: '', fieldIdx: 4, type: 'integer'},
* // {name: 'surge', format: '', fieldIdx: 5, type: 'real'},
* // {name: 'isTrip', format: '', fieldIdx: 6, type: 'boolean'},
* // {name: 'zeroOnes', format: '', fieldIdx: 7, type: 'integer'}];
*
*/
export function getFieldsFromData(data, fieldOrder) {
// add a check for epoch timestamp
const metadata = Analyzer.computeColMeta(
data,
[
{regex: /.*geojson|all_points/g, dataType: 'GEOMETRY'},
{regex: /.*census/g, dataType: 'STRING'}
],
{ignoredDataTypes: IGNORE_DATA_TYPES}
);
const {fieldByIndex} = renameDuplicateFields(fieldOrder);
const result = fieldOrder.map((field, index) => {
const name = fieldByIndex[index];
const fieldMeta = metadata.find(m => m.key === field);
const {type, format} = fieldMeta || {};
return {
name,
id: name,
displayName: name,
format,
fieldIdx: index,
type: analyzerTypeToFieldType(type),
analyzerType: type,
valueAccessor: dc => d => {
return dc.valueAt(d.index, index);
}
};
});
// @ts-ignore
return result;
}
/**
* pass in an array of field names, rename duplicated one
* and return a map from old field index to new name
*
* @param {Array} fieldOrder
* @returns {Object} new field name by index
*/
export function renameDuplicateFields(fieldOrder) {
return fieldOrder.reduce(
(accu, field, i) => {
const {allNames} = accu;
let fieldName = field;
// add a counter to duplicated names
if (allNames.includes(field)) {
let counter = 0;
while (allNames.includes(`${field}-${counter}`)) {
counter++;
}
fieldName = `${field}-${counter}`;
}
accu.fieldByIndex[i] = fieldName;
accu.allNames.push(fieldName);
return accu;
},
{allNames: [], fieldByIndex: {}}
);
}
/**
* Convert type-analyzer output to kepler.gl field types
*
* @param aType
* @returns corresponding type in `ALL_FIELD_TYPES`
* @type {typeof import('./data-processor').analyzerTypeToFieldType}}
*/
/* eslint-disable complexity */
export function analyzerTypeToFieldType(aType) {
const {
DATE,
TIME,
DATETIME,
NUMBER,
INT,
FLOAT,
BOOLEAN,
STRING,
GEOMETRY,
GEOMETRY_FROM_STRING,
PAIR_GEOMETRY_FROM_STRING,
ZIPCODE,
ARRAY,
OBJECT
} = AnalyzerDATA_TYPES;
// TODO: un recognized types
// CURRENCY PERCENT NONE
switch (aType) {
case DATE:
return ALL_FIELD_TYPES.date;
case TIME:
case DATETIME:
return ALL_FIELD_TYPES.timestamp;
case FLOAT:
return ALL_FIELD_TYPES.real;
case INT:
return ALL_FIELD_TYPES.integer;
case BOOLEAN:
return ALL_FIELD_TYPES.boolean;
case GEOMETRY:
case GEOMETRY_FROM_STRING:
case PAIR_GEOMETRY_FROM_STRING:
case ARRAY:
case OBJECT:
// TODO: create a new data type for objects and arrays
return ALL_FIELD_TYPES.geojson;
case NUMBER:
case STRING:
case ZIPCODE:
return ALL_FIELD_TYPES.string;
default:
globalConsole.warn(`Unsupported analyzer type: ${aType}`);
return ALL_FIELD_TYPES.string;
}
}
/* eslint-enable complexity */
/**
* Process data where each row is an object, output can be passed to [`addDataToMap`](../actions/actions.md#adddatatomap)
* NOTE: This function may mutate input.
* @param rawData an array of row object, each object should have the same number of keys
* @returns dataset containing `fields` and `rows`
* @type {typeof import('./data-processor').processRowObject}
* @public
* @example
* import {addDataToMap} from 'kepler.gl/actions';
* import {processRowObject} from 'kepler.gl/processors';
*
* const data = [
* {lat: 31.27, lng: 127.56, value: 3},
* {lat: 31.22, lng: 126.26, value: 1}
* ];
*
* dispatch(addDataToMap({
* datasets: {
* info: {label: 'My Data', id: 'my_data'},
* data: processRowObject(data)
* }
* }));
*/
export function processRowObject(rawData) {
if (!Array.isArray(rawData) || !rawData.length) {
return null;
}
const keys = Object.keys(rawData[0]);
const rows = rawData.map(d => keys.map(key => d[key]));
// row object an still contain values like `Null` or `N/A`
cleanUpFalsyCsvValue(rows);
return processCsvData(rows, keys);
}
/**
* Process GeoJSON [`FeatureCollection`](http://wiki.geojson.org/GeoJSON_draft_version_6#FeatureCollection),
* output a data object with `{fields: [], rows: []}`.
* The data object can be wrapped in a `dataset` and passed to [`addDataToMap`](../actions/actions.md#adddatatomap)
* NOTE: This function may mutate input.
*
* @param rawData raw geojson feature collection
* @returns dataset containing `fields` and `rows`
* @type {typeof import('./data-processor').processGeojson}
* @public
* @example
* import {addDataToMap} from 'kepler.gl/actions';
* import {processGeojson} from 'kepler.gl/processors';
*
* const geojson = {
* "type" : "FeatureCollection",
* "features" : [{
* "type" : "Feature",
* "properties" : {
* "capacity" : "10",
* "type" : "U-Rack"
* },
* "geometry" : {
* "type" : "Point",
* "coordinates" : [ -71.073283, 42.417500 ]
* }
* }]
* };
*
* dispatch(addDataToMap({
* datasets: {
* info: {
* label: 'Sample Taxi Trips in New York City',
* id: 'test_trip_data'
* },
* data: processGeojson(geojson)
* }
* }));
*/
export function processGeojson(rawData) {
const normalizedGeojson = normalize(rawData);
if (!normalizedGeojson || !Array.isArray(normalizedGeojson.features)) {
const error = new Error(
`Read File Failed: File is not a valid GeoJSON. Read more about [supported file format](${GUIDES_FILE_FORMAT_DOC})`
);
throw error;
// fail to normalize geojson
}
// getting all feature fields
const allDataRows = [];
for (let i = 0; i < normalizedGeojson.features.length; i++) {
const f = normalizedGeojson.features[i];
if (f.geometry) {
allDataRows.push({
// add feature to _geojson field
_geojson: f,
...(f.properties || {})
});
}
}
// get all the field
const fields = allDataRows.reduce((prev, curr) => {
Object.keys(curr).forEach(key => {
if (!prev.includes(key)) {
prev.push(key);
}
});
return prev;
}, []);
// make sure each feature has exact same fields
allDataRows.forEach(d => {
fields.forEach(f => {
if (!(f in d)) {
d[f] = null;
d._geojson.properties[f] = null;
}
});
});
return processRowObject(allDataRows);
}
/**
* On export data to csv
* @param {import('utils/table-utils/data-container-interface').DataContainerInterface} dataContainer
* @param {Array<Object>} fields `dataset.fields`
* @returns {string} csv string
*/
export function formatCsv(dataContainer, fields) {
const columns = fields.map(f => f.displayName || f.name);
const formattedData = [columns];
// parse geojson object as string
for (const row of dataContainer.rows(true)) {
formattedData.push(row.map((d, i) => parseFieldValue(d, fields[i].type)));
}
return csvFormatRows(formattedData);
}
/**
* Validate input data, adding missing field types, rename duplicate columns
* @type {typeof import('./data-processor').validateInputData}
*/
export function validateInputData(data) {
if (!isPlainObject(data)) {
assert('addDataToMap Error: dataset.data cannot be null');
return null;
} else if (!Array.isArray(data.fields)) {
assert('addDataToMap Error: expect dataset.data.fields to be an array');
return null;
} else if (!Array.isArray(data.rows)) {
assert('addDataToMap Error: expect dataset.data.rows to be an array');
return null;
}
const {fields, rows} = data;
// check if all fields has name, format and type
const allValid = fields.every((f, i) => {
if (!isPlainObject(f)) {
assert(`fields needs to be an array of object, but find ${typeof f}`);
fields[i] = {};
}
if (!f.name) {
assert(`field.name is required but missing in ${JSON.stringify(f)}`);
// assign a name
fields[i].name = `column_${i}`;
}
if (!ALL_FIELD_TYPES[f.type]) {
assert(`unknown field type ${f.type}`);
return false;
}
if (!fields.every(field => field.analyzerType)) {
assert('field missing analyzerType');
return false;
}
// check time format is correct based on first 10 not empty element
if (f.type === ALL_FIELD_TYPES.timestamp) {
const sample = findNonEmptyRowsAtField(rows, i, 10).map(r => ({ts: r[i]}));
const analyzedType = Analyzer.computeColMeta(sample)[0];
return analyzedType && analyzedType.category === 'TIME' && analyzedType.format === f.format;
}
return true;
});
if (allValid) {
return {rows, fields};
}
// if any field has missing type, recalculate it for everyone
// because we simply lost faith in humanity
const sampleData = getSampleForTypeAnalyze({
fields: fields.map(f => f.name),
rows
});
const fieldOrder = fields.map(f => f.name);
const meta = getFieldsFromData(sampleData, fieldOrder);
const updatedFields = fields.map((f, i) => ({
...f,
type: meta[i].type,
format: meta[i].format,
analyzerType: meta[i].analyzerType
}));
return {fields: updatedFields, rows};
}
function findNonEmptyRowsAtField(rows, fieldIdx, total) {
const sample = [];
let i = 0;
while (sample.length < total && i < rows.length) {
if (notNullorUndefined(rows[i][fieldIdx])) {
sample.push(rows[i]);
}
i++;
}
return sample;
}
/**
* Process saved kepler.gl json to be pass to [`addDataToMap`](../actions/actions.md#adddatatomap).
* The json object should contain `datasets` and `config`.
* @param {Object} rawData
* @param {Array} rawData.datasets
* @param {Object} rawData.config
* @returns {Object} datasets and config `{datasets: {}, config: {}}`
* @public
* @example
* import {addDataToMap} from 'kepler.gl/actions';
* import {processKeplerglJSON} from 'kepler.gl/processors';
*
* dispatch(addDataToMap(processKeplerglJSON(keplerGlJson)));
*/
export function processKeplerglJSON(rawData) {
return rawData ? KeplerGlSchema.load(rawData.datasets, rawData.config) : null;
}
/**
* Parse a single or an array of datasets saved using kepler.gl schema
* @param {Array | Array<Object>} rawData
*/
export function processKeplerglDataset(rawData) {
if (!rawData) {
return null;
}
const results = KeplerGlSchema.parseSavedData(toArray(rawData));
if (!results) {
return null;
}
return Array.isArray(rawData) ? results : results[0];
}
export const DATASET_HANDLERS = {
[DATASET_FORMATS.row]: processRowObject,
[DATASET_FORMATS.geojson]: processGeojson,
[DATASET_FORMATS.csv]: processCsvData,
[DATASET_FORMATS.keplergl]: processKeplerglDataset
};
export const Processors = {
processGeojson,
processCsvData,
processRowObject,
processKeplerglJSON,
processKeplerglDataset,
analyzerTypeToFieldType,
getFieldsFromData,
parseCsvRowsByFieldType,
formatCsv
};