UNPKG

arquero

Version:

Query processing and transformation of array-backed data tables.

201 lines (189 loc) 7.39 kB
import { ColumnTable } from '../table/ColumnTable.js'; import { isArray } from '../util/is-array.js'; import { isDigitString } from '../util/is-digit-string.js'; import { isISODateString } from '../util/is-iso-date-string.js'; import { isString } from '../util/is-string.js'; import { collectJSON } from './stream/collect.js'; import { COLUMNS, NDJSON } from './stream/constants.js'; import { lineFilterTransformer } from './stream/line-filter-stream.js'; import { parseJSONStream, parseJSONSync } from './stream/parse-json-rows.js'; import { textLineTransformer } from './stream/text-line-stream.js'; import { textStream } from './stream/text-stream.js'; /** * Options for JSON parsing. * @typedef {object} JSONParseOptions * @property {'columns' | 'rows' | 'ndjson' | null} [type] The format type. * One of `'columns'` (for an object with named column arrays)`, 'rows'` (for * an array for row objects), or `'ndjson'` for [newline-delimited JSON][1] * rows. For `'ndjson'`, each line of text must contain a JSON row object * (with no trailing comma) and string properties must not contain any * newline characters. If no format type is specified, one of `'rows'` or * `'columns'` is inferred from the structure of the parsed JSON. * * [1]: https://github.com/ndjson/ndjson-spec * @property {boolean} [autoType=true] Flag controlling automatic type * inference for column values. If false, date parsing for input JSON * strings is disabled. * @property {Record<string, (value: any) => any>} [parse] Object of column * parsing options. The object keys should be column names. The object values * should be parsing functions that transform values upon input. * @property {string[]} [columns] An array of column names to include. JSON * properties missing from this list are not included in the table. * @property {number} [skip=0] The number of lines to skip before reading data. * Applicable to newline-delimited (NDJSON) data only. * @property {string} [comment] A string used to identify comment lines. Any * lines that start with the comment pattern are skipped. Applicable to * newline-delimited (NDJSON) data only. */ /** * Parse JavaScript Object Notation (JSON) data into a table. * By default, automatic type inference is performed * for input values; string values that match the ISO standard * date format are parsed into JavaScript Date objects. To disable this * behavior, set the autoType option to false. To perform custom parsing * of input column values, use the parse option. * @param {string} input The input text or stream. * @param {JSONParseOptions} options The JSON parse options. * @return {ColumnTable} A new table containing the parsed values. */ export function fromJSON(input, options = {}) { const { columns = undefined, type = undefined } = options; let data; if (type === NDJSON) { data = parseJSONSync(input, columns, transforms(options)); } else { const json = isString(input) ? JSON.parse(input) : input; data = (type === COLUMNS || (!type && !isArray(json))) ? parseJSONColumns(json, columns) : parseJSONRows(json, columns); } return postprocessJSON(data, options); } /** * Parse a JavaScript Object Notation (JSON) stream into a table. * By default, automatic type inference is performed * for input values; string values that match the ISO standard * date format are parsed into JavaScript Date objects. To disable this * behavior, set the autoType option to false. To perform custom parsing * of input column values, use the parse option. * @param {ReadableStream<string>} input The input text or stream. * @param {JSONParseOptions} options The JSON parse options. * @return {Promise<ColumnTable>} A Promise to a new table containing the * parsed values. */ export async function fromJSONStream(input, options = {}) { return options.type === NDJSON ? postprocessJSON( await parseJSONStream(input, options.columns, transforms(options)), options ) : fromJSON(await collectJSON(input), options); } /** * Load a JSON file from a URL and return a Promise for an Arquero table. * If the loaded JSON is array-valued, an array-of-objects format is assumed * and the aq.from method is used to construct the table. Otherwise, a * column object format is assumed and aq.fromJSON is applied. * @param {string} path The URL or file path to load. * @param {import('./types.js').LoadOptions & JSONParseOptions} [options] * JSON parse options. * @return {Promise<ColumnTable>} A Promise to an Arquero table. * @example aq.loadJSON('data/table.json') */ export async function loadJSON(path, options) { const input = await textStream(path, options); if (!options?.type && path.slice(-NDJSON.length-1).toLowerCase() === `.${NDJSON}`) { options = { ...options, type: NDJSON }; } return fromJSONStream(input, options); } function transforms({ comment = undefined, skip = 0 } = {}) { return [ textLineTransformer(), lineFilterTransformer(skip, comment) ]; } /** * @param {import('../table/types.js').ColumnData} data Initial column data. * @param {string[]} [names] The column names to use. * @return {{ * columns: import('../table/types.js').ColumnData, * names: string[] * }} */ function parseJSONColumns(data, names) { /** @type {import('../table/types.js').ColumnData} */ let columns = data; if (names) { columns = names.reduce((c, name) => (c[name] = data[name], c), {}); } else { names = Object.keys(columns); } return { columns, names }; } /** * @param {object[]} data The input row objects. * @param {string[]} [names] The column names to use. * @return {{ * columns: import('../table/types.js').ColumnData, * names: string[] * }} */ function parseJSONRows(data, names) { names ??= Object.keys(data[0]); const cols = names.map(() => []); const n = data.length; const m = names.length; for (let i = 0; i < n; ++i) { const obj = data[i]; for (let j = 0; j < m; ++j) { cols[j].push(obj[names[j]]); } } /** @type {import('../table/types.js').ColumnData} */ const columns = {}; names.forEach((name, i) => columns[name] = cols[i]); return { columns, names }; } /** * Post-process JavaScript Object Notation (JSON) data, performing type * inference as needed and returning a table instance. * @param {{ * columns: import('../table/types.js').ColumnData, * names: string[] * }} data The column data. * @param {JSONParseOptions} [options] The JSON parse options. * @return {ColumnTable} A new table containing the parsed values. */ function postprocessJSON({ columns, names }, { autoType = true, parse = undefined } = {}) { // parse values as necessary if (autoType || parse) { const parsers = parse || {}; for (const name in columns) { const col = columns[name]; const len = col.length; if (parsers[name]) { // apply custom parser for (let i = 0; i < len; ++i) { col[i] = parsers[name](col[i]); } } else if (autoType) { // apply autoType parser for (let i = 0; i < len; ++i) { const val = col[i]; if (isString(val) && isISODateString(val) && !isDigitString(val)) { col[i] = new Date(val); } } } } } return new ColumnTable(columns, names); }