UNPKG

@flourish/sdk

Version:
373 lines (372 loc) 14.5 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.createInterpreter = void 0; exports.extractData = extractData; exports.getColumnTypesForData = getColumnTypesForData; exports.getRandomSeededSample = getRandomSeededSample; exports.mulberry32 = mulberry32; exports.trimTrailingEmptyRows = trimTrailingEmptyRows; exports.dropReturnCharacters = dropReturnCharacters; exports.tidyTable = tidyTable; exports.stripCommonFixes = stripCommonFixes; exports.transposeNestedArray = transposeNestedArray; exports.getSlicedData = getSlicedData; exports.interpretColumn = interpretColumn; exports.sortDataTables = sortDataTables; const interpreter_1 = require("@flourish/interpreter"); require("./polyfills"); var interpreter_2 = require("@flourish/interpreter"); Object.defineProperty(exports, "createInterpreter", { enumerable: true, get: function () { return interpreter_2.createInterpreter; } }); function getLatestDataTimestamps(data_table_timestamps) { const timestamps = {}; const date_values = data_table_timestamps.map(t => t.last_updated?.getTime()).filter(Boolean); if (date_values.length) { const latest_date = Math.max(...date_values); timestamps.last_updated = new Date(latest_date); } return timestamps; } function extractData(data_binding, data_by_id, column_types_by_id, template_data_bindings, timestamps, // data_rewriter should be a function that takes a AugmentedDataBinding and the parsed value and returns the modified value data_rewriter) { var columns = []; var data_table_ids = []; var num_rows = 0; var dataset = Object.assign([], { column_names: {}, metadata: {}, timestamps: {} }); var interpreters_by_id = {}; // It's possible that a data_binding is for a dataset that is not in a // template_data_binding. In that case, we just return an empty dataset // to avoid an exception below. if (!template_data_bindings) { return dataset; } function getInterpretationIds(data_table_id, column_index) { if (!interpreters_by_id[data_table_id]) { return {}; } var by_column_index = interpreters_by_id[data_table_id]; if (!by_column_index[column_index]) { return {}; } return by_column_index[column_index]; } function getInterpreter(data_table_id, column_index) { const interpretation_ids = getInterpretationIds(data_table_id, column_index); if ("type_id" in interpretation_ids) { return interpreter_1.createInterpreter.getInterpretation(interpretation_ids.type_id); } } for (var data_table_id in column_types_by_id) { var lookup = {}; var column_types = column_types_by_id[data_table_id]; if (!column_types) { continue; } for (let i = 0; i < column_types.length; i++) { const type_id = column_types[i].type_id; const of_id = column_types[i].output_format_id; const output_format_id = (!of_id || of_id === "auto") ? type_id : of_id; lookup[column_types[i].index] = { type_id, output_format_id }; } interpreters_by_id[data_table_id] = lookup; } for (var key in data_binding) { if (data_binding[key] === null) { continue; } if (data_binding[key].columns === undefined && data_binding[key].column === undefined) { continue; } var b = data_binding[key]; b.template_data_binding = template_data_bindings[key]; b.key = key; if (!(b.data_table_id in data_by_id)) { var data_by_id_keys = []; for (var k in data_by_id) { data_by_id_keys.push(k); } console.error("Data table id " + b.data_table_id + " not in " + JSON.stringify(data_by_id_keys)); continue; } const data_table = data_by_id[b.data_table_id]; if (data_table == null) { throw new Error(`[BUG] The data from the data table with ID ${b.data_table_id} was missing`); } if (data_table.length == 0) { console.warn("Empty data table"); continue; } if ("columns" in b && b.columns != null) { var column_count = data_table[0].length; b.columns = b.columns.filter(function (i) { return i < column_count; }); dataset.column_names[key] = b.columns.map(function (i) { return data_table[0][i]; }); dataset.metadata[key] = b.columns.map(function (i) { const interpretation_ids = getInterpretationIds(b.data_table_id, i); if ("type_id" in interpretation_ids) { const { type_id, output_format_id } = interpretation_ids; return { type: type_id.split("$")[0], type_id, output_format_id: output_format_id, }; } return null; }); } else if ("column" in b && b.column != null) { dataset.column_names[key] = data_table[0][b.column]; const interpretation_ids = getInterpretationIds(b.data_table_id, b.column); if ("type_id" in interpretation_ids) { const { type_id, output_format_id } = interpretation_ids; dataset.metadata[key] = { type: type_id.split("$")[0], type_id, output_format_id: output_format_id, }; } } else { throw new Error("Data binding includes no column(s) specification: " + JSON.stringify(b)); } if (data_table_ids.indexOf(b.data_table_id) == -1) { data_table_ids.push(b.data_table_id); num_rows = Math.max(num_rows, data_table.length - 1); } columns.push(b); } // gets the latest timestamp info across all data tables which are linked to by the bindings for this dataset // (this is typically only a single data table)\ dataset.timestamps = getLatestDataTimestamps(data_table_ids.map(id => timestamps.per_data_table[id])); function parse(b, column_index, string_value) { if (!b.template_data_binding?.data_type) { return string_value; } var interpreter = getInterpreter(b.data_table_id, column_index); if (interpreter && interpreter.type == "number") { string_value = stripCommonFixes(string_value); } var result = interpreter ? interpreter.parse(string_value) : string_value; // We require our marshalled data to be JSON-serialisable, // therefore we convert NaNs to null here. if (Number.isNaN(result)) { result = null; } return result; } for (var i = 0; i < num_rows; i++) { var o = {}; for (var j = 0; j < columns.length; j++) { b = columns[j]; const table = data_by_id[b.data_table_id]; if (table == null) { throw new Error(`[BUG] The data from the data table with ID ${b.data_table_id} was missing`); } if (i + 1 >= table.length) { continue; } if (b.key == null) { throw new Error(`BUG: 'key' was ${b.key} in ${b}`); } if ("columns" in b && b.columns != null) { const parsed_values = b.columns .filter(function (c) { return c < table[i + 1].length; }) .map(function (c) { return parse(b, c, table[i + 1][c]); }); o[b.key] = data_rewriter ? parsed_values.map(v => data_rewriter(b, v)) : parsed_values; } else if ("column" in b && b.column != null) { const parsed_value = b.column >= table[i + 1].length ? parse(b, b.column, "") : parse(b, b.column, table[i + 1][b.column]); o[b.key] = data_rewriter ? data_rewriter(b, parsed_value) : parsed_value; } } dataset.push(o); } return dataset; } function getColumnTypesForData(data) { return transposeNestedArray(data) .map(function (column, i) { const sliced_column = getSlicedData(column); const sample_size = 1000; let sample_data; if (sliced_column.length > (sample_size * 2)) { sample_data = getRandomSeededSample(sliced_column, sample_size); } else { sample_data = sliced_column; } const type_id = interpretColumn(sample_data)[0].id; return { type_id: type_id, index: i, output_format_id: type_id }; }); } // Returns a random seeded sample of column values based on the column length. // The sample is consistent and will update if the length of column changes. function getRandomSeededSample(column, sample_size) { if (column.length <= sample_size * 2) { return column; } const rng = mulberry32(column.length); while (column.length > sample_size) { const random_index = Math.floor(rng() * column.length); column.splice(random_index, 1); } return column; } // Seeded RNG implementation taken from https://github.com/bryc/code/blob/master/jshash/PRNGs.md#mulberry32 function mulberry32(seed) { let a = seed; return function () { a |= 0; a = a + 0x6D2B79F5 | 0; var t = Math.imul(a ^ a >>> 15, 1 | a); t = t + Math.imul(t ^ t >>> 7, 61 | t) ^ t; return ((t ^ t >>> 14) >>> 0) / 4294967296; }; } function trimTrailingEmptyRows(data) { for (var i = data.length; i-- > 1;) { if (!data[i] || !data[i].length || (Array.isArray(data[i]) && data[i].findIndex(function (col) { return col !== null && col !== ""; }) == -1)) { data.splice(i, 1); } else { break; } } return data; } function dropReturnCharacters(data) { for (const row of data) { for (let i = 0; i < row.length; i++) { // Due to a bug in HoT, pasting long lines from Excel can lead to the addition of // a newline character and a space *before* a space character. // This leads to a pattern of new line character followed by two spaces. // Here we identify that pattern and revert it. row[i] = row[i].replace(/(\r\n|\n|\r) {2}/g, " "); } } return data; } /** * Takes an array of arrays (typically tabular data) and rewrites * it so that: * - Any trailing empty rows are removed * - Any cell that was not a string is stringified * - Any leading or trailing whitespace of a cell is removed * * (The potentially modified table is returned to match the convention * used by functions this is replacing, although (TODO) I think it * would be more obvious that this function has side-effects if it * did not return the table and the calling code was changed.) * * @param {any[][]} data * @returns {string[][]} */ function tidyTable(data) { trimTrailingEmptyRows(data); for (let row of data) { for (let i = 0; i < row.length; i++) { let value = row[i]; // Convert null or undefined values to the empty string if (value == null) { value = ""; } // If the value is not a string, convert it to one if (typeof value !== "string") { value = "" + value; } if (typeof value !== "string") { throw new Error("BUG: somehow value was not a string"); } // Now value is a definitely a string, strip any leading // or trailing whitespace. row[i] = value.trim(); } } // TypeScript can't infer that the returned data is definitely all // strings so use a cast. return data; } var ERROR_STRINGS = ["#DIV/0", "#N/A", "#NAME?", "#NULL!", "#NUM!", "#REF!", "#VALUE!", "#ERROR!"]; var interpreter = (0, interpreter_1.createInterpreter)().nMax(Infinity).nFailingValues(8).failureFraction(0.1); function stripCommonFixes(str) { str = str || ""; return str.replace(/[€£$¥%º]/g, ""); } function transposeNestedArray(nested_array) { var n_inner = nested_array.length; var n_outer = n_inner > 0 ? nested_array[0].length : 0; var transposed_array = []; for (var i = 0; i < n_outer; i++) { var data = []; for (var j = 0; j < n_inner; j++) { data.push(nested_array[j][i]); } transposed_array.push(data); } return transposed_array; } function getSlicedData(arr) { const n = arr.length; if (n > 100) { return arr.slice(10, n - 10); } if (n > 50) { return arr.slice(5, n - 5); } if (n > 30) { return arr.slice(4, n - 4); } if (n > 20) { return arr.slice(3, n - 3); } if (n > 10) { return arr.slice(2, n - 2); } if (n > 1) { return arr.slice(1, n); } return arr.slice(0, 1); } function interpretColumn(arr) { var idata = arr.filter(function (d) { return d && !ERROR_STRINGS.includes(d.trim()); }) .map(stripCommonFixes); return interpreter(idata); } function sortDataTables(data_tables, data_bindings) { // Sort data tables to match order in the template data bindings if (!data_bindings || !data_bindings.length) { return; } if (!data_tables || !data_tables.length) { return; } var table_names = []; data_bindings.forEach(function (data_binding) { if (typeof data_binding === "string") { return; } let column; if ("column" in data_binding) { column = data_binding.column; } else if ("columns" in data_binding) { column = data_binding.columns; } if (column) { var table_name = column.match(/^(?:[^:]|:[^:])*/); if (table_name && table_names.indexOf(table_name[0]) == -1) { table_names.push(table_name[0]); } } }); return data_tables.sort(function (dt1, dt2) { const i = dt1.name ? table_names.indexOf(dt1.name) : -1; const j = dt2.name ? table_names.indexOf(dt2.name) : -1; return (i == -1 ? Infinity : i) < (j == -1 ? Infinity : j) ? -1 : 1; }); }