UNPKG

@severo_tests/hyparquet

Version:

Parquet file parser for JavaScript

249 lines (226 loc) 7.95 kB
import { getMaxDefinitionLevel, isListLike, isMapLike } from './schema.js' /** * Reconstructs a complex nested structure from flat arrays of values and * definition and repetition levels, according to Dremel encoding. * * @param {any[]} output * @param {number[] | undefined} definitionLevels * @param {number[]} repetitionLevels * @param {DecodedArray} values * @param {SchemaTree[]} schemaPath * @returns {DecodedArray} */ export function assembleLists(output, definitionLevels, repetitionLevels, values, schemaPath) { const n = definitionLevels?.length || repetitionLevels.length if (!n) return values const maxDefinitionLevel = getMaxDefinitionLevel(schemaPath) const repetitionPath = schemaPath.map(({ element }) => element.repetition_type) let valueIndex = 0 // Track state of nested structures const containerStack = [output] let currentContainer = output let currentDepth = 0 // schema depth let currentDefLevel = 0 // list depth let currentRepLevel = 0 if (repetitionLevels[0]) { // continue previous row while (currentDepth < repetitionPath.length - 2 && currentRepLevel < repetitionLevels[0]) { currentDepth++ if (repetitionPath[currentDepth] !== 'REQUIRED') { // go into last list currentContainer = currentContainer.at(-1) containerStack.push(currentContainer) currentDefLevel++ } if (repetitionPath[currentDepth] === 'REPEATED') currentRepLevel++ } } for (let i = 0; i < n; i++) { // assert(currentDefLevel === containerStack.length - 1) const def = definitionLevels?.length ? definitionLevels[i] : maxDefinitionLevel const rep = repetitionLevels[i] // Pop up to start of rep level while (currentDepth && (rep < currentRepLevel || repetitionPath[currentDepth] !== 'REPEATED')) { if (repetitionPath[currentDepth] !== 'REQUIRED') { containerStack.pop() currentDefLevel-- } if (repetitionPath[currentDepth] === 'REPEATED') currentRepLevel-- currentDepth-- } // @ts-expect-error won't be empty currentContainer = containerStack.at(-1) // Go deeper to end of definition level while ( (currentDepth < repetitionPath.length - 2 || repetitionPath[currentDepth + 1] === 'REPEATED') && (currentDefLevel < def || repetitionPath[currentDepth + 1] === 'REQUIRED') ) { currentDepth++ if (repetitionPath[currentDepth] !== 'REQUIRED') { /** @type {any[]} */ const newList = [] currentContainer.push(newList) currentContainer = newList containerStack.push(newList) currentDefLevel++ } if (repetitionPath[currentDepth] === 'REPEATED') currentRepLevel++ } // Add value or null based on definition level if (def === maxDefinitionLevel) { // assert(currentDepth === maxDefinitionLevel || currentDepth === repetitionPath.length - 2) currentContainer.push(values[valueIndex++]) } else if (currentDepth === repetitionPath.length - 2) { currentContainer.push(null) } else { currentContainer.push([]) } } // Handle edge cases for empty inputs or single-level data if (!output.length) { // return max definition level of nested lists for (let i = 0; i < maxDefinitionLevel; i++) { /** @type {any[]} */ const newList = [] currentContainer.push(newList) currentContainer = newList } } return output } /** * Assemble a nested structure from subcolumn data. * https://github.com/apache/parquet-format/blob/apache-parquet-format-2.10.0/LogicalTypes.md#nested-types * * @param {Map<string, DecodedArray>} subcolumnData * @param {SchemaTree} schema top-level schema element * @param {number} [depth] depth of nested structure */ export function assembleNested(subcolumnData, schema, depth = 0) { const path = schema.path.join('.') const optional = schema.element.repetition_type === 'OPTIONAL' const nextDepth = optional ? depth + 1 : depth if (isListLike(schema)) { let sublist = schema.children[0] let subDepth = nextDepth if (sublist.children.length === 1) { sublist = sublist.children[0] subDepth++ } assembleNested(subcolumnData, sublist, subDepth) const subcolumn = sublist.path.join('.') const values = subcolumnData.get(subcolumn) if (!values) throw new Error('parquet list column missing values') if (optional) flattenAtDepth(values, depth) subcolumnData.set(path, values) subcolumnData.delete(subcolumn) return } if (isMapLike(schema)) { const mapName = schema.children[0].element.name // Assemble keys and values assembleNested(subcolumnData, schema.children[0].children[0], nextDepth + 1) assembleNested(subcolumnData, schema.children[0].children[1], nextDepth + 1) const keys = subcolumnData.get(`${path}.${mapName}.key`) const values = subcolumnData.get(`${path}.${mapName}.value`) if (!keys) throw new Error('parquet map column missing keys') if (!values) throw new Error('parquet map column missing values') if (keys.length !== values.length) { throw new Error('parquet map column key/value length mismatch') } const out = assembleMaps(keys, values, nextDepth) if (optional) flattenAtDepth(out, depth) subcolumnData.delete(`${path}.${mapName}.key`) subcolumnData.delete(`${path}.${mapName}.value`) subcolumnData.set(path, out) return } // Struct-like column if (schema.children.length) { // construct a meta struct and then invert const invertDepth = schema.element.repetition_type === 'REQUIRED' ? depth : depth + 1 /** @type {Record<string, any>} */ const struct = {} for (const child of schema.children) { assembleNested(subcolumnData, child, invertDepth) const childData = subcolumnData.get(child.path.join('.')) if (!childData) throw new Error('parquet struct missing child data') struct[child.element.name] = childData } // remove children for (const child of schema.children) { subcolumnData.delete(child.path.join('.')) } // invert struct by depth const inverted = invertStruct(struct, invertDepth) if (optional) flattenAtDepth(inverted, depth) subcolumnData.set(path, inverted) } } /** * @import {DecodedArray, SchemaTree} from '../src/types.d.ts' * @param {DecodedArray} arr * @param {number} depth */ function flattenAtDepth(arr, depth) { for (let i = 0; i < arr.length; i++) { if (depth) { flattenAtDepth(arr[i], depth - 1) } else { arr[i] = arr[i][0] } } } /** * @param {DecodedArray} keys * @param {DecodedArray} values * @param {number} depth * @returns {any[]} */ function assembleMaps(keys, values, depth) { const out = [] for (let i = 0; i < keys.length; i++) { if (depth) { out.push(assembleMaps(keys[i], values[i], depth - 1)) // go deeper } else { if (keys[i]) { /** @type {Record<string, any>} */ const obj = {} for (let j = 0; j < keys[i].length; j++) { const value = values[i][j] obj[keys[i][j]] = value === undefined ? null : value } out.push(obj) } else { out.push(undefined) } } } return out } /** * Invert a struct-like object by depth. * * @param {Record<string, any[]>} struct * @param {number} depth * @returns {any[]} */ function invertStruct(struct, depth) { const keys = Object.keys(struct) const length = struct[keys[0]]?.length const out = [] for (let i = 0; i < length; i++) { /** @type {Record<string, any>} */ const obj = {} for (const key of keys) { if (struct[key].length !== length) throw new Error('parquet struct parsing error') obj[key] = struct[key][i] } if (depth) { out.push(invertStruct(obj, depth - 1)) // deeper } else { out.push(obj) } } return out }