UNPKG

hyparquet

Version:

Parquet file parser for JavaScript

185 lines (167 loc) 6.6 kB
import { columnsNeededForFilter, matchFilter } from './filter.js' import { parquetMetadataAsync, parquetSchema } from './metadata.js' import { parquetPlan, prefetchAsyncBuffer } from './plan.js' import { assembleAsync, asyncGroupToRows, readRowGroup } from './rowgroup.js' import { concat, flatten } from './utils.js' /** * @import {AsyncRowGroup, DecodedArray, ParquetReadOptions, BaseParquetReadOptions} from '../src/types.js' */ /** * Read parquet data rows from a file-like object. * Reads the minimal number of row groups and columns to satisfy the request. * * Returns a void promise when complete. * Errors are thrown on the returned promise. * Data is returned in callbacks onComplete, onChunk, onPage, NOT the return promise. * See parquetReadObjects for a more convenient API. * * @param {ParquetReadOptions} options read options * @returns {Promise<void>} resolves when all requested rows and columns are parsed, all errors are thrown here */ export async function parquetRead(options) { // load metadata if not provided options.metadata ??= await parquetMetadataAsync(options.file, options) const { rowStart = 0, rowEnd, columns, onChunk, onComplete, rowFormat, filter, filterStrict = true } = options // Filter requires object format to match column names if (filter && rowFormat !== 'object') { throw new Error('parquet filter requires rowFormat: "object"') } // Include filter columns in the read plan const filterColumns = columnsNeededForFilter(filter) if (filterColumns.length) { const schemaColumns = parquetSchema(options.metadata).children.map(c => c.element.name) const missingColumns = filterColumns.filter(c => !schemaColumns.includes(c)) if (missingColumns.length) { throw new Error(`parquet filter columns not found: ${missingColumns.join(', ')}`) } } let readColumns = columns let requiresProjection = false if (columns && filter) { const missingFilterColumns = filterColumns.filter(c => !columns.includes(c)) if (missingFilterColumns.length) { readColumns = [...columns, ...missingFilterColumns] requiresProjection = true } } // read row groups with expanded columns const readOptions = readColumns !== columns ? { ...options, columns: readColumns } : options const asyncGroups = parquetReadAsync(readOptions) // skip assembly if no onComplete or onChunk, but wait for reading to finish if (!onComplete && !onChunk) { for (const { asyncColumns } of asyncGroups) { for (const { data } of asyncColumns) await data } return } // assemble struct columns const schemaTree = parquetSchema(options.metadata) const assembled = asyncGroups.map(arg => assembleAsync(arg, schemaTree, options.parsers)) // onChunk emit all chunks (don't await) if (onChunk) { for (const asyncGroup of assembled) { for (const asyncColumn of asyncGroup.asyncColumns) { asyncColumn.data.then(({ data, skipped }) => { let rowStart = asyncGroup.groupStart + skipped for (const columnData of data) { onChunk({ columnName: asyncColumn.pathInSchema[0], columnData, rowStart, rowEnd: rowStart + columnData.length, }) rowStart += columnData.length } }) } } } // onComplete transpose column chunks to rows if (onComplete) { // loosen the types to avoid duplicate code /** @type {any[]} */ const rows = [] for (const asyncGroup of assembled) { // filter to rows in range const selectStart = Math.max(rowStart - asyncGroup.groupStart, 0) const selectEnd = Math.min((rowEnd ?? Infinity) - asyncGroup.groupStart, asyncGroup.groupRows) // transpose column chunks to rows in output const groupData = rowFormat === 'object' ? await asyncGroupToRows(asyncGroup, selectStart, selectEnd, readColumns, 'object') : await asyncGroupToRows(asyncGroup, selectStart, selectEnd, columns, 'array') // Apply filter and projection if (filter) { // eslint-disable-next-line no-extra-parens for (const row of /** @type {Record<string, any>[]} */ (groupData)) { if (matchFilter(row, filter, filterStrict)) { if (requiresProjection && columns) { for (const col of filterColumns) { if (!columns.includes(col)) delete row[col] } } rows.push(row) } } } else { concat(rows, groupData) } } onComplete(rows) } else { // wait for all async groups to finish (complete takes care of this) for (const { asyncColumns } of assembled) { for (const { data } of asyncColumns) await data } } } /** * @param {ParquetReadOptions} options read options * @returns {AsyncRowGroup[]} */ export function parquetReadAsync(options) { if (!options.metadata) throw new Error('parquet requires metadata') // TODO: validate options (start, end, columns, etc) // prefetch byte ranges const plan = parquetPlan(options) options.file = prefetchAsyncBuffer(options.file, plan) // read row groups return plan.groups.map(groupPlan => readRowGroup(options, plan, groupPlan)) } /** * Reads a single column from a parquet file. * * @param {BaseParquetReadOptions} options * @returns {Promise<DecodedArray>} */ export async function parquetReadColumn(options) { if (options.columns?.length !== 1) { throw new Error('parquetReadColumn expected columns: [columnName]') } options.metadata ??= await parquetMetadataAsync(options.file, options) const asyncGroups = parquetReadAsync(options) // assemble struct columns const schemaTree = parquetSchema(options.metadata) const assembled = asyncGroups.map(arg => assembleAsync(arg, schemaTree, options.parsers)) /** @type {DecodedArray[]} */ const columnData = [] for (const rg of assembled) { columnData.push(flatten((await rg.asyncColumns[0].data).data)) } return flatten(columnData) } /** * This is a helper function to read parquet row data as a promise. * It is a wrapper around the more configurable parquetRead function. * * @param {Omit<ParquetReadOptions, 'onComplete'>} options * @returns {Promise<Record<string, any>[]>} resolves when all requested rows and columns are parsed */ export function parquetReadObjects(options) { return new Promise((onComplete, reject) => { parquetRead({ ...options, rowFormat: 'object', // force object output onComplete, }).catch(reject) }) }