UNPKG

hyparquet

Version:

Parquet file parser for JavaScript

142 lines (132 loc) 5.14 kB
import { canSkipRowGroup } from './filter.js' import { parquetSchema } from './metadata.js' import { getPhysicalColumns } from './schema.js' // Combine column chunks if less than 2mb const runLimit = 1 << 21 // 2mb /** * @import {AsyncBuffer, ByteRange, ChunkPlan, GroupPlan, ParquetReadOptions, QueryPlan} from '../src/types.js' */ /** * Plan which byte ranges to read to satisfy a read request. * Metadata must be non-null. * * @param {ParquetReadOptions} options * @returns {QueryPlan} */ export function parquetPlan({ metadata, rowStart = 0, rowEnd = Infinity, columns, filter, filterStrict = true, useOffsetIndex = false }) { if (!metadata) throw new Error('parquetPlan requires metadata') /** @type {GroupPlan[]} */ const groups = [] /** @type {ByteRange[]} */ const fetches = [] /** @type {ByteRange[]} */ const indexes = [] const physicalColumns = getPhysicalColumns(parquetSchema(metadata)) // find which row groups to read let groupStart = 0 // first row index of the current group for (const rowGroup of metadata.row_groups) { const groupRows = Number(rowGroup.num_rows) const groupEnd = groupStart + groupRows // if row group overlaps with row range, add it to the plan if (groupRows > 0 && groupEnd > rowStart && groupStart < rowEnd && !canSkipRowGroup({ rowGroup, physicalColumns, filter, strict: filterStrict })) { /** @type {ChunkPlan[]} */ const chunks = [] let groupStartByte = Infinity let groupEndByte = -Infinity // loop through each column chunk for (const chunk of rowGroup.columns) { const meta = chunk.meta_data if (chunk.file_path) throw new Error('parquet file_path not supported') if (!meta) throw new Error('parquet column metadata is undefined') // add included column chunks to the plan if (!columns || columns.includes(meta.path_in_schema[0])) { // full column chunk const columnOffset = meta.dictionary_page_offset || meta.data_page_offset const startByte = Number(columnOffset) const endByte = Number(columnOffset + meta.total_compressed_size) // update group byte range if (startByte < groupStartByte) groupStartByte = startByte if (endByte > groupEndByte) groupEndByte = endByte if (useOffsetIndex && chunk.offset_index_offset && chunk.offset_index_length) { const offsetIndexStart = Number(chunk.offset_index_offset) chunks.push({ columnMetadata: meta, offsetIndex: { startByte: offsetIndexStart, endByte: offsetIndexStart + chunk.offset_index_length, }, bounds: { startByte, endByte }, }) } else { chunks.push({ columnMetadata: meta, range: { startByte, endByte }, }) } } } const selectStart = Math.max(rowStart - groupStart, 0) const selectEnd = Math.min(rowEnd - groupStart, groupRows) groups.push({ chunks, rowGroup, groupStart, groupRows, selectStart, selectEnd }) // combine runs of column chunks /** @type {ByteRange | undefined} */ let run for (const chunk of chunks) { if ('offsetIndex' in chunk) { indexes.push(chunk.offsetIndex) } else { const { range } = chunk if (columns) { fetches.push(range) } else if (run && range.endByte - run.startByte <= runLimit) { // extend range run.endByte = range.endByte } else { // new range if (run) fetches.push(run) run = { ...range } } } } if (run) fetches.push(run) } groupStart = groupEnd } if (!isFinite(rowEnd)) rowEnd = groupStart fetches.push(...indexes) return { metadata, rowStart, rowEnd, columns, fetches, groups } } /** * Prefetch byte ranges from an AsyncBuffer. * * @param {AsyncBuffer} file * @param {QueryPlan} plan * @returns {AsyncBuffer} */ export function prefetchAsyncBuffer(file, { fetches }) { // fetch byte ranges from the file const promises = fetches.map(({ startByte, endByte }) => file.slice(startByte, endByte)) return { byteLength: file.byteLength, slice(start, end = file.byteLength) { // find matching slice const index = fetches.findIndex(({ startByte, endByte }) => startByte <= start && end <= endByte) if (index < 0) { // fallback to direct read return file.slice(start, end) } if (fetches[index].startByte !== start || fetches[index].endByte !== end) { // slice a subrange of the prefetch const startOffset = start - fetches[index].startByte const endOffset = end - fetches[index].startByte if (promises[index] instanceof Promise) { return promises[index].then(buffer => buffer.slice(startOffset, endOffset)) } else { return promises[index].slice(startOffset, endOffset) } } else { return promises[index] } }, } }