UNPKG

hyparquet-writer

Version:

Parquet file writer for JavaScript

84 lines 3.5 kB
/** * Insert a hash into a Split Block Bloom Filter. * * @param {Uint32Array} blocks bloom filter words (8 * numBlocks long) * @param {bigint} hash 64-bit xxhash of the parquet-plain-encoded value */ export function sbbfInsert(blocks: Uint32Array, hash: bigint): void; /** * Test whether a hash might be present in a Split Block Bloom Filter. * False positives are possible; false negatives are not. * * @param {Uint32Array} blocks bloom filter words (8 * numBlocks long) * @param {bigint} hash 64-bit xxhash of the parquet-plain-encoded value * @returns {boolean} */ export function sbbfContains(blocks: Uint32Array, hash: bigint): boolean; /** * Optimal SBBF size in bytes for a given number of distinct values and * target false-positive probability. Matches parquet-mr's BlockSplitBloomFilter: * derives bits from m = -8 * ndv / ln(1 - p^(1/8)), rounds up to a whole block, * and snaps to the next power of two below 1024 bits. * * @param {number} ndv expected number of distinct values * @param {number} fpp target false positive probability, in (0, 1) * @returns {number} bloom filter size in bytes (multiple of 32) */ export function optimalNumBytes(ndv: number, fpp: number): number; /** * Allocate a zeroed Split Block Bloom Filter sized for the given NDV and FPP. * * @param {number} ndv expected number of distinct values * @param {number} [fpp] target false positive probability, default 0.01 * @returns {Uint32Array} blocks (numBytes / 4 uint32 words) */ export function createBloomFilter(ndv: number, fpp?: number): Uint32Array; /** * Write a parquet bloom filter: BloomFilterHeader thrift struct followed by * the raw little-endian bytes of the SBBF blocks. Always uses BLOCK / XXHASH / * UNCOMPRESSED, the only variants parquet currently defines. * * @param {Writer} writer * @param {Uint32Array} blocks bloom filter words (8 * numBlocks long) */ export function writeBloomFilter(writer: Writer, blocks: Uint32Array): void; /** * Write all pending bloom filters in a contiguous block and patch each chunk's * meta_data.bloom_filter_offset / bloom_filter_length so readers can find them. * Clustering matches parquet-mr's tail placement: a reader fetching the footer * region can pull every bloom in one range request. * * @param {Writer} writer * @param {PageIndexes[]} pageIndexes */ export function writeBlooms(writer: Writer, pageIndexes: PageIndexes[]): void; /** * Collects distinct hashes of column values and finalizes them into an SBBF * sized for the actual distinct count. `finalize` returns `undefined` if any * non-null value was unhashable (the filter would have false negatives), if * no values were seen, or if the optimal size exceeds `maxBytes`. */ export class BloomBuilder { /** * @param {SchemaElement} element * @param {{ fpp?: number, maxBytes?: number }} [options] */ constructor(element: SchemaElement, { fpp, maxBytes }?: { fpp?: number; maxBytes?: number; }); element: import("hyparquet/src/types.js").SchemaElement; fpp: number; maxBytes: number; /** @type {Set<bigint>} */ hashes: Set<bigint>; skipped: number; /** @param {any} value */ insert(value: any): void; /** @returns {Uint32Array | undefined} */ finalize(): Uint32Array | undefined; } import type { Writer } from '../src/types.js'; import type { PageIndexes } from '../src/types.js'; import type { SchemaElement } from 'hyparquet'; //# sourceMappingURL=bloom.d.ts.map