hyparquet-writer
Version:
Parquet file writer for JavaScript
84 lines • 3.5 kB
TypeScript
/**
* Insert a hash into a Split Block Bloom Filter.
*
* @param {Uint32Array} blocks bloom filter words (8 * numBlocks long)
* @param {bigint} hash 64-bit xxhash of the parquet-plain-encoded value
*/
export function sbbfInsert(blocks: Uint32Array, hash: bigint): void;
/**
* Test whether a hash might be present in a Split Block Bloom Filter.
* False positives are possible; false negatives are not.
*
* @param {Uint32Array} blocks bloom filter words (8 * numBlocks long)
* @param {bigint} hash 64-bit xxhash of the parquet-plain-encoded value
* @returns {boolean}
*/
export function sbbfContains(blocks: Uint32Array, hash: bigint): boolean;
/**
* Optimal SBBF size in bytes for a given number of distinct values and
* target false-positive probability. Matches parquet-mr's BlockSplitBloomFilter:
* derives bits from m = -8 * ndv / ln(1 - p^(1/8)), rounds up to a whole block,
* and snaps to the next power of two below 1024 bits.
*
* @param {number} ndv expected number of distinct values
* @param {number} fpp target false positive probability, in (0, 1)
* @returns {number} bloom filter size in bytes (multiple of 32)
*/
export function optimalNumBytes(ndv: number, fpp: number): number;
/**
* Allocate a zeroed Split Block Bloom Filter sized for the given NDV and FPP.
*
* @param {number} ndv expected number of distinct values
* @param {number} [fpp] target false positive probability, default 0.01
* @returns {Uint32Array} blocks (numBytes / 4 uint32 words)
*/
export function createBloomFilter(ndv: number, fpp?: number): Uint32Array;
/**
* Write a parquet bloom filter: BloomFilterHeader thrift struct followed by
* the raw little-endian bytes of the SBBF blocks. Always uses BLOCK / XXHASH /
* UNCOMPRESSED, the only variants parquet currently defines.
*
* @param {Writer} writer
* @param {Uint32Array} blocks bloom filter words (8 * numBlocks long)
*/
export function writeBloomFilter(writer: Writer, blocks: Uint32Array): void;
/**
* Write all pending bloom filters in a contiguous block and patch each chunk's
* meta_data.bloom_filter_offset / bloom_filter_length so readers can find them.
* Clustering matches parquet-mr's tail placement: a reader fetching the footer
* region can pull every bloom in one range request.
*
* @param {Writer} writer
* @param {PageIndexes[]} pageIndexes
*/
export function writeBlooms(writer: Writer, pageIndexes: PageIndexes[]): void;
/**
* Collects distinct hashes of column values and finalizes them into an SBBF
* sized for the actual distinct count. `finalize` returns `undefined` if any
* non-null value was unhashable (the filter would have false negatives), if
* no values were seen, or if the optimal size exceeds `maxBytes`.
*/
export class BloomBuilder {
/**
* @param {SchemaElement} element
* @param {{ fpp?: number, maxBytes?: number }} [options]
*/
constructor(element: SchemaElement, { fpp, maxBytes }?: {
fpp?: number;
maxBytes?: number;
});
element: import("hyparquet/src/types.js").SchemaElement;
fpp: number;
maxBytes: number;
/** @type {Set<bigint>} */
hashes: Set<bigint>;
skipped: number;
/** @param {any} value */
insert(value: any): void;
/** @returns {Uint32Array | undefined} */
finalize(): Uint32Array | undefined;
}
import type { Writer } from '../src/types.js';
import type { PageIndexes } from '../src/types.js';
import type { SchemaElement } from 'hyparquet';
//# sourceMappingURL=bloom.d.ts.map