hyparquet-writer
Version:
Parquet file writer for JavaScript
213 lines (196 loc) • 7.15 kB
JavaScript
// Split Block Bloom Filter (https://github.com/apache/parquet-format/blob/master/BloomFilter.md)
// A bloom filter is a sequence of 32-byte blocks. Each block holds 8 little-endian uint32 words.
// Insertion sets one bit per word, chosen by salting the low 32 bits of an xxhash64.
// Membership requires all 8 bits to be set; misses are exact, hits are probabilistic.
import { hashParquetValue } from 'hyparquet/src/bloom.js'
import { serializeTCompactProtocol } from './thrift.js'
/**
* @import {SchemaElement} from 'hyparquet'
* @import {PageIndexes, Writer} from '../src/types.js'
*/
const SALT = new Uint32Array([
0x47b6137b, 0x44974d91, 0x8824ad5b, 0xa2b7289d,
0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31,
])
const BYTES_PER_BLOCK = 32
const MIN_BYTES = 32 // one block
const MAX_BYTES = 128 * 1024 * 1024 // parquet-mr default cap
/**
* Map the high 32 bits of a hash to a block index in [0, numBlocks).
*
* @param {bigint} hash
* @param {number} numBlocks
* @returns {number}
*/
function blockIndex(hash, numBlocks) {
return Number((hash >> 32n) * BigInt(numBlocks) >> 32n)
}
/**
* Per-block mask: 8 uint32 words, each with a single bit set at position `(low32 * SALT[i]) >> 27`.
*
* @param {bigint} hash
* @returns {Uint32Array}
*/
function blockMask(hash) {
const m = new Uint32Array(8)
const low = Number(hash & 0xffffffffn) | 0
for (let i = 0; i < 8; i++) {
m[i] = 1 << (Math.imul(low, SALT[i]) >>> 27)
}
return m
}
/**
* Insert a hash into a Split Block Bloom Filter.
*
* @param {Uint32Array} blocks bloom filter words (8 * numBlocks long)
* @param {bigint} hash 64-bit xxhash of the parquet-plain-encoded value
*/
export function sbbfInsert(blocks, hash) {
const offset = blockIndex(hash, blocks.length >> 3) << 3
const m = blockMask(hash)
for (let i = 0; i < 8; i++) {
blocks[offset + i] |= m[i]
}
}
/**
* Test whether a hash might be present in a Split Block Bloom Filter.
* False positives are possible; false negatives are not.
*
* @param {Uint32Array} blocks bloom filter words (8 * numBlocks long)
* @param {bigint} hash 64-bit xxhash of the parquet-plain-encoded value
* @returns {boolean}
*/
export function sbbfContains(blocks, hash) {
const offset = blockIndex(hash, blocks.length >> 3) << 3
const m = blockMask(hash)
for (let i = 0; i < 8; i++) {
if ((blocks[offset + i] & m[i]) === 0) return false
}
return true
}
/**
* Round up to the next power of two (32-bit).
*
* @param {number} n
* @returns {number}
*/
function nextPowerOfTwo(n) {
let p = 1
while (p < n) p <<= 1
return p
}
/**
* Optimal SBBF size in bytes for a given number of distinct values and
* target false-positive probability. Matches parquet-mr's BlockSplitBloomFilter:
* derives bits from m = -8 * ndv / ln(1 - p^(1/8)), rounds up to a whole block,
* and snaps to the next power of two below 1024 bits.
*
* @param {number} ndv expected number of distinct values
* @param {number} fpp target false positive probability, in (0, 1)
* @returns {number} bloom filter size in bytes (multiple of 32)
*/
export function optimalNumBytes(ndv, fpp) {
if (!(fpp > 0 && fpp < 1)) throw new Error(`bloom filter fpp must be in (0, 1), got ${fpp}`)
if (!(ndv >= 0)) throw new Error(`bloom filter ndv must be >= 0, got ${ndv}`)
const m = -8 * ndv / Math.log(1 - fpp ** (1 / 8))
let numBits = Math.ceil(m)
if (!isFinite(numBits) || numBits > MAX_BYTES << 3) numBits = MAX_BYTES << 3
// Round up to whole 32-byte blocks
const blockBits = BYTES_PER_BLOCK << 3
numBits = Math.ceil(numBits / blockBits) * blockBits
let numBytes = numBits >> 3
if (numBytes < MIN_BYTES) numBytes = MIN_BYTES
// Power-of-two snap below 1024 bytes (matches parquet-mr behavior)
if (numBytes < 1024) numBytes = nextPowerOfTwo(numBytes)
return numBytes
}
/**
* Allocate a zeroed Split Block Bloom Filter sized for the given NDV and FPP.
*
* @param {number} ndv expected number of distinct values
* @param {number} [fpp] target false positive probability, default 0.01
* @returns {Uint32Array} blocks (numBytes / 4 uint32 words)
*/
export function createBloomFilter(ndv, fpp = 0.01) {
const numBytes = optimalNumBytes(ndv, fpp)
return new Uint32Array(numBytes >> 2)
}
/**
* Collects distinct hashes of column values and finalizes them into an SBBF
* sized for the actual distinct count. `finalize` returns `undefined` if any
* non-null value was unhashable (the filter would have false negatives), if
* no values were seen, or if the optimal size exceeds `maxBytes`.
*/
export class BloomBuilder {
/**
* @param {SchemaElement} element
* @param {{ fpp?: number, maxBytes?: number }} [options]
*/
constructor(element, { fpp = 0.01, maxBytes = 1024 * 1024 } = {}) {
this.element = element
this.fpp = fpp
this.maxBytes = maxBytes
/** @type {Set<bigint>} */
this.hashes = new Set()
this.skipped = 0
}
/** @param {any} value */
insert(value) {
if (value === null || value === undefined) return
const h = hashParquetValue(value, this.element)
if (h === undefined) {
this.skipped++
return
}
this.hashes.add(h)
}
/** @returns {Uint32Array | undefined} */
finalize() {
if (this.skipped > 0 || this.hashes.size === 0) return undefined
const numBytes = optimalNumBytes(this.hashes.size, this.fpp)
if (numBytes > this.maxBytes) return undefined
const blocks = new Uint32Array(numBytes >> 2)
for (const h of this.hashes) sbbfInsert(blocks, h)
return blocks
}
}
/**
* Write a parquet bloom filter: BloomFilterHeader thrift struct followed by
* the raw little-endian bytes of the SBBF blocks. Always uses BLOCK / XXHASH /
* UNCOMPRESSED, the only variants parquet currently defines.
*
* @param {Writer} writer
* @param {Uint32Array} blocks bloom filter words (8 * numBlocks long)
*/
export function writeBloomFilter(writer, blocks) {
if (blocks.length % 8 !== 0) {
throw new Error(`bloom filter block count must be a multiple of 8 uint32 words, got ${blocks.length}`)
}
serializeTCompactProtocol(writer, {
field_1: blocks.byteLength, // numBytes
field_2: { field_1: {} }, // algorithm: SplitBlockAlgorithm
field_3: { field_1: {} }, // hash: XxHash
field_4: { field_1: {} }, // compression: Uncompressed
})
for (let i = 0; i < blocks.length; i++) {
writer.appendUint32(blocks[i])
}
}
/**
* Write all pending bloom filters in a contiguous block and patch each chunk's
* meta_data.bloom_filter_offset / bloom_filter_length so readers can find them.
* Clustering matches parquet-mr's tail placement: a reader fetching the footer
* region can pull every bloom in one range request.
*
* @param {Writer} writer
* @param {PageIndexes[]} pageIndexes
*/
export function writeBlooms(writer, pageIndexes) {
for (const { chunk, bloomFilter } of pageIndexes) {
if (!bloomFilter || !chunk.meta_data) continue
const offset = writer.offset
writeBloomFilter(writer, bloomFilter)
chunk.meta_data.bloom_filter_offset = BigInt(offset)
chunk.meta_data.bloom_filter_length = writer.offset - offset
}
}