UNPKG

@dsnp/parquetjs

Version:

fully asynchronous, pure JavaScript implementation of the Parquet file format

374 lines (373 loc) 15.6 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); const parquet_types_1 = __importDefault(require("../../gen-nodejs/parquet_types")); const long_1 = __importDefault(require("long")); const xxhasher_1 = __importDefault(require("./xxhasher")); /** * @class SplitBlockBloomFilter * * @description Parquet spec implementation of Split Block Bloom Filtering. * Much of this code was pulled from the * [apache/parquet Java implementation](https://github.com/apache/parquet-mr) * See also * [Cache-, Hash- and Space-Efficient Bloom Filters](http://algo2.iti.kit.edu/documents/cacheefficientbloomfilters-jea.pdf) * * Default filter size is ~16.8MB, using 0.001 FPR and 128M rows * * @constructor SplitBlockBloomFilter() * Once desired options are set, call **`init()`** to set up the filter array * @example This calculates and sets the optimal filter size based on the options: * const filter = new SplitBLockBloomFilter() * .setOptionNumDistinct(100000) * .setOptionFalsePositiveRate(0.00001) * .init() * * @example This uses the default values to initialize the filter: * const filter = new SplitBlockBloomFilter().init() * * @example If you know what size bloom filter you want, do: * const filter = new SplitBLockBloomFilter() * .setOptionNumFilterBytes(128000) * .init() */ class SplitBlockBloomFilter { static salt = [ 0x47b6137b, 0x44974d91, 0x8824ad5b, 0xa2b7289d, 0x705495c7, 0x2df1424b, 0x9efc4947, 0x5c6bfb31, ]; // How many bits are in a single block: // - Blocks are UInt32 arrays // - There are 8 UInt32 words in each block. static WORDS_PER_BLOCK = 8; static WORD_SIZE = 32; static BITS_PER_BLOCK = SplitBlockBloomFilter.WORDS_PER_BLOCK * SplitBlockBloomFilter.WORD_SIZE; // Default number of blocks in a Split Block Bloom filter (SBBF) static NUMBER_OF_BLOCKS = 32; // The lower bound of SBBF size in bytes. // Currently this is 1024 static LOWER_BOUND_BYTES = (SplitBlockBloomFilter.NUMBER_OF_BLOCKS * SplitBlockBloomFilter.BITS_PER_BLOCK) / 8; // The upper bound of SBBF size, set to default row group size in bytes. // Note that the subsquent requirements for an effective bloom filter on a row group this size would mean this // is unacceptably large for a lightweight client application. static UPPER_BOUND_BYTES = 128 * 1024 * 1024; static DEFAULT_FALSE_POSITIVE_RATE = 0.001; static DEFAULT_DISTINCT_VALUES = 128 * 1024; /** * @function initBlock * @description initializes a single block */ static initBlock() { return Uint32Array.from(Array(SplitBlockBloomFilter.WORDS_PER_BLOCK).fill(0)); } /** * @function from * @description initialize a SplitBlockBloomFilter for a single column row group * from the provided Buffer * @param buffer a NodeJs Buffer containing bloom filter data for a row group. */ static from(buffer, _rowCount) { if (buffer.length === 0) { throw new Error('buffer is empty'); } const chunkSize = SplitBlockBloomFilter.WORDS_PER_BLOCK; const uint32sFromBuf = new Uint32Array(buffer.buffer); const result = []; const length = uint32sFromBuf.length; for (let index = 0; index < length; index += chunkSize) { result.push(uint32sFromBuf.subarray(index, index + chunkSize)); } const sb = new SplitBlockBloomFilter(); sb.splitBlockFilter = result; sb.numBlocks = result.length; // these will not be knowable when reading sb.numDistinctValues = 0; sb.desiredFalsePositiveRate = 0.0; return sb; } /** * @function getBlockIndex: get a block index to insert a hash value for * @param h the hash from which to derive a block index (?) * @param z the number of blocks in the filter * * @return a number from 0 to z-1, inclusive */ static getBlockIndex(h, z) { const zLong = long_1.default.fromNumber(z, true); const hTopBits = long_1.default.fromNumber(h.getHighBitsUnsigned(), true); return hTopBits.mul(zLong).shiftRightUnsigned(32).getLowBitsUnsigned(); } /** * @function optimalNumOfBlocks * * @description Calculate optimal number of blocks, according to the number of distinct * values and false positive probability. Using a Bloom filter calculator, the upper bound is * far too large for client applications. Sourced from: * https://github.com/apache/parquet-mr/blob/5608695f5777de1eb0899d9075ec9411cfdf31d3/parquet-column/src/main/java/org/apache/parquet/column/values/bloomfilter/BlockSplitBloomFilter.java#L285 * * @param numDistinct The number of distinct values. * @param falsePositiveRate The false positive rate, a number between 0 and 1 exclusive * * @return number: number of bits of given n and p. */ static optimalNumOfBlocks(numDistinct, falsePositiveRate) { let m = (-8 * numDistinct) / Math.log(1 - Math.pow(falsePositiveRate, 1.0 / 8)); m = (m + SplitBlockBloomFilter.NUMBER_OF_BLOCKS - 1) & ~SplitBlockBloomFilter.NUMBER_OF_BLOCKS; // Handle overflow: const upperShiftL3 = SplitBlockBloomFilter.UPPER_BOUND_BYTES << 3; if (m > upperShiftL3 || m < 0) { m = upperShiftL3; } // Round numBits up m = (m + SplitBlockBloomFilter.BITS_PER_BLOCK - 1) & ~SplitBlockBloomFilter.BITS_PER_BLOCK; const lowerBoundShiftL3 = SplitBlockBloomFilter.LOWER_BOUND_BYTES << 3; if (m < lowerBoundShiftL3) { m = lowerBoundShiftL3; } return Math.ceil(m / this.BITS_PER_BLOCK); } /** * @function mask * @description generate a mask block for a bloom filter block * @param hashValue: the hash value to generate the mask from * @private * * @return mask Block */ static mask(hashValue) { const result = SplitBlockBloomFilter.initBlock(); for (let i = 0; i < result.length; i++) { const y = hashValue.getLowBitsUnsigned() * SplitBlockBloomFilter.salt[i]; result[i] = result[i] | (1 << (y >>> 27)); } return result; } /** * @function blockInsert * @description insert a hash into a Bloom filter Block * @param b: the block to flip a bit for: is changed * @param hashValue: the hash value to insert into b * @private * * @return void */ static blockInsert(b, hashValue) { const masked = this.mask(hashValue); for (let i = 0; i < masked.length; i++) { for (let j = 0; j < this.WORD_SIZE; j++) { const isSet = masked[i] & (2 ** j); if (isSet) { b[i] = b[i] | (2 ** j); } } } } /** * @function blockCheck * @description check if a hashValue exists for this filter * @param b: the block to check for inclusion * @param hashValue: the hash to check for should be long * @private * * @return true if hashed item is __probably__ in the data set represented by this filter * @return false if it is __definitely not__ in the data set. */ static blockCheck(b, hashValue) { const masked = this.mask(hashValue); for (let i = 0; i < masked.length; i++) { for (let j = 0; j < this.WORD_SIZE; j++) { const isSet = masked[i] & (2 ** j); if (isSet) { const match = b[i] & (2 ** j); if (!match) { return false; } } } } return true; } /** * Instance */ splitBlockFilter = []; desiredFalsePositiveRate = SplitBlockBloomFilter.DEFAULT_FALSE_POSITIVE_RATE; numBlocks = 0; numDistinctValues = SplitBlockBloomFilter.DEFAULT_DISTINCT_VALUES; hashStrategy = new parquet_types_1.default.BloomFilterHash(new parquet_types_1.default.XxHash()); hasher = new xxhasher_1.default(); isInitialized() { return this.splitBlockFilter.length > 0; } getFalsePositiveRate() { return this.desiredFalsePositiveRate; } getNumDistinct() { return this.numDistinctValues; } getNumFilterBlocks() { return this.splitBlockFilter.length; } getFilter() { return this.splitBlockFilter; } /** * @function optNumFilterBytes * @description return the actual number of filter bytes set; if the option to numBytes * was called, this value will be returned. If the options for preferred FPR * and/or numDistinct were called, this function returns the calculated value. */ getNumFilterBytes() { return (this.numBlocks * SplitBlockBloomFilter.BITS_PER_BLOCK) >>> 3; } /** * @function setOptionFalsePositiveRate * @description set the desired false positive percentage for this Bloom filter. * defaults to SplitBlockBLoomFilter.DEFAULT_FALSE_POSITIVE_RATE * This function does nothing if the filter has already been allocated. * @param proportion: number, between 0.0 and 1.0, exclusive */ setOptionFalsePositiveRate(proportion) { if (this.isInitialized()) { console.error('filter already initialized. options may no longer be changed.'); return this; } if (proportion <= 0.0 || proportion >= 1.0) { console.error('falsePositiveProbability. Must be < 1.0 and > 0.0'); return this; } this.desiredFalsePositiveRate = proportion; return this; } /** * @function setOptionNumDistinct * @description set the number of expected distinct values for the filter. * this should generally be <= to the row group size. Defaults to * SplitBlockBloomFilter.UPPER_BOUND_BYTES * This function does nothing if the filter has already been allocated. * @param numDistinct */ setOptionNumDistinct(numDistinct) { if (this.isInitialized()) { console.error('filter already initialized. options may no longer be changed.'); return this; } if (numDistinct <= 0 || numDistinct > SplitBlockBloomFilter.UPPER_BOUND_BYTES) { console.error(`invalid numDistinct. Must be > 0 and < ${SplitBlockBloomFilter.UPPER_BOUND_BYTES}`); return this; } this.numDistinctValues = numDistinct; return this; } /** * @function nextPwr2 * @description return the next highest power of 2 above v * see https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 * NOTE: cannot use values > 2**31. We are not expecting these values internally, * so this works as intended. */ static nextPwr2(v) { v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; } /** * @function setOptionNumFilterBytes * @description set the bytes for this Bloom filter. Set this if you don't want an * optimal value calculated for you. Rounds up to nearest power of 2 * This function does nothing if the filter has already been allocated. * @param numBytes: number, the desired bit size. */ setOptionNumFilterBytes(numBytes) { if (this.isInitialized()) { console.error('filter already initialized. options may no longer be changed.'); return this; } if (numBytes < SplitBlockBloomFilter.LOWER_BOUND_BYTES || numBytes > SplitBlockBloomFilter.UPPER_BOUND_BYTES) { console.error(`invalid numBits. Must be > ${SplitBlockBloomFilter.LOWER_BOUND_BYTES} and < ${SplitBlockBloomFilter.UPPER_BOUND_BYTES}`); return this; } // numBlocks = Bytes * 8b/Byte * 1Block/256b this.numBlocks = (SplitBlockBloomFilter.nextPwr2(numBytes) * 8) / SplitBlockBloomFilter.BITS_PER_BLOCK; return this; } /** * @function initFilter * @description initialize the Bloom filter using the options previously provided. * If numBlocks has not been calculated and set via setOptionNumBytes, we calculate * the optimal filter size based on number of distinct values and * percent false positive rate. See setOptionNumDistinct and setOptionFalsePositiveRate * * Repeated calls to init do nothing to avoid multiple memory allocations or * accidental loss of filters. * @return void */ init() { if (this.isInitialized()) { console.error('filter already initialized.'); return this; } if (!Object.prototype.hasOwnProperty.call(this.hashStrategy, 'XXHASH')) { throw new Error('unsupported hash strategy'); } if (this.numBlocks === 0) { this.numBlocks = SplitBlockBloomFilter.optimalNumOfBlocks(this.numDistinctValues, this.desiredFalsePositiveRate) >>> 3; } this.splitBlockFilter = Array(this.numBlocks).fill(SplitBlockBloomFilter.initBlock()); return this; } async hash(value) { if (!Object.prototype.hasOwnProperty.call(this.hashStrategy, 'XXHASH')) { throw new Error('unsupported hash strategy'); } const hashed = await this.hasher.hash64(value); return long_1.default.fromString(hashed, true, 16); } insertHash(hashValue) { if (!hashValue.unsigned) throw new Error('hashValue must be an unsigned Long'); if (!this.isInitialized()) throw new Error('filter has not been initialized. call init() first'); const i = SplitBlockBloomFilter.getBlockIndex(hashValue, this.splitBlockFilter.length); SplitBlockBloomFilter.blockInsert(this.splitBlockFilter[i], hashValue); } /** * @function insert * @description add a hash value to this filter * @param value: an unsigned Long, the value to add. If not a string, will be JSON.stringified * @return void */ async insert(value) { if (!this.isInitialized()) throw new Error('filter has not been initialized. call init() first'); this.insertHash(await this.hash(value)); } checkHash(hashValue) { if (!hashValue.unsigned) throw new Error('hashValue must be an unsigned Long'); if (!this.isInitialized()) throw new Error('filter has not been initialized'); const i = SplitBlockBloomFilter.getBlockIndex(hashValue, this.splitBlockFilter.length); return SplitBlockBloomFilter.blockCheck(this.splitBlockFilter[i], hashValue); } /** * @function check * @description check if a hashValue exists for this filter * @param value: the value to check for. If not a string, will be JSON.stringified * @return true if hashed item is found in the data set represented by this filter * @return false if it is __definitely not__ in the data set. */ async check(value) { if (!this.isInitialized()) throw new Error('filter has not been initialized'); return this.checkHash(await this.hash(value)); } } exports.default = SplitBlockBloomFilter;