UNPKG

parquets

Version:

TypeScript implementation of the Parquet file format, based on parquet.js

261 lines (231 loc) 9.18 kB
// tslint:disable: prefer-array-literal // tslint:disable: no-parameter-reassignment const BLOCK_LOG = 16; const BLOCK_SIZE = 1 << BLOCK_LOG; const INPUT_MARGIN = 15; const MAX_HASH_TABLE_BITS = 14; const globalHashTables: Uint16Array[] = new Array(MAX_HASH_TABLE_BITS + 1); function hash32(array: Buffer, pos: number, shift: number) { return ((( array[pos] + (array[pos + 1] << 8) + (array[pos + 2] << 16) + (array[pos + 3] << 24) ) * 0x1e35a7bd) & 0xffffffff) >>> shift; } function equals32(array: Buffer, pos1: number, pos2: number) { return array[pos1] === array[pos2] && array[pos1 + 1] === array[pos2 + 1] && array[pos1 + 2] === array[pos2 + 2] && array[pos1 + 3] === array[pos2 + 3]; } function copyBytes(fromArray: Buffer, fromPos: number, toArray: Buffer, toPos: number, length: number) { for (let i = 0; i < length; i++) { toArray[toPos + i] = fromArray[fromPos + i]; } } function emitLiteral(input: Buffer, ip: number, len: number, output: Buffer, op: number) { // The vast majority of copies are below 16 bytes, for which a // call to memcpy is overkill. This fast path can sometimes // copy up to 15 bytes too much, but that is okay in the // main loop, since we have a bit to go on for both sides: // // - The input will always have kInputMarginBytes = 15 extra // available bytes, as long as we're in the main loop, and // if not, allow_fast_path = false. // - The output will always have 32 spare bytes (see // MaxCompressedLength). if (len <= 60) { // Fits in tag byte output[op] = (len - 1) << 2; op += 1; } else if (len < 256) { output[op] = 60 << 2; output[op + 1] = len - 1; op += 2; } else { output[op] = 61 << 2; output[op + 1] = (len - 1) & 0xff; output[op + 2] = (len - 1) >>> 8; op += 3; } copyBytes(input, ip, output, op, len); return op + len; } function emitCopyAtMost64(output: Buffer, op: number, offset: number, len: number) { if (len < 12 && offset < 2048) { // offset fits in 11 bits. The 3 highest go in the top of the first byte, // and the rest go in the second byte. output[op] = 1 + ((len - 4) << 2) + ((offset >>> 8) << 5); output[op + 1] = offset & 0xff; return op + 2; } else { // Write 4 bytes, though we only care about 3 of them. The output buffer // is required to have some slack, so the extra byte won't overrun it. output[op] = 2 + ((len - 1) << 2); output[op + 1] = offset & 0xff; output[op + 2] = offset >>> 8; return op + 3; } } function emitCopy(output: Buffer, op: number, offset: number, len: number) { // Emit 64 byte copies but make sure to keep at least four bytes reserved. while (len >= 68) { op = emitCopyAtMost64(output, op, offset, 64); len -= 64; } // One or two copies will now finish the job. if (len > 64) { op = emitCopyAtMost64(output, op, offset, 60); len -= 60; } // Emit remainder. op = emitCopyAtMost64(output, op, offset, len); return op; } function putVarint(value: number, output: Buffer, op: number) { do { output[op] = value & 0x7f; value = value >>> 7; if (value > 0) { output[op] += 0x80; } op += 1; } while (value > 0); return op; } function hashBits(inputSize: number) { let bits = 9; while ((1 << bits) <= inputSize && bits <= MAX_HASH_TABLE_BITS) { bits += 1; } bits--; return bits; } function compressFragment(input: Buffer, baseIp: number, inputSize: number, output: Buffer, baseOp: number) { if (inputSize < INPUT_MARGIN) { return emitLiteral(input, baseIp, inputSize, output, baseOp); } const hashTableBits = hashBits(inputSize); const shift = 32 - hashTableBits; if (typeof globalHashTables[hashTableBits] === 'undefined') { globalHashTables[hashTableBits] = new Uint16Array(1 << hashTableBits); } const table = globalHashTables[hashTableBits]; for (let i = 0; i < table.length; i++) { table[i] = 0; } const ipEnd = baseIp + inputSize; const ipLimit = ipEnd - INPUT_MARGIN; // "ip" is the input pointer, and "op" is the output pointer. let ip = baseIp + 1; let op = baseOp; // Bytes in [next_emit, ip) will be emitted as literal bytes. Or // [next_emit, ip_end) after the main loop. let nextEmit = baseIp; let nextHash = hash32(input, ip, shift); // The body of this loop calls EmitLiteral once and then EmitCopy one or // more times. (The exception is that when we're close to exhausting // the input we goto emit_remainder.) // // In the first iteration of this loop we're just starting, so // there's nothing to copy, so calling EmitLiteral once is // necessary. And we only start a new iteration when the // current iteration has determined that a call to EmitLiteral will // precede the next call to EmitCopy (if any). loop: while (ip < ipEnd) { // The body of this loop calls EmitLiteral once and then EmitCopy one or // more times. (The exception is that when we're close to exhausting // the input we goto emit_remainder.) // // In the first iteration of this loop we're just starting, so // there's nothing to copy, so calling EmitLiteral once is // necessary. And we only start a new iteration when the // current iteration has determined that a call to EmitLiteral will // precede the next call to EmitCopy (if any). // Step 1: Scan forward in the input looking for a 4-byte-long match. // If we get close to exhausting the input then goto emit_remainder. // // Heuristic match skipping: If 32 bytes are scanned with no matches // found, start looking only at every other byte. If 32 more bytes are // scanned (or skipped), look at every third byte, etc.. When a match is // found, immediately go back to looking at every byte. This is a small // loss (~5% performance, ~0.1% density) for compressible data due to more // bookkeeping, but for non-compressible data (such as JPEG) it's a huge // win since the compressor quickly "realizes" the data is incompressible // and doesn't bother looking for matches everywhere. // // The "skip" variable keeps track of how many bytes there are since the // last match; dividing it by 32 (ie. right-shifting by five) gives the // number of bytes to move ahead for each iteration. let skip = 32; let nextIp = ip; let candidate = 0; do { ip = nextIp; const inc = skip >>> 5; skip += inc; nextIp = ip + inc; if (nextIp > ipLimit) break loop; const hash = nextHash; nextHash = hash32(input, nextIp, shift); candidate = baseIp + table[hash]; table[hash] = ip - baseIp; } while (!equals32(input, ip, candidate)); // Step 2: A 4-byte match has been found. We'll later see if more // than 4 bytes match. But, prior to the match, input // bytes [next_emit, ip) are unmatched. Emit them as "literal bytes." op = emitLiteral(input, nextEmit, ip - nextEmit, output, op); // Step 3: Call EmitCopy, and then see if another EmitCopy could // be our next move. Repeat until we find no match for the // input immediately after what was consumed by the last EmitCopy call. // // If we exit this loop normally then we need to call EmitLiteral next, // though we don't yet know how big the literal will be. We handle that // by proceeding to the next iteration of the main loop. We also can exit // this loop via goto if we get close to exhausting the input. do { // We have a 4-byte match at ip, and no need to emit any // "literal bytes" prior to ip. let matched = 4; while (ip + matched < ipEnd && input[ip + matched] === input[candidate + matched]) { matched += 1; } op = emitCopy(output, op, ip - candidate, matched); ip += matched; nextEmit = ip; if (ip >= ipLimit) break loop; // We are now looking for a 4-byte match again. We read // table[Hash(ip, shift)] for that. To improve compression, // we also update table[Hash(ip - 1, shift)] and table[Hash(ip, shift)]. const prevHash = hash32(input, ip - 1, shift); table[prevHash] = ip - 1 - baseIp; const curHash = hash32(input, ip, shift); candidate = baseIp + table[curHash]; table[curHash] = ip - baseIp; } while (equals32(input, ip, candidate)); ip += 1; nextHash = hash32(input, ip, shift); } // Emit the remaining bytes as a literal if (nextEmit < ipEnd) { op = emitLiteral(input, nextEmit, ipEnd - nextEmit, output, op); } return op; } export function maxCompressedLength(input: Buffer) { const sourceLen = input.length; return 32 + sourceLen + Math.floor(sourceLen / 6); } export function compressToBuffer(input: Buffer, output: Buffer) { const length = input.length; let pos = 0; let fragmentSize: number; let outPos = putVarint(length, output, 0); while (pos < length) { fragmentSize = Math.min(length - pos, BLOCK_SIZE); outPos = compressFragment(input, pos, fragmentSize, output, outPos); pos += fragmentSize; } return outPos; }