hyparquet-writer
Version:
Parquet file writer for JavaScript
122 lines (103 loc) • 3.15 kB
JavaScript
import { ByteWriter } from './bytewriter.js'
/**
* @import {DecodedArray} from 'hyparquet'
* @import {Writer} from '../src/types.js'
* @param {Writer} writer
* @param {DecodedArray} values
* @param {number} bitWidth
* @returns {number} bytes written
*/
export function writeRleBitPackedHybrid(writer, values, bitWidth) {
const offsetStart = writer.offset
// try both RLE and bit-packed and choose the best
const rle = new ByteWriter()
writeRle(rle, values, bitWidth)
const bitPacked = new ByteWriter()
writeBitPacked(bitPacked, values, bitWidth)
if (rle.offset < bitPacked.offset) {
writer.appendBuffer(rle.getBuffer())
} else {
writer.appendBuffer(bitPacked.getBuffer())
}
return writer.offset - offsetStart
}
/**
* @param {Writer} writer
* @param {DecodedArray} values
* @param {number} bitWidth
*/
function writeBitPacked(writer, values, bitWidth) {
// Number of 8-value groups
const numGroups = Math.ceil(values.length / 8)
// The parquet bitpack header: lower bit = 1 => "bit-packed mode"
// upper bits = number of groups
const header = numGroups << 1 | 1
// Write the header as a varint
writer.appendVarInt(header)
// If bitWidth = 0, no data is actually needed
if (bitWidth === 0 || values.length === 0) {
return
}
const mask = (1 << bitWidth) - 1
let buffer = 0 // accumulates bits
let bitsUsed = 0 // how many bits are in 'buffer' so far
// Write out each value, bit-packing into buffer
for (let i = 0; i < values.length; i++) {
const v = values[i] & mask // mask off bits exceeding bitWidth
buffer |= v << bitsUsed
bitsUsed += bitWidth
// Flush full bytes
while (bitsUsed >= 8) {
writer.appendUint8(buffer & 0xFF)
buffer >>>= 8
bitsUsed -= 8
}
}
// Pad the final partial group with zeros if needed
const totalNeeded = numGroups * 8
for (let padCount = values.length; padCount < totalNeeded; padCount++) {
// Just write a 0 value into the buffer
buffer |= 0 << bitsUsed
bitsUsed += bitWidth
while (bitsUsed >= 8) {
writer.appendUint8(buffer & 0xFF)
buffer >>>= 8
bitsUsed -= 8
}
}
// Flush any remaining bits
if (bitsUsed > 0) {
writer.appendUint8(buffer & 0xff)
}
}
/**
* Run-length encoding: write repeated values by encoding the value and its count.
*
* @param {Writer} writer
* @param {DecodedArray} values
* @param {number} bitWidth
*/
function writeRle(writer, values, bitWidth) {
if (!values.length) return
let currentValue = values[0]
let count = 1
for (let i = 1; i <= values.length; i++) {
if (i < values.length && values[i] === currentValue) {
count++ // continue the run
} else {
// write the count of repeated values
const header = count << 1
writer.appendVarInt(header)
// write the value
const width = bitWidth + 7 >> 3 // bytes needed
for (let j = 0; j < width; j++) {
writer.appendUint8(currentValue >> (j << 3) & 0xff)
}
// reset for the next run
if (i < values.length) {
currentValue = values[i]
count = 1
}
}
}
}