parquets
Version:
TypeScript implementation of the Parquet file format, based on parquet.js
156 lines (129 loc) • 4.45 kB
text/typescript
import varint = require('varint');
import { PrimitiveType } from '../declare';
import { CursorBuffer, ParquetCodecOptions } from './declare';
function encodeRunBitpacked(values: number[], opts: ParquetCodecOptions): Buffer {
for (let i = 0; i < values.length % 8; i++) {
values.push(0);
}
const buf = Buffer.alloc(Math.ceil(opts.bitWidth * (values.length / 8)));
for (let b = 0; b < opts.bitWidth * values.length; b++) {
if ((values[Math.floor(b / opts.bitWidth)] & (1 << b % opts.bitWidth)) > 0) {
buf[Math.floor(b / 8)] |= (1 << (b % 8));
}
}
return Buffer.concat([
Buffer.from(varint.encode(((values.length / 8) << 1) | 1)),
buf
]);
}
function encodeRunRepeated(value: number, count: number, opts: ParquetCodecOptions): Buffer {
const buf = Buffer.alloc(Math.ceil(opts.bitWidth / 8));
for (let i = 0; i < buf.length; i++) {
buf.writeUInt8(value & 0xff, i);
value >> 8;
}
return Buffer.concat([
Buffer.from(varint.encode(count << 1)),
buf
]);
}
export function encodeValues(type: PrimitiveType, values: any[], opts: ParquetCodecOptions): Buffer {
if (!('bitWidth' in opts)) {
throw new Error('bitWidth is required');
}
switch (type) {
case 'BOOLEAN':
case 'INT32':
case 'INT64':
// tslint:disable-next-line:no-parameter-reassignment
values = values.map(x => parseInt(x, 10));
break;
default:
throw new Error(`unsupported type: ${type}`);
}
let buf = Buffer.alloc(0);
let run = [];
let repeats = 0;
for (let i = 0; i < values.length; i++) {
// If we are at the beginning of a run and the next value is same we start
// collecting repeated values
if (repeats === 0 && run.length % 8 === 0 && values[i] === values[i + 1]) {
// If we have any data in runs we need to encode them
if (run.length) {
buf = Buffer.concat([buf, encodeRunBitpacked(run, opts)]);
run = [];
}
repeats = 1;
} else if (repeats > 0 && values[i] === values[i - 1]) {
repeats += 1;
} else {
// If values changes we need to post any previous repeated values
if (repeats) {
buf = Buffer.concat([buf, encodeRunRepeated(values[i - 1], repeats, opts)]);
repeats = 0;
}
run.push(values[i]);
}
}
if (repeats) {
buf = Buffer.concat([buf, encodeRunRepeated(values[values.length - 1], repeats, opts)]);
} else if (run.length) {
buf = Buffer.concat([buf, encodeRunBitpacked(run, opts)]);
}
if (opts.disableEnvelope) {
return buf;
}
const envelope = Buffer.alloc(buf.length + 4);
envelope.writeUInt32LE(buf.length, undefined);
buf.copy(envelope, 4);
return envelope;
}
function decodeRunBitpacked(cursor: CursorBuffer, count: number, opts: ParquetCodecOptions): number[] {
if (count % 8 !== 0) {
throw new Error('must be a multiple of 8');
}
// tslint:disable-next-line:prefer-array-literal
const values = new Array(count).fill(0);
for (let b = 0; b < opts.bitWidth * count; b++) {
if (cursor.buffer[cursor.offset + Math.floor(b / 8)] & (1 << (b % 8))) {
values[Math.floor(b / opts.bitWidth)] |= (1 << b % opts.bitWidth);
}
}
cursor.offset += opts.bitWidth * (count / 8);
return values;
}
function decodeRunRepeated(cursor: CursorBuffer, count: number, opts: ParquetCodecOptions): number[] {
let value = 0;
for (let i = 0; i < Math.ceil(opts.bitWidth / 8); i++) {
value << 8;
value += cursor.buffer[cursor.offset];
cursor.offset += 1;
}
// tslint:disable-next-line:prefer-array-literal
return new Array(count).fill(value);
}
export function decodeValues(type: PrimitiveType, cursor: CursorBuffer, count: number, opts: ParquetCodecOptions): number[] {
if (!('bitWidth' in opts)) {
throw new Error('bitWidth is required');
}
if (!opts.disableEnvelope) {
cursor.offset += 4;
}
let values: number[] = [];
while (values.length < count) {
const header = varint.decode(cursor.buffer, cursor.offset);
cursor.offset += varint.encodingLength(header);
if (header & 1) {
const count = (header >> 1) * 8;
values.push(...decodeRunBitpacked(cursor, count, opts));
} else {
const count = header >> 1;
values.push(...decodeRunRepeated(cursor, count, opts));
}
}
values = values.slice(0, count);
if (values.length !== count) {
throw new Error('invalid RLE encoding');
}
return values;
}