node-parquet
Version:
Parquet is a columnar storage format
402 lines (373 loc) • 13.7 kB
JavaScript
// Copyright 2017 Luca-SAS, licensed under the Apache License 2.0
// Pure javascript module to handle parquet format.
// from specifications at https://parquet.apache.org/documentation/latest/
;
var fs = require('fs');
var util = require('util');
var varint = require('varint');
var hexdump = require('hexdump-nodejs');
module.exports = {
Reader: Reader
};
// Thrift Compact Protocol Types
var Type = {
stop: 0,
true: 1,
false: 2,
byte: 3,
int16: 4,
int32: 5,
int64: 6,
double: 7,
binary: 8,
list: 9,
set: 10,
map: 11,
struct: 12,
extended: 15
};
var TypeString = {
0: 'stop',
1: 'true',
2: 'false',
3: 'byte',
4: 'int16',
5: 'int32',
6: 'int64',
7: 'double',
8: 'binary',
9: 'list',
10: 'set',
11: 'map',
12: 'struct',
15: 'extended'
};
function Reader(path) {
if (!(this instanceof Reader)) return new Reader(path);
this._path = path;
this._fd = fs.openSync(path, 'r');
this.metadata = parseMetadata(this._fd);
}
// zigzag to integer number
function zz2num(n) {
return (n >>> 1) ^ (-1 * (n & 1));
}
// left side of first byte
function lsb1(n) {
return (n >>> 4) & 0xf;
}
// Thrift compact protocol parser
var readType = {
0: function readStop(buf, offset) {
return {stop: true, len: 0};
},
1: function readTrue(buf, offset) {
return {type: 'bool', data: true, len: 0};
},
2: function readFalse(buf, offset) {
return {type: 'bool', data: false, len: 0};
},
3: function readByte(buf, offset) {
return {type: 'byte', data: buf[offset + 1], len: 1};
},
4: function readInt16(buf, offset) {
return {type: 'int16', data: zz2num(varint.decode(buf, offset)), len: varint.decode.bytes};
},
5: function readInt32(buf, offset) {
return {type: 'int32', data: zz2num(varint.decode(buf, offset)), len: varint.decode.bytes};
},
6: function readInt64(buf, offset) {
return {type: 'int64', data: zz2num(varint.decode(buf, offset)), len: varint.decode.bytes};
},
8: function readBinary(buf, offset) {
var len = varint.decode(buf, offset);
var start = offset + varint.decode.bytes;
return {type: 'binary', data: buf.slice(start, start + len).toString(), len: varint.decode.bytes + len};
},
9: function readList(buf, offset) {
var n = lsb1(buf[offset]);
var len = 1;
if (n === 15) {
n = varint.decode(buf, offset + len);
len += varint.decode.bytes;
}
var type = buf[offset] & 0xf;
var res = {};
for (var i = 0; i < n; i++) {
res[i] = readType[type](buf, offset + len);
len += res[i].len;
}
return {type: [TypeString[type]], num: n, data: res, len: len}
},
12: function readStruct(buf, offset) {
return readObj(buf, offset, 'struct');
}
};
function readObj(buf, offset, rtype) {
var res = {}, cur, type, len = 0, i = 0;
do {
type = buf[offset + len] & 0xf;
len++;
cur = readType[type](buf, offset + len);
if (!cur.stop) res[i++] = cur;
len += cur.len;
} while (!cur.stop && (offset + len) < buf.length);
return {type: rtype, data: res, len: len};
}
var parquetTypes = ['BOOLEAN', 'INT32', 'INT64', 'INT96', 'FLOAT', 'DOUBLE', 'BYTE_ARRAY', 'FIXED_LEN_BYTE_ARRAY'];
var convertedTypes = ['UTF8', 'MAP', 'MAP_KEY_VALUE', 'LIST', 'ENUM', 'DECIMAL', 'DATE', 'TIME_MILLIS', 'TIME_MICROS', 'TIMESTAMP_MILLIS', 'TIMESTAMP_MICROS', 'UINT_8', 'UINT_16', 'UINT_32', 'UINT_64', 'INT_8', 'INT_16', 'INT_32', 'INT_64', 'JSON', 'BSON', 'INTERVAL'];
var parquetEncodings = ['PLAIN', 'GROUP_VAR_INT', 'PLAIN_DICTIONARY', 'RLE', 'BIT_PACKED', 'DELTA_BINARY_PACKED', 'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'RLE_DICTIONARY'];
// schema of parquet metadata descriptor
var parquetSchema = {
version: {type: 'int32'},
schema: {
type: ['struct'], schema: {
// Data type for this field. Not set if the current element is a non-leaf node (i.e the 1st)
type: {type: 'int32', optional: true, values: parquetTypes, check: function (context) {
if (context.path[0] === 'schema' && context.path[1] === 0 && !context.body.schema[0]) return false;
return true;
}},
// if type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values,
// otherwise, if specified, this is the maximum bit length to store any of the values.
typeLength: {type: 'int32', optional: true, check: function (context) {
var cur = context.body[context.path[0]][context.path[1]];
return (cur && cur.type === 'FIXED_LEN_BYTE_ARRAY');
}},
fieldRepetitionType: {type: 'int32', optional: true, values: ['REQUIRED', 'OPTIONAL', 'REPEATED']},
name: {type: 'binary'},
numChildren: {type: 'int32', optional: true, check: function (context) {
if (!context.body.schema[0]) return true;
var cur = context.body[context.path[0]][context.path[1]];
return (cur && (cur.type === 'INT64' || cur.type === 'INT32'));
}},
convertedType: {type: 'int32', optional: true, values: convertedTypes},
// Used only for DECIMAL converted type
scale: {type: 'int32', optional: true, check: function (context) {
var cur = context.body[context.path[0]][context.path[1]];
return (cur && cur.convertedType === 'DECIMAL');
}},
// Used only for DECIMAL converted type
precision: {type: 'int32', optional: true, check: function (context) {
var cur = context.body[context.path[0]][context.path[1]];
return (cur && cur.convertedType === 'DECIMAL');
}},
fieldId: {type: 'int32', optional: true},
}
},
numRows: {type: 'int64'},
rowGroups: {
type: ['struct'], schema: {
columns: {
type: ['struct'], schema: {
filePath: {type: 'binary', optional: true},
fileOffset: {type: 'int64'},
metadata: {
type: 'struct', optional: true, schema: {
type: {type: 'int32', values: parquetTypes},
encodings: {type: ['int32'], values: parquetEncodings},
schemaPath: {type: ['binary']},
compressionCodec: {type: 'int32', values: ['UNCOMPRESSED', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI']},
numValues: {type: 'int64'},
uncompressedSize: {type: 'int64'},
compressedSize: {type: 'int64'},
keyValueMetadata: {type: 'struct', optional: true, schema: {
key: {type: 'binary'},
value: {type: 'binary', optional: true}
}
},
dataPageOffset: {type: 'int64'},
indexPageOffset: {type: 'int64', optional: true},
dictionaryPageOffset: {type: 'int64', optional: true},
statistics: {type: 'struct', optional: true, schema: {
max: {type: 'binary', optional: true},
min: {type: 'binary', optional: true},
nullCount: {type: 'int64', optional: true},
distinctCount: {type: 'int64', optional: true}
}
},
encodingStats: {type: ['struct'], optional: true, schema: {
pageType: {type: 'int32', values: ['DATA_PAGE', 'INDEX_PAGE', 'DICTIONARY_PAGE', 'DATA_PAGE_V2']},
encoding: {type: 'int32', values: parquetEncodings},
count: {type: 'int32'}
}
}
}
}
}
},
totalByteSize: {type: 'int64'},
numRows: {type: 'int64'},
sortingColumns: { type: ['struct'], optional: true}
},
},
createdBy: {type: 'binary', optional: true}
};
var statisticSchema = {
max: {type: 'binary', optional: true},
min: {type: 'binary', optional: true},
nullCount: {type: 'int64', optional: true},
distinctCount: {type: 'int64', optional: true}
};
var pageHeaderSchema = {
type: {type: 'int32', values: ['DATA_PAGE', 'INDEX_PAGE', 'DICTIONARY_PAGE', 'DATA_PAGE_V2']},
uncompressedSize: {type: 'int32'},
compressedSize: {type: 'int32'},
crc: {type: 'int32', optional: true},
// The rest of the header is determined by the above type
dataPageHeader: {
type: 'struct', optional: true, schema: {
numValues: {type: 'int32'},
encoding: {type: 'int32', values: parquetEncodings},
definitionLevelEncoding: {type: 'int32', values: parquetEncodings},
repetitionLevelEncoding: {type: 'int32', values: parquetEncodings},
statistics: {type: 'struct', optional: true, schema: statisticSchema}
}, check: function (context) {
return context.body.type === 'DATA_PAGE';
}
},
indexPageHeader: {
type: 'struct', optional: true, schema: { // FIXME: to be completed
numValues: {type: 'int32'},
}, check: function (context) {
return context.body.type === 'INDEX_PAGE';
}
},
dictionaryPageHeader: {
type: 'struct', optional: true, schema: { // FIXME: to be completed
numValues: {type: 'int32'},
encoding: {type: 'int32', values: parquetEncodings},
}, check: function (context) {
return context.body.type === 'DICTIONARY_PAGE';
}
},
dataPageV2Header: {
type: 'struct', optional: true, schema: { // FIXME: to be completed
numValues: {type: 'int32'},
}, check: function (context) {
return context.body.type === 'DATA_PAGE_V2';
}
},
}
function typeEqual(t1, t2) {
return Array.isArray(t1) ? (t1[0] === t2[0]) : (t1 === t2);
}
function decode(fullSchema, input, context) {
var out = {}, i, j, name, obj, schema;
var schemaIndex = 0;
var names = Object.keys(fullSchema);
var ikeys = Object.keys(input.data);
var pathIndex;
if (!context) context = {body: out, path: []};
for (i = 0; i < ikeys.length; i++) {
obj = input.data[ikeys[i]];
while (schemaIndex < names.length) {
name = names[schemaIndex++];
schema = fullSchema[name];
if (!typeEqual(obj.type, schema.type)) {
if (schema.optional) continue;
else console.log("##### ERROR SCHEMA");
} else if (schema.check) {
if (!schema.check(context)) continue;
}
break;
}
context.path.push(name);
if (Array.isArray(obj.type)) {
out[name] = new Array(obj.num);
pathIndex = context.path.push(0) - 1;
for (j = 0; j < obj.num; j++) {
if (obj.type[0] === 'struct') {
out[name][j] = decode(schema.schema, obj.data[j], context);
} else {
if (schema.values)
out[name][j] = schema.values[obj.data[j].data];
else
out[name][j] = obj.data[j].data;
}
context.path[pathIndex] = j;
}
context.path.pop();
} else if (obj.type === 'struct') {
out[name] = decode(schema.schema, obj, context);
} else {
if (schema.values)
out[name] = schema.values[obj.data];
else
out[name] = obj.data;
}
context.path.pop();
}
return out;
}
function parseMetadata(fd) {
var stat = fs.fstatSync(fd);
//console.log('stat', stat);
var footer = Buffer.alloc(8);
var n = fs.readSync(fd, footer, 0, 8, stat.size - 8);
var mlen;
var mbuf;
//console.log('n:', n);
//console.log('footer:', footer);
mlen = footer.readInt32LE(0);
//console.log('footer len:', mlen);
mbuf = Buffer.alloc(mlen);
n = fs.readSync(fd, mbuf, 0, mlen, stat.size - mlen - 8);
//console.log('n:', n);
//console.log(hexdump(mbuf));
//var offset = 0;
//var tmp;
//var res = readObj(mbuf, 0);
//console.log('res', JSON.stringify(res, null, 2));
//var meta = decode(parquetSchema, res);
//console.log('meta', meta);
//console.log('meta', JSON.stringify(meta, null, 2));
return decode(parquetSchema, readObj(mbuf, 0));
}
Reader.prototype.readCol = function (col, skip, rows) {
var meta = this.metadata.rowGroups[0].columns[col];
console.log('col metadata:', meta);
var stat = fs.statSync(this._path);
var fileSize = stat.size;
var debugBuf = Buffer.alloc(fileSize);
var n = fs.readSync(this._fd, debugBuf, 0, debugBuf.length, 0);
console.log(hexdump(debugBuf));
//var pageDataBuf =
var len = meta.metadata.uncompressedSize;
var start = meta.fileOffset - len;
var buf = Buffer.alloc(len);
n = fs.readSync(this._fd, buf, 0, len, start);
console.log(hexdump(buf));
console.log('obj:', readObj(buf, 0));
var indexPageHeader = decode(pageHeaderSchema, readObj(buf, 0));
console.log('indexPageHeader:', indexPageHeader);
// todo: decode page values: if dictionary page preceding data page,
// start at datapage offset - value length (in the page).
// otherwise: buffer end - value length
var vstart = meta.metadata.dataPageOffset - start - indexPageHeader.uncompressedSize;
var values = [];
switch (meta.metadata.type) {
case 'INT32':
for (var i = 0; i < meta.metadata.numValues; i++) {
values[i] = buf.readInt32LE(vstart + i * 4);
}
break;
}
console.log('dict values:', values);
var dataPageHeader = decode(pageHeaderSchema, readObj(buf, meta.metadata.dataPageOffset - start));
console.log('dataPageHeader:', dataPageHeader);
var bitWidth = Math.ceil(Math.log2(dataPageHeader.numValues));
var dlen = dataPageHeader.uncompressedSize - 1;
rleDecode(buf, len - dlen, dlen, bitWidth);
}
// Hybrid RLE / Bit Packed decoder
function rleDecode(buf, offset, datalen, bitWidth) {
var header = varint.decode(buf, offset);
var count = header >>> 1;
datalen -= varint.decode.bytes;
if (header & 1) { // Bit packed
console.log('read ' + (count * 8) + ' bit packed values');
} else { // RLE
console.log('read ' + count + ' RLE values');
console.log('remain ' + datalen + ' bytes');
}
}