parquetjs-lite
Version:
fully asynchronous, pure JavaScript implementation of the Parquet file format
183 lines (147 loc) • 3.9 kB
JavaScript
;
const parquet_codec = require('./codec');
const parquet_compression = require('./compression');
const parquet_types = require('./types');
const parquet_util = require('./util');
const PARQUET_COLUMN_KEY_SEPARATOR = '.';
/**
* A parquet file schema
*/
class ParquetSchema {
/**
* Create a new schema from a JSON schema definition
*/
constructor(schema) {
this.schema = schema;
this.fields = buildFields(schema);
this.fieldList = listFields(this.fields);
}
/**
* Retrieve a field definition
*/
findField(path) {
if (path.constructor !== Array) {
path = path.split(",");
} else {
path = path.slice(0); // clone array
}
let n = this.fields;
for (; path.length > 1; path.shift()) {
n = n[path[0]].fields;
}
return n[path[0]];
}
/**
* Retrieve a field definition and all the field's ancestors
*/
findFieldBranch(path) {
if (path.constructor !== Array) {
path = path.split(",");
}
let branch = [];
let n = this.fields;
for (; path.length > 0; path.shift()) {
branch.push(n[path[0]]);
if (path.length > 1) {
n = n[path[0]].fields;
}
}
return branch;
}
};
function buildFields(schema, rLevelParentMax, dLevelParentMax, path) {
if (!rLevelParentMax) {
rLevelParentMax = 0;
}
if (!dLevelParentMax) {
dLevelParentMax = 0;
}
if (!path) {
path = [];
}
let fieldList = {};
for (let name in schema) {
const opts = schema[name];
/* field repetition type */
const required = !opts.optional;
const repeated = !!opts.repeated;
let rLevelMax = rLevelParentMax;
let dLevelMax = dLevelParentMax;
let repetitionType = 'REQUIRED';
if (!required) {
repetitionType = 'OPTIONAL';
++dLevelMax;
}
if (repeated) {
repetitionType = 'REPEATED';
++rLevelMax;
if (required) {
++dLevelMax;
}
}
/* nested field */
if (opts.fields) {
fieldList[name] = {
name: name,
path: path.concat([name]),
repetitionType: repetitionType,
rLevelMax: rLevelMax,
dLevelMax: dLevelMax,
isNested: true,
statistics: opts.statistics,
fieldCount: Object.keys(opts.fields).length,
fields: buildFields(
opts.fields,
rLevelMax,
dLevelMax,
path.concat([name]))
};
if (opts.type == 'LIST' || opts.type == 'MAP') fieldList[name].originalType = opts.type;
continue;
}
/* field type */
const typeDef = parquet_types.PARQUET_LOGICAL_TYPES[opts.type];
if (!typeDef) {
throw 'invalid parquet type: ' + opts.type;
}
/* field encoding */
if (!opts.encoding) {
opts.encoding = 'PLAIN';
}
if (!(opts.encoding in parquet_codec)) {
throw 'unsupported parquet encoding: ' + opts.encodig;
}
if (!opts.compression) {
opts.compression = 'UNCOMPRESSED';
}
if (!(opts.compression in parquet_compression.PARQUET_COMPRESSION_METHODS)) {
throw 'unsupported compression method: ' + opts.compression;
}
/* add to schema */
fieldList[name] = {
name: name,
primitiveType: typeDef.primitiveType,
originalType: typeDef.originalType,
path: path.concat([name]),
repetitionType: repetitionType,
encoding: opts.encoding,
statistics: opts.statistics,
compression: opts.compression,
typeLength: opts.typeLength || typeDef.typeLength,
rLevelMax: rLevelMax,
dLevelMax: dLevelMax
};
}
return fieldList;
}
function listFields(fields) {
let list = [];
for (let k in fields) {
list.push(fields[k]);
if (fields[k].isNested) {
list = list.concat(listFields(fields[k].fields));
}
}
return list;
}
module.exports = { ParquetSchema };