@dobesv/parquets
Version:
TypeScript implementation of the Parquet file format, based on parquet.js
129 lines • 4.4 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParquetSchema = void 0;
const codec_1 = require("./codec");
const compression_1 = require("./compression");
const shred_1 = require("./shred");
const types_1 = require("./types");
/**
* A parquet file schema
*/
class ParquetSchema {
/**
* Create a new schema from a JSON schema definition
*/
constructor(schema) {
this.schema = schema;
this.fields = buildFields(schema, 0, 0, []);
this.fieldList = listFields(this.fields);
}
findField(path) {
return (Array.isArray(path) ? path : path.split(',')).reduce((field, segment) => field.fields[segment], this);
}
findFieldBranch(path) {
const branch = [];
// tslint:disable-next-line:no-this-assignment
let field = this;
for (const segment of Array.isArray(path) ? path : path.split(',')) {
field = field.fields[segment];
branch.push(field);
}
return branch;
}
shredRecord(record, buffer) {
(0, shred_1.shredRecord)(this, record, buffer);
}
materializeRecords(buffer) {
return (0, shred_1.materializeRecords)(this, buffer);
}
compress(type) {
setCompress(this.schema, type);
setCompress(this.fields, type);
return this;
}
}
exports.ParquetSchema = ParquetSchema;
function setCompress(schema, type) {
for (const name in schema) {
const node = schema[name];
if (node.fields) {
setCompress(node.fields, type);
}
else {
node.compression = type;
}
}
}
function buildFields(schema, rLevelParentMax, dLevelParentMax, path) {
const fieldList = {};
for (const name in schema) {
const opts = schema[name];
// Calculate max dLevel and rLevel for this field
const { repeated = false, optional = false } = opts;
// If this field is repeated, its rLevel is higher than its parent
const rLevelMax = rLevelParentMax + +repeated;
// If this field is optional or repeated, its dLevel is higher than its parent
// For repeated fields, the dLevel is used to indicate there are no values at the given rLevel
const dLevelMax = dLevelParentMax + +(optional || repeated);
const repetitionType = repeated
? 'REPEATED'
: optional
? 'OPTIONAL'
: 'REQUIRED';
/* nested field */
if (opts.fields) {
const cpath = path.concat([name]);
fieldList[name] = {
name,
path: cpath,
key: cpath.join(),
repetitionType,
rLevelMax,
dLevelMax,
isNested: true,
fieldCount: Object.keys(opts.fields).length,
fields: buildFields(opts.fields, rLevelMax, dLevelMax, cpath),
};
continue;
}
const typeDef = types_1.PARQUET_LOGICAL_TYPES[opts.type];
if (!typeDef) {
throw new Error(`invalid parquet type: ${opts.type}`);
}
opts.encoding = opts.encoding || 'PLAIN';
if (!(opts.encoding in codec_1.PARQUET_CODEC)) {
throw new Error(`unsupported parquet encoding: ${opts.encoding}`);
}
opts.compression = opts.compression || 'UNCOMPRESSED';
if (!(opts.compression in compression_1.PARQUET_COMPRESSION_METHODS)) {
throw new Error(`unsupported compression method: ${opts.compression}`);
}
/* add to schema */
const cpath = path.concat([name]);
fieldList[name] = {
name,
primitiveType: typeDef.primitiveType,
originalType: typeDef.originalType,
path: cpath,
key: cpath.join(),
repetitionType,
encoding: opts.encoding,
compression: opts.compression,
typeLength: opts.typeLength || typeDef.typeLength,
rLevelMax,
dLevelMax,
};
}
return fieldList;
}
function listFields(fields) {
let list = [];
for (const k in fields) {
list.push(fields[k]);
if (fields[k].isNested) {
list = list.concat(listFields(fields[k].fields));
}
}
return list;
}
//# sourceMappingURL=schema.js.map