UNPKG

@dobesv/parquets

Version:

TypeScript implementation of the Parquet file format, based on parquet.js

129 lines 4.4 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.ParquetSchema = void 0; const codec_1 = require("./codec"); const compression_1 = require("./compression"); const shred_1 = require("./shred"); const types_1 = require("./types"); /** * A parquet file schema */ class ParquetSchema { /** * Create a new schema from a JSON schema definition */ constructor(schema) { this.schema = schema; this.fields = buildFields(schema, 0, 0, []); this.fieldList = listFields(this.fields); } findField(path) { return (Array.isArray(path) ? path : path.split(',')).reduce((field, segment) => field.fields[segment], this); } findFieldBranch(path) { const branch = []; // tslint:disable-next-line:no-this-assignment let field = this; for (const segment of Array.isArray(path) ? path : path.split(',')) { field = field.fields[segment]; branch.push(field); } return branch; } shredRecord(record, buffer) { (0, shred_1.shredRecord)(this, record, buffer); } materializeRecords(buffer) { return (0, shred_1.materializeRecords)(this, buffer); } compress(type) { setCompress(this.schema, type); setCompress(this.fields, type); return this; } } exports.ParquetSchema = ParquetSchema; function setCompress(schema, type) { for (const name in schema) { const node = schema[name]; if (node.fields) { setCompress(node.fields, type); } else { node.compression = type; } } } function buildFields(schema, rLevelParentMax, dLevelParentMax, path) { const fieldList = {}; for (const name in schema) { const opts = schema[name]; // Calculate max dLevel and rLevel for this field const { repeated = false, optional = false } = opts; // If this field is repeated, its rLevel is higher than its parent const rLevelMax = rLevelParentMax + +repeated; // If this field is optional or repeated, its dLevel is higher than its parent // For repeated fields, the dLevel is used to indicate there are no values at the given rLevel const dLevelMax = dLevelParentMax + +(optional || repeated); const repetitionType = repeated ? 'REPEATED' : optional ? 'OPTIONAL' : 'REQUIRED'; /* nested field */ if (opts.fields) { const cpath = path.concat([name]); fieldList[name] = { name, path: cpath, key: cpath.join(), repetitionType, rLevelMax, dLevelMax, isNested: true, fieldCount: Object.keys(opts.fields).length, fields: buildFields(opts.fields, rLevelMax, dLevelMax, cpath), }; continue; } const typeDef = types_1.PARQUET_LOGICAL_TYPES[opts.type]; if (!typeDef) { throw new Error(`invalid parquet type: ${opts.type}`); } opts.encoding = opts.encoding || 'PLAIN'; if (!(opts.encoding in codec_1.PARQUET_CODEC)) { throw new Error(`unsupported parquet encoding: ${opts.encoding}`); } opts.compression = opts.compression || 'UNCOMPRESSED'; if (!(opts.compression in compression_1.PARQUET_COMPRESSION_METHODS)) { throw new Error(`unsupported compression method: ${opts.compression}`); } /* add to schema */ const cpath = path.concat([name]); fieldList[name] = { name, primitiveType: typeDef.primitiveType, originalType: typeDef.originalType, path: cpath, key: cpath.join(), repetitionType, encoding: opts.encoding, compression: opts.compression, typeLength: opts.typeLength || typeDef.typeLength, rLevelMax, dLevelMax, }; } return fieldList; } function listFields(fields) { let list = []; for (const k in fields) { list.push(fields[k]); if (fields[k].isNested) { list = list.concat(listFields(fields[k].fields)); } } return list; } //# sourceMappingURL=schema.js.map