UNPKG

@dsnp/parquetjs

Version:

fully asynchronous, pure JavaScript implementation of the Parquet file format

238 lines (237 loc) 8.79 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); Object.defineProperty(exports, "__esModule", { value: true }); exports.ParquetSchema = void 0; const parquet_codec = __importStar(require("./codec")); const parquet_compression = __importStar(require("./compression")); const parquet_types = __importStar(require("./types")); const jsonSchema_1 = require("./jsonSchema"); /** * A parquet file schema */ class ParquetSchema { schema; fields; fieldList; /** * Create a new schema from JSON Schema (json-schema.org) */ static fromJsonSchema(jsonSchema) { const schema = (0, jsonSchema_1.fromJsonSchema)(jsonSchema); return new ParquetSchema(schema); } /** * Create a new schema from a JSON schema definition */ constructor(schema) { this.schema = schema; this.fields = buildFields(schema); this.fieldList = listFields(this.fields); } /** * Retrieve a field definition */ findField(path) { if (typeof path === 'string') { path = path.split(','); } else { path = path.slice(0); // clone array } let n = this.fields; for (; path.length > 1; path.shift()) { const fields = n[path[0]]?.fields; if (isDefined(fields)) { n = fields; } } return n[path[0]]; } /** * Retrieve a field definition and all the field's ancestors */ findFieldBranch(path) { if (typeof path === 'string') { path = path.split(','); } const branch = []; let n = this.fields; for (; path.length > 0; path.shift()) { branch.push(n[path[0]]); const fields = n[path[0]].fields; if (path.length > 1 && isDefined(fields)) { n = fields; } } return branch; } } exports.ParquetSchema = ParquetSchema; function buildFields(schema, rLevelParentMax, dLevelParentMax, path) { if (!rLevelParentMax) { rLevelParentMax = 0; } if (!dLevelParentMax) { dLevelParentMax = 0; } if (!path) { path = []; } const fieldList = {}; let fieldErrors = []; for (const name in schema) { const opts = schema[name]; /* field repetition type */ const required = !opts.optional; const repeated = !!opts.repeated; let rLevelMax = rLevelParentMax; let dLevelMax = dLevelParentMax; let repetitionType = 'REQUIRED'; if (!required) { repetitionType = 'OPTIONAL'; ++dLevelMax; } if (repeated) { repetitionType = 'REPEATED'; ++rLevelMax; if (required) { ++dLevelMax; } } /* nested field */ if (opts.fields) { fieldList[name] = { name: name, path: path.concat(name), repetitionType: repetitionType, rLevelMax: rLevelMax, dLevelMax: dLevelMax, isNested: true, statistics: opts.statistics, fieldCount: Object.keys(opts.fields).length, fields: buildFields(opts.fields, rLevelMax, dLevelMax, path.concat(name)), logicalType: opts.logicalType, }; if (opts.type == 'LIST' || opts.type == 'MAP') fieldList[name].originalType = opts.type; continue; } let nameWithPath = `${name}` || 'missing name'; if (path && path.length > 0) { nameWithPath = `${path}.${nameWithPath}`; } const typeDef = opts.type ? parquet_types.getParquetTypeDataObject(opts.type, opts) : undefined; if (!typeDef) { fieldErrors.push(`Invalid parquet type: ${opts.type || 'missing type'}, for Column: ${nameWithPath}`); continue; } /* field encoding */ if (!opts.encoding) { opts.encoding = 'PLAIN'; } if (!(opts.encoding in parquet_codec)) { fieldErrors.push(`Unsupported parquet encoding: ${opts.encoding}, for Column: ${nameWithPath}`); } if (!opts.compression) { opts.compression = 'UNCOMPRESSED'; } if (!(opts.compression in parquet_compression.PARQUET_COMPRESSION_METHODS)) { fieldErrors.push(`Unsupported compression method: ${opts.compression}, for Column: ${nameWithPath}`); } if (typeDef.originalType === 'DECIMAL') { // Default scale to 0 per https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#decimal if (typeof opts.scale === 'undefined') opts.scale = 0; fieldErrors = fieldErrors.concat(errorsForDecimalOpts(typeDef.originalType, typeDef.primitiveType, opts, nameWithPath)); } /* add to schema */ fieldList[name] = { name: name, primitiveType: typeDef.primitiveType, originalType: typeDef.originalType, logicalType: opts.logicalType, path: path.concat([name]), repetitionType: repetitionType, encoding: opts.encoding, statistics: opts.statistics, compression: opts.compression, precision: opts.precision, scale: opts.scale, typeLength: opts.typeLength || typeDef.typeLength, rLevelMax: rLevelMax, dLevelMax: dLevelMax, }; } if (fieldErrors.length > 0) { throw fieldErrors.reduce((accumulator, currentVal) => accumulator + '\n' + currentVal); } return fieldList; } function listFields(fields) { let list = []; for (const k in fields) { list.push(fields[k]); const nestedFields = fields[k].fields; if (fields[k].isNested && isDefined(nestedFields)) { list = list.concat(listFields(nestedFields)); } } return list; } function isDefined(val) { return val !== undefined; } function errorsForDecimalOpts(type, primitiveType, opts, columnName) { const fieldErrors = []; if (opts.precision === undefined || opts.precision < 1) { fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, precision is required and must be be greater than 0`); } else if (!Number.isInteger(opts.precision)) { fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, precision must be an integer`); } else if (primitiveType === 'INT64' && opts.precision > 18) { fieldErrors.push(`invalid schema for type: ${type} and primitive type: ${primitiveType} for Column: ${columnName}, can not handle precision over 18`); } if (typeof opts.scale === 'undefined' || opts.scale < 0) { fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, scale is required to be 0 or greater`); } else if (!Number.isInteger(opts.scale)) { fieldErrors.push(`invalid schema for type: ${type}, for Column: ${columnName}, scale must be an integer`); } else if (opts.precision !== undefined && opts.scale > opts.precision) { fieldErrors.push(`invalid schema or precision for type: ${type}, for Column: ${columnName}, precision must be greater than or equal to scale`); } return fieldErrors; }