UNPKG

@dsnp/parquetjs

Version:

fully asynchronous, pure JavaScript implementation of the Parquet file format

678 lines (677 loc) 28.3 kB
"use strict"; var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.ParquetTransformer = exports.ParquetEnvelopeWriter = exports.ParquetWriter = void 0; const stream_1 = __importDefault(require("stream")); const parquet_types_1 = __importStar(require("../gen-nodejs/parquet_types")); const parquet_shredder = __importStar(require("./shred")); const parquet_util = __importStar(require("./util")); const parquet_codec = __importStar(require("./codec")); const parquet_compression = __importStar(require("./compression")); const parquet_types = __importStar(require("./types")); const bloomFilterWriter = __importStar(require("./bloomFilterIO/bloomFilterWriter")); const node_int64_1 = __importDefault(require("node-int64")); /** * Parquet File Magic String */ const PARQUET_MAGIC = 'PAR1'; /** * Parquet File Format Version */ const PARQUET_VERSION = 1; /** * Default Page and Row Group sizes */ const PARQUET_DEFAULT_PAGE_SIZE = 8192; const PARQUET_DEFAULT_ROW_GROUP_SIZE = 4096; /** * Repetition and Definition Level Encoding */ const PARQUET_RDLVL_TYPE = 'INT32'; const PARQUET_RDLVL_ENCODING = 'RLE'; /** * Write a parquet file to an output stream. The ParquetWriter will perform * buffering/batching for performance, so close() must be called after all rows * are written. */ class ParquetWriter { schema; envelopeWriter; rowBuffer; rowGroupSize; closed; userMetadata; /** * Convenience method to create a new buffered parquet writer that writes to * the specified file */ static async openFile(schema, path, opts) { const outputStream = await parquet_util.osopen(path, opts); return ParquetWriter.openStream(schema, outputStream, opts); } /** * Convenience method to create a new buffered parquet writer that writes to * the specified stream */ static async openStream(schema, outputStream, opts) { if (!opts) { opts = {}; } const envelopeWriter = await ParquetEnvelopeWriter.openStream(schema, outputStream, opts); return new ParquetWriter(schema, envelopeWriter, opts); } /** * Create a new buffered parquet writer for a given envelope writer */ constructor(schema, envelopeWriter, opts) { this.schema = schema; this.envelopeWriter = envelopeWriter; this.rowBuffer = {}; this.rowGroupSize = opts.rowGroupSize || PARQUET_DEFAULT_ROW_GROUP_SIZE; this.closed = false; this.userMetadata = {}; try { envelopeWriter.writeHeader(); } catch (err) { envelopeWriter.close(); throw err; } } /** * Append a single row to the parquet file. Rows are buffered in memory until * rowGroupSize rows are in the buffer or close() is called */ async appendRow(row) { if (this.closed || this.envelopeWriter === null) { throw 'writer was closed'; } parquet_shredder.shredRecord(this.schema, row, this.rowBuffer); const options = { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters, }; if (this.rowBuffer.pageRowCount >= this.envelopeWriter.pageSize) { await encodePages(this.schema, this.rowBuffer, options); } if (this.rowBuffer.rowCount >= this.rowGroupSize) { await encodePages(this.schema, this.rowBuffer, options); await this.envelopeWriter.writeRowGroup(this.rowBuffer); this.rowBuffer = {}; } } /** * Finish writing the parquet file and commit the footer to disk. This method * MUST be called after you are finished adding rows. You must not call this * method twice on the same object or add any rows after the close() method has * been called */ async close(callback) { if (this.closed) { throw 'writer was closed'; } this.closed = true; if (this.envelopeWriter) { if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) { await encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2, bloomFilters: this.envelopeWriter.bloomFilters, }); await this.envelopeWriter.writeRowGroup(this.rowBuffer); this.rowBuffer = {}; } await this.envelopeWriter.writeBloomFilters(); await this.envelopeWriter.writeIndex(); await this.envelopeWriter.writeFooter(this.userMetadata); await this.envelopeWriter.close(); this.envelopeWriter = null; } if (callback) { callback(); } } /** * Add key<>value metadata to the file */ setMetadata(key, value) { this.userMetadata[key.toString()] = value.toString(); } /** * Set the parquet row group size. This values controls the maximum number * of rows that are buffered in memory at any given time as well as the number * of rows that are co-located on disk. A higher value is generally better for * read-time I/O performance at the tradeoff of write-time memory usage. */ setRowGroupSize(cnt) { this.rowGroupSize = cnt; } /** * Set the parquet data page size. The data page size controls the maximum * number of column values that are written to disk as a consecutive array */ setPageSize(cnt) { this.envelopeWriter.setPageSize(cnt); } } exports.ParquetWriter = ParquetWriter; /** * Create a parquet file from a schema and a number of row groups. This class * performs direct, unbuffered writes to the underlying output stream and is * intended for advanced and internal users; the writeXXX methods must be * called in the correct order to produce a valid file. */ class ParquetEnvelopeWriter { schema; write; close; offset; rowCount; rowGroups; pageSize; useDataPageV2; pageIndex; bloomFilters; // TODO: OR filterCollection /** * Create a new parquet envelope writer that writes to the specified stream */ static async openStream(schema, outputStream, opts) { const writeFn = parquet_util.oswrite.bind(undefined, outputStream); const closeFn = parquet_util.osend.bind(undefined, outputStream); return new ParquetEnvelopeWriter(schema, writeFn, closeFn, new node_int64_1.default(0), opts); } constructor(schema, writeFn, closeFn, fileOffset, opts) { this.schema = schema; this.write = writeFn; this.close = closeFn; this.offset = fileOffset; this.rowCount = new node_int64_1.default(0); this.rowGroups = []; this.pageSize = opts.pageSize || PARQUET_DEFAULT_PAGE_SIZE; this.useDataPageV2 = 'useDataPageV2' in opts ? opts.useDataPageV2 : true; this.pageIndex = opts.pageIndex; this.bloomFilters = {}; (opts.bloomFilters || []).forEach((bloomOption) => { this.bloomFilters[bloomOption.column] = bloomFilterWriter.createSBBF(bloomOption); }); } writeSection(buf) { this.offset.setValue(this.offset.valueOf() + buf.length); return this.write(buf); } /** * Encode the parquet file header */ writeHeader() { return this.writeSection(Buffer.from(PARQUET_MAGIC)); } /** * Encode a parquet row group. The records object should be created using the * shredRecord method */ async writeRowGroup(records) { const rgroup = await encodeRowGroup(this.schema, records, { baseOffset: this.offset, pageSize: this.pageSize, useDataPageV2: this.useDataPageV2, pageIndex: this.pageIndex, }); this.rowCount.setValue(this.rowCount.valueOf() + records.rowCount); this.rowGroups.push(rgroup.metadata); return this.writeSection(rgroup.body); } writeBloomFilters() { this.rowGroups.forEach((group) => { group.columns.forEach((column) => { if (!column.meta_data?.path_in_schema.length) { return; } const filterName = column.meta_data?.path_in_schema.join(','); if (!(filterName in this.bloomFilters)) { return; } const serializedBloomFilterData = bloomFilterWriter.getSerializedBloomFilterData(this.bloomFilters[filterName]); bloomFilterWriter.setFilterOffset(column, this.offset); this.writeSection(serializedBloomFilterData); }); }); } /** * Write the columnIndices and offsetIndices */ writeIndex() { this.schema.fieldList.forEach((c, i) => { this.rowGroups.forEach((group) => { const column = group.columns[i]; if (!column) return; if (column.meta_data?.columnIndex) { const columnBody = parquet_util.serializeThrift(column.meta_data.columnIndex); delete column.meta_data.columnIndex; column.column_index_offset = parquet_util.cloneInteger(this.offset); column.column_index_length = columnBody.length; this.writeSection(columnBody); } if (column.meta_data?.offsetIndex) { const offsetBody = parquet_util.serializeThrift(column.meta_data.offsetIndex); delete column.meta_data.offsetIndex; column.offset_index_offset = parquet_util.cloneInteger(this.offset); column.offset_index_length = offsetBody.length; this.writeSection(offsetBody); } }); }); } /** * Write the parquet file footer */ writeFooter(userMetadata) { if (!userMetadata) { userMetadata = {}; } if (this.schema.fieldList.length === 0) { throw 'cannot write parquet file with zero fieldList'; } return this.writeSection(encodeFooter(this.schema, this.rowCount, this.rowGroups, userMetadata)); } /** * Set the parquet data page size. The data page size controls the maximum * number of column values that are written to disk as a consecutive array */ setPageSize(cnt) { this.pageSize = cnt; } } exports.ParquetEnvelopeWriter = ParquetEnvelopeWriter; /** * Create a parquet transform stream */ class ParquetTransformer extends stream_1.default.Transform { writer; constructor(schema, opts = {}) { super({ objectMode: true }); const writeProxy = (function (t) { return function (b) { t.push(b); }; })(this); this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeProxy, () => { /* void */ }, new node_int64_1.default(0), opts), opts); } _transform(row, _encoding, callback) { if (row) { this.writer.appendRow(row).then((data) => callback(null, data), (err) => { const fullErr = new Error(`Error transforming to parquet: ${err.toString()} row:${row}`); fullErr.message = err; callback(fullErr); }); } else { callback(); } } _flush(callback) { this.writer.close().then((d) => callback(null, d), callback); } } exports.ParquetTransformer = ParquetTransformer; /** * Encode a consecutive array of data using one of the parquet encodings */ function encodeValues(type, encoding, values, opts) { if (!(encoding in parquet_codec)) { throw 'invalid encoding: ' + encoding; } return parquet_codec[encoding].encodeValues(type, values, opts); } function encodeStatisticsValue(value, column) { if (value === undefined) { return Buffer.alloc(0); } if (column.originalType) { value = parquet_types.toPrimitive(column.originalType, value, column); } if (column.primitiveType !== 'BYTE_ARRAY') { value = encodeValues(column.primitiveType, 'PLAIN', [value], column); } return value; } function encodeStatistics(statistics, column) { statistics = Object.assign({}, statistics); statistics.min_value = statistics.min_value === undefined ? null : encodeStatisticsValue(statistics.min_value, column); statistics.max_value = statistics.max_value === undefined ? null : encodeStatisticsValue(statistics.max_value, column); statistics.max = statistics.max_value; statistics.min = statistics.min_value; return new parquet_types_1.default.Statistics(statistics); } async function encodePages(schema, rowBuffer, opts) { // generic if (!rowBuffer.pageRowCount) { return; } for (const field of schema.fieldList) { if (field.isNested) { continue; } let page; const columnPath = field.path.join(','); const values = rowBuffer.columnData[columnPath]; if (opts.bloomFilters && columnPath in opts.bloomFilters) { const splitBlockBloomFilter = opts.bloomFilters[columnPath]; values.values.forEach((v) => splitBlockBloomFilter.insert(v)); } let statistics = {}; if (field.statistics !== false) { statistics = {}; [...values.distinct_values].forEach((v, i) => { if (i === 0 || v > statistics.max_value) { statistics.max_value = v; } if (i === 0 || v < statistics.min_value) { statistics.min_value = v; } }); statistics.null_count = new node_int64_1.default(values.dlevels.length - values.values.length); statistics.distinct_count = new node_int64_1.default(values.distinct_values.size); } if (opts.useDataPageV2) { page = await encodeDataPageV2(field, values.count, values.values, values.rlevels, values.dlevels, statistics); } else { page = await encodeDataPage(field, values.values || [], values.rlevels || [], values.dlevels || [], statistics); } const pages = rowBuffer.pages[field.path.join(',')]; const lastPage = pages[pages.length - 1]; const first_row_index = lastPage ? lastPage.first_row_index + lastPage.count : 0; pages.push({ page, statistics, first_row_index, distinct_values: values.distinct_values, num_values: values.dlevels.length, }); values.distinct_values = new Set(); values.values = []; values.rlevels = []; values.dlevels = []; values.count = 0; } rowBuffer.pageRowCount = 0; } /** * Encode a parquet data page */ async function encodeDataPage(column, values, rlevels, dlevels, statistics) { /* encode values */ const valuesBuf = encodeValues(column.primitiveType, column.encoding, values, { bitWidth: column.typeLength, ...column, }); /* encode repetition and definition levels */ let rLevelsBuf = Buffer.alloc(0); if (column.rLevelMax > 0) { rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, rlevels, { bitWidth: parquet_util.getBitWidth(column.rLevelMax), }); } let dLevelsBuf = Buffer.alloc(0); if (column.dLevelMax > 0) { dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, dlevels, { bitWidth: parquet_util.getBitWidth(column.dLevelMax), }); } /* build page header */ const initialPageBody = Buffer.concat([rLevelsBuf, dLevelsBuf, valuesBuf]); const pageBody = await parquet_compression.deflate(column.compression, initialPageBody); const pageHeader = new parquet_types_1.default.PageHeader(); pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE']; pageHeader.uncompressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length; pageHeader.compressed_page_size = pageBody.length; pageHeader.data_page_header = new parquet_types_1.default.DataPageHeader(); pageHeader.data_page_header.num_values = dlevels.length; if (column.statistics !== false) { pageHeader.data_page_header.statistics = encodeStatistics(statistics, column); } pageHeader.data_page_header.encoding = parquet_types_1.default.Encoding[column.encoding]; pageHeader.data_page_header.definition_level_encoding = parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]; pageHeader.data_page_header.repetition_level_encoding = parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]; /* concat page header, repetition and definition levels and values */ return Buffer.concat([parquet_util.serializeThrift(pageHeader), pageBody]); } /** * Encode a parquet data page (v2) */ async function encodeDataPageV2(column, rowCount, values, rlevels, dlevels, statistics) { /* encode values */ const valuesBuf = encodeValues(column.primitiveType, column.encoding, values, { bitWidth: column.typeLength, ...column, }); const valuesBufCompressed = await parquet_compression.deflate(column.compression, valuesBuf); /* encode repetition and definition levels */ let rLevelsBuf = Buffer.alloc(0); if (column.rLevelMax > 0) { rLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, rlevels, { bitWidth: parquet_util.getBitWidth(column.rLevelMax), disableEnvelope: true, }); } let dLevelsBuf = Buffer.alloc(0); if (column.dLevelMax > 0) { dLevelsBuf = encodeValues(PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, dlevels, { bitWidth: parquet_util.getBitWidth(column.dLevelMax), disableEnvelope: true, }); } /* build page header */ const pageHeader = new parquet_types_1.default.PageHeader(); pageHeader.type = parquet_types_1.default.PageType['DATA_PAGE_V2']; pageHeader.data_page_header_v2 = new parquet_types_1.default.DataPageHeaderV2(); pageHeader.data_page_header_v2.num_values = dlevels.length; pageHeader.data_page_header_v2.num_nulls = dlevels.length - values.length; pageHeader.data_page_header_v2.num_rows = rowCount; if (column.statistics !== false) { pageHeader.data_page_header_v2.statistics = encodeStatistics(statistics, column); } pageHeader.uncompressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length; pageHeader.compressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBufCompressed.length; pageHeader.data_page_header_v2.encoding = parquet_types_1.default.Encoding[column.encoding]; pageHeader.data_page_header_v2.definition_levels_byte_length = dLevelsBuf.length; pageHeader.data_page_header_v2.repetition_levels_byte_length = rLevelsBuf.length; pageHeader.data_page_header_v2.is_compressed = column.compression !== 'UNCOMPRESSED'; /* concat page header, repetition and definition levels and values */ return Buffer.concat([parquet_util.serializeThrift(pageHeader), rLevelsBuf, dLevelsBuf, valuesBufCompressed]); } /** * Encode an array of values into a parquet column chunk */ async function encodeColumnChunk(pages, opts) { const pagesBuf = Buffer.concat(pages.map((d) => d.page)); const num_values = pages.reduce((p, d) => p + d.num_values, 0); let offset = opts.baseOffset; /* prepare metadata header */ const metadata = new parquet_types_1.default.ColumnMetaData(); metadata.path_in_schema = opts.column.path; metadata.num_values = new node_int64_1.default(num_values); metadata.data_page_offset = new node_int64_1.default(opts.baseOffset); metadata.encodings = []; metadata.total_uncompressed_size = new node_int64_1.default(pagesBuf.length); metadata.total_compressed_size = new node_int64_1.default(pagesBuf.length); metadata.type = parquet_types_1.default.Type[opts.column.primitiveType]; metadata.codec = await parquet_types_1.default.CompressionCodec[opts.column.compression]; /* compile statistics ColumnIndex and OffsetIndex*/ const columnIndex = new parquet_types_1.default.ColumnIndex(); columnIndex.null_pages = []; columnIndex.max_values = []; columnIndex.min_values = []; // Default to unordered columnIndex.boundary_order = 0; const offsetIndex = new parquet_types_1.default.OffsetIndex(); offsetIndex.page_locations = []; /* prepare statistics */ const statistics = {}; const distinct_values = new Set(); statistics.null_count = new node_int64_1.default(0); statistics.distinct_count = new node_int64_1.default(0); /* loop through pages and update indices and statistics */ for (let i = 0; i < pages.length; i++) { const page = pages[i]; if (opts.column.statistics !== false) { if (page.statistics.max_value > statistics.max_value || i == 0) { statistics.max_value = page.statistics.max_value; } if (page.statistics.min_value < statistics.min_value || i == 0) { statistics.min_value = page.statistics.min_value; } statistics.null_count.setValue(statistics.null_count.valueOf() + (page.statistics.null_count?.valueOf() || 0)); page.distinct_values.forEach((value) => distinct_values.add(value)); // If the number of values and the count of nulls are the same, this is a null page columnIndex.null_pages.push(page.num_values === statistics.null_count.valueOf()); columnIndex.max_values.push(encodeStatisticsValue(page.statistics.max_value, opts.column)); columnIndex.min_values.push(encodeStatisticsValue(page.statistics.min_value, opts.column)); } const pageLocation = new parquet_types_1.default.PageLocation(); pageLocation.offset = new node_int64_1.default(offset); offset += page.page.length; pageLocation.compressed_page_size = page.page.length; pageLocation.first_row_index = new node_int64_1.default(page.first_row_index); offsetIndex.page_locations.push(pageLocation); } if (opts.pageIndex !== false) { metadata.offsetIndex = offsetIndex; } if (opts.column.statistics !== false) { statistics.distinct_count = new node_int64_1.default(distinct_values.size); metadata.statistics = encodeStatistics(statistics, opts.column); if (opts.pageIndex !== false) { metadata.columnIndex = columnIndex; } } /* list encodings */ metadata.encodings.push(parquet_types_1.default.Encoding[PARQUET_RDLVL_ENCODING]); metadata.encodings.push(parquet_types_1.default.Encoding[opts.column.encoding]); /* concat metadata header and data pages */ const metadataOffset = opts.baseOffset + pagesBuf.length; const body = Buffer.concat([pagesBuf, parquet_util.serializeThrift(metadata)]); return { body, metadata, metadataOffset }; } /** * Encode a list of column values into a parquet row group */ async function encodeRowGroup(schema, data, opts) { const metadata = new parquet_types_1.default.RowGroup(); metadata.num_rows = new node_int64_1.default(data.rowCount); metadata.columns = []; metadata.total_byte_size = new node_int64_1.default(0); let body = Buffer.alloc(0); for (const field of schema.fieldList) { if (field.isNested) { continue; } const cchunkData = await encodeColumnChunk(data.pages[field.path.join(',')], { column: field, baseOffset: opts.baseOffset.valueOf() + body.length, pageSize: opts.pageSize || 0, rowCount: data.rowCount || 0, useDataPageV2: opts.useDataPageV2 ?? true, pageIndex: opts.pageIndex ?? true, }); const cchunk = new parquet_types_1.default.ColumnChunk(); cchunk.file_offset = new node_int64_1.default(cchunkData.metadataOffset); cchunk.meta_data = cchunkData.metadata; metadata.columns.push(cchunk); metadata.total_byte_size = new node_int64_1.default(metadata.total_byte_size.valueOf() + cchunkData.body.length); body = Buffer.concat([body, cchunkData.body]); } return { body, metadata }; } /** * Encode a parquet file metadata footer */ function encodeFooter(schema, rowCount, rowGroups, userMetadata) { const metadata = new parquet_types_1.default.FileMetaData(); metadata.version = PARQUET_VERSION; metadata.created_by = '@dsnp/parquetjs'; metadata.num_rows = rowCount; metadata.row_groups = rowGroups; metadata.schema = []; metadata.key_value_metadata = []; for (const k in userMetadata) { const kv = new parquet_types_1.default.KeyValue(); kv.key = k; kv.value = userMetadata[k]; metadata.key_value_metadata.push(kv); } { const schemaRoot = new parquet_types_1.default.SchemaElement(); schemaRoot.name = 'root'; schemaRoot.num_children = Object.keys(schema.fields).length; metadata.schema.push(schemaRoot); } for (const field of schema.fieldList) { const schemaElem = new parquet_types_1.default.SchemaElement(); schemaElem.name = field.name; schemaElem.repetition_type = parquet_types_1.default.FieldRepetitionType[field.repetitionType]; if (field.isNested) { schemaElem.num_children = field.fieldCount; } else { schemaElem.type = parquet_types_1.default.Type[field.primitiveType]; } if (field.originalType) { schemaElem.converted_type = parquet_types_1.default.ConvertedType[field.originalType]; } // Support Decimal switch (schemaElem.converted_type) { case parquet_types_1.ConvertedType.DECIMAL: schemaElem.precision = field.precision; schemaElem.scale = field.scale || 0; break; } schemaElem.type_length = field.typeLength; metadata.schema.push(schemaElem); } const metadataEncoded = parquet_util.serializeThrift(metadata); const footerEncoded = Buffer.alloc(metadataEncoded.length + 8); metadataEncoded.copy(footerEncoded); footerEncoded.writeUInt32LE(metadataEncoded.length, metadataEncoded.length); footerEncoded.write(PARQUET_MAGIC, metadataEncoded.length + 4); return footerEncoded; }