UNPKG

parquetjs-lite

Version:

fully asynchronous, pure JavaScript implementation of the Parquet file format

752 lines (636 loc) 22.2 kB
'use strict'; const stream = require('stream'); const parquet_thrift = require('../gen-nodejs/parquet_types') const parquet_shredder = require('./shred') const parquet_util = require('./util') const parquet_codec = require('./codec') const parquet_compression = require('./compression') const parquet_types = require('./types'); const exportMetadata = require('./exportMetadata'); /** * Parquet File Magic String */ const PARQUET_MAGIC = 'PAR1'; /** * Parquet File Format Version */ const PARQUET_VERSION = 1; /** * Default Page and Row Group sizes */ const PARQUET_DEFAULT_PAGE_SIZE = 8192; const PARQUET_DEFAULT_ROW_GROUP_SIZE = 4096; /** * Repetition and Definition Level Encoding */ const PARQUET_RDLVL_TYPE = 'INT32'; const PARQUET_RDLVL_ENCODING = 'RLE'; /** * Write a parquet file to an output stream. The ParquetWriter will perform * buffering/batching for performance, so close() must be called after all rows * are written. */ class ParquetWriter { /** * Convenience method to create a new buffered parquet writer that writes to * the specified file */ static async openFile(schema, path, opts) { let outputStream = await parquet_util.osopen(path, opts); return ParquetWriter.openStream(schema, outputStream, opts); } /** * Convenience method to create a new buffered parquet writer that writes to * the specified stream */ static async openStream(schema, outputStream, opts) { if (!opts) { opts = {}; } let envelopeWriter = await ParquetEnvelopeWriter.openStream( schema, outputStream, opts); return new ParquetWriter(schema, envelopeWriter, opts); } /** * Create a new buffered parquet writer for a given envelope writer */ constructor(schema, envelopeWriter, opts) { this.schema = schema; this.envelopeWriter = envelopeWriter; this.rowBuffer = {}; this.rowGroupSize = opts.rowGroupSize || PARQUET_DEFAULT_ROW_GROUP_SIZE; this.closed = false; this.userMetadata = {}; try { envelopeWriter.writeHeader(); } catch (err) { envelopeWriter.close(); throw err; } } /** * Append a single row to the parquet file. Rows are buffered in memory until * rowGroupSize rows are in the buffer or close() is called */ async appendRow(row) { if (this.closed) { throw 'writer was closed'; } parquet_shredder.shredRecord(this.schema, row, this.rowBuffer); if (this.rowBuffer.pageRowCount >= this.envelopeWriter.pageSize) { encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2}); } if (this.rowBuffer.rowCount >= this.rowGroupSize) { encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2}); await this.envelopeWriter.writeRowGroup(this.rowBuffer); this.rowBuffer = {}; } } /** * Finish writing the parquet file and commit the footer to disk. This method * MUST be called after you are finished adding rows. You must not call this * method twice on the same object or add any rows after the close() method has * been called */ async close(callback) { if (this.closed) { throw 'writer was closed'; } this.closed = true; if (this.rowBuffer.rowCount > 0 || this.rowBuffer.rowCount >= this.rowGroupSize) { encodePages(this.schema, this.rowBuffer, { useDataPageV2: this.envelopeWriter.useDataPageV2}); await this.envelopeWriter.writeRowGroup(this.rowBuffer); this.rowBuffer = {}; } await this.envelopeWriter.writeIndex(); await this.envelopeWriter.writeFooter(this.userMetadata); await this.envelopeWriter.close(); if (callback) { callback(); } } /** * Add key<>value metadata to the file */ setMetadata(key, value) { this.userMetadata[key.toString()] = value.toString(); } /** * Set the parquet row group size. This values controls the maximum number * of rows that are buffered in memory at any given time as well as the number * of rows that are co-located on disk. A higher value is generally better for * read-time I/O performance at the tradeoff of write-time memory usage. */ setRowGroupSize(cnt) { this.rowGroupSize = cnt; } /** * Set the parquet data page size. The data page size controls the maximum * number of column values that are written to disk as a consecutive array */ setPageSize(cnt) { this.writer.setPageSize(cnt); } } /** * Create a parquet file from a schema and a number of row groups. This class * performs direct, unbuffered writes to the underlying output stream and is * intendend for advanced and internal users; the writeXXX methods must be * called in the correct order to produce a valid file. */ class ParquetEnvelopeWriter { /** * Create a new parquet envelope writer that writes to the specified stream */ static async openStream(schema, outputStream, opts) { let writeFn = parquet_util.oswrite.bind(undefined, outputStream); let closeFn = parquet_util.osend.bind(undefined, outputStream); return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts); } constructor(schema, writeFn, closeFn, fileOffset, opts) { this.schema = schema; this.write = writeFn, this.close = closeFn; this.offset = fileOffset; this.rowCount = 0; this.rowGroups = []; this.pageSize = opts.pageSize || PARQUET_DEFAULT_PAGE_SIZE; this.useDataPageV2 = ("useDataPageV2" in opts) ? opts.useDataPageV2 : true; this.pageIndex = opts.pageIndex; } writeSection(buf) { this.offset += buf.length; return this.write(buf); } /** * Encode the parquet file header */ writeHeader() { return this.writeSection(Buffer.from(PARQUET_MAGIC)); } /** * Encode a parquet row group. The records object should be created using the * shredRecord method */ writeRowGroup(records) { let rgroup = encodeRowGroup( this.schema, records, { baseOffset: this.offset, pageSize: this.pageSize, useDataPageV2: this.useDataPageV2, pageIndex: this.pageIndex }); this.rowCount += records.rowCount; this.rowGroups.push(rgroup.metadata); return this.writeSection(rgroup.body); } /** * Write the columnIndices and offsetIndices */ writeIndex(_rowGroups) { let rowGroups = _rowGroups || this.rowGroups; this.schema.fieldList.forEach( (c,i) => { rowGroups.forEach(group => { let column = group.columns[i]; if (!column) return; if (column.meta_data.columnIndex) { let columnBody = parquet_util.serializeThrift(column.meta_data.columnIndex); delete column.meta_data.columnIndex; column.column_index_offset = this.offset; column.column_index_length = columnBody.length; this.writeSection(columnBody); } if (column.meta_data.offsetIndex) { let offsetBody = parquet_util.serializeThrift(column.meta_data.offsetIndex); delete column.meta_data.offsetIndex; column.offset_index_offset = this.offset; column.offset_index_length = offsetBody.length; this.writeSection(offsetBody); } }); }); } /** * Write the parquet file footer */ writeFooter(userMetadata, schema, rowCount, rowGroups) { if (!userMetadata) { userMetadata = {}; } if (this.schema.fieldList.length === 0) { throw 'cannot write parquet file with zero fieldList'; } return this.writeSection( encodeFooter(this.schema, this.rowCount, this.rowGroups, userMetadata)); } exportMetadata(indent) { const metadata = getMetadata(this.schema, this.rowCount, this.rowGroups, {}); console.log('metadata',metadata) return exportMetadata(metadata, indent || 2); } /** * Set the parquet data page size. The data page size controls the maximum * number of column values that are written to disk as a consecutive array */ setPageSize(cnt) { this.pageSize = cnt; } } /** * Create a parquet transform stream */ class ParquetTransformer extends stream.Transform { constructor(schema, opts = {}) { super({ objectMode: true }); let writeProxy = (function(t) { return function(b) { t.push(b); } })(this); this.writer = new ParquetWriter( schema, new ParquetEnvelopeWriter(schema, writeProxy, function() {}, 0, opts), opts); } _transform(row, encoding, callback) { if (row) { this.writer.appendRow(row).then( data => callback(null, data), err => { const fullErr = new Error(`Error transforming to parquet: ${err.toString()} row:${JSON.stringify(row)}`); fullErr.origErr = err; callback(fullErr); } ); } else { callback(); } } _flush(callback) { this.writer.close(callback) .then(d => callback(null, d), callback); } } /** * Encode a consecutive array of data using one of the parquet encodings */ function encodeValues(type, encoding, values, opts) { if (!(encoding in parquet_codec)) { throw 'invalid encoding: ' + encoding; } return parquet_codec[encoding].encodeValues(type, values, opts); } function encodeStatisticsValue(value, column) { if (value === undefined) { return Buffer.alloc(0); } if (column.originalType) { value = parquet_types.toPrimitive(column.originalType,value); } if (column.primitiveType !== 'BYTE_ARRAY') { value = encodeValues(column.primitiveType,'PLAIN',[value],column); } return value; } function encodeStatistics(statistics,column) { statistics = Object.assign({},statistics); statistics.min_value = statistics.min_value === undefined ? null : encodeStatisticsValue(statistics.min_value, column); statistics.max_value = statistics.max_value === undefined ? null : encodeStatisticsValue(statistics.max_value, column); statistics.max = statistics.max_value; statistics.min = statistics.min_value; return new parquet_thrift.Statistics(statistics); } function encodePages(schema, rowBuffer, opts) { if (!rowBuffer.pageRowCount) { return; } for (let field of schema.fieldList) { if (field.isNested) { continue; } let page; const values = rowBuffer.columnData[field.path]; let statistics; if (field.statistics !== false) { statistics = {}; [...values.distinct_values].forEach( (v,i) => { if (i === 0 || v > statistics.max_value) { statistics.max_value = v; } if (i === 0 || v < statistics.min_value) { statistics.min_value = v; } }); statistics.null_count = values.dlevels.length - values.values.length; statistics.distinct_count = values.distinct_values.size; } if (opts.useDataPageV2) { page = encodeDataPageV2( field, values.count, values.values, values.rlevels, values.dlevels, statistics); } else { page = encodeDataPage( field, values.values, values.rlevels, values.dlevels, statistics); } let pages = rowBuffer.pages[field.path]; let lastPage = pages[pages.length-1]; let first_row_index = lastPage ? lastPage.first_row_index + lastPage.count : 0; pages.push({ page, statistics, first_row_index, distinct_values: values.distinct_values, num_values: values.dlevels.length }); values.distinct_values = new Set(); values.values = []; values.rlevels = []; values.dlevels = []; values.count = 0; } rowBuffer.pageRowCount = 0; } /** * Encode a parquet data page */ function encodeDataPage(column, values, rlevels, dlevels, statistics) { /* encode values */ let valuesBuf = encodeValues( column.primitiveType, column.encoding, values, { typeLength: column.typeLength, bitWidth: column.typeLength }); /* encode repetition and definition levels */ let rLevelsBuf = Buffer.alloc(0); if (column.rLevelMax > 0) { rLevelsBuf = encodeValues( PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, rlevels, { bitWidth: parquet_util.getBitWidth(column.rLevelMax) }); } let dLevelsBuf = Buffer.alloc(0); if (column.dLevelMax > 0) { dLevelsBuf = encodeValues( PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, dlevels, { bitWidth: parquet_util.getBitWidth(column.dLevelMax) }); } /* build page header */ let pageBody = Buffer.concat([rLevelsBuf, dLevelsBuf, valuesBuf]); pageBody = parquet_compression.deflate( column.compression, pageBody); let pageHeader = new parquet_thrift.PageHeader(); pageHeader.type = parquet_thrift.PageType['DATA_PAGE']; pageHeader.uncompressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length; pageHeader.compressed_page_size = pageBody.length pageHeader.data_page_header = new parquet_thrift.DataPageHeader(); pageHeader.data_page_header.num_values = dlevels.length; if (column.statistics !== false) { pageHeader.data_page_header.statistics = encodeStatistics(statistics, column); } pageHeader.data_page_header.encoding = parquet_thrift.Encoding[column.encoding]; pageHeader.data_page_header.definition_level_encoding = parquet_thrift.Encoding[PARQUET_RDLVL_ENCODING]; pageHeader.data_page_header.repetition_level_encoding = parquet_thrift.Encoding[PARQUET_RDLVL_ENCODING]; /* concat page header, repetition and definition levels and values */ return Buffer.concat([parquet_util.serializeThrift(pageHeader), pageBody]); } /** * Encode a parquet data page (v2) */ function encodeDataPageV2(column, rowCount, values, rlevels, dlevels, statistics) { /* encode values */ let valuesBuf = encodeValues( column.primitiveType, column.encoding, values, { typeLength: column.typeLength, bitWidth: column.typeLength }); let valuesBufCompressed = parquet_compression.deflate( column.compression, valuesBuf); /* encode repetition and definition levels */ let rLevelsBuf = Buffer.alloc(0); if (column.rLevelMax > 0) { rLevelsBuf = encodeValues( PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, rlevels, { bitWidth: parquet_util.getBitWidth(column.rLevelMax), disableEnvelope: true }); } let dLevelsBuf = Buffer.alloc(0); if (column.dLevelMax > 0) { dLevelsBuf = encodeValues( PARQUET_RDLVL_TYPE, PARQUET_RDLVL_ENCODING, dlevels, { bitWidth: parquet_util.getBitWidth(column.dLevelMax), disableEnvelope: true }); } /* build page header */ let pageHeader = new parquet_thrift.PageHeader(); pageHeader.type = parquet_thrift.PageType['DATA_PAGE_V2']; pageHeader.data_page_header_v2 = new parquet_thrift.DataPageHeaderV2(); pageHeader.data_page_header_v2.num_values = dlevels.length; pageHeader.data_page_header_v2.num_nulls = dlevels.length - values.length; pageHeader.data_page_header_v2.num_rows = rowCount; if (column.statistics !== false) { pageHeader.data_page_header_v2.statistics = encodeStatistics(statistics, column); } pageHeader.uncompressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBuf.length; pageHeader.compressed_page_size = rLevelsBuf.length + dLevelsBuf.length + valuesBufCompressed.length; pageHeader.data_page_header_v2.encoding = parquet_thrift.Encoding[column.encoding]; pageHeader.data_page_header_v2.definition_levels_byte_length = dLevelsBuf.length; pageHeader.data_page_header_v2.repetition_levels_byte_length = rLevelsBuf.length; pageHeader.data_page_header_v2.is_compressed = column.compression !== 'UNCOMPRESSED'; /* concat page header, repetition and definition levels and values */ return Buffer.concat([ parquet_util.serializeThrift(pageHeader), rLevelsBuf, dLevelsBuf, valuesBufCompressed]); } /** * Encode an array of values into a parquet column chunk */ function encodeColumnChunk(pages, opts) { let pagesBuf = Buffer.concat(pages.map(d => d.page)); let num_values = pages.reduce((p,d) => p + d.num_values, 0); let offset = opts.baseOffset; /* prepare metadata header */ let metadata = new parquet_thrift.ColumnMetaData(); metadata.path_in_schema = opts.column.path; metadata.num_values = num_values; metadata.data_page_offset = opts.baseOffset; metadata.encodings = []; metadata.total_uncompressed_size = pagesBuf.length; metadata.total_compressed_size = pagesBuf.length; metadata.type = parquet_thrift.Type[opts.column.primitiveType]; metadata.codec = parquet_thrift.CompressionCodec[opts.column.compression]; /* compile statistics ColumnIndex and OffsetIndex*/ let columnIndex = new parquet_thrift.ColumnIndex(); columnIndex.max_values = []; columnIndex.min_values = []; let offsetIndex = new parquet_thrift.OffsetIndex(); offsetIndex.page_locations = []; /* prepare statistics */ let statistics = {}; let distinct_values = new Set(); statistics.null_count = 0; statistics.distinct_count = 0; /* loop through pages and update indices and statistics */ for (let i = 0; i < pages.length; i++) { let page = pages[i]; if (opts.column.statistics !== false) { if (page.statistics.max_value > statistics.max_value || i == 0) { statistics.max_value = page.statistics.max_value; } if (page.statistics.min_value < statistics.min_value || i == 0) { statistics.min_value = page.statistics.min_value; } statistics.null_count += page.statistics.null_count; page.distinct_values.forEach(value => distinct_values.add(value)); columnIndex.max_values.push( encodeStatisticsValue(page.statistics.max_value, opts.column) ); columnIndex.min_values.push( encodeStatisticsValue(page.statistics.min_value, opts.column) ); } let pageLocation = new parquet_thrift.PageLocation(); pageLocation.offset = offset; offset += page.page.length; pageLocation.compressed_page_size = page.page.length; pageLocation.first_row_index = page.first_row_index; offsetIndex.page_locations.push(pageLocation); } if (opts.pageIndex !== false) { metadata.offsetIndex = offsetIndex; } if (opts.column.statistics !== false) { statistics.distinct_count = distinct_values.size; metadata.statistics = encodeStatistics(statistics, opts.column); if (opts.pageIndex !== false) { metadata.columnIndex = columnIndex; } } /* list encodings */ let encodingsSet = {}; encodingsSet[PARQUET_RDLVL_ENCODING] = true; encodingsSet[opts.column.encoding] = true; for (let k in encodingsSet) { metadata.encodings.push(parquet_thrift.Encoding[k]); } /* concat metadata header and data pages */ let metadataOffset = opts.baseOffset + pagesBuf.length; let body = Buffer.concat([pagesBuf, parquet_util.serializeThrift(metadata)]); return { body, metadata, metadataOffset }; } /** * Encode a list of column values into a parquet row group */ function encodeRowGroup(schema, data, opts) { let metadata = new parquet_thrift.RowGroup(); metadata.num_rows = data.rowCount; metadata.columns = []; metadata.total_byte_size = 0; let body = Buffer.alloc(0); for (let field of schema.fieldList) { if (field.isNested) { continue; } let cchunkData = encodeColumnChunk( data.pages[field.path], { column: field, baseOffset: opts.baseOffset + body.length, pageSize: opts.pageSize, encoding: field.encoding, rowCount: data.rowCount, useDataPageV2: opts.useDataPageV2, pageIndex: opts.pageIndex }); let cchunk = new parquet_thrift.ColumnChunk(); cchunk.file_offset = cchunkData.metadataOffset; cchunk.meta_data = cchunkData.metadata; metadata.columns.push(cchunk); metadata.total_byte_size += cchunkData.body.length; body = Buffer.concat([body, cchunkData.body]); } return { body, metadata }; } function getMetadata(schema, rowCount, rowGroups, userMetadata) { let metadata = new parquet_thrift.FileMetaData(); metadata.version = PARQUET_VERSION; metadata.created_by = 'parquet.js'; metadata.num_rows = rowCount; metadata.row_groups = rowGroups; metadata.schema = []; metadata.key_value_metadata = []; for (let k in userMetadata) { let kv = new parquet_thrift.KeyValue() kv.key = k; kv.value = userMetadata[k]; metadata.key_value_metadata.push(kv); } { let schemaRoot = new parquet_thrift.SchemaElement(); schemaRoot.name = 'root'; schemaRoot.num_children = Object.keys(schema.fields).length; metadata.schema.push(schemaRoot); } for (let field of schema.fieldList) { let schemaElem = new parquet_thrift.SchemaElement(); schemaElem.name = field.name; schemaElem.repetition_type = parquet_thrift.FieldRepetitionType[field.repetitionType]; if (field.isNested) { schemaElem.num_children = field.fieldCount; } else { schemaElem.type = parquet_thrift.Type[field.primitiveType]; } if (field.originalType) { schemaElem.converted_type = parquet_thrift.ConvertedType[field.originalType]; } schemaElem.type_length = field.typeLength; metadata.schema.push(schemaElem); } return metadata; } /** * Encode a parquet file metadata footer */ function encodeFooter(schema, rowCount, rowGroups, userMetadata) { let metadata = getMetadata(schema, rowCount, rowGroups, userMetadata); let metadataEncoded = parquet_util.serializeThrift(metadata); let footerEncoded = Buffer.alloc(metadataEncoded.length + 8); metadataEncoded.copy(footerEncoded); footerEncoded.writeUInt32LE(metadataEncoded.length, metadataEncoded.length); footerEncoded.write(PARQUET_MAGIC, metadataEncoded.length + 4); return footerEncoded; } module.exports = { ParquetEnvelopeWriter, ParquetWriter, ParquetTransformer };