@dobesv/parquets
Version:
TypeScript implementation of the Parquet file format, based on parquet.js
265 lines • 9.73 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.ParquetTransformer = exports.ParquetEnvelopeWriter = exports.ParquetWriter = void 0;
const stream_1 = require("stream");
const encoding_1 = require("./encoding");
const Util = require("./util");
const shred_1 = require("./shred");
/**
* Write a parquet file to an output stream. The ParquetWriter will perform
* buffering/batching for performance, so close() must be called after all rows
* are written.
*/
class ParquetWriter {
/**
* Convenience method to create a new buffered parquet writer that writes to
* the specified file
*/
static async openFile(schema, path, opts) {
const outputStream = await Util.osopen(path, opts);
return ParquetWriter.openStream(schema, outputStream, opts);
}
/**
* Convenience method to create a new buffered parquet writer that writes to
* the specified stream
*/
static async openStream(schema, outputStream, opts) {
if (!opts) {
// tslint:disable-next-line:no-parameter-reassignment
opts = {};
}
const envelopeWriter = await ParquetEnvelopeWriter.openStream(schema, outputStream, opts);
return new ParquetWriter(schema, envelopeWriter, opts);
}
/**
* Create a new buffered parquet writer for a given envelope writer
*/
constructor(schema, envelopeWriter, opts) {
this.schema = schema;
this.envelopeWriter = envelopeWriter;
this.rowBuffer = new shred_1.ParquetWriteBuffer(schema);
this.rowGroupSize = opts.rowGroupSize || encoding_1.PARQUET_DEFAULT_ROW_GROUP_SIZE;
this.closed = false;
this.headerWritten = false;
this.userMetadata = {};
}
/**
* Write the header if it was not already written
*/
async ensureHeaderWritten() {
if (!this.headerWritten) {
try {
// Set the flag before making the call so that a concurrent call while the header
// is being written will not write the header a second time
this.headerWritten = true;
// Go ahead and write the header
await this.envelopeWriter.writeHeader();
}
catch (err) {
this.envelopeWriter.close();
throw err;
}
}
}
/**
* Append a single row to the parquet file. Rows are buffered in memory until
* rowGroupSize rows are in the buffer or close() is called
*/
async appendRow(row) {
if (this.closed) {
throw new Error('writer was closed');
}
(0, shred_1.shredRecord)(this.schema, row, this.rowBuffer);
if (this.rowBuffer.rowCount >= this.rowGroupSize) {
await this.ensureHeaderWritten();
await this.envelopeWriter.writeRowGroup(this.rowBuffer);
this.rowBuffer = new shred_1.ParquetWriteBuffer(this.schema);
}
}
/**
* Finish writing the parquet file and commit the footer to disk. This method
* MUST be called after you are finished adding rows. You must not call this
* method twice on the same object or add any rows after the close() method has
* been called
*/
async close(callback) {
if (this.closed) {
throw new Error('writer was closed');
}
this.closed = true;
// Make sure we have written the header even if the file is empty
await this.ensureHeaderWritten();
if (this.rowBuffer.rowCount > 0 ||
this.rowBuffer.rowCount >= this.rowGroupSize) {
await this.envelopeWriter.writeRowGroup(this.rowBuffer);
this.rowBuffer = new shred_1.ParquetWriteBuffer(this.schema);
}
await this.envelopeWriter.writeFooter(this.userMetadata);
await this.envelopeWriter.close();
this.envelopeWriter = null;
if (callback) {
callback();
}
}
/**
* Add key<>value metadata to the file
*/
setMetadata(key, value) {
// TODO: value to be any, obj -> JSON
this.userMetadata[String(key)] = String(value);
}
/**
* Set the parquet row group size. This values controls the maximum number
* of rows that are buffered in memory at any given time as well as the number
* of rows that are co-located on disk. A higher value is generally better for
* read-time I/O performance at the tradeoff of write-time memory usage.
*/
setRowGroupSize(cnt) {
this.rowGroupSize = cnt;
}
/**
* Set the parquet data page size. The data page size controls the maximum
* number of column values that are written to disk as a consecutive array
*/
setPageSize(cnt) {
this.envelopeWriter.setPageSize(cnt);
}
}
exports.ParquetWriter = ParquetWriter;
/**
* Create a parquet file from a schema and a number of row groups. This class
* performs direct, unbuffered writes to the underlying output stream and is
* intendend for advanced and internal users; the writeXXX methods must be
* called in the correct order to produce a valid file.
*/
class ParquetEnvelopeWriter {
/**
* Create a new parquet envelope writer that writes to the specified stream
*/
static async openStream(schema, outputStream, opts) {
const writeFn = Util.oswrite.bind(undefined, outputStream);
const closeFn = Util.osclose.bind(undefined, outputStream);
return new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts);
}
constructor(schema, writeFn, closeFn, fileOffset, opts) {
this.schema = schema;
this.write = writeFn;
this.close = closeFn;
this.offset = fileOffset;
this.rowCount = 0;
this.rowGroups = [];
this.pageSize = opts.pageSize || encoding_1.PARQUET_DEFAULT_PAGE_SIZE;
this.useDataPageV2 = 'useDataPageV2' in opts ? opts.useDataPageV2 : false;
}
writeSection(buf) {
this.offset += buf.length;
return this.write(buf);
}
/**
* Encode the parquet file header
*/
writeHeader() {
return this.writeSection(Buffer.from(encoding_1.PARQUET_MAGIC));
}
/**
* Encode a parquet row group. The records object should be created using the
* shredRecord method
*/
writeRowGroup(records) {
const rowGroup = (0, encoding_1.encodeRowGroup)(this.schema, records, {
baseOffset: this.offset,
pageSize: this.pageSize,
useDataPageV2: this.useDataPageV2,
});
this.rowCount += records.rowCount;
this.rowGroups.push(rowGroup.metadata);
return this.writeSection(rowGroup.body);
}
/**
* Write the parquet file footer
*/
writeFooter(userMetadata) {
if (!userMetadata) {
// tslint:disable-next-line:no-parameter-reassignment
userMetadata = {};
}
return this.writeSection((0, encoding_1.encodeFooter)(this.schema, this.rowCount, this.rowGroups, userMetadata));
}
/**
* Set the parquet data page size. The data page size controls the maximum
* number of column values that are written to disk as a consecutive array
*/
setPageSize(cnt) {
this.pageSize = cnt;
}
}
exports.ParquetEnvelopeWriter = ParquetEnvelopeWriter;
/**
* Create a parquet transform stream
*/
class ParquetTransformer extends stream_1.Transform {
constructor(schema, opts = {}) {
super({ objectMode: true });
this.waiting = [];
const writeFn = (function (t) {
return function (b) {
if (!t.push(b)) {
// stop writing until the readable is ready again
return new Promise((resolve, reject) => {
t.waiting.push([resolve, reject]);
});
}
return Promise.resolve();
};
})(this);
const closeFn = (function (t) {
return function () {
t.push(null);
return Promise.resolve();
};
})(this);
this.writer = new ParquetWriter(schema, new ParquetEnvelopeWriter(schema, writeFn, closeFn, 0, opts), opts);
}
// If I/O was delayed due to backpressure and then the stream is destroyed,
// propagate an error back to the callee of the I/O operation(s)
// tslint:disable-next-line:function-name
_destroy(error, callback) {
try {
if (this.waiting.length) {
const waiting = this.waiting;
this.waiting = [];
waiting.forEach(([resolve, reject]) => error ? reject(error) : resolve());
}
callback(null);
}
catch (err) {
callback(err);
}
}
// If we get backpressure we will delay returning from a call to write until
// the next call to _read
// tslint:disable-next-line:function-name
_read(arg) {
if (this.waiting.length) {
const waiting = this.waiting;
this.waiting = [];
waiting.forEach(([resolve]) => resolve());
}
return super._read(arg);
}
// tslint:disable-next-line:function-name
_transform(row, encoding, callback) {
if (row) {
this.writer.appendRow(row).then(() => callback(), err => callback(err));
}
else {
callback();
}
}
// tslint:disable-next-line:function-name
_flush(callback) {
this.writer.close(callback);
}
}
exports.ParquetTransformer = ParquetTransformer;
//# sourceMappingURL=writer.js.map