UNPKG

cdap-avsc

Version:

This project is a clone of mtth/avsc repo with modifications required by CDAP

589 lines (511 loc) 13.9 kB
/* jshint node: true */ // TODO: Add streams which prefix each record with its length. 'use strict'; /** * This module defines custom streams to write and read Avro files. * * In particular, the `Block{En,De}coder` streams are able to deal with Avro * container files. None of the streams below depend on the filesystem however, * this way they can also be used in the browser (for example to parse HTTP * responses). * */ var files = require('./files'), types = require('./types'), utils = require('./utils'), stream = require('stream'), util = require('util'), zlib = require('zlib'); // Type of Avro header. var HEADER_TYPE = types.createType({ namespace: 'org.apache.avro.file', name: 'Header', type: 'record', fields : [ {name: 'magic', type: {type: 'fixed', name: 'Magic', size: 4}}, {name: 'meta', type: {type: 'map', values: 'bytes'}}, {name: 'sync', type: {type: 'fixed', name: 'Sync', size: 16}} ] }); // Type of each block. var BLOCK_TYPE = types.createType({ namespace: 'org.apache.avro.file', name: 'Block', type: 'record', fields : [ {name: 'count', type: 'long'}, {name: 'data', type: 'bytes'}, {name: 'sync', type: {type: 'fixed', name: 'Sync', size: 16}} ] }); // Used to toBuffer each block, without having to copy all its data. var LONG_TYPE = types.createType('long'); // First 4 bytes of an Avro object container file. var MAGIC_BYTES = new Buffer('Obj\x01'); // Convenience. var f = util.format; var Tap = utils.Tap; /** * Duplex stream for decoding fragments. * */ function RawDecoder(schema, opts) { opts = opts || {}; var noDecode = !!opts.noDecode; stream.Duplex.call(this, { readableObjectMode: !noDecode, allowHalfOpen: false }); this._type = types.createType(files.load(schema)); this._tap = new Tap(new Buffer(0)); this._writeCb = null; this._needPush = false; this._readValue = createReader(noDecode, this._type); this._finished = false; this.on('finish', function () { this._finished = true; this._read(); }); } util.inherits(RawDecoder, stream.Duplex); RawDecoder.prototype._write = function (chunk, encoding, cb) { // Store the write callback and call it when we are done decoding all records // in this chunk. If we call it right away, we risk loading the entire input // in memory. We only need to store the latest callback since the stream API // guarantees that `_write` won't be called again until we call the previous. this._writeCb = cb; var tap = this._tap; tap.buf = Buffer.concat([tap.buf.slice(tap.pos), chunk]); tap.pos = 0; if (this._needPush) { this._needPush = false; this._read(); } }; RawDecoder.prototype._read = function () { this._needPush = false; var tap = this._tap; var pos = tap.pos; var val = this._readValue(tap); if (tap.isValid()) { this.push(val); } else if (!this._finished) { tap.pos = pos; this._needPush = true; if (this._writeCb) { // This should only ever be false on the first read, and only if it // happens before the first write. this._writeCb(); } } else { this.push(null); } }; /** * Duplex stream for decoding object container files. * */ function BlockDecoder(opts) { opts = opts || {}; var noDecode = !!opts.noDecode; stream.Duplex.call(this, { allowHalfOpen: true, // For async decompressors. readableObjectMode: !noDecode }); this._type = null; this._codecs = opts.codecs; this._parseHook = opts.parseHook; this._tap = new Tap(new Buffer(0)); this._blockTap = new Tap(new Buffer(0)); this._syncMarker = null; this._readValue = null; this._noDecode = noDecode; this._queue = new utils.OrderedQueue(); this._decompress = null; // Decompression function. this._index = 0; // Next block index. this._needPush = false; this._finished = false; this.on('finish', function () { this._finished = true; if (this._needPush) { this._read(); } }); } util.inherits(BlockDecoder, stream.Duplex); BlockDecoder.getDefaultCodecs = function () { return { 'null': function (buf, cb) { cb(null, buf); }, 'deflate': zlib.inflateRaw }; }; BlockDecoder.prototype._decodeHeader = function () { var tap = this._tap; if (tap.buf.length < MAGIC_BYTES.length) { // Wait until more data arrives. return false; } if (!MAGIC_BYTES.equals(tap.buf.slice(0, MAGIC_BYTES.length))) { this.emit('error', new Error('invalid magic bytes')); return false; } var header = HEADER_TYPE._read(tap); if (!tap.isValid()) { return false; } var codec = (header.meta['avro.codec'] || 'null').toString(); this._decompress = (this._codecs || BlockDecoder.getDefaultCodecs())[codec]; if (!this._decompress) { this.emit('error', new Error(f('unknown codec: %s', codec))); return; } try { var schema = JSON.parse(header.meta['avro.schema'].toString()); if (this._parseHook) { schema = this._parseHook(schema); } this._type = types.createType(schema); } catch (err) { this.emit('error', err); return; } this._readValue = createReader(this._noDecode, this._type); this._syncMarker = header.sync; this.emit('metadata', this._type, codec, header); return true; }; BlockDecoder.prototype._write = function (chunk, encoding, cb) { var tap = this._tap; tap.buf = Buffer.concat([tap.buf, chunk]); tap.pos = 0; if (!this._decodeHeader()) { process.nextTick(cb); return; } // We got the header, switch to block decoding mode. Also, call it directly // in case we already have all the data (in which case `_write` wouldn't get // called anymore). this._write = this._writeChunk; this._write(new Buffer(0), encoding, cb); }; BlockDecoder.prototype._writeChunk = function (chunk, encoding, cb) { var tap = this._tap; tap.buf = Buffer.concat([tap.buf.slice(tap.pos), chunk]); tap.pos = 0; var nBlocks = 1; var block; while ((block = tryReadBlock(tap))) { if (!this._syncMarker.equals(block.sync)) { this.emit('error', new Error('invalid sync marker')); return; } nBlocks++; this._decompress(block.data, this._createBlockCallback(chunkCb)); } chunkCb(); function chunkCb() { if (!--nBlocks) { cb(); } } }; BlockDecoder.prototype._createBlockCallback = function (cb) { var self = this; var index = this._index++; return function (err, data) { if (err) { self.emit('error', err); cb(); } else { self._queue.push(new BlockData(index, data, cb)); if (self._needPush) { self._read(); } } }; }; BlockDecoder.prototype._read = function () { this._needPush = false; var tap = this._blockTap; if (tap.pos >= tap.buf.length) { var data = this._queue.pop(); if (!data) { if (this._finished) { this.push(null); } else { this._needPush = true; } return; // Wait for more data. } data.cb(); tap.buf = data.buf; tap.pos = 0; } this.push(this._readValue(tap)); // The read is guaranteed valid. }; /** * Duplex stream for encoding. * */ function RawEncoder(schema, opts) { opts = opts || {}; stream.Transform.call(this, { writableObjectMode: true, allowHalfOpen: false }); this._type = types.createType(files.load(schema)); this._writeValue = function (tap, val) { try { this._type._write(tap, val); } catch (err) { this.emit('error', err); } }; this._tap = new Tap(new Buffer(opts.batchSize || 65536)); } util.inherits(RawEncoder, stream.Transform); RawEncoder.prototype._transform = function (val, encoding, cb) { var tap = this._tap; var buf = tap.buf; var pos = tap.pos; this._writeValue(tap, val); if (!tap.isValid()) { if (pos) { // Emit any valid data. this.push(copyBuffer(tap.buf, 0, pos)); } var len = tap.pos - pos; if (len > buf.length) { // Not enough space for last written object, need to resize. tap.buf = new Buffer(2 * len); } tap.pos = 0; this._writeValue(tap, val); // Rewrite last failed write. } cb(); }; RawEncoder.prototype._flush = function (cb) { var tap = this._tap; var pos = tap.pos; if (pos) { // This should only ever be false if nothing is written to the stream. this.push(tap.buf.slice(0, pos)); } cb(); }; /** * Duplex stream to write object container files. * * @param schema * @param opts {Object} * * + `blockSize`, uncompressed. * + `codec` * + `codecs` * + `noCheck` * + `omitHeader`, useful to append to an existing block file. * */ function BlockEncoder(schema, opts) { opts = opts || {}; stream.Duplex.call(this, { allowHalfOpen: true, // To support async compressors. writableObjectMode: true }); var obj, type; if (types.Type.isType(schema)) { type = schema; schema = undefined; } else { // Keep full schema to be able to write it to the header later. obj = files.load(schema); type = types.createType(obj); schema = JSON.stringify(obj); } this._schema = schema; this._type = type; this._writeValue = function (tap, val) { try { this._type._write(tap, val); } catch (err) { this.emit('error', err); } }; this._blockSize = opts.blockSize || 65536; this._tap = new Tap(new Buffer(this._blockSize)); this._codecs = opts.codecs; this._codec = opts.codec || 'null'; this._compress = null; this._omitHeader = opts.omitHeader || false; this._blockCount = 0; this._syncMarker = opts.syncMarker || new utils.Lcg().nextBuffer(16); this._queue = new utils.OrderedQueue(); this._pending = 0; this._finished = false; this._needPush = false; this.on('finish', function () { this._finished = true; if (this._blockCount) { this._flushChunk(); } }); } util.inherits(BlockEncoder, stream.Duplex); BlockEncoder.getDefaultCodecs = function () { return { 'null': function (buf, cb) { cb(null, buf); }, 'deflate': zlib.deflateRaw }; }; BlockEncoder.prototype._write = function (val, encoding, cb) { var codec = this._codec; this._compress = (this._codecs || BlockEncoder.getDefaultCodecs())[codec]; if (!this._compress) { this.emit('error', new Error(f('unsupported codec: %s', codec))); return; } if (!this._omitHeader) { var meta = { 'avro.schema': new Buffer(this._schema || this._type.getSchema()), 'avro.codec': new Buffer(this._codec) }; var Header = HEADER_TYPE.getRecordConstructor(); var header = new Header(MAGIC_BYTES, meta, this._syncMarker); this.push(header.toBuffer()); } this._write = this._writeChunk; this._write(val, encoding, cb); }; BlockEncoder.prototype._writeChunk = function (val, encoding, cb) { var tap = this._tap; var pos = tap.pos; var flushing = false; this._writeValue(tap, val); if (!tap.isValid()) { if (pos) { this._flushChunk(pos, cb); flushing = true; } var len = tap.pos - pos; if (len > this._blockSize) { // Not enough space for last written object, need to resize. this._blockSize = len * 2; } tap.buf = new Buffer(this._blockSize); tap.pos = 0; this._writeValue(tap, val); // Rewrite last failed write. } this._blockCount++; if (!flushing) { cb(); } }; BlockEncoder.prototype._flushChunk = function (pos, cb) { var tap = this._tap; pos = pos || tap.pos; this._compress(tap.buf.slice(0, pos), this._createBlockCallback(cb)); this._blockCount = 0; }; BlockEncoder.prototype._read = function () { var self = this; var data = this._queue.pop(); if (!data) { if (this._finished && !this._pending) { process.nextTick(function () { self.push(null); }); } else { this._needPush = true; } return; } this.push(LONG_TYPE.toBuffer(data.count, true)); this.push(LONG_TYPE.toBuffer(data.buf.length, true)); this.push(data.buf); this.push(this._syncMarker); if (!this._finished) { data.cb(); } }; BlockEncoder.prototype._createBlockCallback = function (cb) { var self = this; var index = this._index++; var count = this._blockCount; this._pending++; return function (err, data) { if (err) { self.emit('error', err); return; } self._pending--; self._queue.push(new BlockData(index, data, cb, count)); if (self._needPush) { self._needPush = false; self._read(); } }; }; // Helpers. /** * An indexed block. * * This can be used to preserve block order since compression and decompression * can cause some some blocks to be returned out of order. The count is only * used when encoding. * */ function BlockData(index, buf, cb, count) { this.index = index; this.buf = buf; this.cb = cb; this.count = count | 0; } /** * Maybe get a block. * */ function tryReadBlock(tap) { var pos = tap.pos; var block = BLOCK_TYPE._read(tap); if (!tap.isValid()) { tap.pos = pos; return null; } return block; } /** * Create bytes consumer, either reading or skipping records. * */ function createReader(noDecode, type) { if (noDecode) { return (function (skipper) { return function (tap) { var pos = tap.pos; skipper(tap); return tap.buf.slice(pos, tap.pos); }; })(type._skip); } else { return function (tap) { return type._read(tap); }; } } /** * Copy a buffer. * * This avoids having to create a slice of the original buffer. * */ function copyBuffer(buf, pos, len) { var copy = new Buffer(len); buf.copy(copy, 0, pos, pos + len); return copy; } module.exports = { HEADER_TYPE: HEADER_TYPE, // For tests. MAGIC_BYTES: MAGIC_BYTES, // Idem. streams: { BlockDecoder: BlockDecoder, BlockEncoder: BlockEncoder, RawDecoder: RawDecoder, RawEncoder: RawEncoder } };