UNPKG

aws-kinesis-agg

Version:

Node.js module to simplify working with Amazon Kinesis Records using Protcol Buffers encoding

481 lines (411 loc) 14.9 kB
/*! * aws-kinesis-agg 4.0.4 * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. * SPDX-License-Identifier: Apache-2.0 */ 'use strict'; const crypto = require("crypto"); const async = require('async') const common = require('./common'); // calculate the maximum amount of data to accumulate before emitting to // kinesis. 1MB - 16 bytes for checksum and the length of the magic number const KINESIS_MAX_PAYLOAD_BYTES = (1024 * 1024) - 16 - Buffer.byteLength(common.magic); function calculateVarIntSize(value) { if (value < 0) { throw new Error("Size values should not be negative."); } else if (value == 0) { return 1; } let numBitsNeeded = 0; // shift the value right one bit at a time until // there are no more '1' bits left...this should count // how many bits we need to represent the number while (value > 0) { numBitsNeeded++; value = value >> 1; } // varints only use 7 bits of the byte for the actual value let numVarintBytes = Math.trunc(numBitsNeeded / 7); if (numBitsNeeded % 7 > 0) { numVarintBytes += 1; } return numVarintBytes; } function getPotentialIndex(lookup, key, count) { const it = lookup[key] return (it) ? it : count } function calculateRecordSize(self, record) { let messageSize = 0; // calculate the total new message size when aggregated into protobuf if (!self.partitionKeyTable.hasOwnProperty(record.partitionKey)) { // add the size of the partition key when encoded const pkLength = record.partitionKey.length; messageSize += 1; // (message index + wire type for PK table) messageSize += calculateVarIntSize(pkLength); // size of pk lengthvalue messageSize += pkLength; // actual pk length } if (record.explicitHashKey && !self.explicitHashKeyTable.hasOwnProperty(record.explicitHashKey)) { // add the size of the explicit hash key when encoded const ehkLength = record.explicitHashKey.length; messageSize += 1; // (message index + wire type for EHK table) messageSize += calculateVarIntSize(ehkLength); /* size of ehk length value */ messageSize += ehkLength; // actual ehk length } /* compute the data record length */ // add the sizes of the partition and hash key indexes let innerRecordSize = 1; innerRecordSize += calculateVarIntSize(getPotentialIndex(self.partitionKeyTable, record.partitionKey, self.partitionKeyCount)); // explicit hash key field (this is optional) if (record.explicitHashKey) { innerRecordSize += 1; innerRecordSize += calculateVarIntSize(getPotentialIndex(self.explicitHashKeyTable, record.explicitHashKey, self.explicitHashKeyCount)); } if (typeof (record.data) === 'string') { record.data = Buffer.from(record.data); // default utf8 } var dataLength = Buffer.byteLength(record.data, 'binary'); // message index + wire type for record data innerRecordSize += 1; // size of data length value innerRecordSize += calculateVarIntSize(dataLength); // actual data length innerRecordSize += dataLength; // data field messageSize += 1; // message index + wire type for record messageSize += calculateVarIntSize(innerRecordSize); // size of entire record length value messageSize += innerRecordSize; // actual entire record length return messageSize } function validateRecord(record){ if (!record.data) { throw new Error('Record.Data field is mandatory'); } if (!record.partitionKey) { throw new Error('record.partitionKey field is mandatory'); } } function aggregateRecord(records) { if (common.debug) { console.log("Protobuf Aggregation of " + records.length + " records"); } const partitionKeyTable = {}; let partitionKeyCount = 0; const explicitHashKeyTable = {}; let explicitHashKeyCount = 0; const putRecords = records.map(function (record) { // add the partition key and explicit hash key entries if (!partitionKeyTable.hasOwnProperty(record.partitionKey)) { partitionKeyTable[record.partitionKey] = partitionKeyCount; partitionKeyCount += 1; } if (record.explicitHashKey && !explicitHashKeyTable .hasOwnProperty(record.explicitHashKey)) { explicitHashKeyTable[record.explicitHashKey] = explicitHashKeyCount; explicitHashKeyCount += 1; } // add the AggregatedRecord object with partition and hash // key indexes return { "partition_key_index": partitionKeyTable[record.partitionKey], "explicit_hash_key_index": explicitHashKeyTable[record.explicitHashKey], data: record.data, tags: [] }; }); // encode the data const protoData = common.AggregatedRecord.encode({ "partition_key_table": Object.keys(partitionKeyTable), "explicit_hash_key_table": Object.keys(explicitHashKeyTable), "records": putRecords }); if (common.debug) { console.log(JSON.stringify({ "partition_key_table": Object.keys(partitionKeyTable), "explicit_hash_key_table": Object.keys(explicitHashKeyTable), records: putRecords.map((record) => record.data.toString('base64')) })); } const bufferData = protoData.finish(); // get the md5 for the encoded data const md5 = crypto.createHash('md5'); md5.update(bufferData); const checksum = md5.digest(); // create the final object as a concatenation of the magic KPL number, // the encoded data records, and the md5 checksum var finalBuffer = Buffer.concat([common.magic, bufferData, checksum]); if (common.debug) { console.log("Checksum: " + checksum.toString('base64')); console.log("actual totalBytes=" + Buffer.byteLength(bufferData, 'binary')); console.log("final totalBytes=" + Buffer.byteLength(finalBuffer, 'binary')); } return finalBuffer; } function generateEncodedRecord(records) { if (common.debug) { console.log("generate " + records.length + " records."); } if (records.length == 0) { return; } // do our best to find a valid partition key to use let pk; for (var i = 0; i < records.length; i++) { if (records[i].partitionKey) { pk = records[i].partitionKey; break; } } // do our best to find a valid explicit hash key to use let ehk; for (var j = 0; j < records.length; j++) { if (records[j].explicitHashKey) { ehk = records[0].explicitHashKey; break; } } const encodedRecord = { partitionKey: pk, data: aggregateRecord(records) } // if we find an ExplicitHashKey set it if(ehk !== undefined) { encodedRecord["ExplicitHashKey"] = ehk } // return encoded record return encodedRecord } // call onReadyCallback with encoded record function callOnReadyCallback(err, records, onReadyCallback) { if (onReadyCallback) { if (err) { onReadyCallback(err) } else { onReadyCallback(null, generateEncodedRecord(records)) } } } /** * RecordAggregator build an object which aggregate records with a max size of 1Mo. * @param {*} onReadyCallback */ function RecordAggregator(onReadyCallback) { this.totalBytes = 0; this.putRecords = []; this.partitionKeyTable = {}; this.partitionKeyCount = 0; this.explicitHashKeyTable = {}; this.explicitHashKeyCount = 0; this.onReadyCallback = onReadyCallback; }; module.exports.RecordAggregator = RecordAggregator; /** * Set onReadyCallback * @param {*} onReadyCallback callback function * @returns current onReadyCallback function */ RecordAggregator.prototype.setOnReadyCallback = function (onReadyCallback) { if (onReadyCallback) { this.onReadyCallback = onReadyCallback } return this.onReadyCallback } /** * reset this object to empty (all records currently in the object * will be lost) */ RecordAggregator.prototype.clearRecords = function () { this.totalBytes = 0; this.putRecords = []; this.partitionKeyTable = {}; this.partitionKeyCount = 0; this.explicitHashKeyTable = {}; this.explicitHashKeyCount = 0; }; /** * Method to flush of the current inflight records. * @param {function} onReadyCallback optional onReadyCallback function */ RecordAggregator.prototype.flushBufferedRecords = function (onReadyCallback) { if (common.debug) { console.log("calculated totalBytes=" + this.totalBytes); } callOnReadyCallback(null, this.putRecords, onReadyCallback ||  this.onReadyCallback); this.clearRecords(); }; /** * Method to build an encoded record of all inflight records, flushes * the current inflight records after getting called. */ RecordAggregator.prototype.build = function(){ const data= generateEncodedRecord(this.putRecords); this.clearRecords(); return data; } /** * Method to return the length of inflight records. */ RecordAggregator.prototype.length = function(){ return this.putRecords.length; } /** * Method to check if a specific record will fit in the inflight records array (1 MB max) * @param {*} record record to check */ RecordAggregator.prototype.checkIfUserRecordFits = function(record){ return !((this.totalBytes + this.calculateUserRecordSize(record)) > KINESIS_MAX_PAYLOAD_BYTES); } /** * Method to calculate a record size without adding it to the inflight records. * @param {*} record record to check */ RecordAggregator.prototype.calculateUserRecordSize = function(record){ validateRecord(record); return calculateRecordSize(this, record) } /** * method to add a record to inflight records. * @param {*} record record to add */ RecordAggregator.prototype.addUserRecord = function(record){ validateRecord(record); let messageSize = calculateRecordSize(this, record) if (common.debug) { console.log("Current Pending Size: " + this.putRecords.length + " records, " + this.totalBytes + " bytes"); console.log("Next: " + messageSize + " bytes"); } // if the size of this record would push us over the limit, // then encode the current set if (messageSize > KINESIS_MAX_PAYLOAD_BYTES) { throw new Error('Input record (PK=' + record.partitionKey + ', EHK=' + record.explicitHashKey + ', SizeBytes=' + messageSize + ') is too large to fit inside a single Kinesis record.'); } else if ((this.totalBytes + messageSize) > KINESIS_MAX_PAYLOAD_BYTES) { if (common.debug) { console.log("calculated totalBytes=" + this.totalBytes); } throw new Error("record won't fit"); } else { // the current set of records is still within the kinesis // max payload size so increment inflight/total bytes this.putRecords.push(record); this.totalBytes += messageSize; } if (!this.partitionKeyTable.hasOwnProperty(record.partitionKey)) { // add the size of the partition key when encoded this.partitionKeyTable[record.partitionKey] = this.partitionKeyCount; this.partitionKeyCount += 1; } if (record.explicitHashKey && !this.explicitHashKeyTable .hasOwnProperty(record.explicitHashKey)) { // add the size of the explicit hash key when encoded this.explicitHashKeyTable[record.explicitHashKey] = this.explicitHashKeyCount; this.explicitHashKeyCount += 1; } } /** * method to aggregate a set of records. * @param {*} records records to aggregate * @param {boolean} forceFlush if true call #flushBufferedRecords at end of process * @param {function} onReadyCallback optional onReadyCallback function */ RecordAggregator.prototype.aggregateRecords = function (records, forceFlush, onReadyCallback) { const self = this; const _onReadyCallback = onReadyCallback || this.onReadyCallback records.forEach(function (record) { let messageSize = calculateRecordSize(self, record) if (!record.data) { return callOnReadyCallback(new Error('Record.Data field is mandatory'), record, _onReadyCallback) } if (!record.partitionKey) { return callOnReadyCallback(new Error('record.partitionKey field is mandatory'), record, _onReadyCallback) } if (common.debug) { console.log("Current Pending Size: " + self.putRecords.length + " records, " + self.totalBytes + " bytes"); console.log("Next: " + messageSize + " bytes"); } // if the size of this record would push us over the limit, // then encode the current set if (messageSize > KINESIS_MAX_PAYLOAD_BYTES) { callOnReadyCallback(new Error('Input record (PK=' + record.partitionKey + ', EHK=' + record.explicitHashKey + ', SizeBytes=' + messageSize + ') is too large to fit inside a single Kinesis record.'), null, _onReadyCallback); } else if ((self.totalBytes + messageSize) > KINESIS_MAX_PAYLOAD_BYTES) { if (common.debug) { console.log("calculated totalBytes=" + self.totalBytes); } callOnReadyCallback(null, self.putRecords, _onReadyCallback); self.clearRecords(); // total size tracked is now the size of the current record self.totalBytes = calculateRecordSize(self, record) // current inflight becomes just this record self.putRecords = [record]; } else { // the current set of records is still within the kinesis // max payload size so increment inflight/total bytes self.putRecords.push(record); self.totalBytes += messageSize; } if (!self.partitionKeyTable.hasOwnProperty(record.partitionKey)) { // add the size of the partition key when encoded self.partitionKeyTable[record.partitionKey] = self.partitionKeyCount; self.partitionKeyCount += 1; } if (record.explicitHashKey && !self.explicitHashKeyTable .hasOwnProperty(record.explicitHashKey)) { // add the size of the explicit hash key when encoded self.explicitHashKeyTable[record.explicitHashKey] = self.explicitHashKeyCount; self.explicitHashKeyCount += 1; } }); if (forceFlush === true && self.putRecords.length > 0) { callOnReadyCallback(null, this.putRecords, _onReadyCallback); this.clearRecords(); } }; /** * Aggregate function. * @param {*} records collection of record to send * @param {function} encodedRecordHandler function (encodedRecord, callback) which process each encoded record. * `encodedRecord` is an object with data, partitionKey and [explicitHashKey] fields. * @param {function} afterPutAggregatedRecords called once all records are processed * @param {function} errorCallback called each time an error occurs * @param {number} [queueSize] maximum concurrency when processing encoded records (1 per default) */ module.exports.aggregate = (records, encodedRecordHandler, afterPutAggregatedRecords, errorCallback, queueSize = 1) => { const taskHandler = (params, done) => { encodedRecordHandler(params, (err, result) => { if (err) { errorCallback(err, result) } done() }) } const aggregatorQueue = async.queue(taskHandler, queueSize); // when all task is done call afterPutAggregatedRecords callback aggregatorQueue.drain = () => { afterPutAggregatedRecords() } // aggregator call back const onReadyCallback = (error, encoded) => { if (error) { return errorCallback(error, encoded) } aggregatorQueue.push(encoded) } const aggregator = new RecordAggregator(onReadyCallback) aggregator.aggregateRecords(records, true) if (!aggregatorQueue.started) { errorCallback(new Error('No records')) afterPutAggregatedRecords() } }