UNPKG

@flightstream/utils-arrow

Version:

Advanced utilities for working with Arrow data and Flight protocol in FlightStream for Node.js

504 lines (438 loc) 11.5 kB
import { EventEmitter } from 'events'; import pino from 'pino'; /** * Create a configured pino logger instance */ const createLogger = (options = {}) => { const env = process.env.NODE_ENV || 'development'; const logLevel = process.env.LOG_LEVEL || 'info'; const logFormat = process.env.LOG_FORMAT || (env === 'development' ? 'pretty' : 'json'); const logSilent = process.env.LOG_SILENT === 'true'; const config = { level: logLevel, silent: logSilent, timestamp: pino.stdTimeFunctions.isoTime, ...options }; // Use pretty printing for development if (logFormat === 'pretty' && !logSilent) { config.transport = { target: 'pino-pretty', options: { colorize: true, translateTime: 'HH:MM:ss', ignore: 'pid,hostname' } }; } return pino(config); }; /** * Streaming Utilities for Data Processing * * This module provides common streaming patterns and utilities for processing * large datasets efficiently. It includes batching, backpressure handling, * error recovery, and memory management utilities. * * Key features: * 1. Batch processing with configurable sizes * 2. Backpressure handling for stream control * 3. Error recovery and resilience patterns * 4. Memory-efficient data streaming * 5. Generic stream transformations */ /** * Base Stream Processor * * Provides common streaming functionality that can be extended * by specific data source implementations. */ export class StreamProcessor extends EventEmitter { constructor(options = {}) { super(); this.options = { batchSize: options.batchSize || 10000, maxConcurrency: options.maxConcurrency || 1, errorRetries: options.errorRetries || 3, backpressureThreshold: options.backpressureThreshold || 50000, ...options }; this.isProcessing = false; this.totalProcessed = 0; this.errorCount = 0; this.currentBatch = []; this.pendingBatches = []; } /** * Start processing stream * @returns {Promise<void>} */ async start() { if (this.isProcessing) { throw new Error('Stream processing already in progress'); } this.isProcessing = true; this.totalProcessed = 0; this.errorCount = 0; this.emit('start'); try { await this._process(); this.emit('complete', { totalProcessed: this.totalProcessed, errorCount: this.errorCount }); } catch (error) { this.emit('error', error); throw error; } finally { this.isProcessing = false; } } /** * Stop processing stream */ stop() { if (this.isProcessing) { this.isProcessing = false; this.emit('stop'); } } /** * Process data - to be implemented by subclasses * @private */ async _process() { throw new Error('_process() must be implemented by subclass'); } /** * Add item to current batch * @param {*} item - Item to add */ addToBatch(item) { this.currentBatch.push(item); if (this.currentBatch.length >= this.options.batchSize) { this.flushBatch(); } } /** * Flush current batch */ flushBatch() { if (this.currentBatch.length > 0) { this.emit('batch', [...this.currentBatch]); this.totalProcessed += this.currentBatch.length; this.currentBatch = []; } } /** * Handle backpressure by pausing processing */ async handleBackpressure() { if (this.pendingBatches.length > this.options.backpressureThreshold) { this.emit('backpressure', { pendingBatches: this.pendingBatches.length }); // Wait for pending batches to clear while (this.pendingBatches.length > this.options.backpressureThreshold / 2 && this.isProcessing) { await new Promise(resolve => setTimeout(resolve, 100)); } } } /** * Get processing statistics */ getStats() { return { isProcessing: this.isProcessing, totalProcessed: this.totalProcessed, errorCount: this.errorCount, currentBatchSize: this.currentBatch.length, pendingBatches: this.pendingBatches.length, batchSize: this.options.batchSize }; } } /** * Batch Processor * * Processes data in configurable batches with memory management */ export class BatchProcessor extends StreamProcessor { constructor(processor, options = {}) { super(options); this.processor = processor; this.activeBatches = new Set(); this.logger = createLogger({ name: 'streaming', batch_size: this.options.batchSize, max_concurrency: this.options.maxConcurrency }); } /** * Process a batch of items * @param {Array} batch - Batch to process * @returns {Promise<*>} Processing result */ async processBatch(batch) { const batchId = Math.random().toString(36).substr(2, 9); this.activeBatches.add(batchId); try { this.emit('batch-start', { batchId, size: batch.length }); const result = await this.processor(batch); this.emit('batch-complete', { batchId, size: batch.length, result }); return result; } catch (error) { this.errorCount++; this.emit('batch-error', { batchId, size: batch.length, error }); // Retry logic if (this.errorCount <= this.options.errorRetries) { this.logger.warn({ batch_id: batchId, error_count: this.errorCount, max_retries: this.options.errorRetries, error: { message: error.message, stack: error.stack, name: error.name } }, 'Batch processing failed, retrying'); return this.processBatch(batch); } throw error; } finally { this.activeBatches.delete(batchId); } } /** * Process multiple batches with concurrency control * @param {Array} batches - Array of batches to process * @returns {Promise<Array>} Array of results */ async processBatches(batches) { const results = []; const semaphore = new Semaphore(this.options.maxConcurrency); const promises = batches.map(async (batch, index) => { await semaphore.acquire(); try { const result = await this.processBatch(batch); results[index] = result; return result; } finally { semaphore.release(); } }); await Promise.all(promises); return results; } getStats() { return { ...super.getStats(), activeBatches: this.activeBatches.size }; } } /** * Data Chunker * * Splits data streams into configurable chunks */ export class DataChunker { constructor(options = {}) { this.options = { chunkSize: options.chunkSize || 1000, overlap: options.overlap || 0, ...options }; } /** * Split array into chunks * @param {Array} data - Data to chunk * @returns {Array} Array of chunks */ chunk(data) { if (!Array.isArray(data)) { throw new Error('Data must be an array'); } const chunks = []; const chunkSize = this.options.chunkSize; const overlap = this.options.overlap; const step = chunkSize - overlap; for (let i = 0; i < data.length; i += step) { const chunk = data.slice(i, i + chunkSize); if (chunk.length > 0) { chunks.push(chunk); } } return chunks; } /** * Create streaming chunker * @param {Function} onChunk - Callback for each chunk * @returns {Object} Chunker interface */ createStreamingChunker(onChunk) { let buffer = []; return { add: (items) => { buffer.push(...items); while (buffer.length >= this.options.chunkSize) { const chunk = buffer.splice(0, this.options.chunkSize); onChunk(chunk); } }, flush: () => { if (buffer.length > 0) { onChunk([...buffer]); buffer = []; } }, getBufferSize: () => buffer.length }; } } /** * Stream Buffer * * Provides buffering for smooth data flow */ export class StreamBuffer extends EventEmitter { constructor(options = {}) { super(); this.options = { maxSize: options.maxSize || 10000, lowWaterMark: options.lowWaterMark || 2000, highWaterMark: options.highWaterMark || 8000, ...options }; this.buffer = []; this.isReading = false; this.isDraining = false; } /** * Write data to buffer * @param {*} data - Data to write * @returns {boolean} Whether more data can be written */ write(data) { if (this.buffer.length >= this.options.maxSize) { this.emit('overflow', { bufferSize: this.buffer.length }); return false; } this.buffer.push(data); if (this.buffer.length >= this.options.highWaterMark && !this.isDraining) { this.emit('drain-needed'); } this.emit('data', data); return true; } /** * Read data from buffer * @param {number} count - Number of items to read * @returns {Array} Read items */ read(count = 1) { const items = this.buffer.splice(0, count); if (this.buffer.length <= this.options.lowWaterMark && this.isDraining) { this.isDraining = false; this.emit('drained'); } return items; } /** * Check if buffer needs draining * @returns {boolean} */ needsDraining() { return this.buffer.length >= this.options.highWaterMark; } /** * Get buffer statistics * @returns {Object} */ getStats() { return { size: this.buffer.length, maxSize: this.options.maxSize, utilizationPercent: (this.buffer.length / this.options.maxSize) * 100, needsDraining: this.needsDraining(), isDraining: this.isDraining }; } } /** * Semaphore for concurrency control */ class Semaphore { constructor(count) { this.count = count; this.waiting = []; } async acquire() { if (this.count > 0) { this.count--; return; } return new Promise(resolve => { this.waiting.push(resolve); }); } release() { if (this.waiting.length > 0) { const resolve = this.waiting.shift(); resolve(); } else { this.count++; } } } /** * Rate Limiter for controlling processing speed */ export class RateLimiter { constructor(options = {}) { this.options = { requestsPerSecond: options.requestsPerSecond || 100, burstSize: options.burstSize || 10, ...options }; this.tokens = this.options.burstSize; this.lastRefill = Date.now(); } /** * Check if operation is allowed * @returns {boolean} */ tryAcquire() { this.refillTokens(); if (this.tokens > 0) { this.tokens--; return true; } return false; } /** * Wait for permission to proceed * @returns {Promise<void>} */ async acquire() { while (!this.tryAcquire()) { const waitTime = 1000 / this.options.requestsPerSecond; await new Promise(resolve => setTimeout(resolve, waitTime)); } } /** * Refill token bucket * @private */ refillTokens() { const now = Date.now(); const timePassed = now - this.lastRefill; const tokensToAdd = Math.floor((timePassed / 1000) * this.options.requestsPerSecond); if (tokensToAdd > 0) { this.tokens = Math.min(this.options.burstSize, this.tokens + tokensToAdd); this.lastRefill = now; } } } export default { StreamProcessor, BatchProcessor, DataChunker, StreamBuffer, RateLimiter };