ak-fetch
Version:
Production-ready HTTP client for bulk operations with connection pooling, exponential backoff, streaming, and comprehensive error handling
403 lines (358 loc) • 13.8 kB
JavaScript
/**
* Stream processing utilities for efficient data handling
*/
import { Transform, Readable, Writable } from 'stream';
import { createWriteStream } from 'fs';
import { pipeline } from 'stream/promises';
import { MemoryError } from './errors.js';
class StreamProcessors {
constructor(options = {}) {
this.highWaterMark = options.highWaterMark || 16384;
this.maxMemoryUsage = options.maxMemoryUsage || 1024 * 1024 * 1024; // 1GB default
}
/**
* Transform JSONL to JSON objects
* @param {Object} options - Transform options
* @returns {Transform} Transform stream
*/
createJSONLTransform(options = {}) {
let buffer = '';
const highWaterMark = options.highWaterMark || this.highWaterMark;
return new Transform({
readableObjectMode: true,
highWaterMark,
transform(chunk, encoding, callback) {
buffer += chunk.toString();
let lines = buffer.split('\n');
buffer = lines.pop() || ''; // Keep incomplete line
for (const line of lines) {
const trimmedLine = line.trim();
if (trimmedLine) {
try {
this.push(JSON.parse(trimmedLine));
} catch (error) {
this.emit('error', new Error(`Invalid JSON line: ${trimmedLine}`));
return;
}
}
}
callback();
},
flush(callback) {
if (buffer.trim()) {
try {
this.push(JSON.parse(buffer));
} catch (error) {
this.emit('error', new Error(`Invalid JSON in buffer: ${buffer}`));
return;
}
}
callback();
}
});
}
/**
* Transform objects to JSONL format
* @param {Object} options - Transform options
* @returns {Transform} Transform stream
*/
createJSONLStringifyTransform(options = {}) {
return new Transform({
writableObjectMode: true,
readableObjectMode: false,
transform(chunk, encoding, callback) {
try {
const jsonString = JSON.stringify(chunk) + '\n';
callback(null, jsonString);
} catch (error) {
callback(new Error(`Failed to stringify object: ${error.message}`));
}
}
});
}
/**
* Transform objects to CSV format
* @param {Object} options - Transform options
* @returns {Transform} Transform stream
*/
createCSVTransform(options = {}) {
let isFirstRow = true;
let headers = options.headers || null;
return new Transform({
writableObjectMode: true,
readableObjectMode: false,
transform(chunk, encoding, callback) {
try {
// Auto-detect headers from first object
if (isFirstRow && !headers) {
headers = Object.keys(chunk);
const headerRow = headers.join(',') + '\n';
this.push(headerRow);
}
// Ensure all nested objects are stringified
const processedChunk = { ...chunk };
for (const key in processedChunk) {
if (typeof processedChunk[key] === 'object' && processedChunk[key] !== null) {
processedChunk[key] = JSON.stringify(processedChunk[key]);
}
}
// Create CSV row
const row = headers.map(header => {
const value = processedChunk[header] || '';
const stringValue = String(value).replace(/"/g, '""');
return `"${stringValue}"`;
}).join(',') + '\n';
isFirstRow = false;
callback(null, row);
} catch (error) {
callback(new Error(`Failed to create CSV row: ${error.message}`));
}
}
});
}
/**
* Batch objects into arrays
* @param {number} batchSize - Size of each batch
* @param {Object} options - Batch options
* @returns {Transform} Transform stream
*/
createBatchTransform(batchSize, options = {}) {
let batch = [];
const flushIncomplete = options.flushIncomplete !== false;
return new Transform({
objectMode: true,
transform(chunk, encoding, callback) {
batch.push(chunk);
if (batch.length >= batchSize) {
callback(null, batch);
batch = [];
} else {
callback();
}
},
flush(callback) {
if (batch.length > 0 && flushIncomplete) {
callback(null, batch);
} else {
callback();
}
}
});
}
/**
* Memory monitoring transform
* @param {Object} options - Memory options
* @returns {Transform} Transform stream
*/
createMemoryMonitorTransform(options = {}) {
const maxMemory = options.maxMemory || this.maxMemoryUsage;
const checkInterval = options.checkInterval || 100;
let itemCount = 0;
return new Transform({
objectMode: true,
transform(chunk, encoding, callback) {
itemCount++;
if (itemCount % checkInterval === 0) {
const memUsage = process.memoryUsage();
if (memUsage.heapUsed > maxMemory) {
callback(new MemoryError(`Memory usage exceeded limit: ${memUsage.heapUsed} > ${maxMemory}`, {
memoryUsage: memUsage,
limit: maxMemory
}));
return;
}
}
callback(null, chunk);
}
});
}
/**
* Create a backpressure-aware transform
* @param {Function} transformFn - Transform function
* @param {Object} options - Transform options
* @returns {Transform} Transform stream
*/
createBackpressureTransform(transformFn, options = {}) {
const maxBuffer = options.maxBuffer || 100;
let bufferCount = 0;
return new Transform({
objectMode: true,
highWaterMark: maxBuffer,
transform(chunk, encoding, callback) {
bufferCount++;
try {
const result = transformFn(chunk);
if (result instanceof Promise) {
result
.then(data => {
bufferCount--;
callback(null, data);
})
.catch(error => {
bufferCount--;
callback(error);
});
} else {
bufferCount--;
callback(null, result);
}
} catch (error) {
bufferCount--;
callback(error);
}
}
});
}
/**
* Create a rate-limited transform
* @param {number} maxPerSecond - Maximum items per second
* @param {Object} options - Rate limit options
* @returns {Transform} Transform stream
*/
createRateLimitTransform(maxPerSecond, options = {}) {
const intervalMs = 1000 / maxPerSecond;
let lastProcessTime = 0;
return new Transform({
objectMode: true,
async transform(chunk, encoding, callback) {
const now = Date.now();
const timeSinceLastProcess = now - lastProcessTime;
if (timeSinceLastProcess < intervalMs) {
const delay = intervalMs - timeSinceLastProcess;
await new Promise(resolve => setTimeout(resolve, delay));
}
lastProcessTime = Date.now();
callback(null, chunk);
}
});
}
/**
* Create a stream that writes to multiple outputs
* @param {Array} outputs - Array of writable streams
* @param {Object} options - Tee options
* @returns {Writable} Writable stream
*/
createTeeStream(outputs, options = {}) {
return new Writable({
objectMode: options.objectMode || false,
write(chunk, encoding, callback) {
let completed = 0;
const totalOutputs = outputs.length;
if (totalOutputs === 0) {
callback();
return;
}
let hasError = false;
outputs.forEach(output => {
output.write(chunk, encoding, (error) => {
if (error && !hasError) {
hasError = true;
callback(error);
return;
}
completed++;
if (completed === totalOutputs && !hasError) {
callback();
}
});
});
}
});
}
/**
* Stream data to file with format selection
* @param {string} filePath - Output file path
* @param {string} format - Output format (json, csv, ndjson)
* @param {Object} options - Stream options
* @returns {Promise<string>} File path when complete
*/
async streamToFile(dataStream, filePath, format = 'json', options = {}) {
const writeStream = createWriteStream(filePath, { encoding: 'utf8' });
try {
switch (format.toLowerCase()) {
case 'json':
await this.streamJSON(dataStream, writeStream, options);
break;
case 'csv':
await this.streamCSV(dataStream, writeStream, options);
break;
case 'ndjson':
case 'jsonl':
await this.streamNDJSON(dataStream, writeStream, options);
break;
default:
throw new Error(`Unsupported format: ${format}`);
}
return filePath;
} catch (error) {
writeStream.destroy();
throw error;
}
}
/**
* Stream data as JSON array
* @param {Readable} dataStream - Input stream
* @param {Writable} writeStream - Output stream
* @param {Object} options - Options
*/
async streamJSON(dataStream, writeStream, options = {}) {
writeStream.write('[');
let isFirst = true;
const transformStream = new Transform({
objectMode: true,
transform(chunk, encoding, callback) {
const prefix = isFirst ? '' : ',';
isFirst = false;
callback(null, prefix + JSON.stringify(chunk, null, options.indent));
}
});
await pipeline(dataStream, transformStream, writeStream, { end: false });
writeStream.write(']');
// Wait for the write stream to finish flushing to disk
await new Promise((resolve, reject) => {
writeStream.end();
writeStream.on('finish', resolve);
writeStream.on('error', reject);
});
}
/**
* Stream data as CSV
* @param {Readable} dataStream - Input stream
* @param {Writable} writeStream - Output stream
* @param {Object} options - Options
*/
async streamCSV(dataStream, writeStream, options = {}) {
const csvTransform = this.createCSVTransform(options);
await pipeline(dataStream, csvTransform, writeStream);
}
/**
* Stream data as NDJSON
* @param {Readable} dataStream - Input stream
* @param {Writable} writeStream - Output stream
* @param {Object} options - Options
*/
async streamNDJSON(dataStream, writeStream, options = {}) {
const ndjsonTransform = this.createJSONLStringifyTransform(options);
await pipeline(dataStream, ndjsonTransform, writeStream);
}
/**
* Get unique keys from a stream of objects
* @param {Readable} dataStream - Input stream
* @returns {Promise<Array>} Array of unique keys
*/
async getUniqueKeys(dataStream) {
const keysSet = new Set();
const keyExtractor = new Writable({
objectMode: true,
write(chunk, encoding, callback) {
if (chunk && typeof chunk === 'object') {
Object.keys(chunk).forEach(key => keysSet.add(key));
}
callback();
}
});
await pipeline(dataStream, keyExtractor);
return Array.from(keysSet);
}
}
export default StreamProcessors;