UNPKG

@naturalcycles/nodejs-lib

Version:
358 lines (357 loc) 12.5 kB
import { Readable } from 'node:stream'; import { pipeline } from 'node:stream/promises'; import { createGzip, createUnzip, createZstdCompress, createZstdDecompress } from 'node:zlib'; import { createAbortableSignal } from '@naturalcycles/js-lib'; import { _passthroughPredicate } from '@naturalcycles/js-lib/types'; import { fs2 } from '../fs/fs2.js'; import { zip2 } from '../zip/zip2.js'; import { createReadStreamAsNDJson } from './ndjson/createReadStreamAsNDJson.js'; import { transformJsonParse } from './ndjson/transformJsonParse.js'; import { transformToNDJson } from './ndjson/transformToNDJson.js'; import { createReadableFromAsync } from './readable/createReadable.js'; import { PIPELINE_GRACEFUL_ABORT } from './stream.util.js'; import { transformChunk } from './transform/transformChunk.js'; import { transformFilterSync } from './transform/transformFilter.js'; import { transformFlatten, transformFlattenIfNeeded } from './transform/transformFlatten.js'; // oxlint-disable-next-line import/no-cycle -- intentional cycle import { transformFork } from './transform/transformFork.js'; import { transformLimit } from './transform/transformLimit.js'; import { transformLogProgress } from './transform/transformLogProgress.js'; import { transformMap } from './transform/transformMap.js'; import { transformMapSimple } from './transform/transformMapSimple.js'; import { transformMapSync } from './transform/transformMapSync.js'; import { transformOffset } from './transform/transformOffset.js'; import { transformSplitOnNewline } from './transform/transformSplit.js'; import { transformTap, transformTapSync } from './transform/transformTap.js'; import { transformThrottle } from './transform/transformThrottle.js'; import { transformThrottleByRSS } from './transform/transformThrottleByRSS.js'; import { transformWarmup } from './transform/transformWarmup.js'; import { writablePushToArray } from './writable/writablePushToArray.js'; import { writableVoid } from './writable/writableVoid.js'; export class Pipeline { source; transforms = []; destination; readableLimit; objectMode; abortableSignal = createAbortableSignal(); constructor(source, objectMode = true) { this.source = source; this.objectMode = objectMode; } static from(source) { return new Pipeline(source); } /** * Useful in cases when Readable is not immediately available, * but only available after an async operation is completed. * Implemented via a proxy Transform, which should be transparent. */ static fromAsyncReadable(fn) { return new Pipeline(createReadableFromAsync(fn)); } static fromWeb(webReadableStream) { return new Pipeline(Readable.fromWeb(webReadableStream)); } /** * Technically same as `fromIterable` (since Array is Iterable), * but named a bit friendlier. */ static fromArray(input) { return new Pipeline(Readable.from(input)); } static fromIterable(input) { return new Pipeline(Readable.from(input)); } static fromNDJsonFile(sourceFilePath) { // Important that createReadStreamAsNDJson function is used // (and not Pipeline set of individual transforms), // because createReadStreamAsNDJson returns a Readable, // hence it allows to apply .take(limit) on it // e.g like Pipeline.fromNDJsonFile().limitSource(limit) return new Pipeline(createReadStreamAsNDJson(sourceFilePath)); } static fromFile(sourceFilePath) { return new Pipeline(fs2.createReadStream(sourceFilePath, { highWaterMark: 64 * 1024, // no observed speedup }), false); } /** * Limits the source Readable, but using `.take(limit)` on it. * This is THE preferred way of limiting the source. */ limitSource(limit) { this.readableLimit = limit; return this; } /** * If possible - STRONGLY PREFER applying `.take(limit)` on the source Readable, * as it's a clean graceful way of limiting the Readable. Example: * * Pipeline.from(myReadable.take(10)) * * or * * Pipeline * .from(myReadable) * .limitSource(10) * * If applying `take` on Readable is not possible - use this method at your own risk. * Why warning? * The limit works by aborting the stream, and then catching the error - certainly * less clean than `.take()` on the source. */ limit(limit) { if (!this.transforms.length) { console.warn(`Pipeline.limit was used as a very first Transfrom - please use Pipeline.limitSource instead`); this.limitSource(limit); return this; } this.transforms.push(transformLimit({ limit, signal: this.abortableSignal, })); return this; } chunk(chunkSize, opt) { this.transforms.push(transformChunk(chunkSize, opt)); return this; } flatten() { this.transforms.push(transformFlatten()); return this; } flattenIfNeeded() { this.transforms.push(transformFlattenIfNeeded()); return this; } // TransformLogProgressOptions intentionally doesn't have <T> passed, as it's inconvenient in many cases logProgress(opt) { this.transforms.push(transformLogProgress(opt)); return this; } map(mapper, opt) { this.transforms.push(transformMap(mapper, { ...opt, signal: this.abortableSignal, })); return this; } mapSync(mapper, opt) { this.transforms.push(transformMapSync(mapper, { ...opt, signal: this.abortableSignal, })); return this; } mapSimple(mapper, opt) { this.transforms.push(transformMapSimple(mapper, opt)); return this; } filter(asyncPredicate, opt) { this.transforms.push(transformMap(v => v, { asyncPredicate, ...opt, signal: this.abortableSignal, })); return this; } filterSync(predicate, opt) { this.transforms.push(transformFilterSync(predicate, opt)); return this; } offset(opt) { this.transforms.push(transformOffset(opt)); return this; } tap(fn, opt) { this.transforms.push(transformTap(fn, opt)); return this; } tapSync(fn, opt) { this.transforms.push(transformTapSync(fn, opt)); return this; } throttle(opt) { this.transforms.push(transformThrottle(opt)); return this; } throttleByRSS(opt) { this.transforms.push(transformThrottleByRSS(opt)); return this; } /** * @experimental to be removed after transformMap2 is stable */ warmup(opt) { this.transforms.push(transformWarmup(opt)); return this; } transform(transform) { this.transforms.push(transform); return this; } /** * Helper method to add multiple transforms at once. * Not type safe! Prefer using singular `transform()` multiple times for type safety. */ transformMany(transforms) { this.transforms.push(...transforms); return this; } fork(fn, opt) { this.transforms.push(transformFork(fn, opt)); return this; } /** * Utility method just to conveniently type-cast the current Pipeline type. * No runtime effect. */ typeCastAs() { return this; } setObjectMode(objectMode) { this.objectMode = objectMode; return this; } /** * Transform the stream of Objects into a stream of JSON lines. * Technically, it goes into objectMode=false, so it's a binary stream at the end. */ toNDJson() { this.transforms.push(transformToNDJson()); this.objectMode = false; return this; } parseNDJson() { // It was said that transformJsonParse() separately is 10% or more slower than .map(line => JSON.parse(line)) // So, we can investigate a speedup this.transforms.push(transformSplitOnNewline(), transformJsonParse()); this.objectMode = true; return this; } splitOnNewline() { // Input: objectMode=false - binary stream // Output: objectMode=true - stream of Buffer objects (which are also strings?) this.transforms.push(transformSplitOnNewline()); this.objectMode = true; return this; } parseJson() { // Input: objectMode=false - takes a stream of strings one by one // Output: objectMode=true - stream of json-parsed Objects this.transforms.push(transformJsonParse()); this.objectMode = true; return this; } gzip(opt) { this.transforms.push(createGzip({ // chunkSize: 64 * 1024, // no observed speedup ...opt, })); this.objectMode = false; return this; } gunzip(opt) { this.transforms.push(createUnzip({ chunkSize: 64 * 1024, // speedup from ~3200 to 3800 rps! ...opt, })); this.objectMode = false; return this; } zstdCompress(level, // defaults to 3 opt) { this.transforms.push(createZstdCompress(zip2.zstdLevelToOptions(level, opt))); this.objectMode = false; return this; } zstdDecompress(opt) { this.transforms.push(createZstdDecompress({ chunkSize: 64 * 1024, // todo: test it ...opt, })); this.objectMode = false; return this; } async toArray(opt) { const arr = []; this.destination = writablePushToArray(arr, opt); await this.run(); return arr; } async toFile(outputFilePath) { fs2.ensureFile(outputFilePath); this.destination = fs2.createWriteStream(outputFilePath); await this.run(); } /** * level corresponds to zstd compression level (if filename ends with .zst), * or gzip compression level (if filename ends with .gz). * Default levels are: * gzip: 6 * zlib: 3 (optimized for throughput, not size, may be larger than gzip at its default level) */ async toNDJsonFile(outputFilePath, level) { fs2.ensureFile(outputFilePath); this.transforms.push(transformToNDJson()); if (outputFilePath.endsWith('.gz')) { this.transforms.push(createGzip({ level, // chunkSize: 64 * 1024, // no observed speedup })); } else if (outputFilePath.endsWith('.zst')) { this.transforms.push(createZstdCompress(zip2.zstdLevelToOptions(level))); } this.destination = fs2.createWriteStream(outputFilePath, { // highWaterMark: 64 * 1024, // no observed speedup }); await this.run(); } async to(destination) { this.destination = destination; await this.run(); } async forEach(fn, opt = {}) { this.transforms.push(transformMap(fn, { predicate: opt.logEvery ? _passthroughPredicate : undefined, // for the logger to work ...opt, signal: this.abortableSignal, })); if (opt.logEvery) { this.transforms.push(transformLogProgress(opt)); } await this.run(); } async forEachSync(fn, opt = {}) { this.transforms.push(transformMapSync(fn, { predicate: opt.logEvery ? _passthroughPredicate : undefined, // for the logger to work ...opt, signal: this.abortableSignal, })); if (opt.logEvery) { this.transforms.push(transformLogProgress(opt)); } await this.run(); } async run() { this.destination ||= writableVoid(); let { source } = this; if (this.readableLimit) { source = source.take(this.readableLimit); } try { await pipeline([source, ...this.transforms, this.destination], { signal: this.abortableSignal, }); } catch (err) { if (err instanceof Error && err.cause?.message === PIPELINE_GRACEFUL_ABORT) { console.log('pipeline gracefully aborted'); // todo: this message may be removed later return; } throw err; } } }