@flowlab/all
Version:
A cool library focusing on handling various flows
402 lines (359 loc) • 17.1 kB
text/typescript
// src/core/pipeline.ts
import {
IExtractor,
ICleaner,
ITransformer,
ILoader,
DataSource,
PipelineContext,
IFlowLabNode,
IFlowLabRegistry,
BaseSourceConfig,
BaseTargetConfig,
SourceConfig,
TargetConfig,
FailedItemInfo,
ProcessingErrorStrategy,
FileTargetConfig,
} from './interfaces';
import { createLogger } from '../utils/logger';
import { performRetry } from '../utils/retry';
// import { v4 as uuidv4 } from 'uuid'; // Use uuid for run IDs
import { Logger } from 'pino';
import { loadPipelineFromConfig as loadConfig } from '../config/loader';
import pLimit from 'p-limit'; // Import p-limit
import { FileDlqLoader } from '../loaders/fileDlqLoader';
import { createLoader as createComponentLoader } from './registry';
// Helper type for pipeline steps
type PipelineStep<TInput, TOutput> = ICleaner<TInput> | ITransformer<TInput, TOutput>;
// Type guards
function isCleaner<T>(step: any): step is ICleaner<T> {
return typeof step.clean === 'function';
}
function isTransformer<TInput, TOutput>(step: any): step is ITransformer<TInput, TOutput> {
return typeof step.transform === 'function';
}
function isAsyncIterable<T>(dataSource: DataSource<T>): dataSource is AsyncIterable<T> {
return typeof (dataSource as any)[Symbol.asyncIterator] === 'function';
}
// MARK: - DataPipeline
export class DataPipeline<TStart, TEnd> implements IFlowLabNode {
readonly id: string; // Can be user-defined or generated
readonly type = '@flowlab/data:pipeline';
private extractorInstance: IExtractor<TStart> | null = null;
private steps: PipelineStep<any, any>[] = [];
private loaderInstance: ILoader<TEnd> | null = null;
private config: {
batchSize: number;
retries: number;
retryDelay: number;
logger: Logger;
concurrency: number; // Add concurrency setting
itemProcessingErrorStrategy: ProcessingErrorStrategy;
dlqLoaderInstance?: ILoader<FailedItemInfo>;
};
// Track types through the chain - using 'any' here for simplicity in the builder,
// but the run method ensures type flow based on step order. A more complex builder
// could maintain stricter compile-time types between steps.
private currentOutputType: any; // Internal helper
constructor(id?: string, options?: { logger?: Logger }) {
this.id = id || `pipeline-${uuidv4()}`;
this.config = {
batchSize: 100,
retries: 3,
retryDelay: 1000,
logger: options?.logger || createLogger(),
concurrency: 1, // Default to sequential processing
itemProcessingErrorStrategy: 'fail', // Default to failing the pipeline on item error
dlqLoaderInstance: undefined,
};
this.currentOutputType = null; // Placeholder
}
/**
* MARK: - Configure
* Configure pipeline options.
*/
configure(options: Partial<typeof this.config> & { dlqTarget?: TargetConfig }): this {
const { dlqTarget, ...pipelineOptions } = options;
this.config = { ...this.config, ...pipelineOptions };
if (dlqTarget) {
try {
// Use a simple file DLQ loader for now, could be made dynamic
//TO chinese: 使用一个简单的文件DLQ加载器,可以动态创建
if (dlqTarget.type === 'file-dlq') { // Use a specific type for DLQ file
this.config.dlqLoaderInstance = new FileDlqLoader(dlqTarget as FileTargetConfig);
this.config.logger.info(`Configured File DLQ Loader target: ${dlqTarget.path}`);
} else {
// Allow registering other DLQ loader types via registry if needed
// this.config.dlqLoaderInstance = createComponentLoader(dlqTarget);
//TO chinese: 允许通过注册表动态创建其他DLQ加载器类型,如果需要
this.config.logger.warn(`Unsupported dlqTarget type: ${dlqTarget.type}. Only 'file-dlq' supported currently.`);
// Fallback strategy if DLQ loader fails?
// 如果DLQ加载器失败,回退策略是什么?
if (this.config.itemProcessingErrorStrategy === 'dlq') {
this.config.itemProcessingErrorStrategy = 'fail';
this.config.logger.warn("DLQ configured but loader couldn't be created. Defaulting item error strategy to 'fail'.");
}
}
// Ensure DLQ strategy is set if target is provided
if (!pipelineOptions.itemProcessingErrorStrategy) {
this.config.itemProcessingErrorStrategy = 'dlq'; // Default to DLQ if target is set
} else if (pipelineOptions.itemProcessingErrorStrategy !== 'dlq') {
this.config.logger.warn(`DLQ target configured, but itemProcessingErrorStrategy is set to '${pipelineOptions.itemProcessingErrorStrategy}'. DLQ will not be used.`);
}
} catch (error) {
this.config.logger.error({ err: error }, "Failed to instantiate DLQ loader. DLQ strategy disabled.");
if (this.config.itemProcessingErrorStrategy === 'dlq') {
this.config.itemProcessingErrorStrategy = 'fail'; // Fallback needed
}
}
} else if (this.config.itemProcessingErrorStrategy === 'dlq') {
this.config.itemProcessingErrorStrategy = 'fail'; // Can't use DLQ strategy without a target
this.config.logger.warn("itemProcessingErrorStrategy set to 'dlq' but no dlqTarget provided. Defaulting to 'fail'.");
}
return this;
}
/**
* MARK: - handleProcessingError
* Handle processing errors.
*/
private async handleProcessingError(
error: Error,
item: any,
stepName: string,
context: PipelineContext
): Promise<boolean> { // Returns true if pipeline should continue, false if it should fail
context.logger.error({ err: error, item, step: stepName }, `Error during pipeline step: ${stepName}`);
const strategy = this.config.itemProcessingErrorStrategy;
if (strategy === 'dlq' && this.config.dlqLoaderInstance) {
const failedInfo: FailedItemInfo = {
pipelineId: this.id,
runId: context.runId,
timestamp: new Date().toISOString(),
step: stepName,
error: {
message: error.message,
stack: error.stack,
name: error.name,
},
originalItem: item, // Be careful with large items
};
try {
// Load batch takes an array
await this.config.dlqLoaderInstance.loadBatch([failedInfo], context);
context.logger.warn(`Item sent to DLQ due to processing error in step ${stepName}.`);
return true; // Continue pipeline
} catch (dlqError) {
context.logger.error({ err: dlqError }, `Failed to send item to DLQ! Falling back to 'fail' strategy for this run.`);
return false; // Fail pipeline if DLQ fails
}
} else if (strategy === 'log') {
context.logger.warn(`Skipping item due to processing error in step ${stepName} (strategy: log).`);
return true; // Continue pipeline
} else if (strategy === 'skip') {
context.logger.info(`Skipping item due to processing error in step ${stepName} (strategy: skip).`);
return true; // Continue pipeline
} else { // strategy === 'fail'
context.logger.error(`Pipeline failed due to processing error in step ${stepName} (strategy: fail).`);
return false; // Fail the pipeline
}
}
/**
* MARK: - Extractor
* Set the data extractor.
*/
extract<TOutput>(extractor: IExtractor<TOutput>): DataPipeline<TOutput, TOutput> { // Reset TEnd initially
if (this.extractorInstance) {
this.config.logger.warn({ pipelineId: this.id }, 'Extractor is already set. Overwriting.');
}
this.extractorInstance = extractor as unknown as IExtractor<TStart>; // Cast needed due to generic reset
this.currentOutputType = null; // Reset output type marker
// Explicitly cast the return type to reflect the new TStart and TEnd
return this as unknown as DataPipeline<TOutput, TOutput>;
}
/**
* MARK: - Cleaning
* Add a cleaning step. The output type remains the same.
*/
clean<TCurrent>(cleaner: ICleaner<TCurrent>): DataPipeline<TStart, TEnd> {
// Basic type check simulation based on prior step (won't catch all errors at compile time here)
// if (this.currentOutputType && /* check compatibility */) { throw new Error('Type mismatch'); }
this.steps.push(cleaner);
// Cleaner doesn't change the core type, just filters or modifies in place
return this;
}
/**
* MARK: - Transformation
* Add a transformation step. Changes the current data type.
* We need 'any' here in the builder pattern unless we use more complex generic chaining.
*/
transform<TCurrent, TNext>(transformer: ITransformer<TCurrent, TNext>): DataPipeline<TStart, any> {
// Basic type check simulation
// if (this.currentOutputType && /* check compatibility */) { throw new Error('Type mismatch'); }
this.steps.push(transformer);
this.currentOutputType = null; // Mark type as changed, could store TNext constructor potentially
// The TEnd type becomes unknown until load is called
return this as unknown as DataPipeline<TStart, any>;
}
/**
* MARK: - Loading
* Set the data loader. This defines the final TEnd type for the pipeline.
*/
load<TCurrent>(loader: ILoader<TCurrent>): DataPipeline<TStart, TCurrent> {
// Basic type check simulation
// if (this.currentOutputType && /* check compatibility */) { throw new Error('Type mismatch'); }
if (this.loaderInstance) {
this.config.logger.warn({ pipelineId: this.id }, 'Loader is already set. Overwriting.');
}
this.loaderInstance = loader as unknown as ILoader<TEnd>; // Cast needed
// Explicitly cast the return type to set the final TEnd
return this as unknown as DataPipeline<TStart, TCurrent>;
}
/**
* MARK: - Execution
* Execute the pipeline.
* This is the core logic where data flows through steps.
*/
async run(): Promise<void> {
const runId = uuidv4();
const context: PipelineContext = {
logger: this.config.logger.child({ pipelineId: this.id, runId }),
runId: runId,
// Add any other initial context
};
context.logger.info('Pipeline run started.');
if (!this.extractorInstance) throw new Error('Extractor is not defined for pipeline.');
if (!this.loaderInstance) throw new Error('Loader is not defined for pipeline.');
let processedCount = 0;
let loadedCount = 0;
let errorCount = 0;
let currentBatch: TEnd[] = []; // Batch for the loader
try {
const dataSource = await performRetry(
() => this.extractorInstance!.extract(context),
this.config.retries,
this.config.retryDelay,
context.logger,
"Extractor"
);
const processItem = async (item: TStart): Promise<TEnd | null | undefined> => {
let currentData: any = item; // Start with the extracted item
for (const step of this.steps) {
if (currentData === null || currentData === undefined) {
return null; // Skip remaining steps if item was filtered out
}
try {
if (isCleaner(step)) {
currentData = await step.clean(currentData, context);
} else if (isTransformer(step)) {
currentData = await step.transform(currentData, context);
}
} catch (error: any) {
context.logger.error({ err: error, item }, `Error during pipeline step ${step.constructor.name}`);
throw error; // Propagate error to main handler for the item
}
}
return currentData as TEnd; // Final type after all steps
};
const loadBatchIfNeeded = async () => {
if (currentBatch.length > 0) {
context.logger.debug(`Loading batch of ${currentBatch.length} items.`);
await performRetry(
() => this.loaderInstance!.loadBatch(currentBatch, context),
this.config.retries,
this.config.retryDelay,
context.logger,
"Loader"
);
loadedCount += currentBatch.length;
currentBatch = []; // Clear the batch
}
};
// Process data (handles both arrays and async iterables)
if (isAsyncIterable(dataSource)) {
context.logger.info('Processing data as AsyncIterable (stream/batched extraction).');
for await (const item of dataSource) {
processedCount++;
try {
const result = await processItem(item);
if (result !== null && result !== undefined) {
currentBatch.push(result);
if (currentBatch.length >= this.config.batchSize) {
await loadBatchIfNeeded();
}
}
} catch (error) {
errorCount++;
// Decide how to handle item processing errors (e.g., log, send to DLQ)
context.logger.error({ err: error, item, runId }, 'Failed to process item.');
// Continue processing next item? Or fail the run? For now, just log and count.
}
if (processedCount % 1000 === 0) { // Log progress periodically
context.logger.info(`Processed ${processedCount} items...`);
}
}
} else { // Handle simple array input
context.logger.info(`Processing data as an Array (${dataSource.length} items).`);
for (const item of dataSource) {
processedCount++;
try {
const result = await processItem(item);
if (result !== null && result !== undefined) {
currentBatch.push(result);
if (currentBatch.length >= this.config.batchSize) {
await loadBatchIfNeeded();
}
}
} catch (error) {
errorCount++;
context.logger.error({ err: error, item, runId }, 'Failed to process item.');
}
}
}
// Load any remaining items in the last batch
await loadBatchIfNeeded();
context.logger.info(
`Pipeline run finished. Processed: ${processedCount}, Loaded: ${loadedCount}, Errors: ${errorCount}`
);
} catch (error: any) {
context.logger.error({ err: error, runId }, 'Pipeline run failed.');
// Rethrow or handle the fatal error appropriately
throw error;
}
}
// --- IFlowLabNode Implementation ---
/**
* MARK: - execute
* Basic execution for FlowLab integration. Assumes payload might contain
* overrides or context, but primarily runs the predefined pipeline.
*/
async execute(payload?: any, context?: any): Promise<void> {
this.config.logger.info({ payload, context }, `Executing pipeline node ${this.id} via FlowLab.`);
// Potentially merge payload/context into pipeline context or config
await this.run();
}
/**
* Static method to create a pipeline from a configuration object.
* This requires a way to instantiate components based on config strings.
* (Requires a registry or factory pattern - more complex setup)
*/
// MARK: - fromConfig
static fromConfig(config: /* PipelineConfig from interfaces.ts */ any, componentRegistry: any): DataPipeline<any, any> {
// TODO: Implement logic to:
// 1. Parse config
// 2. Instantiate Extractor based on config.source and registry
// 3. Instantiate Steps based on config.steps and registry
// 4. Instantiate Loader based on config.target and registry
// 5. Configure pipeline options
throw new Error("fromConfig not yet implemented. Requires component registry.");
}
/**
* MARK: - register
* Registers the pipeline itself as a node.
* Individual components (extractors, loaders) could also be registered.
*/
register(registry: IFlowLabRegistry): void {
registry.register(this);
this.config.logger.info(`Registered pipeline node ${this.id} with FlowLab.`);
}
}