@flowlab/all
Version:
A cool library focusing on handling various flows
64 lines (58 loc) • 2.98 kB
text/typescript
// src/extractors/fileExtractor.ts
import * as fs from 'fs'; // Use sync fs for createReadStream
import * as readline from 'readline';
import { IExtractor, PipelineContext, DataSource, FileSourceConfig } from '../core/interfaces';
import { ComponentError } from '../core/errors';
export class FileExtractor implements IExtractor<string | object> { // Output depends on format
private config: FileSourceConfig;
constructor(config: FileSourceConfig) {
if (!config.path || !config.format) {
throw new ComponentError('FileExtractor requires "path" and "format" in config.');
}
this.config = config;
}
async extract(context: PipelineContext): Promise<DataSource<string | object>> {
context.logger.info(`Extracting data from file: ${this.config.path} (format: ${this.config.format})`);
// Use AsyncIterable for large files
return this.extractAsStream(context);
}
private async *extractAsStream(context: PipelineContext): AsyncIterable<string | object> {
const fileStream = fs.createReadStream(this.config.path, { encoding: this.config.encoding || 'utf-8' });
const rl = readline.createInterface({
input: fileStream,
crlfDelay: Infinity // Handle different line endings
});
let lineNum = 0;
try {
for await (const line of rl) {
lineNum++;
if (this.config.format === 'json') {
// Assume JSON Lines format (one JSON object per line)
if(line.trim()){ // Avoid parsing empty lines
try {
yield JSON.parse(line);
} catch (error) {
context.logger.error({ err: error, line, lineNum, file: this.config.path }, `Skipping invalid JSON line`);
}
}
} else if (this.config.format === 'csv') {
// Basic CSV parsing (split by comma), consider a proper CSV library for robustness
yield line.split(','); // VERY basic, use 'csv-parse' for real cases
// TODO: Add header handling / object creation based on header
}
else { // text format
yield line;
}
}
context.logger.info(`Finished reading file ${this.config.path}, ${lineNum} lines processed.`);
} catch (error: any) {
context.logger.error({ err: error, file: this.config.path }, `Error reading file stream`);
throw new ComponentError(`Error reading file ${this.config.path}`, 'FileExtractor', error);
} finally {
// Ensure stream is closed if readline doesn't do it automatically on error/completion
if (!fileStream.destroyed) {
fileStream.destroy();
}
}
}
}