UNPKG

bigbasealpha

Version:

Professional Grade Custom Database System - A sophisticated, dependency-free database with encryption, caching, indexing, and web dashboard

736 lines (606 loc) 20.7 kB
import { EventEmitter } from 'events'; import { promises as fs } from 'fs'; import { join, extname } from 'path'; import { createReadStream, createWriteStream } from 'fs'; import { pipeline } from 'stream/promises'; import { Transform } from 'stream'; /** * ETL & Data Pipeline Engine for BigBaseAlpha * Handles data extraction, transformation, and loading operations */ export class ETLEngine extends EventEmitter { constructor(config) { super(); this.config = config; this.basePath = config.path || './bigbase_data'; this.pipelinesPath = join(this.basePath, 'pipelines'); this.tempPath = join(this.basePath, 'temp'); // Pipeline management this.pipelines = new Map(); this.activePipelines = new Map(); this.scheduledJobs = new Map(); this.jobHistory = []; // Statistics this.stats = { totalPipelines: 0, totalJobs: 0, successfulJobs: 0, failedJobs: 0, totalRowsProcessed: 0, totalDataProcessed: 0 }; // Data transformation functions this.transformers = new Map(); this.validators = new Map(); this.extractors = new Map(); this.loaders = new Map(); this._initializeBuiltInComponents(); } /** * Initialize the ETL engine */ async init() { try { // Create necessary directories await this._ensureDirectories(); // Load existing pipelines await this._loadPipelines(); // Register built-in components this._registerBuiltInComponents(); // Start scheduler this._startScheduler(); console.log('✅ ETL Engine initialized'); this.emit('initialized'); } catch (error) { console.error('❌ ETL Engine initialization failed:', error); throw error; } } /** * Create a new data pipeline */ async createPipeline(config) { const pipeline = { id: config.id || this._generateId(), name: config.name, description: config.description || '', source: config.source, destination: config.destination, transformations: config.transformations || [], validations: config.validations || [], schedule: config.schedule || null, enabled: config.enabled !== false, created: new Date(), lastRun: null, stats: { totalRuns: 0, successfulRuns: 0, failedRuns: 0, totalRowsProcessed: 0, averageRunTime: 0 } }; this.pipelines.set(pipeline.id, pipeline); // Save pipeline configuration await this._savePipeline(pipeline); // Schedule if needed if (pipeline.schedule && pipeline.enabled) { this._schedulePipeline(pipeline); } this.stats.totalPipelines++; this.emit('pipelineCreated', pipeline); return pipeline; } /** * Execute a pipeline */ async executePipeline(pipelineId, options = {}) { const pipeline = this.pipelines.get(pipelineId); if (!pipeline) { throw new Error(`Pipeline '${pipelineId}' not found`); } if (this.activePipelines.has(pipelineId)) { throw new Error(`Pipeline '${pipelineId}' is already running`); } const execution = { id: this._generateId(), pipelineId, startTime: new Date(), endTime: null, status: 'running', rowsProcessed: 0, errors: [], warnings: [], progress: 0 }; this.activePipelines.set(pipelineId, execution); this.emit('pipelineStarted', execution); try { // Extract data console.log(`📤 Extracting data from: ${pipeline.source.type}`); const extractedData = await this._extractData(pipeline.source); // Transform data console.log(`🔄 Transforming data with ${pipeline.transformations.length} transformations`); const transformedData = await this._transformData(extractedData, pipeline.transformations); // Validate data if (pipeline.validations.length > 0) { console.log(`✅ Validating data with ${pipeline.validations.length} validators`); await this._validateData(transformedData, pipeline.validations); } // Load data console.log(`📥 Loading data to: ${pipeline.destination.type}`); await this._loadData(transformedData, pipeline.destination); // Update execution status execution.endTime = new Date(); execution.status = 'completed'; execution.rowsProcessed = transformedData.length; // Update pipeline stats pipeline.stats.totalRuns++; pipeline.stats.successfulRuns++; pipeline.stats.totalRowsProcessed += execution.rowsProcessed; pipeline.stats.averageRunTime = this._calculateAverageRunTime(pipeline); pipeline.lastRun = execution.endTime; // Update global stats this.stats.totalJobs++; this.stats.successfulJobs++; this.stats.totalRowsProcessed += execution.rowsProcessed; console.log(`✅ Pipeline '${pipeline.name}' completed successfully`); console.log(` Processed ${execution.rowsProcessed} rows in ${execution.endTime - execution.startTime}ms`); } catch (error) { execution.endTime = new Date(); execution.status = 'failed'; execution.errors.push({ message: error.message, stack: error.stack, timestamp: new Date() }); pipeline.stats.totalRuns++; pipeline.stats.failedRuns++; this.stats.totalJobs++; this.stats.failedJobs++; console.error(`❌ Pipeline '${pipeline.name}' failed:`, error.message); this.emit('pipelineError', { execution, error }); } finally { this.activePipelines.delete(pipelineId); this.jobHistory.unshift(execution); // Keep only last 100 executions if (this.jobHistory.length > 100) { this.jobHistory = this.jobHistory.slice(0, 100); } this.emit('pipelineCompleted', execution); await this._savePipeline(pipeline); } return execution; } /** * Extract data from various sources */ async _extractData(source) { const extractor = this.extractors.get(source.type); if (!extractor) { throw new Error(`No extractor found for source type: ${source.type}`); } return await extractor(source); } /** * Transform data using configured transformations */ async _transformData(data, transformations) { let result = [...data]; for (const transformation of transformations) { const transformer = this.transformers.get(transformation.type); if (!transformer) { console.warn(`Warning: Transformer '${transformation.type}' not found, skipping`); continue; } result = await transformer(result, transformation.config || {}); } return result; } /** * Validate data using configured validators */ async _validateData(data, validations) { for (const validation of validations) { const validator = this.validators.get(validation.type); if (!validator) { console.warn(`Warning: Validator '${validation.type}' not found, skipping`); continue; } const isValid = await validator(data, validation.config || {}); if (!isValid) { throw new Error(`Data validation failed: ${validation.type}`); } } } /** * Load data to various destinations */ async _loadData(data, destination) { const loader = this.loaders.get(destination.type); if (!loader) { throw new Error(`No loader found for destination type: ${destination.type}`); } return await loader(data, destination); } /** * Register built-in ETL components */ _registerBuiltInComponents() { // Extractors this.extractors.set('csv', this._extractFromCSV.bind(this)); this.extractors.set('json', this._extractFromJSON.bind(this)); this.extractors.set('collection', this._extractFromCollection.bind(this)); this.extractors.set('api', this._extractFromAPI.bind(this)); // Transformers this.transformers.set('map', this._transformMap.bind(this)); this.transformers.set('filter', this._transformFilter.bind(this)); this.transformers.set('aggregate', this._transformAggregate.bind(this)); this.transformers.set('join', this._transformJoin.bind(this)); this.transformers.set('normalize', this._transformNormalize.bind(this)); // Validators this.validators.set('schema', this._validateSchema.bind(this)); this.validators.set('uniqueness', this._validateUniqueness.bind(this)); this.validators.set('completeness', this._validateCompleteness.bind(this)); // Loaders this.loaders.set('csv', this._loadToCSV.bind(this)); this.loaders.set('json', this._loadToJSON.bind(this)); this.loaders.set('collection', this._loadToCollection.bind(this)); this.loaders.set('api', this._loadToAPI.bind(this)); } /** * Built-in Extractors */ async _extractFromCSV(source) { const data = []; const filePath = source.path; try { const content = await fs.readFile(filePath, 'utf8'); const lines = content.split('\n').filter(line => line.trim()); if (lines.length === 0) return data; const headers = lines[0].split(',').map(h => h.trim().replace(/"/g, '')); for (let i = 1; i < lines.length; i++) { const values = lines[i].split(',').map(v => v.trim().replace(/"/g, '')); const row = {}; headers.forEach((header, index) => { row[header] = values[index] || ''; }); data.push(row); } } catch (error) { throw new Error(`Failed to extract CSV data from ${filePath}: ${error.message}`); } return data; } async _extractFromJSON(source) { try { const content = await fs.readFile(source.path, 'utf8'); const data = JSON.parse(content); return Array.isArray(data) ? data : [data]; } catch (error) { throw new Error(`Failed to extract JSON data: ${error.message}`); } } async _extractFromCollection(source) { if (!this.database) { throw new Error('Database instance not available for collection extraction'); } try { const query = source.query || {}; return await this.database.find(source.collection, query, source.options || {}); } catch (error) { throw new Error(`Failed to extract from collection: ${error.message}`); } } async _extractFromAPI(source) { // API extraction would require HTTP client implementation throw new Error('API extraction not implemented yet'); } /** * Built-in Transformers */ async _transformMap(data, config) { const { mapping } = config; return data.map(row => { const newRow = {}; for (const [newField, oldField] of Object.entries(mapping)) { newRow[newField] = row[oldField]; } return newRow; }); } async _transformFilter(data, config) { const { condition } = config; return data.filter(row => { try { // Simple condition evaluation (can be enhanced) return eval(condition.replace(/\{\{(\w+)\}\}/g, 'row.$1')); } catch { return false; } }); } async _transformAggregate(data, config) { const { groupBy, aggregations } = config; const groups = {}; // Group data data.forEach(row => { const key = Array.isArray(groupBy) ? groupBy.map(field => row[field]).join('|') : row[groupBy]; if (!groups[key]) groups[key] = []; groups[key].push(row); }); // Apply aggregations const result = []; for (const [key, rows] of Object.entries(groups)) { const aggregated = {}; if (Array.isArray(groupBy)) { groupBy.forEach((field, index) => { aggregated[field] = key.split('|')[index]; }); } else { aggregated[groupBy] = key; } for (const [field, operation] of Object.entries(aggregations)) { switch (operation) { case 'sum': aggregated[field] = rows.reduce((sum, row) => sum + (Number(row[field]) || 0), 0); break; case 'avg': aggregated[field] = rows.reduce((sum, row) => sum + (Number(row[field]) || 0), 0) / rows.length; break; case 'count': aggregated[field] = rows.length; break; case 'min': aggregated[field] = Math.min(...rows.map(row => Number(row[field]) || 0)); break; case 'max': aggregated[field] = Math.max(...rows.map(row => Number(row[field]) || 0)); break; } } result.push(aggregated); } return result; } async _transformJoin(data, config) { // Join transformation implementation // This would require additional data source for joining return data; } async _transformNormalize(data, config) { const { fields } = config; return data.map(row => { const normalized = { ...row }; fields.forEach(field => { if (normalized[field]) { // Simple normalization - trim, lowercase normalized[field] = String(normalized[field]).trim().toLowerCase(); } }); return normalized; }); } /** * Built-in Validators */ async _validateSchema(data, config) { const { schema } = config; for (const row of data) { for (const [field, rules] of Object.entries(schema)) { if (rules.required && !(field in row)) { return false; } if (field in row && rules.type && typeof row[field] !== rules.type) { return false; } } } return true; } async _validateUniqueness(data, config) { const { fields } = config; const seen = new Set(); for (const row of data) { const key = fields.map(field => row[field]).join('|'); if (seen.has(key)) { return false; } seen.add(key); } return true; } async _validateCompleteness(data, config) { const { threshold = 0.9, requiredFields } = config; for (const field of requiredFields) { const filledRows = data.filter(row => row[field] && row[field] !== '').length; const completeness = filledRows / data.length; if (completeness < threshold) { return false; } } return true; } /** * Built-in Loaders */ async _loadToCSV(data, destination) { if (data.length === 0) return; const headers = Object.keys(data[0]); const csvContent = [ headers.join(','), ...data.map(row => headers.map(header => `"${row[header] || ''}"`).join(',')) ].join('\n'); await fs.writeFile(destination.path, csvContent, 'utf8'); } async _loadToJSON(data, destination) { await fs.writeFile(destination.path, JSON.stringify(data, null, 2), 'utf8'); } async _loadToCollection(data, destination) { if (!this.database) { throw new Error('Database instance not available for collection loading'); } for (const row of data) { await this.database.insert(destination.collection, row); } } async _loadToAPI(data, destination) { // API loading would require HTTP client implementation throw new Error('API loading not implemented yet'); } /** * Pipeline scheduling */ _schedulePipeline(pipeline) { if (this.scheduledJobs.has(pipeline.id)) { clearInterval(this.scheduledJobs.get(pipeline.id)); } const interval = this._parseSchedule(pipeline.schedule); if (interval) { const job = setInterval(async () => { try { await this.executePipeline(pipeline.id); } catch (error) { console.error(`Scheduled pipeline '${pipeline.name}' failed:`, error.message); } }, interval); this.scheduledJobs.set(pipeline.id, job); } } _parseSchedule(schedule) { const schedules = { 'hourly': 60 * 60 * 1000, 'daily': 24 * 60 * 60 * 1000, 'weekly': 7 * 24 * 60 * 60 * 1000, 'monthly': 30 * 24 * 60 * 60 * 1000 }; return schedules[schedule] || null; } /** * Utility methods */ getPipelines() { return Array.from(this.pipelines.values()); } getPipeline(id) { return this.pipelines.get(id); } getActivePipelines() { return Array.from(this.activePipelines.values()); } getJobHistory(limit = 50) { return this.jobHistory.slice(0, limit); } getStats() { return { ...this.stats, activePipelines: this.activePipelines.size, scheduledPipelines: this.scheduledJobs.size, totalPipelinesConfigured: this.pipelines.size }; } /** * Set database instance for collection operations */ setDatabase(database) { this.database = database; } /** * Cleanup and close */ async close() { // Clear all scheduled jobs for (const job of this.scheduledJobs.values()) { clearInterval(job); } this.scheduledJobs.clear(); // Cancel active pipelines this.activePipelines.clear(); console.log('✅ ETL Engine closed'); } // Private helper methods _generateId() { return `etl_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`; } async _ensureDirectories() { try { await fs.mkdir(this.pipelinesPath, { recursive: true }); await fs.mkdir(this.tempPath, { recursive: true }); } catch (error) { if (error.code !== 'EEXIST') throw error; } } async _loadPipelines() { try { const files = await fs.readdir(this.pipelinesPath); for (const file of files) { if (file.endsWith('.json')) { try { const content = await fs.readFile(join(this.pipelinesPath, file), 'utf8'); const pipeline = JSON.parse(content); this.pipelines.set(pipeline.id, pipeline); if (pipeline.schedule && pipeline.enabled) { this._schedulePipeline(pipeline); } } catch (error) { console.warn(`Failed to load pipeline from ${file}:`, error.message); } } } this.stats.totalPipelines = this.pipelines.size; } catch (error) { // Directory doesn't exist or is empty } } async _savePipeline(pipeline) { const filePath = join(this.pipelinesPath, `${pipeline.id}.json`); await fs.writeFile(filePath, JSON.stringify(pipeline, null, 2), 'utf8'); } _calculateAverageRunTime(pipeline) { // This would calculate based on job history return 0; // Placeholder } _startScheduler() { // Scheduler is already handled by individual pipeline scheduling console.log('📅 ETL Scheduler started'); } _initializeBuiltInComponents() { // Initialize component registries this.transformers.clear(); this.validators.clear(); this.extractors.clear(); this.loaders.clear(); } /** * Shutdown ETL Engine */ async shutdown() { console.log('🔄 Shutting down ETL Engine...'); // Stop all active pipelines for (const [id, pipeline] of this.activePipelines) { try { await this.stopPipeline(id); } catch (error) { console.error(`Error stopping pipeline ${id}:`, error.message); } } // Clear scheduled jobs for (const [id, job] of this.scheduledJobs) { if (job.timer) { clearInterval(job.timer); } } // Clear data structures this.pipelines.clear(); this.activePipelines.clear(); this.scheduledJobs.clear(); this.transformers.clear(); this.validators.clear(); this.extractors.clear(); this.loaders.clear(); console.log('✅ ETL Engine shutdown complete'); } } export default ETLEngine;