UNPKG

datapilot-cli

Version:

Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform

github.com/Mrassimo/datapilot

Mrassimo/datapilot

180 lines • 6.67 kB

JavaScript

"use strict"; /** * Simple Facade for Join Analysis - Phase 1 Only * Provides easy-to-use interface without overengineering */ Object.defineProperty(exports, "__esModule", { value: true }); exports.createJoinAnalyzer = createJoinAnalyzer; const join_analyzer_1 = require("./join-analyzer"); const join_formatter_1 = require("./join-formatter"); const memory_manager_1 = require("../../utils/memory-manager"); /** * Create a simple join analyzer with sensible defaults */ function createJoinAnalyzer(config = {}) { const analyzer = new join_analyzer_1.JoinAnalyzer({ maxTables: config.maxTables ?? 50, // Increased from 5 to 50 for scalability confidenceThreshold: config.confidenceThreshold ?? 0.7, enableFuzzyMatching: config.enableFuzzyMatching ?? true, enableSemanticAnalysis: config.enableSemanticAnalysis ?? true, enableTemporalJoins: false, // Phase 2 feature performanceMode: 'BALANCED', outputFormats: [{ type: 'MARKDOWN' }] }); const formatter = new join_formatter_1.JoinFormatter(); return { analyzer, formatter, // Scalable analyze method with batched processing async analyze(files) { if (files.length < 2) { throw new Error('Join analysis requires at least 2 files'); } // If files exceed the internal limit, process in batches if (files.length > 10) { return await processBatchedAnalysis(files, analyzer); } return await analyzer.analyzeJoins(files); }, // Simple format method format(result, format = 'markdown') { const outputFormat = { type: format.toUpperCase() }; return formatter.format(result, outputFormat); } }; } /** * Process files in batches to handle large datasets */ async function processBatchedAnalysis(files, analyzer) { const batchSize = 8; // Process files in batches of 8 const allCandidates = []; const allBusinessRules = []; const allRecommendations = []; let totalAnalysisTime = 0; let totalRows = 0; let highConfidenceJoins = 0; let potentialIssues = 0; // Process files in batches for (let i = 0; i < files.length; i += batchSize) { const batch = files.slice(i, i + batchSize); // Ensure we have at least 2 files in each batch for analysis if (batch.length < 2) { // For the last batch, include one file from the previous batch if (i > 0) { batch.unshift(files[i - 1]); } } try { const batchResult = await analyzer.analyzeJoins(batch); // Aggregate results allCandidates.push(...batchResult.candidates); allBusinessRules.push(...batchResult.businessRules); allRecommendations.push(...batchResult.recommendations); totalAnalysisTime += batchResult.summary.analysisTime; totalRows += batchResult.summary.totalRows; highConfidenceJoins += batchResult.summary.highConfidenceJoins; potentialIssues += batchResult.summary.potentialIssues; // Clean up memory after each batch memory_manager_1.globalMemoryManager.runCleanup(); } catch (error) { console.warn(`Warning: Batch ${i / batchSize + 1} failed: ${error.message}`); continue; } } // Remove duplicate candidates (same tables and columns) const uniqueCandidates = deduplicateCandidates(allCandidates); const uniqueBusinessRules = deduplicateBusinessRules(allBusinessRules); // Create aggregated dependency graph (simplified for batched processing) const dependencyGraph = { nodes: [], edges: [], cycles: [], depth: 0 }; // Create aggregated integrity report const integrityReport = { validJoins: uniqueCandidates, brokenRelationships: [], orphanedRecords: [], circularDependencies: [], recommendations: [] }; // Create aggregated performance analysis const performance = { overallComplexity: files.length > 20 ? 'HIGH' : files.length > 10 ? 'MEDIUM' : 'LOW', bottlenecks: [], optimizations: [{ category: 'ALGORITHM', description: 'Used batched processing for large file sets', expectedImprovement: '50-80% memory reduction', implementationComplexity: 'LOW' }], scalabilityAssessment: { currentCapacity: { rows: totalRows, sizeGB: totalRows * 0.001, // Rough estimate tables: files.length, complexity: 'BATCHED' }, projectedCapacity: { rows: totalRows * 10, sizeGB: totalRows * 0.01, tables: files.length * 10, complexity: 'HORIZONTAL' }, scalingStrategy: 'HORIZONTAL', recommendations: ['Consider database-based joins for very large datasets', 'Use indexing for frequently joined columns'] } }; // Create final aggregated result return { summary: { tablesAnalyzed: files.length, totalRows, joinCandidatesFound: uniqueCandidates.length, highConfidenceJoins, potentialIssues, analysisTime: totalAnalysisTime }, candidates: uniqueCandidates.sort((a, b) => b.confidence - a.confidence), dependencyGraph, integrityReport, businessRules: uniqueBusinessRules, temporalJoins: [], // Not implemented in batched processing yet recommendations: allRecommendations, performance }; } /** * Remove duplicate join candidates */ function deduplicateCandidates(candidates) { const seen = new Set(); return candidates.filter(candidate => { const key = `${candidate.leftTable.tableName}-${candidate.leftColumn}-${candidate.rightTable.tableName}-${candidate.rightColumn}`; if (seen.has(key)) { return false; } seen.add(key); return true; }); } /** * Remove duplicate business rules */ function deduplicateBusinessRules(rules) { const seen = new Set(); return rules.filter(rule => { const key = `${rule.name}-${rule.tables.sort().join('-')}`; if (seen.has(key)) { return false; } seen.add(key); return true; }); } //# sourceMappingURL=simple-facade.js.map