datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
180 lines • 6.67 kB
JavaScript
;
/**
* Simple Facade for Join Analysis - Phase 1 Only
* Provides easy-to-use interface without overengineering
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.createJoinAnalyzer = createJoinAnalyzer;
const join_analyzer_1 = require("./join-analyzer");
const join_formatter_1 = require("./join-formatter");
const memory_manager_1 = require("../../utils/memory-manager");
/**
* Create a simple join analyzer with sensible defaults
*/
function createJoinAnalyzer(config = {}) {
const analyzer = new join_analyzer_1.JoinAnalyzer({
maxTables: config.maxTables ?? 50, // Increased from 5 to 50 for scalability
confidenceThreshold: config.confidenceThreshold ?? 0.7,
enableFuzzyMatching: config.enableFuzzyMatching ?? true,
enableSemanticAnalysis: config.enableSemanticAnalysis ?? true,
enableTemporalJoins: false, // Phase 2 feature
performanceMode: 'BALANCED',
outputFormats: [{ type: 'MARKDOWN' }]
});
const formatter = new join_formatter_1.JoinFormatter();
return {
analyzer,
formatter,
// Scalable analyze method with batched processing
async analyze(files) {
if (files.length < 2) {
throw new Error('Join analysis requires at least 2 files');
}
// If files exceed the internal limit, process in batches
if (files.length > 10) {
return await processBatchedAnalysis(files, analyzer);
}
return await analyzer.analyzeJoins(files);
},
// Simple format method
format(result, format = 'markdown') {
const outputFormat = {
type: format.toUpperCase()
};
return formatter.format(result, outputFormat);
}
};
}
/**
* Process files in batches to handle large datasets
*/
async function processBatchedAnalysis(files, analyzer) {
const batchSize = 8; // Process files in batches of 8
const allCandidates = [];
const allBusinessRules = [];
const allRecommendations = [];
let totalAnalysisTime = 0;
let totalRows = 0;
let highConfidenceJoins = 0;
let potentialIssues = 0;
// Process files in batches
for (let i = 0; i < files.length; i += batchSize) {
const batch = files.slice(i, i + batchSize);
// Ensure we have at least 2 files in each batch for analysis
if (batch.length < 2) {
// For the last batch, include one file from the previous batch
if (i > 0) {
batch.unshift(files[i - 1]);
}
}
try {
const batchResult = await analyzer.analyzeJoins(batch);
// Aggregate results
allCandidates.push(...batchResult.candidates);
allBusinessRules.push(...batchResult.businessRules);
allRecommendations.push(...batchResult.recommendations);
totalAnalysisTime += batchResult.summary.analysisTime;
totalRows += batchResult.summary.totalRows;
highConfidenceJoins += batchResult.summary.highConfidenceJoins;
potentialIssues += batchResult.summary.potentialIssues;
// Clean up memory after each batch
memory_manager_1.globalMemoryManager.runCleanup();
}
catch (error) {
console.warn(`Warning: Batch ${i / batchSize + 1} failed: ${error.message}`);
continue;
}
}
// Remove duplicate candidates (same tables and columns)
const uniqueCandidates = deduplicateCandidates(allCandidates);
const uniqueBusinessRules = deduplicateBusinessRules(allBusinessRules);
// Create aggregated dependency graph (simplified for batched processing)
const dependencyGraph = {
nodes: [],
edges: [],
cycles: [],
depth: 0
};
// Create aggregated integrity report
const integrityReport = {
validJoins: uniqueCandidates,
brokenRelationships: [],
orphanedRecords: [],
circularDependencies: [],
recommendations: []
};
// Create aggregated performance analysis
const performance = {
overallComplexity: files.length > 20 ? 'HIGH' : files.length > 10 ? 'MEDIUM' : 'LOW',
bottlenecks: [],
optimizations: [{
category: 'ALGORITHM',
description: 'Used batched processing for large file sets',
expectedImprovement: '50-80% memory reduction',
implementationComplexity: 'LOW'
}],
scalabilityAssessment: {
currentCapacity: {
rows: totalRows,
sizeGB: totalRows * 0.001, // Rough estimate
tables: files.length,
complexity: 'BATCHED'
},
projectedCapacity: {
rows: totalRows * 10,
sizeGB: totalRows * 0.01,
tables: files.length * 10,
complexity: 'HORIZONTAL'
},
scalingStrategy: 'HORIZONTAL',
recommendations: ['Consider database-based joins for very large datasets', 'Use indexing for frequently joined columns']
}
};
// Create final aggregated result
return {
summary: {
tablesAnalyzed: files.length,
totalRows,
joinCandidatesFound: uniqueCandidates.length,
highConfidenceJoins,
potentialIssues,
analysisTime: totalAnalysisTime
},
candidates: uniqueCandidates.sort((a, b) => b.confidence - a.confidence),
dependencyGraph,
integrityReport,
businessRules: uniqueBusinessRules,
temporalJoins: [], // Not implemented in batched processing yet
recommendations: allRecommendations,
performance
};
}
/**
* Remove duplicate join candidates
*/
function deduplicateCandidates(candidates) {
const seen = new Set();
return candidates.filter(candidate => {
const key = `${candidate.leftTable.tableName}-${candidate.leftColumn}-${candidate.rightTable.tableName}-${candidate.rightColumn}`;
if (seen.has(key)) {
return false;
}
seen.add(key);
return true;
});
}
/**
* Remove duplicate business rules
*/
function deduplicateBusinessRules(rules) {
const seen = new Set();
return rules.filter(rule => {
const key = `${rule.name}-${rule.tables.sort().join('-')}`;
if (seen.has(key)) {
return false;
}
seen.add(key);
return true;
});
}
//# sourceMappingURL=simple-facade.js.map