datapilot-cli
Version:
Enterprise-grade streaming multi-format data analysis with comprehensive statistical insights and intelligent relationship detection - supports CSV, JSON, Excel, TSV, Parquet - memory-efficient, cross-platform
508 lines • 22.7 kB
JavaScript
;
/**
* Intelligent Chunking Engine
* Advanced chunk size adaptation based on data characteristics and system performance
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.IntelligentChunker = void 0;
exports.getGlobalIntelligentChunker = getGlobalIntelligentChunker;
exports.shutdownGlobalIntelligentChunker = shutdownGlobalIntelligentChunker;
const events_1 = require("events");
const fs_1 = require("fs");
const memory_optimizer_1 = require("./memory-optimizer");
const logger_1 = require("../utils/logger");
/**
* Intelligent chunk size adaptation with machine learning
*/
class IntelligentChunker extends events_1.EventEmitter {
options;
performanceHistory = [];
recentDecisions = [];
learningModel = new Map();
systemBaseline;
constructor(options = {}) {
super();
this.options = {
baseChunkSize: options.baseChunkSize || 64 * 1024, // 64KB
minChunkSize: options.minChunkSize || 4 * 1024, // 4KB
maxChunkSize: options.maxChunkSize || 16 * 1024 * 1024, // 16MB
adaptationSensitivity: options.adaptationSensitivity || 0.2,
performanceWindow: options.performanceWindow || 10,
complexityWeights: {
dataTypeComplexity: 0.25,
encodingComplexity: 0.15,
structuralComplexity: 0.20,
contentComplexity: 0.15,
memoryPressure: 0.15,
ioPerformance: 0.10,
...options.complexityWeights
},
enableLearning: options.enableLearning ?? true,
maxLearningHistory: options.maxLearningHistory || 1000
};
this.systemBaseline = this.getCurrentSystemMetrics();
this.initializeLearningModel();
logger_1.logger.info(`Intelligent chunker initialized with learning ${this.options.enableLearning ? 'enabled' : 'disabled'}`);
}
/**
* Analyze data characteristics from file sample
*/
async analyzeDataCharacteristics(filePath, sampleSize = 64 * 1024) {
const fileStats = await fs_1.promises.stat(filePath);
const actualSampleSize = Math.min(sampleSize, fileStats.size);
const fileHandle = await fs_1.promises.open(filePath, 'r');
const buffer = Buffer.alloc(actualSampleSize);
try {
await fileHandle.read(buffer, 0, actualSampleSize, 0);
const sampleText = buffer.toString('utf8');
// Analyze structure
const lines = sampleText.split(/\r?\n/);
const nonEmptyLines = lines.filter(line => line.trim().length > 0);
const averageLineLength = nonEmptyLines.reduce((sum, line) => sum + line.length, 0) / nonEmptyLines.length;
const estimatedRows = Math.floor(fileStats.size / averageLineLength);
// Detect encoding complexity
const encoding = this.detectEncoding(buffer);
// Analyze content complexity
const hasQuotedFields = /["']/.test(sampleText);
const hasEscapedFields = /\\["'\\]/.test(sampleText);
const firstLine = nonEmptyLines[0] || '';
const columnCount = this.estimateColumnCount(firstLine);
// Estimate data types
const dataTypes = this.estimateDataTypes(nonEmptyLines.slice(0, 10));
// Calculate null density
const nullPatterns = /^$|^null$|^NULL$|^na$|^NA$|^n\/a$|^N\/A$/gi;
const totalFields = nonEmptyLines.slice(0, 100).reduce((sum, line) => {
return sum + line.split(',').length;
}, 0);
const nullFields = nonEmptyLines.slice(0, 100).reduce((sum, line) => {
return sum + line.split(',').filter(field => nullPatterns.test(field.trim())).length;
}, 0);
const nullDensity = totalFields > 0 ? nullFields / totalFields : 0;
// Estimate compression ratio (simple heuristic)
const uniqueChars = new Set(sampleText).size;
const compressionRatio = uniqueChars / 256; // Rough estimate
return {
fileSize: fileStats.size,
estimatedRows,
averageLineLength,
encoding,
hasQuotedFields,
hasEscapedFields,
columnCount,
dataTypes,
nullDensity,
compressionRatio
};
}
finally {
await fileHandle.close();
}
}
/**
* Calculate optimal chunk size based on data characteristics and system state
*/
calculateOptimalChunkSize(dataCharacteristics, currentSystemMetrics) {
const systemMetrics = currentSystemMetrics || this.getCurrentSystemMetrics();
const reasoning = [];
// Base chunk size
let chunkSize = this.options.baseChunkSize;
reasoning.push(`Starting with base chunk size: ${this.formatBytes(chunkSize)}`);
// Data complexity analysis
const dataComplexity = this.calculateDataComplexity(dataCharacteristics);
const dataComplexityFactor = Math.max(0.3, Math.min(2.0, 1 / dataComplexity));
chunkSize *= dataComplexityFactor;
reasoning.push(`Data complexity factor: ${dataComplexityFactor.toFixed(2)} (complexity: ${dataComplexity.toFixed(2)})`);
// System performance analysis
const systemPerformanceFactor = this.calculateSystemPerformanceFactor(systemMetrics);
chunkSize *= systemPerformanceFactor;
reasoning.push(`System performance factor: ${systemPerformanceFactor.toFixed(2)}`);
// Memory constraint analysis
const memoryOptimizer = (0, memory_optimizer_1.getGlobalMemoryOptimizer)();
const memoryRecommendation = memoryOptimizer.getAdaptiveChunkSize(chunkSize, dataComplexity);
const memoryConstraintFactor = memoryRecommendation.recommendedSize / chunkSize;
chunkSize = memoryRecommendation.recommendedSize;
reasoning.push(`Memory constraint factor: ${memoryConstraintFactor.toFixed(2)} (${memoryRecommendation.reason})`);
// Learning-based adjustment
let learningAdjustment = 1.0;
if (this.options.enableLearning && this.performanceHistory.length > 10) {
learningAdjustment = this.calculateLearningAdjustment(dataCharacteristics, systemMetrics, chunkSize);
chunkSize *= learningAdjustment;
reasoning.push(`Learning adjustment: ${learningAdjustment.toFixed(2)} (from ${this.performanceHistory.length} historical samples)`);
}
// Apply bounds
const originalChunkSize = chunkSize;
chunkSize = Math.max(this.options.minChunkSize, Math.min(this.options.maxChunkSize, chunkSize));
if (chunkSize !== originalChunkSize) {
reasoning.push(`Applied bounds: ${this.formatBytes(originalChunkSize)} → ${this.formatBytes(chunkSize)}`);
}
// Calculate confidence based on data quality and learning history
const confidence = this.calculateConfidence(dataCharacteristics, systemMetrics);
// Predict expected performance
const expectedPerformance = this.predictPerformance(chunkSize, dataCharacteristics, systemMetrics);
const decision = {
chunkSize: Math.round(chunkSize),
reasoning,
confidence,
adaptationFactors: {
dataComplexity: dataComplexityFactor,
systemPerformance: systemPerformanceFactor,
memoryConstraint: memoryConstraintFactor,
learningAdjustment
},
expectedPerformance
};
// Store decision for learning
this.recentDecisions.push(decision);
if (this.recentDecisions.length > this.options.performanceWindow) {
this.recentDecisions.shift();
}
this.emit('chunk-decision', decision);
return decision;
}
/**
* Record actual performance for learning
*/
recordPerformance(chunkSize, dataCharacteristics, actualPerformance) {
if (!this.options.enableLearning)
return;
const systemMetrics = this.getCurrentSystemMetrics();
// Calculate satisfaction score
const recentDecision = this.recentDecisions.find(d => Math.abs(d.chunkSize - chunkSize) < chunkSize * 0.1);
const satisfaction = recentDecision
? this.calculateSatisfaction(recentDecision.expectedPerformance, actualPerformance)
: 0.5; // Neutral score if no matching decision found
const learningData = {
chunkSize,
dataCharacteristics,
systemMetrics,
actualPerformance,
satisfaction
};
this.performanceHistory.push(learningData);
// Maintain history size
if (this.performanceHistory.length > this.options.maxLearningHistory) {
this.performanceHistory.shift();
}
// Update learning model
this.updateLearningModel(learningData);
this.emit('performance-recorded', learningData);
}
/**
* Calculate data complexity score
*/
calculateDataComplexity(data) {
const weights = this.options.complexityWeights;
let complexity = 0;
// Data type complexity
const typeComplexity = this.calculateTypeComplexity(data.dataTypes);
complexity += typeComplexity * weights.dataTypeComplexity;
// Encoding complexity
const encodingComplexity = this.calculateEncodingComplexity(data.encoding);
complexity += encodingComplexity * weights.encodingComplexity;
// Structural complexity
const structuralComplexity = this.calculateStructuralComplexity(data);
complexity += structuralComplexity * weights.structuralComplexity;
// Content complexity
const contentComplexity = this.calculateContentComplexity(data);
complexity += contentComplexity * weights.contentComplexity;
return Math.max(0.1, Math.min(5.0, complexity));
}
/**
* Calculate system performance factor
*/
calculateSystemPerformanceFactor(metrics) {
// Higher factor for better performance, lower for worse
const memoryFactor = Math.max(0.3, 1 - metrics.memoryPressure);
const ioFactor = Math.max(0.3, Math.min(2.0, metrics.throughput / 50)); // Normalized to 50 MB/s baseline
const latencyFactor = Math.max(0.3, Math.min(2.0, 100 / Math.max(1, metrics.ioLatency))); // Normalized to 100ms baseline
return memoryFactor * ioFactor * latencyFactor;
}
/**
* Calculate learning-based adjustment
*/
calculateLearningAdjustment(dataCharacteristics, systemMetrics, proposedChunkSize) {
const similarCases = this.findSimilarCases(dataCharacteristics, systemMetrics);
if (similarCases.length === 0)
return 1.0;
// Calculate weighted average of adjustments from similar cases
const totalWeight = similarCases.reduce((sum, case_) => sum + case_.weight, 0);
const weightedAdjustment = similarCases.reduce((sum, case_) => {
const adjustment = case_.data.chunkSize / proposedChunkSize;
return sum + (adjustment * case_.weight * case_.data.satisfaction);
}, 0) / totalWeight;
// Apply dampening to prevent wild swings
const dampening = 0.3;
return 1.0 + (weightedAdjustment - 1.0) * dampening;
}
/**
* Find similar historical cases for learning
*/
findSimilarCases(dataCharacteristics, systemMetrics, maxCases = 10) {
const similarities = this.performanceHistory.map(historyData => {
const dataSimilarity = this.calculateDataSimilarity(dataCharacteristics, historyData.dataCharacteristics);
const systemSimilarity = this.calculateSystemSimilarity(systemMetrics, historyData.systemMetrics);
const overallSimilarity = (dataSimilarity + systemSimilarity) / 2;
return {
data: historyData,
similarity: overallSimilarity,
weight: overallSimilarity * historyData.satisfaction
};
});
return similarities
.filter(item => item.similarity > 0.3) // Minimum similarity threshold
.sort((a, b) => b.weight - a.weight)
.slice(0, maxCases);
}
/**
* Calculate data similarity between two data characteristics
*/
calculateDataSimilarity(data1, data2) {
const fileSizeSimilarity = 1 - Math.abs(Math.log10(data1.fileSize) - Math.log10(data2.fileSize)) / 3; // 3 orders of magnitude
const rowSimilarity = 1 - Math.abs(Math.log10(data1.estimatedRows) - Math.log10(data2.estimatedRows)) / 3;
const columnSimilarity = 1 - Math.abs(data1.columnCount - data2.columnCount) / Math.max(data1.columnCount, data2.columnCount);
const typeSimilarity = this.calculateTypeSetSimilarity(data1.dataTypes, data2.dataTypes);
return (fileSizeSimilarity + rowSimilarity + columnSimilarity + typeSimilarity) / 4;
}
/**
* Calculate system similarity between two system metrics
*/
calculateSystemSimilarity(metrics1, metrics2) {
const memoryPressureSimilarity = 1 - Math.abs(metrics1.memoryPressure - metrics2.memoryPressure);
const throughputSimilarity = 1 - Math.abs(metrics1.throughput - metrics2.throughput) / Math.max(metrics1.throughput, metrics2.throughput);
const latencySimilarity = 1 - Math.abs(metrics1.ioLatency - metrics2.ioLatency) / Math.max(metrics1.ioLatency, metrics2.ioLatency);
return (memoryPressureSimilarity + throughputSimilarity + latencySimilarity) / 3;
}
/**
* Calculate satisfaction score from expected vs actual performance
*/
calculateSatisfaction(expected, actual) {
const timeRatio = Math.min(2, expected.processingTime / Math.max(0.1, actual.processingTime));
const memoryRatio = Math.min(2, expected.memoryUsage / Math.max(0.1, actual.memoryUsage));
const throughputRatio = Math.min(2, actual.throughput / Math.max(0.1, expected.throughput));
const errorPenalty = Math.max(0, 1 - actual.errorCount * 0.1);
return (timeRatio + memoryRatio + throughputRatio) / 3 * errorPenalty;
}
/**
* Helper methods for complexity calculations
*/
calculateTypeComplexity(dataTypes) {
const complexityMap = {
'string': 1.0,
'integer': 0.8,
'number': 0.9,
'boolean': 0.6,
'date': 1.2,
'time': 1.3,
'datetime': 1.4,
'json': 2.0,
'xml': 2.2,
'binary': 1.8
};
const avgComplexity = dataTypes.reduce((sum, type) => sum + (complexityMap[type] || 1.0), 0) / dataTypes.length;
return avgComplexity;
}
calculateEncodingComplexity(encoding) {
const complexityMap = {
'ascii': 0.5,
'utf8': 1.0,
'utf16': 1.3,
'latin1': 0.8,
'binary': 1.5
};
return complexityMap[encoding.toLowerCase()] || 1.0;
}
calculateStructuralComplexity(data) {
let complexity = 1.0;
if (data.hasQuotedFields)
complexity += 0.3;
if (data.hasEscapedFields)
complexity += 0.4;
if (data.nullDensity > 0.1)
complexity += data.nullDensity * 0.5;
if (data.columnCount > 50)
complexity += Math.log10(data.columnCount / 50) * 0.3;
return complexity;
}
calculateContentComplexity(data) {
let complexity = 1.0;
complexity += (1 - data.compressionRatio) * 0.5; // Less compressible = more complex
complexity += Math.min(1.0, data.averageLineLength / 1000) * 0.3; // Longer lines = more complex
return complexity;
}
/**
* Utility methods
*/
getCurrentSystemMetrics() {
const memoryOptimizer = (0, memory_optimizer_1.getGlobalMemoryOptimizer)();
const memoryPressure = memoryOptimizer.getMemoryPressure();
return {
memoryPressure,
cpuUsage: 0.5, // Placeholder - would need actual CPU monitoring
ioLatency: 10, // Placeholder - would need actual I/O monitoring
throughput: 50, // Placeholder - would need actual throughput monitoring
errorRate: 0.0
};
}
detectEncoding(buffer) {
// Simple encoding detection
const text = buffer.toString('utf8');
const hasNonAscii = /[^\x00-\x7F]/.test(text);
return hasNonAscii ? 'utf8' : 'ascii';
}
estimateColumnCount(line) {
// Simple CSV column estimation
return line.split(',').length;
}
estimateDataTypes(lines) {
if (lines.length === 0)
return [];
const firstDataLine = lines.find(line => line.trim() && !line.startsWith('#'));
if (!firstDataLine)
return [];
const fields = firstDataLine.split(',');
return fields.map(field => {
const trimmed = field.trim().replace(/['"]/g, '');
if (/^\d+$/.test(trimmed))
return 'integer';
if (/^\d*\.\d+$/.test(trimmed))
return 'number';
if (/^(true|false)$/i.test(trimmed))
return 'boolean';
if (/^\d{4}-\d{2}-\d{2}/.test(trimmed))
return 'date';
if (/^[{[]/.test(trimmed))
return 'json';
return 'string';
});
}
calculateTypeSetSimilarity(types1, types2) {
const set1 = new Set(types1);
const set2 = new Set(types2);
const intersection = new Set([...set1].filter(x => set2.has(x)));
const union = new Set([...set1, ...set2]);
return union.size > 0 ? intersection.size / union.size : 1;
}
calculateConfidence(data, metrics) {
let confidence = 0.5; // Base confidence
// Higher confidence for more data
if (this.performanceHistory.length > 10)
confidence += 0.2;
if (this.performanceHistory.length > 50)
confidence += 0.1;
// Higher confidence for stable system metrics
if (metrics.memoryPressure < 0.6)
confidence += 0.1;
if (metrics.errorRate < 0.01)
confidence += 0.1;
// Lower confidence for unusual data characteristics
if (data.nullDensity > 0.3)
confidence -= 0.1;
if (data.columnCount > 100)
confidence -= 0.1;
return Math.max(0.1, Math.min(1.0, confidence));
}
predictPerformance(chunkSize, data, metrics) {
// Simple performance prediction model
const baseProcessingTimePerByte = 0.000001; // 1 microsecond per byte
const complexity = this.calculateDataComplexity(data);
const processingTime = chunkSize * baseProcessingTimePerByte * complexity / Math.max(0.1, 1 - metrics.memoryPressure);
const memoryUsage = chunkSize * 1.5; // Assume 50% overhead
const throughput = chunkSize / processingTime / 1024 / 1024; // MB/s
return {
processingTime,
memoryUsage,
throughput
};
}
initializeLearningModel() {
// Initialize with some basic patterns
this.learningModel.set('small_file_factor', 1.5);
this.learningModel.set('large_file_factor', 0.8);
this.learningModel.set('complex_data_factor', 0.7);
this.learningModel.set('simple_data_factor', 1.2);
}
updateLearningModel(learningData) {
// Simple learning model update
const dataSize = learningData.dataCharacteristics.fileSize;
const complexity = this.calculateDataComplexity(learningData.dataCharacteristics);
if (dataSize < 1024 * 1024) { // Small file
const currentFactor = this.learningModel.get('small_file_factor') || 1.0;
const newFactor = currentFactor * 0.9 + (learningData.satisfaction * 2) * 0.1;
this.learningModel.set('small_file_factor', newFactor);
}
if (complexity > 2.0) { // Complex data
const currentFactor = this.learningModel.get('complex_data_factor') || 1.0;
const newFactor = currentFactor * 0.9 + (learningData.satisfaction * 2) * 0.1;
this.learningModel.set('complex_data_factor', newFactor);
}
}
formatBytes(bytes) {
const units = ['B', 'KB', 'MB', 'GB'];
let size = bytes;
let unitIndex = 0;
while (size >= 1024 && unitIndex < units.length - 1) {
size /= 1024;
unitIndex++;
}
return `${size.toFixed(unitIndex > 0 ? 1 : 0)}${units[unitIndex]}`;
}
/**
* Get learning statistics
*/
getLearningStats() {
return {
historySize: this.performanceHistory.length,
averageSatisfaction: this.performanceHistory.length > 0
? this.performanceHistory.reduce((sum, data) => sum + data.satisfaction, 0) / this.performanceHistory.length
: 0,
recentDecisions: this.recentDecisions.length,
learningModel: Object.fromEntries(this.learningModel),
confidenceDistribution: this.recentDecisions.map(d => d.confidence)
};
}
/**
* Reset learning data
*/
resetLearning() {
this.performanceHistory = [];
this.recentDecisions = [];
this.initializeLearningModel();
logger_1.logger.info('Intelligent chunker learning data reset');
}
/**
* Export learning data for analysis
*/
exportLearningData() {
return [...this.performanceHistory];
}
/**
* Import learning data
*/
importLearningData(data) {
this.performanceHistory = data.slice(-this.options.maxLearningHistory);
logger_1.logger.info(`Imported ${this.performanceHistory.length} learning samples`);
}
}
exports.IntelligentChunker = IntelligentChunker;
/**
* Global intelligent chunker instance
*/
let globalIntelligentChunker = null;
/**
* Get or create global intelligent chunker
*/
function getGlobalIntelligentChunker(options) {
if (!globalIntelligentChunker) {
globalIntelligentChunker = new IntelligentChunker(options);
}
return globalIntelligentChunker;
}
/**
* Shutdown global intelligent chunker
*/
function shutdownGlobalIntelligentChunker() {
if (globalIntelligentChunker) {
globalIntelligentChunker = null;
}
}
//# sourceMappingURL=intelligent-chunker.js.map