zon-format
Version:
ZON: The most token-efficient serialization format for LLMs - beats CSV, TOON, JSON, and all competitors
175 lines (174 loc) • 5.96 kB
JavaScript
;
/**
* Data Complexity Analyzer for Adaptive Encoding
*
* Analyzes data structures to determine optimal encoding strategies.
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.globalAnalyzer = exports.DataComplexityAnalyzer = void 0;
/**
* Analyzes data complexity to guide encoding decisions.
*/
class DataComplexityAnalyzer {
/**
* Analyzes a data structure and returns complexity metrics.
*
* @param data - Data to analyze
* @returns Complexity metrics and encoding recommendation
*/
analyze(data) {
const metrics = this.calculateMetrics(data);
const recommendation = this.getRecommendation(metrics);
return {
...metrics,
...recommendation
};
}
/**
* Calculates complexity metrics for data.
*/
calculateMetrics(data) {
const stats = {
maxNesting: 0,
allKeys: new Set(),
keySets: [],
largestArray: 0,
arrayCount: 0,
objectCount: 0,
fieldCounts: []
};
this.traverse(data, 1, stats);
const irregularity = this.calculateIrregularity(stats.keySets);
const total = stats.arrayCount + stats.objectCount;
const arrayDensity = total > 0 ? stats.arrayCount / total : 0;
const avgFieldsPerObject = stats.fieldCounts.length > 0
? stats.fieldCounts.reduce((a, b) => a + b, 0) / stats.fieldCounts.length
: 0;
return {
nesting: stats.maxNesting,
irregularity,
fieldCount: stats.allKeys.size,
arraySize: stats.largestArray,
arrayDensity,
avgFieldsPerObject
};
}
/**
* Traverses data structure to collect statistics.
*/
traverse(data, depth, stats) {
if (typeof data === 'object' && data !== null) {
stats.maxNesting = Math.max(stats.maxNesting, depth);
}
if (Array.isArray(data)) {
stats.arrayCount++;
stats.largestArray = Math.max(stats.largestArray, data.length);
for (const item of data) {
this.traverse(item, depth + 1, stats);
}
}
else if (typeof data === 'object' && data !== null) {
stats.objectCount++;
const keys = new Set(Object.keys(data));
stats.keySets.push(keys);
stats.fieldCounts.push(keys.size);
keys.forEach(k => stats.allKeys.add(k));
for (const value of Object.values(data)) {
this.traverse(value, depth + 1, stats);
}
}
}
/**
* Calculates schema irregularity score.
* Higher score = more variation in object shapes.
*/
calculateIrregularity(keySets) {
if (keySets.length <= 1)
return 0;
let totalOverlap = 0;
let comparisons = 0;
for (let i = 0; i < keySets.length; i++) {
for (let j = i + 1; j < keySets.length; j++) {
const keys1 = keySets[i];
const keys2 = keySets[j];
let shared = 0;
keys1.forEach(k => {
if (keys2.has(k))
shared++;
});
const union = keys1.size + keys2.size - shared;
const similarity = union > 0 ? shared / union : 1;
totalOverlap += similarity;
comparisons++;
}
}
if (comparisons === 0)
return 0;
const avgSimilarity = totalOverlap / comparisons;
return 1 - avgSimilarity;
}
/**
* Determines encoding recommendation based on metrics.
*/
getRecommendation(metrics) {
if (metrics.nesting > 4) {
return {
recommendation: 'inline',
confidence: 0.9,
reason: `Deep nesting (${metrics.nesting} levels) favors inline format for readability`
};
}
if (metrics.irregularity > 0.7) {
return {
recommendation: 'json',
confidence: 0.85,
reason: `High irregularity (${(metrics.irregularity * 100).toFixed(0)}%) makes table format inefficient`
};
}
if (metrics.arraySize >= 3 && metrics.irregularity < 0.3) {
return {
recommendation: 'table',
confidence: 0.95,
reason: `Large uniform array (${metrics.arraySize} items, ${(metrics.irregularity * 100).toFixed(0)}% irregularity) is ideal for table format`
};
}
if (metrics.nesting > 2 && metrics.arrayDensity > 0.3) {
return {
recommendation: 'mixed',
confidence: 0.7,
reason: 'Mixed structure with nested arrays benefits from hybrid approach'
};
}
return {
recommendation: 'table',
confidence: 0.6,
reason: 'Standard structure suitable for table format'
};
}
/**
* Checks if data is suitable for table encoding.
*/
isSuitableForTable(data) {
const analysis = this.analyze(data);
return analysis.recommendation === 'table' && analysis.confidence > 0.7;
}
/**
* Gets optimal complexity threshold for mode selection.
*/
getComplexityThreshold(mode) {
switch (mode) {
case 'aggressive':
return 0.8; // Only switch away from table for very irregular data
case 'conservative':
return 0.4; // More readily use inline/json formats
case 'balanced':
default:
return 0.6;
}
}
}
exports.DataComplexityAnalyzer = DataComplexityAnalyzer;
/**
* Global analyzer instance.
*/
exports.globalAnalyzer = new DataComplexityAnalyzer();