zon-format
Version:
ZON: The most token-efficient serialization format for LLMs - beats CSV, TOON, JSON, and all competitors
717 lines (716 loc) • 26.1 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.ZonEncoder = void 0;
exports.encode = encode;
exports.encodeLLM = encodeLLM;
const constants_1 = require("./constants");
const utils_1 = require("./utils");
const type_inference_1 = require("../schema/type-inference");
const versioning_1 = require("./versioning");
/**
* Encodes data structures into ZON format v1.3.0.
*/
class ZonEncoder {
constructor(anchorInterval = constants_1.DEFAULT_ANCHOR_INTERVAL, enableDictCompression = true, enableTypeCoercion = false, disableTables = false) {
this.anchor_interval = anchorInterval;
this.safe_str_re = /^[a-zA-Z0-9_\-\.]+$/;
this.enableDictionaryCompression = enableDictCompression;
this.enableTypeCoercion = enableTypeCoercion;
this.disableTables = disableTables;
this.typeInferrer = new type_inference_1.TypeInferrer();
}
/**
* Encodes data to ZON format.
* When disableTables is true, bypasses table generation and formats data directly.
*
* @param data - Data to encode
* @param options - Optional encoding options
* @returns ZON formatted string
*/
encode(data, options) {
let processedData = data;
if (options === null || options === void 0 ? void 0 : options.embedMetadata) {
processedData = (0, versioning_1.embedVersion)(data, options.version || '1.3.0', options.schemaId);
}
if (this.disableTables) {
if (typeof data === 'object' && data !== null) {
if (!Array.isArray(data) && Object.keys(data).length === 0) {
return "";
}
return this._formatZonNode(processedData);
}
return JSON.stringify(processedData);
}
const [streams, metadata] = this._extractStreams(processedData);
if (streams.size === 0 && (!metadata || Object.keys(metadata).length === 0)) {
if (typeof data === 'object' && data !== null) {
if (!Array.isArray(data) && Object.keys(data).length === 0) {
return "";
}
return this._formatZonNode(data);
}
return JSON.stringify(data);
}
if (Array.isArray(data) && data.length > 0 && data.every(item => typeof item === 'object' && !Array.isArray(item))) {
const irregularityScore = this._calculateIrregularity(data);
if (irregularityScore > 0.6) {
return this._formatZonNode(data);
}
}
const output = [];
if (metadata && Object.keys(metadata).length > 0) {
output.push(...this._writeMetadata(metadata));
}
const streamEntries = Array.from(streams.entries()).sort((a, b) => {
return a[0].localeCompare(b[0]);
});
for (const [key, streamData] of streamEntries) {
if (output.length > 0) {
output.push("");
}
const finalKey = key || "data";
output.push(...this._writeTable(streamData, finalKey));
}
return output.join("\n");
}
/**
* Extracts all uniform arrays that should become tables.
*
* @param data - Input data
* @returns Tuple of [streams Map, metadata]
*/
_extractStreams(data) {
if (Array.isArray(data)) {
if (data.length > 0 && typeof data[0] === 'object' && data[0] !== null && !Array.isArray(data[0])) {
const streams = new Map();
streams.set('', data);
return [streams, {}];
}
return [new Map(), {}];
}
if (typeof data === 'object' && data !== null) {
const streams = new Map();
const metadata = {};
for (const [k, v] of Object.entries(data)) {
if (Array.isArray(v) && v.length > 0) {
if (typeof v[0] === 'object' && v[0] !== null && !Array.isArray(v[0])) {
streams.set(k, v);
}
else {
metadata[k] = v;
}
}
else {
metadata[k] = v;
}
}
return [streams, metadata];
}
return [new Map(), typeof data === 'object' ? data : {}];
}
/**
* Writes metadata section in YAML-like format.
*
* @param metadata - Metadata object
* @returns Array of formatted lines
*/
_writeMetadata(metadata) {
const lines = [];
const sortedKeys = Object.keys(metadata).sort();
for (const key of sortedKeys) {
const val = metadata[key];
if (typeof val === 'object' && val !== null) {
const valStr = this._formatZonNode(val);
if (valStr.startsWith('{') || valStr.startsWith('[')) {
lines.push(`${key}${valStr}`);
}
else {
lines.push(`${key}${constants_1.META_SEPARATOR}${valStr}`);
}
}
else {
const valStr = this._formatValue(val);
lines.push(`${key}${constants_1.META_SEPARATOR}${valStr}`);
}
}
return lines;
}
/**
* Writes table data with adaptive encoding strategy.
*
* @param stream - Array of data objects
* @param key - Table key name
* @returns Array of formatted lines
*/
_writeTable(stream, key) {
if (!stream || stream.length === 0) {
return [];
}
const lines = [];
const flatStream = stream.map(row => this._flatten(row, '', '.', 5));
const allKeysSet = new Set();
flatStream.forEach(d => Object.keys(d).forEach(k => allKeysSet.add(k)));
let cols = Array.from(allKeysSet).sort();
if (this.enableTypeCoercion) {
for (const col of cols) {
const values = flatStream.map(row => row[col]);
const inferred = this.typeInferrer.inferColumnType(values);
if (inferred.coercible) {
for (const row of flatStream) {
if (col in row && row[col] !== undefined && row[col] !== null) {
row[col] = this.typeInferrer.coerce(row[col], inferred);
}
}
}
}
}
const dictionaries = this.enableDictionaryCompression ? this._detectDictionaries(flatStream, cols) : new Map();
if (dictionaries.size > 0) {
return this._writeDictionaryTable(flatStream, cols, dictionaries, stream.length, key);
}
const columnStats = this._analyzeColumnSparsity(flatStream, cols);
const coreColumns = columnStats.filter(c => c.presence >= 0.7).map(c => c.name);
const optionalColumns = columnStats.filter(c => c.presence < 0.7).map(c => c.name);
const useSparseEncoding = optionalColumns.length > 0;
if (useSparseEncoding) {
return this._writeSparseTable(flatStream, coreColumns, optionalColumns, stream.length, key);
}
else {
return this._writeStandardTable(flatStream, cols, stream.length, key);
}
}
/**
* Writes standard table format.
*
* @param flatStream - Flattened data rows
* @param cols - Column names
* @param rowCount - Number of rows
* @param key - Table key
* @returns Array of formatted lines
*/
_writeStandardTable(flatStream, cols, rowCount, key) {
const lines = [];
let header = '';
if (key && key !== 'data') {
header = `${key}${constants_1.META_SEPARATOR}${constants_1.TABLE_MARKER}(${rowCount})`;
}
else {
header = `${constants_1.TABLE_MARKER}${rowCount}`;
}
header += `${constants_1.META_SEPARATOR}${cols.join(',')}`;
lines.push(header);
for (const row of flatStream) {
const tokens = [];
for (const col of cols) {
const val = row[col];
if (val === undefined || val === null) {
tokens.push('null');
}
else {
tokens.push(this._formatValue(val));
}
}
lines.push(tokens.join(','));
}
return lines;
}
/**
* Writes sparse table format for semi-uniform data.
*
* @param flatStream - Flattened data rows
* @param coreColumns - Core column names
* @param optionalColumns - Optional column names
* @param rowCount - Number of rows
* @param key - Table key
* @returns Array of formatted lines
*/
_writeSparseTable(flatStream, coreColumns, optionalColumns, rowCount, key) {
const lines = [];
let header = '';
if (key && key !== 'data') {
header = `${key}${constants_1.META_SEPARATOR}${constants_1.TABLE_MARKER}(${rowCount})`;
}
else {
header = `${constants_1.TABLE_MARKER}${rowCount}`;
}
header += `${constants_1.META_SEPARATOR}${coreColumns.join(',')}`;
lines.push(header);
for (const row of flatStream) {
const tokens = [];
for (const col of coreColumns) {
tokens.push(this._formatValue(row[col]));
}
for (const col of optionalColumns) {
if (col in row && row[col] !== undefined) {
const val = this._formatValue(row[col]);
tokens.push(`${col}:${val}`);
}
}
lines.push(tokens.join(','));
}
return lines;
}
/**
* Analyzes column presence across rows.
*
* @param data - Array of data rows
* @param cols - Column names
* @returns Array of column statistics
*/
_analyzeColumnSparsity(data, cols) {
return cols.map(col => {
const presenceCount = data.filter(row => col in row && row[col] !== undefined && row[col] !== null).length;
return {
name: col,
presence: presenceCount / data.length
};
});
}
/**
* Detects dictionary compression opportunities for string columns.
*
* @param data - Array of data rows
* @param cols - Column names
* @returns Map of column names to unique value dictionaries
*/
_detectDictionaries(data, cols) {
const dictionaries = new Map();
for (const col of cols) {
const values = data.map(row => row[col]).filter(v => typeof v === 'string');
if (values.length < data.length * 0.8)
continue;
const uniqueValues = Array.from(new Set(values));
const repetitionRate = 1 - (uniqueValues.length / values.length);
const avgLength = uniqueValues.reduce((sum, v) => sum + v.length, 0) / uniqueValues.length;
const currentTokens = values.length * avgLength;
const refCost = uniqueValues.length < 10 ? 1 : (uniqueValues.length < 100 ? 2 : 3);
const valuesLength = uniqueValues.reduce((sum, v) => sum + v.length, 0);
const definitionOverhead = col.length + 4 + valuesLength + (uniqueValues.length - 1);
const dictTokens = valuesLength + (values.length * refCost) + definitionOverhead;
const savings = (currentTokens - dictTokens) / currentTokens;
const threshold = values.length < 20 ? 0.1 : 0.2;
// Heuristic: Avoid dictionary for single unique value unless it's long (readability)
if (uniqueValues.length === 1 && uniqueValues[0].length < 20) {
continue;
}
if (savings > threshold && uniqueValues.length < values.length / 2 && uniqueValues.length <= 50) {
dictionaries.set(col, uniqueValues.sort());
}
}
return dictionaries;
}
/**
* Writes table with dictionary compression for string columns.
*
* @param flatStream - Flattened data rows
* @param cols - All column names
* @param dictionaries - Map of column names to dictionaries
* @param rowCount - Number of rows
* @param key - Table key name
* @returns Array of formatted lines
*/
_writeDictionaryTable(flatStream, cols, dictionaries, rowCount, key) {
const lines = [];
for (const [col, values] of dictionaries) {
lines.push(`${col}[${values.length}]:${values.join(',')}`);
}
const dictCols = Array.from(dictionaries.keys());
const regularCols = cols.filter(c => !dictionaries.has(c));
const allCols = [...dictCols, ...regularCols];
let header = '';
if (key && key !== 'data') {
header = `${key}${constants_1.META_SEPARATOR}${constants_1.TABLE_MARKER}(${rowCount})`;
}
else {
header = `${constants_1.TABLE_MARKER}${rowCount}`;
}
header += `${constants_1.META_SEPARATOR}${allCols.join(',')}`;
lines.push(header);
for (const row of flatStream) {
const tokens = [];
for (const col of dictCols) {
const value = row[col];
const dict = dictionaries.get(col);
const index = dict.indexOf(value);
tokens.push(String(index));
}
for (const col of regularCols) {
const val = row[col];
if (val === undefined || val === null) {
tokens.push('null');
}
else {
tokens.push(this._formatValue(val));
}
}
lines.push(tokens.join(','));
}
return lines;
}
/**
* Analyzes columns for compression opportunities.
*
* @param data - Array of data rows
* @param cols - Column names
* @returns Column analysis results
*/
_analyzeColumns(data, cols) {
const analysis = {};
for (const col of cols) {
const vals = data.map(d => d[col]);
const result = {
is_sequential: false,
step: 1,
has_repetition: false
};
const nums = vals.filter(v => typeof v === 'number' && typeof v !== 'boolean');
if (nums.length === vals.length && vals.length > 1) {
try {
const diffs = nums.slice(1).map((n, i) => n - nums[i]);
const uniqueDiffs = new Set(diffs);
if (uniqueDiffs.size === 1) {
result.is_sequential = true;
result.step = Array.from(uniqueDiffs)[0];
}
}
catch (e) {
}
}
if (vals.length > 1) {
try {
const unique = new Set(vals.map(v => JSON.stringify(v)));
if (unique.size < vals.length) {
result.has_repetition = true;
}
}
catch (e) {
}
}
analysis[col] = result;
}
return analysis;
}
/**
* Calculates schema irregularity score for array of objects.
*
* @param data - Array of objects
* @returns Irregularity score from 0.0 (uniform) to 1.0 (irregular)
*/
_calculateIrregularity(data) {
if (data.length === 0) {
return 0;
}
const allKeys = new Set();
const keySets = [];
for (const item of data) {
const keys = new Set(Object.keys(item));
keySets.push(keys);
keys.forEach(k => allKeys.add(k));
}
const totalKeys = allKeys.size;
if (totalKeys === 0) {
return 0;
}
let totalOverlap = 0;
let comparisons = 0;
for (let i = 0; i < keySets.length; i++) {
for (let j = i + 1; j < keySets.length; j++) {
const keys1 = keySets[i];
const keys2 = keySets[j];
let shared = 0;
keys1.forEach(k => {
if (keys2.has(k))
shared++;
});
const union = keys1.size + keys2.size - shared;
const similarity = union > 0 ? shared / union : 1;
totalOverlap += similarity;
comparisons++;
}
}
if (comparisons === 0) {
return 0;
}
const avgSimilarity = totalOverlap / comparisons;
const irregularity = 1 - avgSimilarity;
return irregularity;
}
/**
* Quotes string for CSV format (RFC 4180).
*
* @param s - String to quote
* @returns Quoted string
*/
_csvQuote(s) {
const escaped = s.replace(/"/g, '""');
return `"${escaped}"`;
}
/**
* Formats nested structures using ZON syntax.
*
* @param val - Value to format
* @param visited - Set of visited objects for circular reference detection
* @returns Formatted string
*/
_formatZonNode(val, visited = new WeakSet()) {
if (typeof val === 'object' && val !== null) {
if (visited.has(val)) {
throw new Error('Circular reference detected');
}
visited.add(val);
}
if (typeof val === 'object' && val !== null && !Array.isArray(val)) {
const keys = Object.keys(val).sort();
if (keys.length === 0) {
return "{}";
}
const items = [];
for (const k of keys) {
let kStr = String(k);
if (/[,:\{\}\[\]"]/.test(kStr)) {
kStr = JSON.stringify(kStr);
}
const vStr = this._formatZonNode(val[k], visited);
if (vStr.startsWith('{') || vStr.startsWith('[')) {
items.push(`${kStr}${vStr}`);
}
else {
items.push(`${kStr}:${vStr}`);
}
}
return "{" + items.join(",") + "}";
}
else if (Array.isArray(val)) {
if (val.length === 0) {
return "[]";
}
return "[" + val.map(item => this._formatZonNode(item, visited)).join(",") + "]";
}
return this._formatValue(val);
}
/**
* Formats a value with minimal quoting.
*
* @param val - Value to format
* @returns Formatted string
*/
_formatValue(val) {
if (val === null || val === undefined) {
return "null";
}
if (typeof val === 'boolean') {
if (this.enableTypeCoercion) {
return val ? "true" : "false";
}
return val ? "T" : "F";
}
if (typeof val === 'number') {
if (!Number.isFinite(val)) {
return "null";
}
// Removed scientific notation expansion as it was incorrect and unnecessary
// ZON supports scientific notation natively
if (Number.isInteger(val)) {
return String(val);
}
let s = String(val);
if (!s.includes('.') && !s.includes('e') && !s.includes('E')) {
s += '.0';
}
return s;
}
if (val instanceof Date) {
return val.toISOString();
}
if (Array.isArray(val) || (typeof val === 'object' && val !== null)) {
return this._formatZonNode(val);
}
const s = String(val);
if (this._isISODate(s)) {
return s;
}
const needsTypeProtection = this._needsTypeProtection(s);
if (needsTypeProtection) {
return (0, utils_1.quoteString)(s);
}
if (this._needsQuotes(s)) {
return (0, utils_1.quoteString)(s);
}
return s;
}
/**
* Checks if string is an ISO 8601 date/datetime.
*
* @param s - String to check
* @returns True if ISO date format
*/
_isISODate(s) {
if (/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(Z|[+-]\d{2}:\d{2})$/.test(s)) {
return true;
}
if (/^\d{4}-\d{2}-\d{2}$/.test(s)) {
return true;
}
if (/^\d{2}:\d{2}:\d{2}$/.test(s)) {
return true;
}
return false;
}
/**
* Determines if string needs type protection quoting.
*
* @param s - String to check
* @returns True if quoting needed
*/
_needsTypeProtection(s) {
const sLower = s.toLowerCase();
if (['t', 'f', 'true', 'false', 'null', 'none', 'nil'].includes(sLower)) {
return true;
}
if ([constants_1.GAS_TOKEN, constants_1.LIQUID_TOKEN].includes(s)) {
return true;
}
if (s.trim() !== s) {
return true;
}
if (/[\x00-\x1f]/.test(s)) {
return true;
}
if (/^-?\d+$/.test(s)) {
return true;
}
if (/^-?\d+\.\d+$/.test(s)) {
return true;
}
if (/^-?\d+(\.\d+)?e[+-]?\d+$/i.test(s)) {
return true;
}
if (/^\d/.test(s) || /\d$/.test(s)) {
const num = parseFloat(s);
if (!isNaN(num) && String(num) === s) {
return true;
}
}
return false;
}
/**
* Determines if string needs CSV quoting.
*
* @param s - String to check
* @returns True if quoting needed
*/
_needsQuotes(s) {
if (!s) {
return true;
}
if (['T', 'F', 'null', constants_1.GAS_TOKEN, constants_1.LIQUID_TOKEN].includes(s)) {
return true;
}
if (/^-?\d+$/.test(s)) {
return true;
}
try {
parseFloat(s);
if (!isNaN(parseFloat(s))) {
return true;
}
}
catch (e) {
}
if (s.trim() !== s) {
return true;
}
if (/[,\n\r\t"\[\]{};]/.test(s)) {
return true;
}
// Single quotes are allowed in the middle of words, but not at the start
// (because that would look like a quoted string to the decoder)
if (s.startsWith("'")) {
return true;
}
if (s.includes('//') || s.includes('/*')) {
return true;
}
return false;
}
/**
* Flattens nested dictionary with depth limit.
*
* @param d - Dictionary to flatten
* @param parent - Parent key prefix
* @param sep - Key separator
* @param maxDepth - Maximum flattening depth
* @param currentDepth - Current depth level
* @param visited - Set of visited objects
* @returns Flattened dictionary
*/
_flatten(d, parent = '', sep = '.', maxDepth = 0, currentDepth = 0, visited = new WeakSet()) {
if (typeof d === 'object' && d !== null) {
if (visited.has(d)) {
throw new Error('Circular reference detected');
}
visited.add(d);
}
if (typeof d !== 'object' || d === null || Array.isArray(d)) {
return parent ? { [parent]: d } : {};
}
const items = [];
for (const [k, v] of Object.entries(d)) {
const newKey = parent ? `${parent}${sep}${k}` : k;
if (typeof v === 'object' && v !== null && !Array.isArray(v) && currentDepth < maxDepth) {
const flattened = this._flatten(v, newKey, sep, maxDepth, currentDepth + 1, visited);
items.push(...Object.entries(flattened));
}
else {
items.push([newKey, v]);
}
}
return Object.fromEntries(items);
}
}
exports.ZonEncoder = ZonEncoder;
/**
* Encodes data to ZON format v1.1.0.
*
* @param data - Data to encode
* @param options - Optional encoding options
* @returns ZON formatted string
*/
function encode(data, options) {
const encoder = new ZonEncoder(options === null || options === void 0 ? void 0 : options.anchorInterval, options === null || options === void 0 ? void 0 : options.enableDictCompression, options === null || options === void 0 ? void 0 : options.enableTypeCoercion, options === null || options === void 0 ? void 0 : options.disableTables);
return encoder.encode(data, options);
}
const llm_optimizer_1 = require("../tools/llm-optimizer");
/**
* Encodes data optimized for LLM consumption.
* Optimizes field order and compression based on LLM task type.
*
* @param data - Data to encode
* @param context - LLM context including model and task type
* @returns Optimized ZON string
*/
function encodeLLM(data, context) {
let processedData = data;
if (context.task === 'generation' || context.task === 'analysis') {
const optimizer = new llm_optimizer_1.LLMOptimizer();
if (Array.isArray(data)) {
processedData = optimizer.optimizeFieldOrder(data);
}
else if (typeof data === 'object' && data !== null) {
const newData = { ...data };
for (const key of Object.keys(newData)) {
if (Array.isArray(newData[key])) {
newData[key] = optimizer.optimizeFieldOrder(newData[key]);
}
}
processedData = newData;
}
}
const enableDict = true;
let enableTypeCoercion = true;
if (context.task === 'retrieval') {
enableTypeCoercion = true;
}
const encoder = new ZonEncoder(constants_1.DEFAULT_ANCHOR_INTERVAL, enableDict, enableTypeCoercion);
return encoder.encode(processedData);
}