UNPKG

zon-format

Version:

ZON: The most token-efficient serialization format for LLMs - beats CSV, TOON, JSON, and all competitors

github.com/ZON-Format/zon-TS

ZON-Format/zon-TS

717 lines (716 loc) • 26.1 kB

JavaScript

"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.ZonEncoder = void 0; exports.encode = encode; exports.encodeLLM = encodeLLM; const constants_1 = require("./constants"); const utils_1 = require("./utils"); const type_inference_1 = require("../schema/type-inference"); const versioning_1 = require("./versioning"); /** * Encodes data structures into ZON format v1.3.0. */ class ZonEncoder { constructor(anchorInterval = constants_1.DEFAULT_ANCHOR_INTERVAL, enableDictCompression = true, enableTypeCoercion = false, disableTables = false) { this.anchor_interval = anchorInterval; this.safe_str_re = /^[a-zA-Z0-9_\-\.]+$/; this.enableDictionaryCompression = enableDictCompression; this.enableTypeCoercion = enableTypeCoercion; this.disableTables = disableTables; this.typeInferrer = new type_inference_1.TypeInferrer(); } /** * Encodes data to ZON format. * When disableTables is true, bypasses table generation and formats data directly. * * @param data - Data to encode * @param options - Optional encoding options * @returns ZON formatted string */ encode(data, options) { let processedData = data; if (options === null || options === void 0 ? void 0 : options.embedMetadata) { processedData = (0, versioning_1.embedVersion)(data, options.version || '1.3.0', options.schemaId); } if (this.disableTables) { if (typeof data === 'object' && data !== null) { if (!Array.isArray(data) && Object.keys(data).length === 0) { return ""; } return this._formatZonNode(processedData); } return JSON.stringify(processedData); } const [streams, metadata] = this._extractStreams(processedData); if (streams.size === 0 && (!metadata || Object.keys(metadata).length === 0)) { if (typeof data === 'object' && data !== null) { if (!Array.isArray(data) && Object.keys(data).length === 0) { return ""; } return this._formatZonNode(data); } return JSON.stringify(data); } if (Array.isArray(data) && data.length > 0 && data.every(item => typeof item === 'object' && !Array.isArray(item))) { const irregularityScore = this._calculateIrregularity(data); if (irregularityScore > 0.6) { return this._formatZonNode(data); } } const output = []; if (metadata && Object.keys(metadata).length > 0) { output.push(...this._writeMetadata(metadata)); } const streamEntries = Array.from(streams.entries()).sort((a, b) => { return a[0].localeCompare(b[0]); }); for (const [key, streamData] of streamEntries) { if (output.length > 0) { output.push(""); } const finalKey = key || "data"; output.push(...this._writeTable(streamData, finalKey)); } return output.join("\n"); } /** * Extracts all uniform arrays that should become tables. * * @param data - Input data * @returns Tuple of [streams Map, metadata] */ _extractStreams(data) { if (Array.isArray(data)) { if (data.length > 0 && typeof data[0] === 'object' && data[0] !== null && !Array.isArray(data[0])) { const streams = new Map(); streams.set('', data); return [streams, {}]; } return [new Map(), {}]; } if (typeof data === 'object' && data !== null) { const streams = new Map(); const metadata = {}; for (const [k, v] of Object.entries(data)) { if (Array.isArray(v) && v.length > 0) { if (typeof v[0] === 'object' && v[0] !== null && !Array.isArray(v[0])) { streams.set(k, v); } else { metadata[k] = v; } } else { metadata[k] = v; } } return [streams, metadata]; } return [new Map(), typeof data === 'object' ? data : {}]; } /** * Writes metadata section in YAML-like format. * * @param metadata - Metadata object * @returns Array of formatted lines */ _writeMetadata(metadata) { const lines = []; const sortedKeys = Object.keys(metadata).sort(); for (const key of sortedKeys) { const val = metadata[key]; if (typeof val === 'object' && val !== null) { const valStr = this._formatZonNode(val); if (valStr.startsWith('{') || valStr.startsWith('[')) { lines.push(`${key}${valStr}`); } else { lines.push(`${key}${constants_1.META_SEPARATOR}${valStr}`); } } else { const valStr = this._formatValue(val); lines.push(`${key}${constants_1.META_SEPARATOR}${valStr}`); } } return lines; } /** * Writes table data with adaptive encoding strategy. * * @param stream - Array of data objects * @param key - Table key name * @returns Array of formatted lines */ _writeTable(stream, key) { if (!stream || stream.length === 0) { return []; } const lines = []; const flatStream = stream.map(row => this._flatten(row, '', '.', 5)); const allKeysSet = new Set(); flatStream.forEach(d => Object.keys(d).forEach(k => allKeysSet.add(k))); let cols = Array.from(allKeysSet).sort(); if (this.enableTypeCoercion) { for (const col of cols) { const values = flatStream.map(row => row[col]); const inferred = this.typeInferrer.inferColumnType(values); if (inferred.coercible) { for (const row of flatStream) { if (col in row && row[col] !== undefined && row[col] !== null) { row[col] = this.typeInferrer.coerce(row[col], inferred); } } } } } const dictionaries = this.enableDictionaryCompression ? this._detectDictionaries(flatStream, cols) : new Map(); if (dictionaries.size > 0) { return this._writeDictionaryTable(flatStream, cols, dictionaries, stream.length, key); } const columnStats = this._analyzeColumnSparsity(flatStream, cols); const coreColumns = columnStats.filter(c => c.presence >= 0.7).map(c => c.name); const optionalColumns = columnStats.filter(c => c.presence < 0.7).map(c => c.name); const useSparseEncoding = optionalColumns.length > 0; if (useSparseEncoding) { return this._writeSparseTable(flatStream, coreColumns, optionalColumns, stream.length, key); } else { return this._writeStandardTable(flatStream, cols, stream.length, key); } } /** * Writes standard table format. * * @param flatStream - Flattened data rows * @param cols - Column names * @param rowCount - Number of rows * @param key - Table key * @returns Array of formatted lines */ _writeStandardTable(flatStream, cols, rowCount, key) { const lines = []; let header = ''; if (key && key !== 'data') { header = `${key}${constants_1.META_SEPARATOR}${constants_1.TABLE_MARKER}(${rowCount})`; } else { header = `${constants_1.TABLE_MARKER}${rowCount}`; } header += `${constants_1.META_SEPARATOR}${cols.join(',')}`; lines.push(header); for (const row of flatStream) { const tokens = []; for (const col of cols) { const val = row[col]; if (val === undefined || val === null) { tokens.push('null'); } else { tokens.push(this._formatValue(val)); } } lines.push(tokens.join(',')); } return lines; } /** * Writes sparse table format for semi-uniform data. * * @param flatStream - Flattened data rows * @param coreColumns - Core column names * @param optionalColumns - Optional column names * @param rowCount - Number of rows * @param key - Table key * @returns Array of formatted lines */ _writeSparseTable(flatStream, coreColumns, optionalColumns, rowCount, key) { const lines = []; let header = ''; if (key && key !== 'data') { header = `${key}${constants_1.META_SEPARATOR}${constants_1.TABLE_MARKER}(${rowCount})`; } else { header = `${constants_1.TABLE_MARKER}${rowCount}`; } header += `${constants_1.META_SEPARATOR}${coreColumns.join(',')}`; lines.push(header); for (const row of flatStream) { const tokens = []; for (const col of coreColumns) { tokens.push(this._formatValue(row[col])); } for (const col of optionalColumns) { if (col in row && row[col] !== undefined) { const val = this._formatValue(row[col]); tokens.push(`${col}:${val}`); } } lines.push(tokens.join(',')); } return lines; } /** * Analyzes column presence across rows. * * @param data - Array of data rows * @param cols - Column names * @returns Array of column statistics */ _analyzeColumnSparsity(data, cols) { return cols.map(col => { const presenceCount = data.filter(row => col in row && row[col] !== undefined && row[col] !== null).length; return { name: col, presence: presenceCount / data.length }; }); } /** * Detects dictionary compression opportunities for string columns. * * @param data - Array of data rows * @param cols - Column names * @returns Map of column names to unique value dictionaries */ _detectDictionaries(data, cols) { const dictionaries = new Map(); for (const col of cols) { const values = data.map(row => row[col]).filter(v => typeof v === 'string'); if (values.length < data.length * 0.8) continue; const uniqueValues = Array.from(new Set(values)); const repetitionRate = 1 - (uniqueValues.length / values.length); const avgLength = uniqueValues.reduce((sum, v) => sum + v.length, 0) / uniqueValues.length; const currentTokens = values.length * avgLength; const refCost = uniqueValues.length < 10 ? 1 : (uniqueValues.length < 100 ? 2 : 3); const valuesLength = uniqueValues.reduce((sum, v) => sum + v.length, 0); const definitionOverhead = col.length + 4 + valuesLength + (uniqueValues.length - 1); const dictTokens = valuesLength + (values.length * refCost) + definitionOverhead; const savings = (currentTokens - dictTokens) / currentTokens; const threshold = values.length < 20 ? 0.1 : 0.2; // Heuristic: Avoid dictionary for single unique value unless it's long (readability) if (uniqueValues.length === 1 && uniqueValues[0].length < 20) { continue; } if (savings > threshold && uniqueValues.length < values.length / 2 && uniqueValues.length <= 50) { dictionaries.set(col, uniqueValues.sort()); } } return dictionaries; } /** * Writes table with dictionary compression for string columns. * * @param flatStream - Flattened data rows * @param cols - All column names * @param dictionaries - Map of column names to dictionaries * @param rowCount - Number of rows * @param key - Table key name * @returns Array of formatted lines */ _writeDictionaryTable(flatStream, cols, dictionaries, rowCount, key) { const lines = []; for (const [col, values] of dictionaries) { lines.push(`${col}[${values.length}]:${values.join(',')}`); } const dictCols = Array.from(dictionaries.keys()); const regularCols = cols.filter(c => !dictionaries.has(c)); const allCols = [...dictCols, ...regularCols]; let header = ''; if (key && key !== 'data') { header = `${key}${constants_1.META_SEPARATOR}${constants_1.TABLE_MARKER}(${rowCount})`; } else { header = `${constants_1.TABLE_MARKER}${rowCount}`; } header += `${constants_1.META_SEPARATOR}${allCols.join(',')}`; lines.push(header); for (const row of flatStream) { const tokens = []; for (const col of dictCols) { const value = row[col]; const dict = dictionaries.get(col); const index = dict.indexOf(value); tokens.push(String(index)); } for (const col of regularCols) { const val = row[col]; if (val === undefined || val === null) { tokens.push('null'); } else { tokens.push(this._formatValue(val)); } } lines.push(tokens.join(',')); } return lines; } /** * Analyzes columns for compression opportunities. * * @param data - Array of data rows * @param cols - Column names * @returns Column analysis results */ _analyzeColumns(data, cols) { const analysis = {}; for (const col of cols) { const vals = data.map(d => d[col]); const result = { is_sequential: false, step: 1, has_repetition: false }; const nums = vals.filter(v => typeof v === 'number' && typeof v !== 'boolean'); if (nums.length === vals.length && vals.length > 1) { try { const diffs = nums.slice(1).map((n, i) => n - nums[i]); const uniqueDiffs = new Set(diffs); if (uniqueDiffs.size === 1) { result.is_sequential = true; result.step = Array.from(uniqueDiffs)[0]; } } catch (e) { } } if (vals.length > 1) { try { const unique = new Set(vals.map(v => JSON.stringify(v))); if (unique.size < vals.length) { result.has_repetition = true; } } catch (e) { } } analysis[col] = result; } return analysis; } /** * Calculates schema irregularity score for array of objects. * * @param data - Array of objects * @returns Irregularity score from 0.0 (uniform) to 1.0 (irregular) */ _calculateIrregularity(data) { if (data.length === 0) { return 0; } const allKeys = new Set(); const keySets = []; for (const item of data) { const keys = new Set(Object.keys(item)); keySets.push(keys); keys.forEach(k => allKeys.add(k)); } const totalKeys = allKeys.size; if (totalKeys === 0) { return 0; } let totalOverlap = 0; let comparisons = 0; for (let i = 0; i < keySets.length; i++) { for (let j = i + 1; j < keySets.length; j++) { const keys1 = keySets[i]; const keys2 = keySets[j]; let shared = 0; keys1.forEach(k => { if (keys2.has(k)) shared++; }); const union = keys1.size + keys2.size - shared; const similarity = union > 0 ? shared / union : 1; totalOverlap += similarity; comparisons++; } } if (comparisons === 0) { return 0; } const avgSimilarity = totalOverlap / comparisons; const irregularity = 1 - avgSimilarity; return irregularity; } /** * Quotes string for CSV format (RFC 4180). * * @param s - String to quote * @returns Quoted string */ _csvQuote(s) { const escaped = s.replace(/"/g, '""'); return `"${escaped}"`; } /** * Formats nested structures using ZON syntax. * * @param val - Value to format * @param visited - Set of visited objects for circular reference detection * @returns Formatted string */ _formatZonNode(val, visited = new WeakSet()) { if (typeof val === 'object' && val !== null) { if (visited.has(val)) { throw new Error('Circular reference detected'); } visited.add(val); } if (typeof val === 'object' && val !== null && !Array.isArray(val)) { const keys = Object.keys(val).sort(); if (keys.length === 0) { return "{}"; } const items = []; for (const k of keys) { let kStr = String(k); if (/[,:\{\}\[\]"]/.test(kStr)) { kStr = JSON.stringify(kStr); } const vStr = this._formatZonNode(val[k], visited); if (vStr.startsWith('{') || vStr.startsWith('[')) { items.push(`${kStr}${vStr}`); } else { items.push(`${kStr}:${vStr}`); } } return "{" + items.join(",") + "}"; } else if (Array.isArray(val)) { if (val.length === 0) { return "[]"; } return "[" + val.map(item => this._formatZonNode(item, visited)).join(",") + "]"; } return this._formatValue(val); } /** * Formats a value with minimal quoting. * * @param val - Value to format * @returns Formatted string */ _formatValue(val) { if (val === null || val === undefined) { return "null"; } if (typeof val === 'boolean') { if (this.enableTypeCoercion) { return val ? "true" : "false"; } return val ? "T" : "F"; } if (typeof val === 'number') { if (!Number.isFinite(val)) { return "null"; } // Removed scientific notation expansion as it was incorrect and unnecessary // ZON supports scientific notation natively if (Number.isInteger(val)) { return String(val); } let s = String(val); if (!s.includes('.') && !s.includes('e') && !s.includes('E')) { s += '.0'; } return s; } if (val instanceof Date) { return val.toISOString(); } if (Array.isArray(val) || (typeof val === 'object' && val !== null)) { return this._formatZonNode(val); } const s = String(val); if (this._isISODate(s)) { return s; } const needsTypeProtection = this._needsTypeProtection(s); if (needsTypeProtection) { return (0, utils_1.quoteString)(s); } if (this._needsQuotes(s)) { return (0, utils_1.quoteString)(s); } return s; } /** * Checks if string is an ISO 8601 date/datetime. * * @param s - String to check * @returns True if ISO date format */ _isISODate(s) { if (/^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(Z|[+-]\d{2}:\d{2})$/.test(s)) { return true; } if (/^\d{4}-\d{2}-\d{2}$/.test(s)) { return true; } if (/^\d{2}:\d{2}:\d{2}$/.test(s)) { return true; } return false; } /** * Determines if string needs type protection quoting. * * @param s - String to check * @returns True if quoting needed */ _needsTypeProtection(s) { const sLower = s.toLowerCase(); if (['t', 'f', 'true', 'false', 'null', 'none', 'nil'].includes(sLower)) { return true; } if ([constants_1.GAS_TOKEN, constants_1.LIQUID_TOKEN].includes(s)) { return true; } if (s.trim() !== s) { return true; } if (/[\x00-\x1f]/.test(s)) { return true; } if (/^-?\d+$/.test(s)) { return true; } if (/^-?\d+\.\d+$/.test(s)) { return true; } if (/^-?\d+(\.\d+)?e[+-]?\d+$/i.test(s)) { return true; } if (/^\d/.test(s) || /\d$/.test(s)) { const num = parseFloat(s); if (!isNaN(num) && String(num) === s) { return true; } } return false; } /** * Determines if string needs CSV quoting. * * @param s - String to check * @returns True if quoting needed */ _needsQuotes(s) { if (!s) { return true; } if (['T', 'F', 'null', constants_1.GAS_TOKEN, constants_1.LIQUID_TOKEN].includes(s)) { return true; } if (/^-?\d+$/.test(s)) { return true; } try { parseFloat(s); if (!isNaN(parseFloat(s))) { return true; } } catch (e) { } if (s.trim() !== s) { return true; } if (/[,\n\r\t"\[\]{};]/.test(s)) { return true; } // Single quotes are allowed in the middle of words, but not at the start // (because that would look like a quoted string to the decoder) if (s.startsWith("'")) { return true; } if (s.includes('//') || s.includes('/*')) { return true; } return false; } /** * Flattens nested dictionary with depth limit. * * @param d - Dictionary to flatten * @param parent - Parent key prefix * @param sep - Key separator * @param maxDepth - Maximum flattening depth * @param currentDepth - Current depth level * @param visited - Set of visited objects * @returns Flattened dictionary */ _flatten(d, parent = '', sep = '.', maxDepth = 0, currentDepth = 0, visited = new WeakSet()) { if (typeof d === 'object' && d !== null) { if (visited.has(d)) { throw new Error('Circular reference detected'); } visited.add(d); } if (typeof d !== 'object' || d === null || Array.isArray(d)) { return parent ? { [parent]: d } : {}; } const items = []; for (const [k, v] of Object.entries(d)) { const newKey = parent ? `${parent}${sep}${k}` : k; if (typeof v === 'object' && v !== null && !Array.isArray(v) && currentDepth < maxDepth) { const flattened = this._flatten(v, newKey, sep, maxDepth, currentDepth + 1, visited); items.push(...Object.entries(flattened)); } else { items.push([newKey, v]); } } return Object.fromEntries(items); } } exports.ZonEncoder = ZonEncoder; /** * Encodes data to ZON format v1.1.0. * * @param data - Data to encode * @param options - Optional encoding options * @returns ZON formatted string */ function encode(data, options) { const encoder = new ZonEncoder(options === null || options === void 0 ? void 0 : options.anchorInterval, options === null || options === void 0 ? void 0 : options.enableDictCompression, options === null || options === void 0 ? void 0 : options.enableTypeCoercion, options === null || options === void 0 ? void 0 : options.disableTables); return encoder.encode(data, options); } const llm_optimizer_1 = require("../tools/llm-optimizer"); /** * Encodes data optimized for LLM consumption. * Optimizes field order and compression based on LLM task type. * * @param data - Data to encode * @param context - LLM context including model and task type * @returns Optimized ZON string */ function encodeLLM(data, context) { let processedData = data; if (context.task === 'generation' || context.task === 'analysis') { const optimizer = new llm_optimizer_1.LLMOptimizer(); if (Array.isArray(data)) { processedData = optimizer.optimizeFieldOrder(data); } else if (typeof data === 'object' && data !== null) { const newData = { ...data }; for (const key of Object.keys(newData)) { if (Array.isArray(newData[key])) { newData[key] = optimizer.optimizeFieldOrder(newData[key]); } } processedData = newData; } } const enableDict = true; let enableTypeCoercion = true; if (context.task === 'retrieval') { enableTypeCoercion = true; } const encoder = new ZonEncoder(constants_1.DEFAULT_ANCHOR_INTERVAL, enableDict, enableTypeCoercion); return encoder.encode(processedData); }