UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

282 lines (281 loc) 10.9 kB
/** * JSON-aware Chunker * * Splits JSON documents based on structure (arrays, objects, keys). * Best for API responses, configuration files, and structured data. */ import { randomUUID } from "crypto"; /** * JSON-aware chunker implementation * Splits based on JSON structure */ export class JSONChunker { strategy = "json"; async chunk(text, config) { const { maxSize = 1000, maxDepth = 10, splitKeys = [], preserveKeys = [], includeJsonPath = true, trimWhitespace = true, metadata = {}, } = config || {}; const documentId = randomUUID(); const chunks = []; if (!text || text.length === 0) { return chunks; } // Parse JSON let jsonData; try { jsonData = JSON.parse(text); } catch { // If not valid JSON, treat as plain text chunks.push({ id: randomUUID(), text: trimWhitespace ? text.trim() : text, metadata: { documentId, chunkIndex: 0, totalChunks: 1, startPosition: 0, endPosition: text.length, documentType: "json", custom: { ...metadata, parseError: "Invalid JSON", }, }, }); return chunks; } // Extract chunks from JSON structure const extractedChunks = this.extractChunks({ data: jsonData, path: "", depth: 0, maxDepth, maxSize, splitKeys, preserveKeys, includeJsonPath, }); // Convert to Chunk objects let chunkIndex = 0; let currentPosition = 0; for (const extracted of extractedChunks) { const chunkText = JSON.stringify(extracted.value, null, 2); const finalText = trimWhitespace ? chunkText.trim() : chunkText; if (finalText.length > 0) { const chunkMetadata = { ...metadata, }; if (includeJsonPath && extracted.path) { chunkMetadata.jsonPath = extracted.path; } chunks.push({ id: randomUUID(), text: finalText, metadata: { documentId, chunkIndex, startPosition: currentPosition, endPosition: currentPosition + finalText.length, documentType: "json", jsonPath: extracted.path, custom: chunkMetadata, }, }); chunkIndex++; currentPosition += finalText.length; } } // Update total chunks count chunks.forEach((chunk) => { chunk.metadata.totalChunks = chunks.length; }); return chunks; } /** * Recursively extract chunks from JSON structure */ extractChunks(options) { const { data, path, depth, maxDepth, maxSize, splitKeys, preserveKeys, includeJsonPath, } = options; const results = []; // Check depth limit if (depth > maxDepth) { results.push({ value: data, path }); return results; } // Check if this should be preserved as a unit const currentKey = path.split(".").pop() || ""; if (preserveKeys.includes(currentKey)) { results.push({ value: data, path }); return results; } // Check size - if small enough, keep as one chunk const serialized = JSON.stringify(data, null, 2); if (serialized.length <= maxSize) { results.push({ value: data, path }); return results; } // Handle arrays if (Array.isArray(data)) { // Check if array should be split by index if (splitKeys.length === 0 || splitKeys.some((k) => path.endsWith(k))) { // Split array into individual elements or groups let currentGroup = []; let currentGroupSize = 0; for (let i = 0; i < data.length; i++) { const item = data[i]; const itemSize = JSON.stringify(item, null, 2).length; if (currentGroupSize + itemSize > maxSize && currentGroup.length > 0) { // Save current group results.push({ value: currentGroup.length === 1 ? currentGroup[0] : currentGroup, path: `${path}[${i - currentGroup.length}:${i}]`, }); currentGroup = []; currentGroupSize = 0; } // If single item is too large, recursively split it if (itemSize > maxSize) { const subChunks = this.extractChunks({ data: item, path: `${path}[${i}]`, depth: depth + 1, maxDepth, maxSize, splitKeys, preserveKeys, includeJsonPath, }); results.push(...subChunks); } else { currentGroup.push(item); currentGroupSize += itemSize; } } // Don't forget the last group if (currentGroup.length > 0) { results.push({ value: currentGroup.length === 1 ? currentGroup[0] : currentGroup, path: `${path}[${data.length - currentGroup.length}:${data.length}]`, }); } } else { // Keep array as one unit but may need to truncate results.push({ value: data, path }); } } // Handle objects else if (data !== null && typeof data === "object") { const obj = data; const keys = Object.keys(obj); // Check if any keys should be split const keysToSplit = keys.filter((k) => splitKeys.length === 0 || splitKeys.includes(k)); if (keysToSplit.length > 0) { let currentObj = {}; let currentObjSize = 0; for (const key of keys) { const value = obj[key]; const valueSize = JSON.stringify({ [key]: value }, null, 2).length; // Check if this key should be split out if (splitKeys.includes(key)) { // Save current object first if it has content if (Object.keys(currentObj).length > 0) { results.push({ value: currentObj, path: path, }); currentObj = {}; currentObjSize = 0; } // Recursively process this value const subChunks = this.extractChunks({ data: value, path: path ? `${path}.${key}` : key, depth: depth + 1, maxDepth, maxSize, splitKeys, preserveKeys, includeJsonPath, }); results.push(...subChunks); } else if (currentObjSize + valueSize > maxSize && Object.keys(currentObj).length > 0) { // Save current object results.push({ value: currentObj, path: path, }); currentObj = { [key]: value }; currentObjSize = valueSize; } else { currentObj[key] = value; currentObjSize += valueSize; } } // Don't forget the last object if (Object.keys(currentObj).length > 0) { results.push({ value: currentObj, path: path, }); } } else { // Process each key individually for (const key of keys) { const value = obj[key]; const keyPath = path ? `${path}.${key}` : key; const valueSize = JSON.stringify(value, null, 2).length; if (valueSize > maxSize) { // Recursively split const subChunks = this.extractChunks({ data: value, path: keyPath, depth: depth + 1, maxDepth, maxSize, splitKeys, preserveKeys, includeJsonPath, }); results.push(...subChunks); } else { results.push({ value: { [key]: value }, path: keyPath, }); } } } } // Primitive values else { results.push({ value: data, path }); } return results; } validateConfig(config) { const errors = []; const warnings = []; const jsonConfig = config; if (jsonConfig.maxSize !== undefined && jsonConfig.maxSize <= 0) { errors.push("maxSize must be greater than 0"); } if (jsonConfig.maxDepth !== undefined && jsonConfig.maxDepth < 1) { errors.push("maxDepth must be at least 1"); } if (jsonConfig.maxDepth !== undefined && jsonConfig.maxDepth > 100) { warnings.push("Very high maxDepth may cause performance issues"); } return { valid: errors.length === 0, errors, warnings, }; } }