UNPKG

@caleblawson/rag

Version:

The Retrieval-Augmented Generation (RAG) module contains document processing and embedding utilities.

532 lines (455 loc) 15.5 kB
import { Document } from '../schema'; export class RecursiveJsonTransformer { private maxSize: number; private minSize: number; constructor({ maxSize = 2000, minSize }: { maxSize: number; minSize?: number }) { this.maxSize = maxSize; this.minSize = minSize ?? Math.max(maxSize - 200, 50); } private static jsonSize(data: Record<string, any>): number { const seen = new WeakSet(); function getStringifiableData(obj: any): any { if (obj === null || typeof obj !== 'object') { return obj; } if (seen.has(obj)) { return '[Circular]'; } seen.add(obj); if (Array.isArray(obj)) { const safeArray = []; for (const item of obj) { safeArray.push(getStringifiableData(item)); } return safeArray; } const safeObj: Record<string, any> = {}; for (const key in obj) { if (Object.prototype.hasOwnProperty.call(obj, key)) { safeObj[key] = getStringifiableData(obj[key]); } } return safeObj; } const stringifiable = getStringifiableData(data); const jsonString = JSON.stringify(stringifiable); return jsonString.length; } /** * Transform JSON data while handling circular references */ public transform(data: Record<string, any>): Record<string, any> { const size = RecursiveJsonTransformer.jsonSize(data); const seen = new WeakSet(); function createSafeCopy(obj: any): any { if (obj === null || typeof obj !== 'object') { return obj; } if (seen.has(obj)) { return '[Circular]'; } seen.add(obj); if (Array.isArray(obj)) { return obj.map(item => createSafeCopy(item)); } const copy: Record<string, any> = {}; for (const key in obj) { if (Object.prototype.hasOwnProperty.call(obj, key)) { copy[key] = createSafeCopy(obj[key]); } } return copy; } return { size, data: createSafeCopy(data), }; } /** * Set a value in a nested dictionary based on the given path */ private static setNestedDict(d: Record<string, any>, path: string[], value: any): void { let current = d; for (const key of path.slice(0, -1)) { current[key] = current[key] || {}; current = current[key]; } current[path[path.length - 1]!] = value; } /** * Convert lists in the JSON structure to dictionaries with index-based keys */ private listToDictPreprocessing(data: any): any { if (data && typeof data === 'object') { if (Array.isArray(data)) { return Object.fromEntries(data.map((item, index) => [String(index), this.listToDictPreprocessing(item)])); } return Object.fromEntries(Object.entries(data).map(([k, v]) => [k, this.listToDictPreprocessing(v)])); } return data; } /** * Handles primitive values (strings, numbers, etc) by either adding them to the current chunk * or creating new chunks if they don't fit */ private handlePrimitiveValue( value: any, key: string, currentChunk: Record<string, any>, chunks: Record<string, any>[], fullPath: string[], ): { currentChunk: Record<string, any>; chunks: Record<string, any>[] } { const testValue = { [key]: value }; if (RecursiveJsonTransformer.jsonSize(testValue) <= this.maxSize) { if (RecursiveJsonTransformer.jsonSize({ ...currentChunk, ...testValue }) <= this.maxSize) { return { currentChunk: { ...currentChunk, ...testValue }, chunks, }; } else { return { currentChunk: testValue, chunks: [...chunks, currentChunk], }; } } else if (typeof value === 'string') { const stringChunks = this.splitLongString(value); const newChunks = stringChunks .map(chunk => { return this.createChunk(chunk, fullPath); }) .filter(chunk => RecursiveJsonTransformer.jsonSize(chunk) <= this.maxSize); return { currentChunk, chunks: [...chunks, ...newChunks], }; } const newChunk = this.createChunk(value, fullPath); return { currentChunk, chunks: RecursiveJsonTransformer.jsonSize(newChunk) <= this.maxSize ? [...chunks, newChunk] : chunks, }; } /** * Creates a nested dictionary chunk from a value and path * e.g., path ['a', 'b'], value 'c' becomes { a: { b: 'c' } } */ private createChunk(value: any, path: string[]): Record<string, any> { const chunk: Record<string, any> = {}; RecursiveJsonTransformer.setNestedDict(chunk, path, value); return chunk.root ? chunk.root : chunk; } /** * Checks if value is within size limits */ private isWithinSizeLimit(value: any, currentSize: number = 0): boolean { const size = RecursiveJsonTransformer.jsonSize(value); // If this is a new chunk (currentSize = 0), allow items smaller than maxSize // If adding to existing chunk, ensure we're above minSize before splitting return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize || currentSize < this.minSize; } /** * Splits arrays into chunks based on size limits * Handles nested objects by recursing into handleNestedObject */ private handleArray( value: any[], key: string, currentPath: string[], depth: number, maxDepth: number, ): Record<string, any>[] { const path = currentPath.length ? [...currentPath, key] : ['root', key]; // Try keeping array intact const chunk = this.createChunk(value, path); if (this.isWithinSizeLimit(chunk)) { return [chunk]; } const chunks: Record<string, any>[] = []; let currentGroup: any[] = []; const saveCurrentGroup = () => { if (currentGroup.length > 0) { const groupChunk = this.createChunk(currentGroup, path); if (RecursiveJsonTransformer.jsonSize(groupChunk) >= this.minSize) { chunks.push(groupChunk); currentGroup = []; } } }; for (const item of value) { // Try adding item to current group const testGroup = [...currentGroup, item]; const testChunk = this.createChunk(testGroup, path); if (this.isWithinSizeLimit(testChunk)) { currentGroup = testGroup; continue; } // Current group is full saveCurrentGroup(); // Handle the new item if (typeof item === 'object' && item !== null) { const singleItemArray = [item]; const singleItemChunk = this.createChunk(singleItemArray, path); if (this.isWithinSizeLimit(singleItemChunk)) { currentGroup = singleItemArray; } else { const itemPath = [...path, String(chunks.length)]; const nestedChunks = this.handleNestedObject(item, itemPath, depth + 1, maxDepth); chunks.push(...nestedChunks); } } else { currentGroup = [item]; } } saveCurrentGroup(); return chunks; } /** * Splits objects into chunks based on size limits * Handles nested arrays and objects by recursing into handleArray and handleNestedObject */ private handleNestedObject( value: Record<string, any>, fullPath: string[], depth: number, maxDepth: number, ): Record<string, any>[] { const path = fullPath.length ? fullPath : ['root']; // Handle max depth if (depth > maxDepth) { console.warn(`Maximum depth of ${maxDepth} exceeded, flattening remaining structure`); return [this.createChunk(value, path)]; } // Try keeping object intact const wholeChunk = this.createChunk(value, path); if (this.isWithinSizeLimit(wholeChunk)) { return [wholeChunk]; } const chunks: Record<string, any>[] = []; let currentChunk: Record<string, any> = {}; const saveCurrentChunk = () => { if (Object.keys(currentChunk).length > 0) { const objChunk = this.createChunk(currentChunk, path); if (RecursiveJsonTransformer.jsonSize(objChunk) >= this.minSize) { chunks.push(objChunk); currentChunk = {}; } } }; for (const [key, val] of Object.entries(value)) { if (val === undefined) continue; // Handle arrays separately if (Array.isArray(val)) { saveCurrentChunk(); const arrayChunks = this.handleArray(val, key, path, depth, maxDepth); chunks.push(...arrayChunks); continue; } // Try adding to current chunk const testChunk = this.createChunk({ ...currentChunk, [key]: val }, path); if (this.isWithinSizeLimit(testChunk)) { currentChunk[key] = val; continue; } // Current chunk is full saveCurrentChunk(); // Handle value that didn't fit if (typeof val === 'object' && val !== null) { const nestedChunks = this.handleNestedObject(val, [...path, key], depth + 1, maxDepth); chunks.push(...nestedChunks); } else { currentChunk = { [key]: val }; } } saveCurrentChunk(); return chunks; } /** * Splits long strings into smaller chunks at word boundaries * Ensures each chunk is within maxSize limit */ private splitLongString(value: string): string[] { const chunks: string[] = []; let remaining = value; while (remaining.length > 0) { const overhead = 20; const chunkSize = Math.floor(this.maxSize - overhead); if (remaining.length <= chunkSize) { chunks.push(remaining); break; } const lastSpace = remaining.slice(0, chunkSize).lastIndexOf(' '); const splitAt = lastSpace > 0 ? lastSpace + 1 : chunkSize; chunks.push(remaining.slice(0, splitAt)); remaining = remaining.slice(splitAt); } return chunks; } /** * Core chunking logic that processes JSON data recursively * Handles arrays, objects, and primitive values while maintaining structure */ private jsonSplit({ data, currentPath = [], chunks = [{}], depth = 0, maxDepth = 100, }: { data: Record<string, any>; currentPath?: string[]; chunks?: Record<string, any>[]; depth?: number; maxDepth?: number; }): Record<string, any>[] { if (!data || typeof data !== 'object') { return chunks; } if (depth > maxDepth) { console.warn(`Maximum depth of ${maxDepth} exceeded, flattening remaining structure`); RecursiveJsonTransformer.setNestedDict(chunks[chunks.length - 1] || {}, currentPath, data); return chunks; } let currentChunk = {}; let accumulatedChunks = chunks; for (const [key, value] of Object.entries(data)) { const fullPath = [...currentPath, key]; if (Array.isArray(value)) { const arrayChunks = this.handleArray(value, key, currentPath, depth, maxDepth); accumulatedChunks = [...accumulatedChunks, ...arrayChunks]; } else if (typeof value === 'object' && value !== null) { const objectChunks = this.handleNestedObject(value, fullPath, depth, maxDepth); accumulatedChunks = [...accumulatedChunks, ...objectChunks]; } else { const { currentChunk: newCurrentChunk, chunks: newChunks } = this.handlePrimitiveValue( value, key, currentChunk, accumulatedChunks, fullPath, ); currentChunk = newCurrentChunk; accumulatedChunks = newChunks; } } if (Object.keys(currentChunk).length > 0) { accumulatedChunks = [...accumulatedChunks, currentChunk]; } return accumulatedChunks.filter(chunk => Object.keys(chunk).length > 0); } /** * Splits JSON into a list of JSON chunks */ splitJson({ jsonData, convertLists = false, }: { jsonData: Record<string, any>; convertLists?: boolean; }): Record<string, any>[] { const processedData = convertLists ? this.listToDictPreprocessing(jsonData) : jsonData; const chunks = this.jsonSplit({ data: processedData }); if (Object.keys(chunks[chunks.length - 1] || {}).length === 0) { chunks.pop(); } return chunks; } /** * Converts Unicode characters to their escaped ASCII representation * e.g., 'café' becomes 'caf\u00e9' */ private escapeNonAscii(obj: any): any { if (typeof obj === 'string') { return obj.replace(/[\u0080-\uffff]/g, char => { return `\\u${char.charCodeAt(0).toString(16).padStart(4, '0')}`; }); } if (Array.isArray(obj)) { return obj.map(item => this.escapeNonAscii(item)); } if (typeof obj === 'object' && obj !== null) { return Object.fromEntries(Object.entries(obj).map(([key, value]) => [key, this.escapeNonAscii(value)])); } return obj; } /** * Splits JSON into a list of JSON formatted strings */ splitText({ jsonData, convertLists = false, ensureAscii = true, }: { jsonData: Record<string, any>; convertLists?: boolean; ensureAscii?: boolean; }): string[] { const chunks = this.splitJson({ jsonData, convertLists }); if (ensureAscii) { const escapedChunks = chunks.map(chunk => this.escapeNonAscii(chunk)); return escapedChunks.map(chunk => JSON.stringify(chunk)); } return chunks.map(chunk => JSON.stringify(chunk, (key, value) => { // Convert escaped Unicode sequences back to actual characters // e.g., '\u00e9' -> 'é' if (typeof value === 'string') { return value.replace(/\\u[\da-f]{4}/gi, match => String.fromCharCode(parseInt(match.slice(2), 16))); } return value; }), ); } /** * Create documents from a list of json objects */ createDocuments({ texts, convertLists = false, ensureAscii = true, metadatas, }: { texts: string[]; convertLists?: boolean; ensureAscii?: boolean; metadatas?: Record<string, any>[]; }): Document[] { const _metadatas = metadatas || Array(texts.length).fill({}); const documents: Document[] = []; texts.forEach((text, i) => { const chunks = this.splitText({ jsonData: JSON.parse(text), convertLists, ensureAscii }); chunks.forEach(chunk => { const metadata = { ...(_metadatas[i] || {}) }; documents.push( new Document({ text: chunk, metadata, }), ); }); }); return documents; } transformDocuments({ ensureAscii, documents, convertLists, }: { ensureAscii?: boolean; convertLists?: boolean; documents: Document[]; }): Document[] { const texts: string[] = []; const metadatas: Record<string, any>[] = []; for (const doc of documents) { texts.push(doc.text); metadatas.push(doc.metadata); } return this.createDocuments({ texts, metadatas, ensureAscii, convertLists, }); } }