@caleblawson/rag
Version:
The Retrieval-Augmented Generation (RAG) module contains document processing and embedding utilities.
532 lines (455 loc) • 15.5 kB
text/typescript
import { Document } from '../schema';
export class RecursiveJsonTransformer {
private maxSize: number;
private minSize: number;
constructor({ maxSize = 2000, minSize }: { maxSize: number; minSize?: number }) {
this.maxSize = maxSize;
this.minSize = minSize ?? Math.max(maxSize - 200, 50);
}
private static jsonSize(data: Record<string, any>): number {
const seen = new WeakSet();
function getStringifiableData(obj: any): any {
if (obj === null || typeof obj !== 'object') {
return obj;
}
if (seen.has(obj)) {
return '[Circular]';
}
seen.add(obj);
if (Array.isArray(obj)) {
const safeArray = [];
for (const item of obj) {
safeArray.push(getStringifiableData(item));
}
return safeArray;
}
const safeObj: Record<string, any> = {};
for (const key in obj) {
if (Object.prototype.hasOwnProperty.call(obj, key)) {
safeObj[key] = getStringifiableData(obj[key]);
}
}
return safeObj;
}
const stringifiable = getStringifiableData(data);
const jsonString = JSON.stringify(stringifiable);
return jsonString.length;
}
/**
* Transform JSON data while handling circular references
*/
public transform(data: Record<string, any>): Record<string, any> {
const size = RecursiveJsonTransformer.jsonSize(data);
const seen = new WeakSet();
function createSafeCopy(obj: any): any {
if (obj === null || typeof obj !== 'object') {
return obj;
}
if (seen.has(obj)) {
return '[Circular]';
}
seen.add(obj);
if (Array.isArray(obj)) {
return obj.map(item => createSafeCopy(item));
}
const copy: Record<string, any> = {};
for (const key in obj) {
if (Object.prototype.hasOwnProperty.call(obj, key)) {
copy[key] = createSafeCopy(obj[key]);
}
}
return copy;
}
return {
size,
data: createSafeCopy(data),
};
}
/**
* Set a value in a nested dictionary based on the given path
*/
private static setNestedDict(d: Record<string, any>, path: string[], value: any): void {
let current = d;
for (const key of path.slice(0, -1)) {
current[key] = current[key] || {};
current = current[key];
}
current[path[path.length - 1]!] = value;
}
/**
* Convert lists in the JSON structure to dictionaries with index-based keys
*/
private listToDictPreprocessing(data: any): any {
if (data && typeof data === 'object') {
if (Array.isArray(data)) {
return Object.fromEntries(data.map((item, index) => [String(index), this.listToDictPreprocessing(item)]));
}
return Object.fromEntries(Object.entries(data).map(([k, v]) => [k, this.listToDictPreprocessing(v)]));
}
return data;
}
/**
* Handles primitive values (strings, numbers, etc) by either adding them to the current chunk
* or creating new chunks if they don't fit
*/
private handlePrimitiveValue(
value: any,
key: string,
currentChunk: Record<string, any>,
chunks: Record<string, any>[],
fullPath: string[],
): { currentChunk: Record<string, any>; chunks: Record<string, any>[] } {
const testValue = { [key]: value };
if (RecursiveJsonTransformer.jsonSize(testValue) <= this.maxSize) {
if (RecursiveJsonTransformer.jsonSize({ ...currentChunk, ...testValue }) <= this.maxSize) {
return {
currentChunk: { ...currentChunk, ...testValue },
chunks,
};
} else {
return {
currentChunk: testValue,
chunks: [...chunks, currentChunk],
};
}
} else if (typeof value === 'string') {
const stringChunks = this.splitLongString(value);
const newChunks = stringChunks
.map(chunk => {
return this.createChunk(chunk, fullPath);
})
.filter(chunk => RecursiveJsonTransformer.jsonSize(chunk) <= this.maxSize);
return {
currentChunk,
chunks: [...chunks, ...newChunks],
};
}
const newChunk = this.createChunk(value, fullPath);
return {
currentChunk,
chunks: RecursiveJsonTransformer.jsonSize(newChunk) <= this.maxSize ? [...chunks, newChunk] : chunks,
};
}
/**
* Creates a nested dictionary chunk from a value and path
* e.g., path ['a', 'b'], value 'c' becomes { a: { b: 'c' } }
*/
private createChunk(value: any, path: string[]): Record<string, any> {
const chunk: Record<string, any> = {};
RecursiveJsonTransformer.setNestedDict(chunk, path, value);
return chunk.root ? chunk.root : chunk;
}
/**
* Checks if value is within size limits
*/
private isWithinSizeLimit(value: any, currentSize: number = 0): boolean {
const size = RecursiveJsonTransformer.jsonSize(value);
// If this is a new chunk (currentSize = 0), allow items smaller than maxSize
// If adding to existing chunk, ensure we're above minSize before splitting
return currentSize === 0 ? size <= this.maxSize : size + currentSize <= this.maxSize || currentSize < this.minSize;
}
/**
* Splits arrays into chunks based on size limits
* Handles nested objects by recursing into handleNestedObject
*/
private handleArray(
value: any[],
key: string,
currentPath: string[],
depth: number,
maxDepth: number,
): Record<string, any>[] {
const path = currentPath.length ? [...currentPath, key] : ['root', key];
// Try keeping array intact
const chunk = this.createChunk(value, path);
if (this.isWithinSizeLimit(chunk)) {
return [chunk];
}
const chunks: Record<string, any>[] = [];
let currentGroup: any[] = [];
const saveCurrentGroup = () => {
if (currentGroup.length > 0) {
const groupChunk = this.createChunk(currentGroup, path);
if (RecursiveJsonTransformer.jsonSize(groupChunk) >= this.minSize) {
chunks.push(groupChunk);
currentGroup = [];
}
}
};
for (const item of value) {
// Try adding item to current group
const testGroup = [...currentGroup, item];
const testChunk = this.createChunk(testGroup, path);
if (this.isWithinSizeLimit(testChunk)) {
currentGroup = testGroup;
continue;
}
// Current group is full
saveCurrentGroup();
// Handle the new item
if (typeof item === 'object' && item !== null) {
const singleItemArray = [item];
const singleItemChunk = this.createChunk(singleItemArray, path);
if (this.isWithinSizeLimit(singleItemChunk)) {
currentGroup = singleItemArray;
} else {
const itemPath = [...path, String(chunks.length)];
const nestedChunks = this.handleNestedObject(item, itemPath, depth + 1, maxDepth);
chunks.push(...nestedChunks);
}
} else {
currentGroup = [item];
}
}
saveCurrentGroup();
return chunks;
}
/**
* Splits objects into chunks based on size limits
* Handles nested arrays and objects by recursing into handleArray and handleNestedObject
*/
private handleNestedObject(
value: Record<string, any>,
fullPath: string[],
depth: number,
maxDepth: number,
): Record<string, any>[] {
const path = fullPath.length ? fullPath : ['root'];
// Handle max depth
if (depth > maxDepth) {
console.warn(`Maximum depth of ${maxDepth} exceeded, flattening remaining structure`);
return [this.createChunk(value, path)];
}
// Try keeping object intact
const wholeChunk = this.createChunk(value, path);
if (this.isWithinSizeLimit(wholeChunk)) {
return [wholeChunk];
}
const chunks: Record<string, any>[] = [];
let currentChunk: Record<string, any> = {};
const saveCurrentChunk = () => {
if (Object.keys(currentChunk).length > 0) {
const objChunk = this.createChunk(currentChunk, path);
if (RecursiveJsonTransformer.jsonSize(objChunk) >= this.minSize) {
chunks.push(objChunk);
currentChunk = {};
}
}
};
for (const [key, val] of Object.entries(value)) {
if (val === undefined) continue;
// Handle arrays separately
if (Array.isArray(val)) {
saveCurrentChunk();
const arrayChunks = this.handleArray(val, key, path, depth, maxDepth);
chunks.push(...arrayChunks);
continue;
}
// Try adding to current chunk
const testChunk = this.createChunk({ ...currentChunk, [key]: val }, path);
if (this.isWithinSizeLimit(testChunk)) {
currentChunk[key] = val;
continue;
}
// Current chunk is full
saveCurrentChunk();
// Handle value that didn't fit
if (typeof val === 'object' && val !== null) {
const nestedChunks = this.handleNestedObject(val, [...path, key], depth + 1, maxDepth);
chunks.push(...nestedChunks);
} else {
currentChunk = { [key]: val };
}
}
saveCurrentChunk();
return chunks;
}
/**
* Splits long strings into smaller chunks at word boundaries
* Ensures each chunk is within maxSize limit
*/
private splitLongString(value: string): string[] {
const chunks: string[] = [];
let remaining = value;
while (remaining.length > 0) {
const overhead = 20;
const chunkSize = Math.floor(this.maxSize - overhead);
if (remaining.length <= chunkSize) {
chunks.push(remaining);
break;
}
const lastSpace = remaining.slice(0, chunkSize).lastIndexOf(' ');
const splitAt = lastSpace > 0 ? lastSpace + 1 : chunkSize;
chunks.push(remaining.slice(0, splitAt));
remaining = remaining.slice(splitAt);
}
return chunks;
}
/**
* Core chunking logic that processes JSON data recursively
* Handles arrays, objects, and primitive values while maintaining structure
*/
private jsonSplit({
data,
currentPath = [],
chunks = [{}],
depth = 0,
maxDepth = 100,
}: {
data: Record<string, any>;
currentPath?: string[];
chunks?: Record<string, any>[];
depth?: number;
maxDepth?: number;
}): Record<string, any>[] {
if (!data || typeof data !== 'object') {
return chunks;
}
if (depth > maxDepth) {
console.warn(`Maximum depth of ${maxDepth} exceeded, flattening remaining structure`);
RecursiveJsonTransformer.setNestedDict(chunks[chunks.length - 1] || {}, currentPath, data);
return chunks;
}
let currentChunk = {};
let accumulatedChunks = chunks;
for (const [key, value] of Object.entries(data)) {
const fullPath = [...currentPath, key];
if (Array.isArray(value)) {
const arrayChunks = this.handleArray(value, key, currentPath, depth, maxDepth);
accumulatedChunks = [...accumulatedChunks, ...arrayChunks];
} else if (typeof value === 'object' && value !== null) {
const objectChunks = this.handleNestedObject(value, fullPath, depth, maxDepth);
accumulatedChunks = [...accumulatedChunks, ...objectChunks];
} else {
const { currentChunk: newCurrentChunk, chunks: newChunks } = this.handlePrimitiveValue(
value,
key,
currentChunk,
accumulatedChunks,
fullPath,
);
currentChunk = newCurrentChunk;
accumulatedChunks = newChunks;
}
}
if (Object.keys(currentChunk).length > 0) {
accumulatedChunks = [...accumulatedChunks, currentChunk];
}
return accumulatedChunks.filter(chunk => Object.keys(chunk).length > 0);
}
/**
* Splits JSON into a list of JSON chunks
*/
splitJson({
jsonData,
convertLists = false,
}: {
jsonData: Record<string, any>;
convertLists?: boolean;
}): Record<string, any>[] {
const processedData = convertLists ? this.listToDictPreprocessing(jsonData) : jsonData;
const chunks = this.jsonSplit({ data: processedData });
if (Object.keys(chunks[chunks.length - 1] || {}).length === 0) {
chunks.pop();
}
return chunks;
}
/**
* Converts Unicode characters to their escaped ASCII representation
* e.g., 'café' becomes 'caf\u00e9'
*/
private escapeNonAscii(obj: any): any {
if (typeof obj === 'string') {
return obj.replace(/[\u0080-\uffff]/g, char => {
return `\\u${char.charCodeAt(0).toString(16).padStart(4, '0')}`;
});
}
if (Array.isArray(obj)) {
return obj.map(item => this.escapeNonAscii(item));
}
if (typeof obj === 'object' && obj !== null) {
return Object.fromEntries(Object.entries(obj).map(([key, value]) => [key, this.escapeNonAscii(value)]));
}
return obj;
}
/**
* Splits JSON into a list of JSON formatted strings
*/
splitText({
jsonData,
convertLists = false,
ensureAscii = true,
}: {
jsonData: Record<string, any>;
convertLists?: boolean;
ensureAscii?: boolean;
}): string[] {
const chunks = this.splitJson({ jsonData, convertLists });
if (ensureAscii) {
const escapedChunks = chunks.map(chunk => this.escapeNonAscii(chunk));
return escapedChunks.map(chunk => JSON.stringify(chunk));
}
return chunks.map(chunk =>
JSON.stringify(chunk, (key, value) => {
// Convert escaped Unicode sequences back to actual characters
// e.g., '\u00e9' -> 'é'
if (typeof value === 'string') {
return value.replace(/\\u[\da-f]{4}/gi, match => String.fromCharCode(parseInt(match.slice(2), 16)));
}
return value;
}),
);
}
/**
* Create documents from a list of json objects
*/
createDocuments({
texts,
convertLists = false,
ensureAscii = true,
metadatas,
}: {
texts: string[];
convertLists?: boolean;
ensureAscii?: boolean;
metadatas?: Record<string, any>[];
}): Document[] {
const _metadatas = metadatas || Array(texts.length).fill({});
const documents: Document[] = [];
texts.forEach((text, i) => {
const chunks = this.splitText({ jsonData: JSON.parse(text), convertLists, ensureAscii });
chunks.forEach(chunk => {
const metadata = { ...(_metadatas[i] || {}) };
documents.push(
new Document({
text: chunk,
metadata,
}),
);
});
});
return documents;
}
transformDocuments({
ensureAscii,
documents,
convertLists,
}: {
ensureAscii?: boolean;
convertLists?: boolean;
documents: Document[];
}): Document[] {
const texts: string[] = [];
const metadatas: Record<string, any>[] = [];
for (const doc of documents) {
texts.push(doc.text);
metadatas.push(doc.metadata);
}
return this.createDocuments({
texts,
metadatas,
ensureAscii,
convertLists,
});
}
}