UNPKG

@ai2070/l0

Version:

L0: The Missing Reliability Substrate for AI

326 lines 12.3 kB
export function chunkDocument(document, options) { const { strategy } = options; switch (strategy) { case "token": return chunkByTokens(document, options); case "char": return chunkByChars(document, options); case "paragraph": return chunkByParagraphs(document, options); case "sentence": return chunkBySentences(document, options); default: return chunkByTokens(document, options); } } export function chunkByTokens(document, options) { const { size, overlap, estimateTokens, preserveParagraphs } = options; const chunks = []; let startPos = 0; while (startPos < document.length) { let endPos = startPos; let currentTokens = 0; while (endPos < document.length && currentTokens < size) { endPos++; if (endPos % 4 === 0) { currentTokens++; } } if (preserveParagraphs && endPos < document.length) { const nextNewline = document.indexOf("\n\n", endPos); const prevNewline = document.lastIndexOf("\n\n", endPos); if (nextNewline !== -1 && nextNewline - endPos < 100) { endPos = nextNewline + 2; } else if (prevNewline > startPos && endPos - prevNewline < 100) { endPos = prevNewline + 2; } } const content = document.slice(startPos, endPos).trim(); if (content.length > 0) { chunks.push({ index: chunks.length, content, startPos, endPos, tokenCount: estimateTokens(content), charCount: content.length, isFirst: chunks.length === 0, isLast: endPos >= document.length, totalChunks: 0, metadata: options.metadata, }); } const overlapChars = Math.floor(overlap * 4); startPos = endPos - overlapChars; const lastChunk = chunks[chunks.length - 1]; if (lastChunk && startPos <= lastChunk.startPos) { startPos = endPos; } } chunks.forEach((chunk) => { chunk.totalChunks = chunks.length; chunk.isLast = chunk.index === chunks.length - 1; }); return chunks; } export function chunkByChars(document, options) { const { size, overlap, estimateTokens, preserveParagraphs } = options; const chunks = []; let startPos = 0; while (startPos < document.length) { let endPos = Math.min(startPos + size, document.length); if (preserveParagraphs && endPos < document.length) { const nextNewline = document.indexOf("\n\n", endPos); const prevNewline = document.lastIndexOf("\n\n", endPos); if (nextNewline !== -1 && nextNewline - endPos < 100) { endPos = nextNewline + 2; } else if (prevNewline > startPos && endPos - prevNewline < 100) { endPos = prevNewline + 2; } } const content = document.slice(startPos, endPos).trim(); if (content.length > 0) { chunks.push({ index: chunks.length, content, startPos, endPos, tokenCount: estimateTokens(content), charCount: content.length, isFirst: chunks.length === 0, isLast: endPos >= document.length, totalChunks: 0, metadata: options.metadata, }); } startPos = endPos - overlap; const lastChunk = chunks[chunks.length - 1]; if (lastChunk && startPos <= lastChunk.startPos) { startPos = endPos; } } chunks.forEach((chunk) => { chunk.totalChunks = chunks.length; chunk.isLast = chunk.index === chunks.length - 1; }); return chunks; } export function chunkByParagraphs(document, options) { const { size, overlap, estimateTokens } = options; const paragraphs = document.split(/\n\n+/).filter((p) => p.trim().length > 0); const chunks = []; let currentChunk = []; let currentSize = 0; let currentStartPos = 0; for (let i = 0; i < paragraphs.length; i++) { const para = paragraphs[i].trim(); const paraSize = estimateTokens(para); if (paraSize > size) { if (currentChunk.length > 0) { const content = currentChunk.join("\n\n"); chunks.push(createChunk(content, currentStartPos, document, chunks.length, estimateTokens, options.metadata)); currentChunk = []; currentSize = 0; } const paraChunks = chunkByChars(para, { ...options, size, overlap: 0, }); paraChunks.forEach((pc) => { chunks.push({ ...pc, index: chunks.length, startPos: document.indexOf(pc.content, currentStartPos), }); }); currentStartPos = document.indexOf(para, currentStartPos) + para.length; continue; } if (currentSize + paraSize > size && currentChunk.length > 0) { const content = currentChunk.join("\n\n"); chunks.push(createChunk(content, currentStartPos, document, chunks.length, estimateTokens, options.metadata)); const overlapParas = []; let overlapSize = 0; for (let j = currentChunk.length - 1; j >= 0; j--) { const p = currentChunk[j]; const pSize = estimateTokens(p); if (overlapSize + pSize <= overlap) { overlapParas.unshift(p); overlapSize += pSize; } else { break; } } currentChunk = overlapParas; currentSize = overlapSize; currentStartPos = document.indexOf(currentChunk[0] || para, currentStartPos); } currentChunk.push(para); currentSize += paraSize; } if (currentChunk.length > 0) { const content = currentChunk.join("\n\n"); chunks.push(createChunk(content, currentStartPos, document, chunks.length, estimateTokens, options.metadata)); } chunks.forEach((chunk) => { chunk.totalChunks = chunks.length; chunk.isFirst = chunk.index === 0; chunk.isLast = chunk.index === chunks.length - 1; }); return chunks; } export function chunkBySentences(document, options) { const { size, overlap, estimateTokens } = options; const sentences = splitIntoSentences(document); const chunks = []; let currentChunk = []; let currentSize = 0; let currentStartPos = 0; for (const sentence of sentences) { const sentSize = estimateTokens(sentence); if (sentSize > size) { if (currentChunk.length > 0) { const content = currentChunk.join(" "); chunks.push(createChunk(content, currentStartPos, document, chunks.length, estimateTokens, options.metadata)); currentChunk = []; currentSize = 0; } const sentChunks = chunkByChars(sentence, { ...options, size, overlap: 0, }); sentChunks.forEach((sc) => { chunks.push({ ...sc, index: chunks.length, startPos: document.indexOf(sc.content, currentStartPos), }); }); currentStartPos = document.indexOf(sentence, currentStartPos) + sentence.length; continue; } if (currentSize + sentSize > size && currentChunk.length > 0) { const content = currentChunk.join(" "); chunks.push(createChunk(content, currentStartPos, document, chunks.length, estimateTokens, options.metadata)); const overlapSents = []; let overlapSize = 0; for (let j = currentChunk.length - 1; j >= 0; j--) { const s = currentChunk[j]; const sSize = estimateTokens(s); if (overlapSize + sSize <= overlap) { overlapSents.unshift(s); overlapSize += sSize; } else { break; } } currentChunk = overlapSents; currentSize = overlapSize; currentStartPos = document.indexOf(currentChunk[0] || sentence, currentStartPos); } currentChunk.push(sentence); currentSize += sentSize; } if (currentChunk.length > 0) { const content = currentChunk.join(" "); chunks.push(createChunk(content, currentStartPos, document, chunks.length, estimateTokens, options.metadata)); } chunks.forEach((chunk) => { chunk.totalChunks = chunks.length; chunk.isFirst = chunk.index === 0; chunk.isLast = chunk.index === chunks.length - 1; }); return chunks; } export function splitIntoSentences(text) { const sentences = []; const regex = /[.!?]+[\s\n]+(?=[A-Z])|[.!?]+$/g; let lastIndex = 0; let match; while ((match = regex.exec(text)) !== null) { const sentence = text .slice(lastIndex, match.index + match[0].length) .trim(); if (sentence.length > 0) { sentences.push(sentence); } lastIndex = match.index + match[0].length; } if (lastIndex < text.length) { const remaining = text.slice(lastIndex).trim(); if (remaining.length > 0) { sentences.push(remaining); } } return sentences; } function createChunk(content, startPos, fullDocument, index, estimateTokens, metadata) { const actualStartPos = fullDocument.indexOf(content, startPos); return { index, content, startPos: actualStartPos !== -1 ? actualStartPos : startPos, endPos: actualStartPos !== -1 ? actualStartPos + content.length : startPos + content.length, tokenCount: estimateTokens(content), charCount: content.length, isFirst: index === 0, isLast: false, totalChunks: 0, metadata, }; } export function estimateTokenCount(text) { const charCount = text.length; const wordCount = text.split(/\s+/).length; const charEstimate = Math.ceil(charCount / 4); const wordEstimate = Math.ceil(wordCount * 1.3); return Math.ceil((charEstimate + wordEstimate) / 2); } export function getChunkOverlap(chunk1, chunk2) { if (chunk1.endPos <= chunk2.startPos || chunk2.endPos <= chunk1.startPos) { return null; } const overlapStart = Math.max(chunk1.startPos, chunk2.startPos); const overlapEnd = Math.min(chunk1.endPos, chunk2.endPos); const chunk1End = chunk1.content.slice(-(chunk1.endPos - overlapStart)); const chunk2Start = chunk2.content.slice(0, overlapEnd - chunk2.startPos); return chunk1End.length <= chunk2Start.length ? chunk1End : chunk2Start; } export function mergeChunks(chunks, preserveOverlap = false) { if (chunks.length === 0) return ""; if (chunks.length === 1) return chunks[0].content; if (preserveOverlap) { return chunks.map((c) => c.content).join("\n\n"); } const result = [chunks[0].content]; for (let i = 1; i < chunks.length; i++) { const prevChunk = chunks[i - 1]; const currentChunk = chunks[i]; const overlap = getChunkOverlap(prevChunk, currentChunk); if (overlap) { const overlapIndex = currentChunk.content.indexOf(overlap); if (overlapIndex !== -1) { result.push(currentChunk.content.slice(overlapIndex + overlap.length)); } else { result.push(currentChunk.content); } } else { result.push(currentChunk.content); } } return result.join(""); } //# sourceMappingURL=chunking.js.map