@ai2070/l0
Version:
L0: The Missing Reliability Substrate for AI
326 lines • 12.3 kB
JavaScript
export function chunkDocument(document, options) {
const { strategy } = options;
switch (strategy) {
case "token":
return chunkByTokens(document, options);
case "char":
return chunkByChars(document, options);
case "paragraph":
return chunkByParagraphs(document, options);
case "sentence":
return chunkBySentences(document, options);
default:
return chunkByTokens(document, options);
}
}
export function chunkByTokens(document, options) {
const { size, overlap, estimateTokens, preserveParagraphs } = options;
const chunks = [];
let startPos = 0;
while (startPos < document.length) {
let endPos = startPos;
let currentTokens = 0;
while (endPos < document.length && currentTokens < size) {
endPos++;
if (endPos % 4 === 0) {
currentTokens++;
}
}
if (preserveParagraphs && endPos < document.length) {
const nextNewline = document.indexOf("\n\n", endPos);
const prevNewline = document.lastIndexOf("\n\n", endPos);
if (nextNewline !== -1 && nextNewline - endPos < 100) {
endPos = nextNewline + 2;
}
else if (prevNewline > startPos && endPos - prevNewline < 100) {
endPos = prevNewline + 2;
}
}
const content = document.slice(startPos, endPos).trim();
if (content.length > 0) {
chunks.push({
index: chunks.length,
content,
startPos,
endPos,
tokenCount: estimateTokens(content),
charCount: content.length,
isFirst: chunks.length === 0,
isLast: endPos >= document.length,
totalChunks: 0,
metadata: options.metadata,
});
}
const overlapChars = Math.floor(overlap * 4);
startPos = endPos - overlapChars;
const lastChunk = chunks[chunks.length - 1];
if (lastChunk && startPos <= lastChunk.startPos) {
startPos = endPos;
}
}
chunks.forEach((chunk) => {
chunk.totalChunks = chunks.length;
chunk.isLast = chunk.index === chunks.length - 1;
});
return chunks;
}
export function chunkByChars(document, options) {
const { size, overlap, estimateTokens, preserveParagraphs } = options;
const chunks = [];
let startPos = 0;
while (startPos < document.length) {
let endPos = Math.min(startPos + size, document.length);
if (preserveParagraphs && endPos < document.length) {
const nextNewline = document.indexOf("\n\n", endPos);
const prevNewline = document.lastIndexOf("\n\n", endPos);
if (nextNewline !== -1 && nextNewline - endPos < 100) {
endPos = nextNewline + 2;
}
else if (prevNewline > startPos && endPos - prevNewline < 100) {
endPos = prevNewline + 2;
}
}
const content = document.slice(startPos, endPos).trim();
if (content.length > 0) {
chunks.push({
index: chunks.length,
content,
startPos,
endPos,
tokenCount: estimateTokens(content),
charCount: content.length,
isFirst: chunks.length === 0,
isLast: endPos >= document.length,
totalChunks: 0,
metadata: options.metadata,
});
}
startPos = endPos - overlap;
const lastChunk = chunks[chunks.length - 1];
if (lastChunk && startPos <= lastChunk.startPos) {
startPos = endPos;
}
}
chunks.forEach((chunk) => {
chunk.totalChunks = chunks.length;
chunk.isLast = chunk.index === chunks.length - 1;
});
return chunks;
}
export function chunkByParagraphs(document, options) {
const { size, overlap, estimateTokens } = options;
const paragraphs = document.split(/\n\n+/).filter((p) => p.trim().length > 0);
const chunks = [];
let currentChunk = [];
let currentSize = 0;
let currentStartPos = 0;
for (let i = 0; i < paragraphs.length; i++) {
const para = paragraphs[i].trim();
const paraSize = estimateTokens(para);
if (paraSize > size) {
if (currentChunk.length > 0) {
const content = currentChunk.join("\n\n");
chunks.push(createChunk(content, currentStartPos, document, chunks.length, estimateTokens, options.metadata));
currentChunk = [];
currentSize = 0;
}
const paraChunks = chunkByChars(para, {
...options,
size,
overlap: 0,
});
paraChunks.forEach((pc) => {
chunks.push({
...pc,
index: chunks.length,
startPos: document.indexOf(pc.content, currentStartPos),
});
});
currentStartPos = document.indexOf(para, currentStartPos) + para.length;
continue;
}
if (currentSize + paraSize > size && currentChunk.length > 0) {
const content = currentChunk.join("\n\n");
chunks.push(createChunk(content, currentStartPos, document, chunks.length, estimateTokens, options.metadata));
const overlapParas = [];
let overlapSize = 0;
for (let j = currentChunk.length - 1; j >= 0; j--) {
const p = currentChunk[j];
const pSize = estimateTokens(p);
if (overlapSize + pSize <= overlap) {
overlapParas.unshift(p);
overlapSize += pSize;
}
else {
break;
}
}
currentChunk = overlapParas;
currentSize = overlapSize;
currentStartPos = document.indexOf(currentChunk[0] || para, currentStartPos);
}
currentChunk.push(para);
currentSize += paraSize;
}
if (currentChunk.length > 0) {
const content = currentChunk.join("\n\n");
chunks.push(createChunk(content, currentStartPos, document, chunks.length, estimateTokens, options.metadata));
}
chunks.forEach((chunk) => {
chunk.totalChunks = chunks.length;
chunk.isFirst = chunk.index === 0;
chunk.isLast = chunk.index === chunks.length - 1;
});
return chunks;
}
export function chunkBySentences(document, options) {
const { size, overlap, estimateTokens } = options;
const sentences = splitIntoSentences(document);
const chunks = [];
let currentChunk = [];
let currentSize = 0;
let currentStartPos = 0;
for (const sentence of sentences) {
const sentSize = estimateTokens(sentence);
if (sentSize > size) {
if (currentChunk.length > 0) {
const content = currentChunk.join(" ");
chunks.push(createChunk(content, currentStartPos, document, chunks.length, estimateTokens, options.metadata));
currentChunk = [];
currentSize = 0;
}
const sentChunks = chunkByChars(sentence, {
...options,
size,
overlap: 0,
});
sentChunks.forEach((sc) => {
chunks.push({
...sc,
index: chunks.length,
startPos: document.indexOf(sc.content, currentStartPos),
});
});
currentStartPos =
document.indexOf(sentence, currentStartPos) + sentence.length;
continue;
}
if (currentSize + sentSize > size && currentChunk.length > 0) {
const content = currentChunk.join(" ");
chunks.push(createChunk(content, currentStartPos, document, chunks.length, estimateTokens, options.metadata));
const overlapSents = [];
let overlapSize = 0;
for (let j = currentChunk.length - 1; j >= 0; j--) {
const s = currentChunk[j];
const sSize = estimateTokens(s);
if (overlapSize + sSize <= overlap) {
overlapSents.unshift(s);
overlapSize += sSize;
}
else {
break;
}
}
currentChunk = overlapSents;
currentSize = overlapSize;
currentStartPos = document.indexOf(currentChunk[0] || sentence, currentStartPos);
}
currentChunk.push(sentence);
currentSize += sentSize;
}
if (currentChunk.length > 0) {
const content = currentChunk.join(" ");
chunks.push(createChunk(content, currentStartPos, document, chunks.length, estimateTokens, options.metadata));
}
chunks.forEach((chunk) => {
chunk.totalChunks = chunks.length;
chunk.isFirst = chunk.index === 0;
chunk.isLast = chunk.index === chunks.length - 1;
});
return chunks;
}
export function splitIntoSentences(text) {
const sentences = [];
const regex = /[.!?]+[\s\n]+(?=[A-Z])|[.!?]+$/g;
let lastIndex = 0;
let match;
while ((match = regex.exec(text)) !== null) {
const sentence = text
.slice(lastIndex, match.index + match[0].length)
.trim();
if (sentence.length > 0) {
sentences.push(sentence);
}
lastIndex = match.index + match[0].length;
}
if (lastIndex < text.length) {
const remaining = text.slice(lastIndex).trim();
if (remaining.length > 0) {
sentences.push(remaining);
}
}
return sentences;
}
function createChunk(content, startPos, fullDocument, index, estimateTokens, metadata) {
const actualStartPos = fullDocument.indexOf(content, startPos);
return {
index,
content,
startPos: actualStartPos !== -1 ? actualStartPos : startPos,
endPos: actualStartPos !== -1
? actualStartPos + content.length
: startPos + content.length,
tokenCount: estimateTokens(content),
charCount: content.length,
isFirst: index === 0,
isLast: false,
totalChunks: 0,
metadata,
};
}
export function estimateTokenCount(text) {
const charCount = text.length;
const wordCount = text.split(/\s+/).length;
const charEstimate = Math.ceil(charCount / 4);
const wordEstimate = Math.ceil(wordCount * 1.3);
return Math.ceil((charEstimate + wordEstimate) / 2);
}
export function getChunkOverlap(chunk1, chunk2) {
if (chunk1.endPos <= chunk2.startPos || chunk2.endPos <= chunk1.startPos) {
return null;
}
const overlapStart = Math.max(chunk1.startPos, chunk2.startPos);
const overlapEnd = Math.min(chunk1.endPos, chunk2.endPos);
const chunk1End = chunk1.content.slice(-(chunk1.endPos - overlapStart));
const chunk2Start = chunk2.content.slice(0, overlapEnd - chunk2.startPos);
return chunk1End.length <= chunk2Start.length ? chunk1End : chunk2Start;
}
export function mergeChunks(chunks, preserveOverlap = false) {
if (chunks.length === 0)
return "";
if (chunks.length === 1)
return chunks[0].content;
if (preserveOverlap) {
return chunks.map((c) => c.content).join("\n\n");
}
const result = [chunks[0].content];
for (let i = 1; i < chunks.length; i++) {
const prevChunk = chunks[i - 1];
const currentChunk = chunks[i];
const overlap = getChunkOverlap(prevChunk, currentChunk);
if (overlap) {
const overlapIndex = currentChunk.content.indexOf(overlap);
if (overlapIndex !== -1) {
result.push(currentChunk.content.slice(overlapIndex + overlap.length));
}
else {
result.push(currentChunk.content);
}
}
else {
result.push(currentChunk.content);
}
}
return result.join("");
}
//# sourceMappingURL=chunking.js.map