vespa-ts
Version:
A reusable TypeScript package for interacting with Vespa search engine with dependency injection support
109 lines • 4.52 kB
JavaScript
/**
* Chunk the input text by paragraphs with optional overlap.
*
* @param {string} text - The input text to be chunked.
* @param {number} maxChunkSize - Maximum size of each chunk.
* @param {number} overlap - Number of overlapping characters between chunks.
* @returns {string[]} - An array of text chunks.
*/
export const chunkTextByParagraph = (text, maxChunkSize = 512, overlap = 128) => {
// Helper function to get the byte length of a string
const getByteLength = (str) => Buffer.byteLength(str, "utf8");
// Helper function to clean up illegal code points in a string
// Some PDFs may contain illegal UTF-8 code points like 0xF and 0x2
// Vespa throws an error when ingesting such strings, so we replace those characters
const cleanText = (str) => {
// Normalize carriage return and newline combinations to a single newline.
const normalized = str.replace(/\r\n|\r/g, "\n");
// Remove control characters except newline (\u000A)
// Use a regular expression to remove illegal UTF-8 code points
return normalized.replace(/[\u0000-\u0008\u000B-\u000C\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF]/g, "");
};
// Clean the input text before processing
const cleanedText = cleanText(text);
// Split the cleaned text into paragraphs using newline characters
const paragraphs = cleanedText.split(/\n+/).filter((p) => p.length > 0);
let chunks = [];
let currentChunk = [];
let currentLength = 0;
// Helper function to add a chunk to the chunks array
const addChunk = (chunkArr) => {
const chunkStr = chunkArr.join("\n");
chunks.push(chunkStr);
if (overlap > 0) {
// Calculate overlap in terms of bytes
let overlapBytes = 0;
let overlapChunk = [];
// Iterate from the end to get the overlapping paragraphs
for (let i = chunkArr.length - 1; i >= 0; i--) {
const para = chunkArr[i] || "";
const paraByteLength = getByteLength(para) + 1; // +1 for newline character
if (overlapBytes + paraByteLength > overlap) {
break;
}
overlapChunk.unshift(para);
overlapBytes += paraByteLength;
}
currentChunk = overlapChunk;
currentLength = overlapBytes;
}
else {
currentChunk = [];
currentLength = 0;
}
};
for (let paragraph of paragraphs) {
const paragraphByteLength = getByteLength(paragraph) + 1; // +1 for newline character
if (paragraphByteLength > maxChunkSize) {
// Handle very long paragraphs by splitting them into smaller chunks
if (currentLength > 0) {
addChunk(currentChunk);
}
// Split the long paragraph into sentences
let sentences = paragraph.split(/(?<=[.!?])\s+/);
let subChunk = [];
let subChunkLength = 0;
for (let sentence of sentences) {
const sentenceByteLength = getByteLength(sentence) + 1; // +1 for space or newline
if (subChunkLength + sentenceByteLength > maxChunkSize) {
addChunk(subChunk);
subChunk = [];
subChunkLength = 0;
}
subChunk.push(sentence);
subChunkLength += sentenceByteLength;
}
if (subChunk.length > 0) {
addChunk(subChunk);
}
}
else if (currentLength + paragraphByteLength > maxChunkSize) {
// If adding the current paragraph exceeds maxChunkSize, finalize the current chunk
addChunk(currentChunk);
currentChunk = [paragraph];
currentLength = paragraphByteLength;
}
else {
// Add the current paragraph to the current chunk
currentChunk.push(paragraph);
currentLength += paragraphByteLength;
}
}
// Add the last chunk if it exists
if (currentChunk.length > 0) {
addChunk(currentChunk);
}
return chunks;
};
export const chunkDocument = (body) => {
let out = [];
let chunks = chunkTextByParagraph(body, 512, 0);
for (const [index, chunk] of chunks.entries()) {
out.push({
chunk,
chunkIndex: index,
});
}
return out;
};
//# sourceMappingURL=chunks.js.map