UNPKG

@elpassion/semantic-chunking

Version:

Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).

231 lines (204 loc) 8.61 kB
// =========================== // == 🍱 semantic-chunking == // ================================================================== // == Semantically create chunks from large texts == // == Useful for workflows involving large language models (LLMs) == // ================================================================== // == npm package: https://www.npmjs.com/package/semantic-chunking == // == github repo: https://github.com/jparkerweb/semantic-chunking == // ================================================================== import { MarkdownTextSplitter } from "@langchain/textsplitters"; import { DEFAULT_CONFIG } from "./config.js"; import { computeAdvancedSimilarities, adjustThreshold } from "./similarityUtils.js"; import { createChunks, optimizeAndRebalanceChunks, applyPrefixToChunk } from "./chunkingUtils.js"; import { readFileSync } from "fs"; export { LocalEmbeddingModel, OpenAIEmbedding } from "./embeddingUtils.js"; const packageJson = JSON.parse(readFileSync(new URL("./package.json", import.meta.url))); const VERSION = packageJson.version; export async function printVersion() { const versionText = `-- semantic-chunking v${VERSION} --`; const lineLength = versionText.length; console.log(`\n${"-".repeat(lineLength)}\n${versionText}\n${"-".repeat(lineLength)}`); } const defaultConfig = { logging: DEFAULT_CONFIG.LOGGING, maxTokenSize: DEFAULT_CONFIG.MAX_TOKEN_SIZE, similarityThreshold: DEFAULT_CONFIG.SIMILARITY_THRESHOLD, dynamicThresholdLowerBound: DEFAULT_CONFIG.DYNAMIC_THRESHOLD_LOWER_BOUND, dynamicThresholdUpperBound: DEFAULT_CONFIG.DYNAMIC_THRESHOLD_UPPER_BOUND, numSimilaritySentencesLookahead: DEFAULT_CONFIG.NUM_SIMILARITY_SENTENCES_LOOKAHEAD, combineChunks: DEFAULT_CONFIG.COMBINE_CHUNKS, combineChunksSimilarityThreshold: DEFAULT_CONFIG.COMBINE_CHUNKS_SIMILARITY_THRESHOLD, returnEmbedding: DEFAULT_CONFIG.RETURN_EMBEDDING, returnTokenLength: DEFAULT_CONFIG.RETURN_TOKEN_LENGTH, chunkPrefix: DEFAULT_CONFIG.CHUNK_PREFIX, excludeChunkPrefixInResults: false, }; // Helper function to parse text into markdown chunks async function parseMarkdownChunks(text, splitter) { const chunks = await splitter.splitText(text); return chunks; } // --------------------------- // -- Main chunkit function -- // --------------------------- export async function chunkit(documents, model, config = {}) { const { logging, maxTokenSize, similarityThreshold, dynamicThresholdLowerBound, dynamicThresholdUpperBound, numSimilaritySentencesLookahead, combineChunks, combineChunksSimilarityThreshold, returnEmbedding, returnTokenLength, chunkPrefix, excludeChunkPrefixInResults, } = { ...defaultConfig, ...config, }; // Create a markdown text splitter instance const markdownSplitter = new MarkdownTextSplitter({ chunkSize: maxTokenSize / 2, // Large chunk size since we'll be doing our own chunking chunkOverlap: 0, }); if (logging) { console.log("maxTokenSize", maxTokenSize); console.log("similarityThreshold", similarityThreshold); console.log("dynamicThresholdLowerBound", dynamicThresholdLowerBound); console.log("dynamicThresholdUpperBound", dynamicThresholdUpperBound); console.log("numSimilaritySentencesLookahead", numSimilaritySentencesLookahead); console.log("combineChunks", combineChunks); console.log("combineChunksSimilarityThreshold", combineChunksSimilarityThreshold); console.log("returnEmbedding", returnEmbedding); console.log("returnTokenLength", returnTokenLength); console.log("chunkPrefix", chunkPrefix); console.log("excludeChunkPrefixInResults", excludeChunkPrefixInResults); } if (logging) { printVersion(); } // Input validation if (!Array.isArray(documents)) { throw new Error("Input must be an array of document objects"); } if (!model || typeof model.createEmbedding !== "function") { throw new Error("A valid model instance must be provided"); } const { modelName, dtype } = model.getModelInfo(); // Process each document const allResults = await Promise.all( documents.map(async (doc) => { if (!doc.document_text) { throw new Error("Each document must have a document_text property"); } // Normalize document text by converting single line breaks to spaces // but preserving multiple line breaks let normalizedText = doc.document_text.replace(/([^\n])\n([^\n])/g, "$1 $2"); // Convert multiple spaces to single space normalizedText = normalizedText.replace(/\s{2,}/g, " "); doc.document_text = normalizedText; // Split the text into sentences const sentences = await parseMarkdownChunks(doc.document_text, markdownSplitter); // Compute similarities and create chunks const { similarities, average, variance } = await computeAdvancedSimilarities(sentences, { numSimilaritySentencesLookahead, logging, model, // Pass the model to similarity computation }); // Dynamically adjust the similarity threshold based on variance and average let dynamicThreshold = similarityThreshold; if (average != null && variance != null) { dynamicThreshold = adjustThreshold( average, variance, similarityThreshold, dynamicThresholdLowerBound, dynamicThresholdUpperBound ); } // Create the initial chunks using the adjusted threshold const initialChunks = await createChunks(sentences, similarities, maxTokenSize, dynamicThreshold, logging, model); // Log initial chunks if needed if (logging) { console.log("\n=============\ninitialChunks\n============="); initialChunks.forEach((chunk, index) => { console.log("\n"); console.log(`--------------`); console.log(`-- Chunk ${index + 1} --`); console.log(`--------------`); console.log(chunk.substring(0, 50) + "..."); }); } let finalChunks; // Combine similar chunks and balance sizes if requested if (combineChunks) { finalChunks = await optimizeAndRebalanceChunks( initialChunks, model, // Use model's tokenizer maxTokenSize, combineChunksSimilarityThreshold ); if (logging) { console.log("\n\n=============\ncombinedChunks\n============="); finalChunks.forEach((chunk, index) => { console.log("\n\n\n"); console.log("--------------------"); console.log("Chunk " + (index + 1)); console.log("--------------------"); console.log(chunk.substring(0, 50) + "..."); }); } } else { finalChunks = initialChunks; } const documentName = doc.document_name || ""; // Normalize document_name const documentId = Date.now(); const numberOfChunks = finalChunks.length; return Promise.all( finalChunks.map(async (chunk, index) => { const prefixedChunk = applyPrefixToChunk(chunkPrefix, chunk); const result = { document_id: documentId, document_name: documentName, number_of_chunks: numberOfChunks, chunk_number: index + 1, model_name: modelName, dtype: dtype, text: prefixedChunk, }; if (returnEmbedding) { result.embedding = await model.createEmbedding(prefixedChunk); } if (returnTokenLength) { try { const encoded = await model.tokenize(prefixedChunk, { padding: true, }); if (encoded && encoded.size) { result.token_length = encoded.size; } else { console.error("Tokenizer returned unexpected format:", encoded); result.token_length = 0; } } catch (error) { console.error("Error during tokenization:", error); result.token_length = 0; } } // Remove prefix if requested (after embedding calculation) if (excludeChunkPrefixInResults && chunkPrefix && chunkPrefix.trim()) { const prefixPattern = new RegExp(`^${chunkPrefix}:\\s*`); result.text = result.text.replace(prefixPattern, ""); } return result; }) ); }) ); // Flatten the results array since we're processing multiple documents return allResults.flat(); }