@elpassion/semantic-chunking
Version:
Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).
231 lines (204 loc) • 8.61 kB
JavaScript
// ===========================
// == 🍱 semantic-chunking ==
// ==================================================================
// == Semantically create chunks from large texts ==
// == Useful for workflows involving large language models (LLMs) ==
// ==================================================================
// == npm package: https://www.npmjs.com/package/semantic-chunking ==
// == github repo: https://github.com/jparkerweb/semantic-chunking ==
// ==================================================================
import { MarkdownTextSplitter } from "@langchain/textsplitters";
import { DEFAULT_CONFIG } from "./config.js";
import { computeAdvancedSimilarities, adjustThreshold } from "./similarityUtils.js";
import { createChunks, optimizeAndRebalanceChunks, applyPrefixToChunk } from "./chunkingUtils.js";
import { readFileSync } from "fs";
export { LocalEmbeddingModel, OpenAIEmbedding } from "./embeddingUtils.js";
const packageJson = JSON.parse(readFileSync(new URL("./package.json", import.meta.url)));
const VERSION = packageJson.version;
export async function printVersion() {
const versionText = `-- semantic-chunking v${VERSION} --`;
const lineLength = versionText.length;
console.log(`\n${"-".repeat(lineLength)}\n${versionText}\n${"-".repeat(lineLength)}`);
}
const defaultConfig = {
logging: DEFAULT_CONFIG.LOGGING,
maxTokenSize: DEFAULT_CONFIG.MAX_TOKEN_SIZE,
similarityThreshold: DEFAULT_CONFIG.SIMILARITY_THRESHOLD,
dynamicThresholdLowerBound: DEFAULT_CONFIG.DYNAMIC_THRESHOLD_LOWER_BOUND,
dynamicThresholdUpperBound: DEFAULT_CONFIG.DYNAMIC_THRESHOLD_UPPER_BOUND,
numSimilaritySentencesLookahead: DEFAULT_CONFIG.NUM_SIMILARITY_SENTENCES_LOOKAHEAD,
combineChunks: DEFAULT_CONFIG.COMBINE_CHUNKS,
combineChunksSimilarityThreshold: DEFAULT_CONFIG.COMBINE_CHUNKS_SIMILARITY_THRESHOLD,
returnEmbedding: DEFAULT_CONFIG.RETURN_EMBEDDING,
returnTokenLength: DEFAULT_CONFIG.RETURN_TOKEN_LENGTH,
chunkPrefix: DEFAULT_CONFIG.CHUNK_PREFIX,
excludeChunkPrefixInResults: false,
};
// Helper function to parse text into markdown chunks
async function parseMarkdownChunks(text, splitter) {
const chunks = await splitter.splitText(text);
return chunks;
}
// ---------------------------
// -- Main chunkit function --
// ---------------------------
export async function chunkit(documents, model, config = {}) {
const {
logging,
maxTokenSize,
similarityThreshold,
dynamicThresholdLowerBound,
dynamicThresholdUpperBound,
numSimilaritySentencesLookahead,
combineChunks,
combineChunksSimilarityThreshold,
returnEmbedding,
returnTokenLength,
chunkPrefix,
excludeChunkPrefixInResults,
} = {
...defaultConfig,
...config,
};
// Create a markdown text splitter instance
const markdownSplitter = new MarkdownTextSplitter({
chunkSize: maxTokenSize / 2, // Large chunk size since we'll be doing our own chunking
chunkOverlap: 0,
});
if (logging) {
console.log("maxTokenSize", maxTokenSize);
console.log("similarityThreshold", similarityThreshold);
console.log("dynamicThresholdLowerBound", dynamicThresholdLowerBound);
console.log("dynamicThresholdUpperBound", dynamicThresholdUpperBound);
console.log("numSimilaritySentencesLookahead", numSimilaritySentencesLookahead);
console.log("combineChunks", combineChunks);
console.log("combineChunksSimilarityThreshold", combineChunksSimilarityThreshold);
console.log("returnEmbedding", returnEmbedding);
console.log("returnTokenLength", returnTokenLength);
console.log("chunkPrefix", chunkPrefix);
console.log("excludeChunkPrefixInResults", excludeChunkPrefixInResults);
}
if (logging) {
printVersion();
}
// Input validation
if (!Array.isArray(documents)) {
throw new Error("Input must be an array of document objects");
}
if (!model || typeof model.createEmbedding !== "function") {
throw new Error("A valid model instance must be provided");
}
const { modelName, dtype } = model.getModelInfo();
// Process each document
const allResults = await Promise.all(
documents.map(async (doc) => {
if (!doc.document_text) {
throw new Error("Each document must have a document_text property");
}
// Normalize document text by converting single line breaks to spaces
// but preserving multiple line breaks
let normalizedText = doc.document_text.replace(/([^\n])\n([^\n])/g, "$1 $2");
// Convert multiple spaces to single space
normalizedText = normalizedText.replace(/\s{2,}/g, " ");
doc.document_text = normalizedText;
// Split the text into sentences
const sentences = await parseMarkdownChunks(doc.document_text, markdownSplitter);
// Compute similarities and create chunks
const { similarities, average, variance } = await computeAdvancedSimilarities(sentences, {
numSimilaritySentencesLookahead,
logging,
model, // Pass the model to similarity computation
});
// Dynamically adjust the similarity threshold based on variance and average
let dynamicThreshold = similarityThreshold;
if (average != null && variance != null) {
dynamicThreshold = adjustThreshold(
average,
variance,
similarityThreshold,
dynamicThresholdLowerBound,
dynamicThresholdUpperBound
);
}
// Create the initial chunks using the adjusted threshold
const initialChunks = await createChunks(sentences, similarities, maxTokenSize, dynamicThreshold, logging, model);
// Log initial chunks if needed
if (logging) {
console.log("\n=============\ninitialChunks\n=============");
initialChunks.forEach((chunk, index) => {
console.log("\n");
console.log(`--------------`);
console.log(`-- Chunk ${index + 1} --`);
console.log(`--------------`);
console.log(chunk.substring(0, 50) + "...");
});
}
let finalChunks;
// Combine similar chunks and balance sizes if requested
if (combineChunks) {
finalChunks = await optimizeAndRebalanceChunks(
initialChunks,
model, // Use model's tokenizer
maxTokenSize,
combineChunksSimilarityThreshold
);
if (logging) {
console.log("\n\n=============\ncombinedChunks\n=============");
finalChunks.forEach((chunk, index) => {
console.log("\n\n\n");
console.log("--------------------");
console.log("Chunk " + (index + 1));
console.log("--------------------");
console.log(chunk.substring(0, 50) + "...");
});
}
} else {
finalChunks = initialChunks;
}
const documentName = doc.document_name || ""; // Normalize document_name
const documentId = Date.now();
const numberOfChunks = finalChunks.length;
return Promise.all(
finalChunks.map(async (chunk, index) => {
const prefixedChunk = applyPrefixToChunk(chunkPrefix, chunk);
const result = {
document_id: documentId,
document_name: documentName,
number_of_chunks: numberOfChunks,
chunk_number: index + 1,
model_name: modelName,
dtype: dtype,
text: prefixedChunk,
};
if (returnEmbedding) {
result.embedding = await model.createEmbedding(prefixedChunk);
}
if (returnTokenLength) {
try {
const encoded = await model.tokenize(prefixedChunk, {
padding: true,
});
if (encoded && encoded.size) {
result.token_length = encoded.size;
} else {
console.error("Tokenizer returned unexpected format:", encoded);
result.token_length = 0;
}
} catch (error) {
console.error("Error during tokenization:", error);
result.token_length = 0;
}
}
// Remove prefix if requested (after embedding calculation)
if (excludeChunkPrefixInResults && chunkPrefix && chunkPrefix.trim()) {
const prefixPattern = new RegExp(`^${chunkPrefix}:\\s*`);
result.text = result.text.replace(prefixPattern, "");
}
return result;
})
);
})
);
// Flatten the results array since we're processing multiple documents
return allResults.flat();
}