semantic-chunking
Version:
Semantically create chunks from large texts. Useful for workflows involving large language models (LLMs).
113 lines (93 loc) • 4.34 kB
JavaScript
import { tokenizer } from './embeddingUtils.js';
import { cosineSimilarity } from './similarityUtils.js';
import { createEmbedding } from './embeddingUtils.js';
// -----------------------------------------------------------
// -- Function to create chunks of text based on similarity --
// -----------------------------------------------------------
export function createChunks(sentences, similarities, maxTokenSize, similarityThreshold, logging) {
let chunks = [];
let currentChunk = [sentences[0]];
if (logging) {
console.log('Initial sentence:', sentences[0]);
}
for (let i = 1; i < sentences.length; i++) {
const nextSentence = sentences[i];
// For cramit (when similarities is null), only check token size
if (!similarities) {
const currentChunkText = currentChunk.join(" ");
const currentChunkSize = tokenizer(currentChunkText).input_ids.size;
const nextSentenceTokenCount = tokenizer(nextSentence).input_ids.size;
if (currentChunkSize + nextSentenceTokenCount <= maxTokenSize) {
currentChunk.push(nextSentence);
} else {
chunks.push(currentChunkText);
currentChunk = [nextSentence];
}
continue;
}
// Check similarity first for chunkit
if (similarities[i - 1] >= similarityThreshold) {
if (logging) {
console.log(`Adding sentence ${i} with similarity ${similarities[i - 1]}`);
}
// Then check token size
const currentChunkText = currentChunk.join(" ");
const currentChunkSize = tokenizer(currentChunkText).input_ids.size;
const nextSentenceTokenCount = tokenizer(nextSentence).input_ids.size;
if (currentChunkSize + nextSentenceTokenCount <= maxTokenSize) {
currentChunk.push(nextSentence);
} else {
chunks.push(currentChunkText);
currentChunk = [nextSentence];
}
} else {
if (logging) {
console.log(`Starting new chunk at sentence ${i}, similarity was ${similarities[i - 1]}`);
}
chunks.push(currentChunk.join(" "));
currentChunk = [nextSentence];
}
}
if (currentChunk.length > 0) {
chunks.push(currentChunk.join(" "));
}
return chunks;
}
// --------------------------------------------------------------
// -- Optimize and Rebalance Chunks (optionally use Similarity) --
// --------------------------------------------------------------
export async function optimizeAndRebalanceChunks(combinedChunks, tokenizer, maxTokenSize, combineChunksSimilarityThreshold = 0.5) {
let optimizedChunks = [];
let currentChunkText = "";
let currentChunkTokenCount = 0;
let currentEmbedding = null;
for (let index = 0; index < combinedChunks.length; index++) {
const chunk = combinedChunks[index];
const chunkTokenCount = tokenizer(chunk).input_ids.size;
if (currentChunkText && (currentChunkTokenCount + chunkTokenCount <= maxTokenSize)) {
const nextEmbedding = await createEmbedding(chunk);
const similarity = currentEmbedding ? cosineSimilarity(currentEmbedding, nextEmbedding) : 0;
if (similarity >= combineChunksSimilarityThreshold) {
currentChunkText += " " + chunk;
currentChunkTokenCount += chunkTokenCount;
currentEmbedding = nextEmbedding;
continue;
}
}
if (currentChunkText) optimizedChunks.push(currentChunkText);
currentChunkText = chunk;
currentChunkTokenCount = chunkTokenCount;
currentEmbedding = await createEmbedding(chunk);
}
if (currentChunkText) optimizedChunks.push(currentChunkText);
return optimizedChunks.filter(chunk => chunk);
}
// ------------------------------------------------
// -- Helper function to apply prefix to a chunk --
// ------------------------------------------------
export function applyPrefixToChunk(chunkPrefix, chunk) {
if (chunkPrefix && chunkPrefix.trim()) {
return `${chunkPrefix}: ${chunk}`;
}
return chunk;
};