UNPKG

mongodb-rag-core

Version:

Common elements used by MongoDB Chatbot Framework components.

74 lines 2.68 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.chunkMd = void 0; const text_splitter_1 = require("langchain/text_splitter"); const gpt3_tokenizer_1 = __importDefault(require("gpt3-tokenizer")); const frontMatter_1 = require("../frontMatter"); const defaultMdChunkOptions = { maxChunkSize: 600, // max chunk size of 600 tokens gets avg ~400 tokens/chunk minChunkSize: 15, // chunks below this size are discarded, which improves search quality chunkOverlap: 0, tokenizer: new gpt3_tokenizer_1.default({ type: "gpt3" }), }; // separators modified from https://github.com/hwchase17/langchainjs/blob/d017e0dac9d84c9d58fd816698125ab0ae1c0826/langchain/src/text_splitter.ts#L566C5-L566C5 const separators = [ // First, try to split along Markdown headings (starting with level 2) "\n## ", "\n### ", "\n#### ", "\n##### ", "\n###### ", '\n\n<Tab name="', "\n\n<Tabs>\n\n", "<table>\n", "<tr>\n", "<th>\n", "<td>\n", "```\n\n", "\n\n***\n\n", "\n\n---\n\n", "\n\n___\n\n", "\n\n", "\n", " ", "", ]; const chunkMd = async function (page, optionsIn) { const options = { ...defaultMdChunkOptions, ...optionsIn }; const { tokenizer, maxChunkSize, minChunkSize, chunkOverlap, transform } = options; const splitter = new text_splitter_1.RecursiveCharacterTextSplitter({ chunkOverlap, chunkSize: maxChunkSize, lengthFunction: (text) => tokenizer.encode(text).bpe.length, separators, }); let chunks = await splitter.createDocuments([page.body]); if (minChunkSize) { chunks = chunks.filter((chunk) => tokenizer.encode(chunk.pageContent).bpe.length > minChunkSize); } return await Promise.all(chunks.map(async ({ pageContent }, chunkIndex) => { const preTransformChunk = { chunkIndex, sourceName: page.sourceName, url: page.url, text: pageContent, }; const transformedChunk = transform ? await transform(preTransformChunk, { page }) : preTransformChunk; const chunk = { ...transformedChunk, tokenCount: tokenizer.encode(transformedChunk.text).bpe.length, }; const { metadata } = (0, frontMatter_1.extractFrontMatter)(transformedChunk.text); if (metadata) { chunk["metadata"] = metadata; } return chunk; })); }; exports.chunkMd = chunkMd; //# sourceMappingURL=chunkMd.js.map