mongodb-rag-core
Version:
Common elements used by MongoDB Chatbot Framework components.
74 lines • 2.68 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.chunkMd = void 0;
const text_splitter_1 = require("langchain/text_splitter");
const gpt3_tokenizer_1 = __importDefault(require("gpt3-tokenizer"));
const frontMatter_1 = require("../frontMatter");
const defaultMdChunkOptions = {
maxChunkSize: 600, // max chunk size of 600 tokens gets avg ~400 tokens/chunk
minChunkSize: 15, // chunks below this size are discarded, which improves search quality
chunkOverlap: 0,
tokenizer: new gpt3_tokenizer_1.default({ type: "gpt3" }),
};
// separators modified from https://github.com/hwchase17/langchainjs/blob/d017e0dac9d84c9d58fd816698125ab0ae1c0826/langchain/src/text_splitter.ts#L566C5-L566C5
const separators = [
// First, try to split along Markdown headings (starting with level 2)
"\n## ",
"\n### ",
"\n#### ",
"\n##### ",
"\n###### ",
'\n\n<Tab name="',
"\n\n<Tabs>\n\n",
"<table>\n",
"<tr>\n",
"<th>\n",
"<td>\n",
"```\n\n",
"\n\n***\n\n",
"\n\n---\n\n",
"\n\n___\n\n",
"\n\n",
"\n",
" ",
"",
];
const chunkMd = async function (page, optionsIn) {
const options = { ...defaultMdChunkOptions, ...optionsIn };
const { tokenizer, maxChunkSize, minChunkSize, chunkOverlap, transform } = options;
const splitter = new text_splitter_1.RecursiveCharacterTextSplitter({
chunkOverlap,
chunkSize: maxChunkSize,
lengthFunction: (text) => tokenizer.encode(text).bpe.length,
separators,
});
let chunks = await splitter.createDocuments([page.body]);
if (minChunkSize) {
chunks = chunks.filter((chunk) => tokenizer.encode(chunk.pageContent).bpe.length > minChunkSize);
}
return await Promise.all(chunks.map(async ({ pageContent }, chunkIndex) => {
const preTransformChunk = {
chunkIndex,
sourceName: page.sourceName,
url: page.url,
text: pageContent,
};
const transformedChunk = transform
? await transform(preTransformChunk, { page })
: preTransformChunk;
const chunk = {
...transformedChunk,
tokenCount: tokenizer.encode(transformedChunk.text).bpe.length,
};
const { metadata } = (0, frontMatter_1.extractFrontMatter)(transformedChunk.text);
if (metadata) {
chunk["metadata"] = metadata;
}
return chunk;
}));
};
exports.chunkMd = chunkMd;
//# sourceMappingURL=chunkMd.js.map