UNPKG

mongodb-rag-core

Version:

Common elements used by MongoDB Chatbot Framework components.

74 lines 2.73 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.pageFormatToLanguage = exports.isSupportedLanguage = exports.chunkCode = void 0; const text_splitter_1 = require("langchain/text_splitter"); const gpt3_tokenizer_1 = __importDefault(require("gpt3-tokenizer")); const defaultCodeChunkOptions = { maxChunkSize: 600, // max chunk size of 600 tokens gets avg ~400 tokens/chunk minChunkSize: 0, // chunks below this size are discarded, which improves search quality chunkOverlap: 0, tokenizer: new gpt3_tokenizer_1.default({ type: "gpt3" }), }; const chunkCode = async function (page, optionsIn) { const options = { ...defaultCodeChunkOptions, ...optionsIn }; const { tokenizer, maxChunkSize, minChunkSize, chunkOverlap, transform } = options; const language = pageFormatToLanguage(page.format); if (!language) { throw new Error(`No language found for page format ${page.format}`); } const splitter = text_splitter_1.RecursiveCharacterTextSplitter.fromLanguage(language, { chunkOverlap, chunkSize: maxChunkSize, lengthFunction: (text) => tokenizer.encode(text).bpe.length, }); let chunks = await splitter.createDocuments([page.body]); if (minChunkSize) { chunks = chunks.filter((chunk) => tokenizer.encode(chunk.pageContent).bpe.length > minChunkSize); } return await Promise.all(chunks.map(async ({ pageContent }, chunkIndex) => { const preTransformChunk = { chunkIndex, sourceName: page.sourceName, url: page.url, text: pageContent, }; const transformedChunk = transform ? await transform(preTransformChunk, { page }) : preTransformChunk; const chunk = { ...transformedChunk, tokenCount: tokenizer.encode(transformedChunk.text).bpe.length, }; return chunk; })); }; exports.chunkCode = chunkCode; const supportedLanguageMap = { c: "cpp", cpp: "cpp", go: "go", java: "java", javascript: "js", typescript: "js", php: "php", python: "python", ruby: "ruby", rust: "rust", scala: "scala", swift: "swift", latex: "latex", html: "html", }; function isSupportedLanguage(str) { return str in supportedLanguageMap; } exports.isSupportedLanguage = isSupportedLanguage; function pageFormatToLanguage(format) { return (supportedLanguageMap[format] ?? undefined); } exports.pageFormatToLanguage = pageFormatToLanguage; //# sourceMappingURL=chunkCode.js.map