mongodb-rag-core
Version:
Common elements used by MongoDB Chatbot Framework components.
74 lines • 2.73 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.pageFormatToLanguage = exports.isSupportedLanguage = exports.chunkCode = void 0;
const text_splitter_1 = require("langchain/text_splitter");
const gpt3_tokenizer_1 = __importDefault(require("gpt3-tokenizer"));
const defaultCodeChunkOptions = {
maxChunkSize: 600, // max chunk size of 600 tokens gets avg ~400 tokens/chunk
minChunkSize: 0, // chunks below this size are discarded, which improves search quality
chunkOverlap: 0,
tokenizer: new gpt3_tokenizer_1.default({ type: "gpt3" }),
};
const chunkCode = async function (page, optionsIn) {
const options = { ...defaultCodeChunkOptions, ...optionsIn };
const { tokenizer, maxChunkSize, minChunkSize, chunkOverlap, transform } = options;
const language = pageFormatToLanguage(page.format);
if (!language) {
throw new Error(`No language found for page format ${page.format}`);
}
const splitter = text_splitter_1.RecursiveCharacterTextSplitter.fromLanguage(language, {
chunkOverlap,
chunkSize: maxChunkSize,
lengthFunction: (text) => tokenizer.encode(text).bpe.length,
});
let chunks = await splitter.createDocuments([page.body]);
if (minChunkSize) {
chunks = chunks.filter((chunk) => tokenizer.encode(chunk.pageContent).bpe.length > minChunkSize);
}
return await Promise.all(chunks.map(async ({ pageContent }, chunkIndex) => {
const preTransformChunk = {
chunkIndex,
sourceName: page.sourceName,
url: page.url,
text: pageContent,
};
const transformedChunk = transform
? await transform(preTransformChunk, { page })
: preTransformChunk;
const chunk = {
...transformedChunk,
tokenCount: tokenizer.encode(transformedChunk.text).bpe.length,
};
return chunk;
}));
};
exports.chunkCode = chunkCode;
const supportedLanguageMap = {
c: "cpp",
cpp: "cpp",
go: "go",
java: "java",
javascript: "js",
typescript: "js",
php: "php",
python: "python",
ruby: "ruby",
rust: "rust",
scala: "scala",
swift: "swift",
latex: "latex",
html: "html",
};
function isSupportedLanguage(str) {
return str in supportedLanguageMap;
}
exports.isSupportedLanguage = isSupportedLanguage;
function pageFormatToLanguage(format) {
return (supportedLanguageMap[format] ??
undefined);
}
exports.pageFormatToLanguage = pageFormatToLanguage;
//# sourceMappingURL=chunkCode.js.map