mongodb-rag-core
Version:
Common elements used by MongoDB Chatbot Framework components.
175 lines • 5.98 kB
JavaScript
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.chunkOpenApiSpecYaml = exports.defaultOpenApiSpecYamlChunkOptions = void 0;
const swagger_parser_1 = __importDefault(require("@apidevtools/swagger-parser"));
const text_splitter_1 = require("langchain/text_splitter");
const yaml_1 = __importDefault(require("yaml"));
const gpt3_tokenizer_1 = __importDefault(require("gpt3-tokenizer"));
const logger_1 = require("../logger");
const updateFrontMatter_1 = require("../frontMatter/updateFrontMatter");
exports.defaultOpenApiSpecYamlChunkOptions = {
maxChunkSize: 1250,
chunkOverlap: 0,
minChunkSize: 15,
tokenizer: new gpt3_tokenizer_1.default({ type: "gpt3" }),
};
const chunkOpenApiSpecYaml = async function (page, optionsIn) {
const options = {
...exports.defaultOpenApiSpecYamlChunkOptions,
...optionsIn,
maxChunkSize: optionsIn?.yamlChunkSize ??
exports.defaultOpenApiSpecYamlChunkOptions.maxChunkSize,
};
const { tokenizer, maxChunkSize, chunkOverlap } = options;
const splitter = makeOpenApiSpecYamlTextSplitter({
chunkOverlap,
maxChunkSize,
tokenizer,
});
const spec = await swagger_parser_1.default.parse(yaml_1.default.parse(page.body));
const apiName = spec?.info?.title ?? page.title ?? "";
const baseUrls = spec?.servers?.map((server) => server.url);
const chunks = [];
let chunkIndex = 0;
// Deal with paths
const { paths } = spec;
if (paths !== undefined) {
for (const path of Object.keys(paths)) {
const actions = paths[path];
if (actions === undefined) {
continue;
}
for (const action of Object.keys(actions ?? {})) {
const resourceName = `${action.trim().toUpperCase()} ${path.trim()}`;
const methodBody = actions[action];
if (!methodBody) {
logger_1.logger.info(`Skipping ${resourceName} - no method body`);
continue;
}
const method = {
[`${path}`]: {
[`${action}`]: methodBody,
},
};
const stringChunks = await splitter.splitText(yaml_1.default.stringify(method));
chunks.push(...stringChunks.map((stringChunk) => {
const metadata = {
resourceName,
openApiSpec: true,
apiName: apiName.trim(),
baseUrls,
specTags: methodBody.tags ?? [],
};
const text = (0, updateFrontMatter_1.updateFrontMatter)(stringChunk.trim(), metadata);
const tokenCount = tokenizer.encode(text).bpe.length;
const chunk = {
url: page.url,
sourceName: page.sourceName,
text,
tokenCount,
metadata,
chunkIndex: chunkIndex++,
};
return chunk;
}));
}
}
}
// deal with other parts of the spec to index besides paths
const otherSpecInfoToKeep = {
info: spec.info,
security: spec.security,
servers: spec.servers,
tags: spec.tags,
components: spec.components,
};
let stringChunks = await splitter.splitText(yaml_1.default.stringify(otherSpecInfoToKeep));
if (options.minChunkSize !== undefined) {
const { minChunkSize } = options;
stringChunks = stringChunks.filter((chunk) => tokenizer.encode(chunk).bpe.length > minChunkSize);
}
chunks.push(...stringChunks.map((stringChunk) => {
const metadata = {
openApiSpec: true,
apiName: apiName,
baseUrls,
};
const text = (0, updateFrontMatter_1.updateFrontMatter)(stringChunk, metadata);
const tokenCount = tokenizer.encode(text).bpe.length;
const chunk = {
url: page.url,
sourceName: page.sourceName,
text,
tokenCount,
metadata,
chunkIndex: chunkIndex++,
};
return chunk;
}));
return chunks;
};
exports.chunkOpenApiSpecYaml = chunkOpenApiSpecYaml;
function makeOpenApiSpecYamlTextSplitter({ chunkOverlap, maxChunkSize, tokenizer, }) {
const separators = [
"\npaths:\n",
"\nget:\n",
"\npost:\n",
"\nput:\n",
"\ndelete:\n",
"\npatch:\n",
"\nhead:\n",
"\noptions:\n",
"\nconnect:\n",
"\ntrace:\n",
"\nrequestBody:\n",
"\nresponses:\n",
"\n100:\n",
"\n101:\n",
"\n102:\n",
"\n200:\n",
"\n201:\n",
"\n202:\n",
"\n204:\n",
"\n206:\n",
"\n300:\n",
"\n301:\n",
"\n302:\n",
"\n303:\n",
"\n304:\n",
"\n307:\n",
"\n308:\n",
"\n400:\n",
"\n401:\n",
"\n403:\n",
"\n404:\n",
"\n405:\n",
"\n406:\n",
"\n409:\n",
"\n410:\n",
"\n413:\n",
"\n415:\n",
"\n429:\n",
"\n500:\n",
"\n501:\n",
"\n502:\n",
"\n503:\n",
"\n504:\n",
"\n505:\n",
"\ncontent\n",
"\nschema:\n",
"\n\n",
"\n",
" ",
"",
];
return new text_splitter_1.RecursiveCharacterTextSplitter({
chunkOverlap,
chunkSize: maxChunkSize,
separators,
lengthFunction: (text) => tokenizer.encode(text).bpe.length,
});
}
//# sourceMappingURL=chunkOpenApiSpecYaml.js.map