UNPKG

docusaurus-plugin-llms-builder

Version:

A Docusaurus plugin for generating standardized LLM configuration files (llms.txt, llms-full.txt)

github.com/kingsword09/docusaurus-plugin-llms-builder

kingsword09/docusaurus-plugin-llms-builder

690 lines (682 loc) • 28.2 kB

JavaScript

//#region rolldown:runtime var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) { key = keys[i]; if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: ((k) => from[k]).bind(null, key), enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod)); //#endregion const __docusaurus_utils = __toESM(require("@docusaurus/utils")); const minimatch = __toESM(require("minimatch")); const node_assert = __toESM(require("node:assert")); const node_path = __toESM(require("node:path")); const node_fs_promises = __toESM(require("node:fs/promises")); const hookable = __toESM(require("hookable")); const fuse_js = __toESM(require("fuse.js")); const __kingsword_node_html_markdown = __toESM(require("@kingsword/node-html-markdown")); const fast_xml_parser = __toESM(require("fast-xml-parser")); //#region src/files.ts const collectPatternsDocsFiles = async (siteDir, docsDir, ignorePatterns) => { const allDocsFiles = []; const fullDocsDir = node_path.default.join(siteDir, docsDir); try { await node_fs_promises.default.access(fullDocsDir); const docFiles = await collectMarkdownFiles(siteDir, fullDocsDir, ignorePatterns); allDocsFiles.push(...docFiles); } catch (err) { console.warn(`Docs directory not found: ${fullDocsDir}`, err); } return allDocsFiles; }; /** * Collect all docs files from docs directory and blog directory * * @param siteDir * @param defaultLLMConfig * @returns */ const collectLLMSessionFiles = async (siteDir, llmConfig) => { const llmSessionFiles = []; for (const session of llmConfig.sessions) { const docsFiles = await collectPatternsDocsFiles(siteDir, session.docsDir, session.patterns?.ignorePatterns ?? []); if (docsFiles.length === 0) { console.warn("No docs files found: ", JSON.stringify(session)); continue; } llmSessionFiles.push({ ...session, docsFiles }); } return llmSessionFiles; }; /** * Process docs files with patterns * * @param options * @returns */ const processLLMSessionsFilesWithPatternFilters = async (llmSessionFiles, pluginSiteConfig) => { const { patterns, docsFiles } = llmSessionFiles; const { siteDir } = pluginSiteConfig; let filteredFiles = docsFiles; if (llmSessionFiles.type === "blog") filteredFiles = filteredFiles.sort((a, b) => { const dateRegex = /(\d{4}-\d{2}-\d{2})/; const dateA = a.match(dateRegex)?.[0] || ""; const dateB = b.match(dateRegex)?.[0] || ""; return dateB.localeCompare(dateA); }); let filesToProcess = []; if (patterns) { if (Array.isArray(patterns.includePatterns) && patterns.includePatterns.length > 0) { const includePatterns = patterns.includePatterns; filteredFiles = docsFiles.filter((file) => { const relativePath = node_path.default.relative(siteDir, file); return includePatterns.some((pattern) => (0, minimatch.minimatch)(relativePath, pattern, { matchBase: true })); }); } if (Array.isArray(patterns.ignorePatterns) && patterns.ignorePatterns.length > 0) { const ignorePatterns = patterns.ignorePatterns; filteredFiles = filteredFiles.filter((file) => { const relativePath = node_path.default.relative(siteDir, file); return !ignorePatterns.some((pattern) => (0, minimatch.minimatch)(relativePath, pattern, { matchBase: true })); }); } if (Array.isArray(patterns.orderPatterns)) { const orderPatterns = patterns.orderPatterns; const matchedFiles = new Set(); for (const pattern of orderPatterns) { const matchingFiles = filteredFiles.filter((file) => { const relativePath = node_path.default.relative(siteDir, file); return (0, minimatch.minimatch)(relativePath, pattern, { matchBase: true }) && !matchedFiles.has(file); }); for (const file of matchingFiles) { filesToProcess.push(file); matchedFiles.add(file); } } const remainingFiles = filteredFiles.filter((file) => !matchedFiles.has(file)); filesToProcess.push(...remainingFiles); } else if (patterns.orderPatterns) filesToProcess = filteredFiles.sort(patterns.orderPatterns); else filesToProcess = filteredFiles; } else filesToProcess = filteredFiles; llmSessionFiles.docsFiles = filesToProcess; return llmSessionFiles; }; /** * Check if a file should be ignored based on glob patterns * @param filePath - Path to the file * @param baseDir - Base directory for relative paths * @param ignorePatterns - Glob patterns for files to ignore * @returns Whether the file should be ignored */ const shouldIgnoreFile = (baseDir, filePath, ignorePatterns = []) => { if (ignorePatterns.length === 0) return false; const relativePath = node_path.default.relative(baseDir, filePath); return ignorePatterns.some((pattern) => (0, minimatch.minimatch)(relativePath, pattern, { matchBase: true })); }; /** * Recursively reads all Markdown files in a directory * @param dir - Directory to scan * @param baseDir - Base directory for relative paths * @param ignorePatterns - Glob patterns for files to ignore * @returns Array of file paths */ const collectMarkdownFiles = async (baseDir, dir, ignorePatterns) => { const files = []; const entries = await node_fs_promises.default.readdir(dir, { withFileTypes: true }); for (const entry of entries) { const fullPath = node_path.default.join(dir, entry.name); if (shouldIgnoreFile(baseDir, fullPath, ignorePatterns)) continue; if (entry.isDirectory()) { const subDirFiles = await collectMarkdownFiles(baseDir, fullPath, ignorePatterns); files.push(...subDirFiles); } else if (entry.name.endsWith(".md") || entry.name.endsWith(".mdx")) files.push(fullPath); } return files; }; /** * Get all docusaurus build files paths * @param outDir * @returns */ const getAllDocusaurusBuildFilesPaths = async (outDir) => { const existingPaths = new Set(); try { const files = await node_fs_promises.default.readdir(outDir, { recursive: true }); for (const file of files) { if (!file.endsWith(".html")) continue; const fullPath = node_path.default.join(outDir, file.toString()); const stat = await node_fs_promises.default.stat(fullPath); if (stat.isFile()) existingPaths.add(file.endsWith("index.html") ? file === "index.html" ? "/" : file.replace("/index.html", "") : file); } } catch (error) { console.warn("Error reading outDir directory:", error); } return existingPaths; }; //#endregion //#region src/hooks.ts const createLlmsHooks = async (llmConfig, llmStdConfig, llmFullStdConfig) => { const hooks = (0, hookable.createHooks)(); if (typeof llmConfig.hooks === "object") hooks.addHooks(llmConfig.hooks); else if (typeof llmConfig.hooks === "function") await llmConfig.hooks(hooks); const context = { llmConfig: { llmStdConfig, llmFullStdConfig }, hooks }; return { hooks, context }; }; //#endregion //#region src/xml.ts /** * Parse sitemap.xml and extract URLs * @param filePath * @returns */ const sitemapParser = async (filePath) => { const parser = new fast_xml_parser.XMLParser({ isArray: (tagName) => "loc" === tagName, removeNSPrefix: true }); const data = parser.parse(await node_fs_promises.default.readFile(filePath, { encoding: "utf-8" })); if (Array.isArray(data?.urlset?.url) && data.urlset.url.length > 0) { const locUrls = data.urlset.url.map((url) => url.loc.pop()); return locUrls.sort((a, b) => { const pathA = new URL(a).pathname; const pathB = new URL(b).pathname; return pathA.localeCompare(pathB); }); } return null; }; /** * Parse RSS XML file and extract title, description, and content from each item. * @param filePath - Path to the RSS XML file. * @returns An array of objects, each containing title, description, and content. */ const parseRssItems = async (filePath) => { const xmlContent = await node_fs_promises.default.readFile(filePath, { encoding: "utf-8" }); const parser = new fast_xml_parser.XMLParser({ isArray: (tagName) => tagName === "item", removeNSPrefix: true, ignoreAttributes: true, textNodeName: "#text" }); const parsedData = parser.parse(xmlContent); const items = parsedData?.rss?.channel?.item ?? parsedData?.feed?.entry; if (!Array.isArray(items)) { console.error("Could not find RSS items in the provided XML file."); return []; } return items.map((item) => { let content = item.encoded?.__CDATA || item.encoded || item.content || ""; if (content) { const dom = (0, __kingsword_node_html_markdown.parse)(content); content = __kingsword_node_html_markdown.NodeHtmlMarkdown.translate(dom.querySelector(".markdown") ?? content); } return { title: item.title?.__CDATA || item.title || "", description: item.description?.__CDATA || item.description || item.summary || "", content, link: item.link || item.id || "" }; }); }; /** * Parse HTML content from a file * @param filePath - Path to the HTML file * @returns Parsed content */ const htmlParser = async (filePath) => { try { const htmlContent = await node_fs_promises.default.readFile(filePath, "utf8"); const dom = (0, __kingsword_node_html_markdown.parse)(htmlContent); const title = dom.querySelector("title")?.textContent; const metaDescription = dom.querySelector("meta[name=\"description\"]")?.getAttribute("content"); return { title, description: metaDescription, content: __kingsword_node_html_markdown.NodeHtmlMarkdown.translate(dom.querySelector(".markdown")?.toString() ?? htmlContent) }; } catch (error) { console.warn(`Failed to parse MDX HTML content for file: ${filePath}`, error); } return null; }; //#endregion //#region src/parser.ts /** * parse markdown file title * * @param filePath - Path to the markdown file * @param contentTitle - Title extracted from the markdown content * @param frontMatter - Front matter data extracted from the markdown file * @returns Parsed title */ const titleParser = (filePath, frontMatter, content, contentTitle) => { if (contentTitle) return contentTitle; if (typeof frontMatter.title === "string" && frontMatter.title) return frontMatter.title; const headingMatch = content.match(/^#\s+(.*)/m); if (headingMatch && headingMatch[1]) return headingMatch[1].trim(); return node_path.default.basename(filePath, node_path.default.extname(filePath)).replace(/-/g, " ").replace(/\b\w/g, (c) => c.toUpperCase()); }; /** * Process a markdown file and extract its metadata and content * * @param filePath - Path to the markdown file * @param removeContentTitle - If true, the matching title will be removed from the returned content. We can promise that at least one empty line will be left between the content before and after, but you shouldn't make too much assumption about what's left. * @param siteConfig - Docusaurus config * @returns */ const markdownParser = async (filePath, removeContentTitle, siteConfig) => { const fileContent = await node_fs_promises.default.readFile(filePath, "utf8"); return await (0, __docusaurus_utils.parseMarkdownFile)({ filePath, fileContent, parseFrontMatter: siteConfig.markdown.parseFrontMatter, removeContentTitle }); }; /** * Process a markdown file and extract its metadata and content * @param options * @returns */ const markdownMetadataParser = async (options) => { const { type, buildFilesPaths, filePath, removeContentTitle, siteConfig, baseDir, siteUrl, outDir, pathPrefix } = options; const metadata = await markdownParser(filePath, removeContentTitle ?? false, siteConfig); const normalizedPath = node_path.default.normalize(node_path.default.relative(baseDir, filePath)); let isMdx = false; if (normalizedPath.endsWith(".mdx")) isMdx = true; const linkPathBase = normalizedPath.replace(/\.mdx?$/, ""); const parts = linkPathBase.split("/"); const lastPart = parts[parts.length - 1]; const parentDir = parts[parts.length - 2]; let linkPath; if (linkPathBase.endsWith("index")) linkPath = linkPathBase.replace(/\/index$/, ""); else if (lastPart === parentDir) linkPath = parts.slice(0, -1).join("/"); else linkPath = linkPathBase; let finalLinkPath = linkPath; if (metadata.frontMatter.slug) finalLinkPath = metadata.frontMatter.slug; if (type === "blog") { const dateMatch = linkPath.match(/(\d{4})-(\d{2})-(\d{2})/); if (dateMatch) { const currentDate = new Date(); const postDate = new Date(parseInt(dateMatch[1]), parseInt(dateMatch[2]) - 1, parseInt(dateMatch[3])); const oneYearAgo = new Date(); oneYearAgo.setFullYear(currentDate.getFullYear() - 1); if (postDate < oneYearAgo) finalLinkPath = linkPath.replace(/\d{4}-\d{2}-\d{2}-/, ""); else finalLinkPath = linkPath.replace(/(\d{4})-(\d{2})-(\d{2})-/, "$1/$2/$3"); } } else finalLinkPath = finalLinkPath.replace(/^(\d{1,2})-/, ""); let content = metadata.content; finalLinkPath = finalLinkPath === "/" ? "" : buildFilesPaths.has(finalLinkPath) ? finalLinkPath : findBestMatch(node_path.default.join(pathPrefix ?? "", finalLinkPath), buildFilesPaths); const link = new URL(finalLinkPath, siteUrl).toString(); const title = titleParser(filePath, metadata.frontMatter, metadata.content, metadata.contentTitle); let description = typeof metadata.frontMatter.description === "string" && metadata.frontMatter.description || metadata.excerpt || ""; if (isMdx) { const htmlParserResult = await htmlParser(node_path.default.join(outDir, finalLinkPath, "index.html")); if (htmlParserResult) { if (!description) description = htmlParserResult.description ?? ""; content = htmlParserResult.content; } } return { title, description, summary: metadata.excerpt, content, link }; }; /** * Find the best matching string in an array using fuzzy search * @param needle - The string to search for * @param haystack - Array of strings to search in * @param options - Optional Fuse.js options * @returns The best matching string or null if no matches found */ const findBestMatch = (needle, haystack, options = { threshold: .6, includeScore: true }) => { if (haystack.has(needle)) return needle; const haystackArray = Array.from(haystack); const fuse = new fuse_js.default(haystackArray, options); const results = fuse.search(needle).sort((a, b) => { if (a.score === b.score) return b.refIndex - a.refIndex; return (a.score ?? 0) - (b.score ?? 0); }); if (results.length > 0) return results[0]?.item ?? needle; return needle; }; //#endregion //#region src/llmstxt.ts const generateLLMStdConfig = async (stdConfig, buildFilesPaths, llmSessionFiles, pluginSiteConfig) => { for await (const llmSessionFile of llmSessionFiles) { const session = { sessionName: llmSessionFile.sessionName ?? llmSessionFile.docsDir, source: "normal", items: [] }; for await (const filePath of llmSessionFile.docsFiles) { const { title, description, link } = await markdownMetadataParser({ type: llmSessionFile.type, buildFilesPaths, filePath, siteConfig: pluginSiteConfig.siteConfig, baseDir: node_path.default.join(pluginSiteConfig.siteDir, llmSessionFile.docsDir), siteUrl: pluginSiteConfig.siteUrl, outDir: pluginSiteConfig.outDir, pathPrefix: llmSessionFile.docsDir, removeContentTitle: true }); session.items.push({ title: title ?? "", description: description ?? "", link: link ?? "" }); } if (session.items.length > 0) stdConfig.sessions.push(session); } return stdConfig; }; const generateLLMFullStdConfig = async (stdFullConfig, buildFilesPaths, llmSessionFiles, pluginSiteConfig) => { for await (const llmSessionFile of llmSessionFiles) for await (const filePath of llmSessionFile.docsFiles) { const { title, content, link } = await markdownMetadataParser({ type: llmSessionFile.type, buildFilesPaths, filePath, siteConfig: pluginSiteConfig.siteConfig, baseDir: node_path.default.join(pluginSiteConfig.siteDir, llmSessionFile.docsDir), siteUrl: pluginSiteConfig.siteUrl, outDir: pluginSiteConfig.outDir, pathPrefix: llmSessionFile.docsDir, removeContentTitle: true }); stdFullConfig.sessions.push({ title: title ?? "", content, link }); } return stdFullConfig; }; /** * Standardize the content of llms.txt * @param llmStdConfig * @returns */ const standardizeLLMsTxtContent = (llmStdConfig, extraSession) => { const headerSection = [ `# ${llmStdConfig.title}`, `> ${llmStdConfig.description}`, llmStdConfig.summary ].filter(Boolean).join("\n\n"); const sessionsContent = `\n\n## Table of Contents` + llmStdConfig.sessions.map((session) => { const sessionHeader = `\n\n### ${session.sessionName}\n\n`; const sessionItems = session.items.map((item) => { const baseLink = `- [${item.title}](${item.link})`; return item.description ? `${baseLink}: ${item.description}` : `${baseLink}`; }).join("\n"); return sessionHeader + sessionItems; }).join(""); let extraContent = ""; if (extraSession) { const extraLinksHeader = `\n\n### ${extraSession.sessionName}\n\n`; extraContent = extraLinksHeader + extraSession.extraLinks.map((link) => { const baseLink = `- [${link.title}](${link.link})`; return link.description ? `${baseLink}: ${link.description}` : `${baseLink}`; }).join("\n"); } return headerSection + sessionsContent + extraContent; }; /** * Standardize the content of llms-full.txt * @param llmFullStdConfig * @returns */ const standardizeLLMsFullTxtContent = (llmFullStdConfig) => { const headerSection = [ `# ${llmFullStdConfig.title}`, `> ${llmFullStdConfig.description}`, llmFullStdConfig.summary ].filter(Boolean).join("\n\n"); const sessionsContent = llmFullStdConfig.sessions.map((session) => { const sessionHeader = `\n\n---\nurl: ${session.link}\n---\n${session.title ? "# " + session.title + "\n" : ""}`; const sessionItems = `\n${session.content.trim()}\n`; return sessionHeader + sessionItems + "\n---"; }).join(""); return headerSection + sessionsContent; }; /** * Generate llms.txt or llms-full.txt * @param outDir * @param filename * @param content * @returns */ const generateLLMsTxt = async (outDir, filename, content) => { return (0, __docusaurus_utils.generate)(outDir, filename, content, true); }; const initializeLLMConfigurations = (config) => { return { llmStdConfig: { title: config.title ?? "", description: config.description ?? "", summary: config.summary ?? "", sessions: [] }, llmFullStdConfig: { title: config.title ?? "", description: config.description ?? "", summary: config.summary ?? "", processedUrls: new Set(), sessions: [] } }; }; const processDocumentationSession = async (sessionFileData, siteConfig, standardConfig, fullContentConfig) => { (0, node_assert.default)(sessionFileData.type === "docs", `Session ${sessionFileData.docsDir} is not a docs type, skipping processing`); const sessionItem = { sessionName: sessionFileData.sessionName ?? sessionFileData.docsDir, source: "sitemap", items: [] }; const { ignorePatterns, includePatterns, orderPatterns } = sessionFileData.patterns ?? {}; const sitemapPath = node_path.default.join(siteConfig.outDir, sessionFileData.sitemap); const urlList = await sitemapParser(sitemapPath); if (!urlList) return { llmStdConfig: standardConfig, llmFullStdConfig: fullContentConfig }; let matchedUrls = []; if (orderPatterns) if (Array.isArray(orderPatterns)) { for await (const orderPattern of orderPatterns) { const matchedUrlsByPattern = urlList.filter((url) => (0, minimatch.minimatch)(url, orderPattern, { matchBase: true })); matchedUrlsByPattern.forEach((url) => matchedUrls.push(url)); } const unmatchedUrls = urlList.filter((url) => !matchedUrls.includes(url)); matchedUrls = matchedUrls.concat(unmatchedUrls); } else matchedUrls = urlList.sort(orderPatterns); else matchedUrls = urlList; for await (const pageUrl of matchedUrls) { const htmlFilePath = decodeURIComponent(node_path.default.join(siteConfig.outDir, pageUrl.replace(siteConfig.siteUrl, ""), "index.html")); if (ignorePatterns && ignorePatterns.some((pattern) => (0, minimatch.minimatch)(pageUrl, pattern, { matchBase: true }))) continue; if (includePatterns && !includePatterns.some((pattern) => (0, minimatch.minimatch)(pageUrl, pattern, { matchBase: true }))) continue; const htmlParserResult = await htmlParser(htmlFilePath); const pageTitle = htmlParserResult?.title ?? ""; const content = htmlParserResult?.content ?? ""; sessionItem.items.push({ title: pageTitle, link: pageUrl, description: htmlParserResult?.description ?? "" }); if (fullContentConfig.processedUrls.has(pageUrl)) continue; fullContentConfig.processedUrls.add(pageUrl); fullContentConfig.sessions.push({ title: pageTitle, link: pageUrl, content }); } standardConfig.sessions.push(sessionItem); return { llmStdConfig: standardConfig, llmFullStdConfig: fullContentConfig }; }; const processBlogSession = async (sessionFileData, siteConfig, standardConfig, fullContentConfig) => { (0, node_assert.default)(sessionFileData.type === "blog", `Session ${sessionFileData.docsDir} is not a blog type, skipping processing`); const { ignorePatterns, includePatterns, orderPatterns } = sessionFileData.patterns ?? {}; const sessionItem = { sessionName: sessionFileData.sessionName ?? sessionFileData.docsDir, source: "rss", items: [] }; const rssFilePath = node_path.default.join(siteConfig.outDir, sessionFileData.docsDir, sessionFileData.rss); const blogEntries = await parseRssItems(rssFilePath); let matchedRssFeedItems = []; if (orderPatterns) if (Array.isArray(orderPatterns)) { for await (const orderPattern of orderPatterns) { const matchedUrlsByPattern = blogEntries.filter((entry) => (0, minimatch.minimatch)(entry.link, orderPattern, { matchBase: true })); matchedUrlsByPattern.forEach((rssFeedItem) => matchedRssFeedItems.push(rssFeedItem)); } const unmatchedUrls = blogEntries.filter((entry) => !matchedRssFeedItems.includes(entry)); matchedRssFeedItems = matchedRssFeedItems.concat(unmatchedUrls); } else { const entryMap = new Map(blogEntries.map((entry) => [entry.link, entry])); matchedRssFeedItems = Array.from(entryMap.keys()).sort(orderPatterns).map((link) => entryMap.get(link)).filter(Boolean); } else matchedRssFeedItems = blogEntries; for await (const blogEntry of matchedRssFeedItems) { if (ignorePatterns && ignorePatterns.some((pattern) => (0, minimatch.minimatch)(blogEntry.link, pattern, { matchBase: true }))) continue; if (includePatterns && !includePatterns.some((pattern) => (0, minimatch.minimatch)(blogEntry.link, pattern, { matchBase: true }))) continue; sessionItem.items.push({ title: blogEntry.title, description: blogEntry.description, link: blogEntry.link }); if (fullContentConfig.processedUrls.has(blogEntry.link)) continue; fullContentConfig.processedUrls.add(blogEntry.link); fullContentConfig.sessions.push({ title: blogEntry.title, link: blogEntry.link, content: blogEntry.content ?? "" }); } standardConfig.sessions.push(sessionItem); return { llmStdConfig: standardConfig, llmFullStdConfig: fullContentConfig }; }; const processGenericSession = async (sessionFileData, siteConfig, buildFilePaths, processedSessionFiles, standardConfig, fullContentConfig) => { const processedSessionFile = await processLLMSessionsFilesWithPatternFilters(sessionFileData, siteConfig); processedSessionFiles.push(processedSessionFile); const updatedStandardConfig = await generateLLMStdConfig(standardConfig, buildFilePaths, processedSessionFiles, siteConfig); const updatedFullContentConfig = await generateLLMFullStdConfig(fullContentConfig, buildFilePaths, processedSessionFiles, siteConfig); return { llmStdConfig: updatedStandardConfig, llmFullStdConfig: updatedFullContentConfig }; }; const generateOutputFiles = async (llmConfig, siteConfig, standardConfig, fullContentConfig) => { const fileNamePrefix = llmConfig.infixName ? `llms-${llmConfig.infixName}` : "llms"; if (llmConfig.generateLLMsTxt) { const standardContent = standardizeLLMsTxtContent(standardConfig, llmConfig.extraSession); await generateLLMsTxt(siteConfig.outDir, `${fileNamePrefix}.txt`, standardContent); } if (llmConfig.generateLLMsFullTxt) { const fullContent = standardizeLLMsFullTxtContent(fullContentConfig); await generateLLMsTxt(siteConfig.outDir, `${fileNamePrefix}-full.txt`, fullContent); } }; const generateLLMsTxtFlow = async (context) => { const { pluginSiteConfig: siteConfig, llmConfigs } = context; const buildFilePaths = await getAllDocusaurusBuildFilesPaths(siteConfig.outDir); for await (const currentLLMConfig of llmConfigs) { if (!currentLLMConfig.generateLLMsTxt && !currentLLMConfig.generateLLMsFullTxt) continue; const sessionFilesList = await collectLLMSessionFiles(siteConfig.siteDir, currentLLMConfig); if (sessionFilesList.length === 0) { console.warn("No session files found: ", JSON.stringify(currentLLMConfig)); continue; } let { llmStdConfig: currentStandardConfig, llmFullStdConfig: currentFullContentConfig } = initializeLLMConfigurations(currentLLMConfig); for await (const sessionFileData of sessionFilesList) if (sessionFileData.type === "docs" && sessionFileData.sitemap) { const { llmStdConfig, llmFullStdConfig } = await processDocumentationSession(sessionFileData, siteConfig, currentStandardConfig, currentFullContentConfig); currentStandardConfig = llmStdConfig; currentFullContentConfig = llmFullStdConfig; } else if (sessionFileData.type === "blog" && sessionFileData.rss) { const { llmStdConfig, llmFullStdConfig } = await processBlogSession(sessionFileData, siteConfig, currentStandardConfig, currentFullContentConfig); currentStandardConfig = llmStdConfig; currentFullContentConfig = llmFullStdConfig; } else { const processedSessionFiles = []; const { llmStdConfig, llmFullStdConfig } = await processGenericSession(sessionFileData, siteConfig, buildFilePaths, processedSessionFiles, currentStandardConfig, currentFullContentConfig); currentStandardConfig = llmStdConfig; currentFullContentConfig = llmFullStdConfig; } currentStandardConfig.sessions = currentStandardConfig.sessions.map((session) => { if (session.source === "sitemap") { const otherSessionUrls = new Set(currentStandardConfig.sessions.filter((s) => s.sessionName !== session.sessionName).flatMap((s) => s.items.map((item) => item.link))); session.items = session.items.filter((item) => !otherSessionUrls.has(item.link)); } return session; }); const { hooks, context: context$1 } = await createLlmsHooks(currentLLMConfig, currentStandardConfig, currentFullContentConfig); await hooks.callHook("generate:prepare", context$1); await generateOutputFiles(currentLLMConfig, siteConfig, context$1.llmConfig.llmStdConfig, context$1.llmConfig.llmFullStdConfig); } }; //#endregion //#region src/index.ts function defineDocusaurusPlugins(context, options) { const { siteConfig, siteDir, outDir, siteVersion } = context; const { version, llmConfigs } = options; const siteUrl = siteConfig.url + (siteConfig.baseUrl.endsWith("/") ? siteConfig.baseUrl.slice(0, -1) : siteConfig.baseUrl || ""); const pluginContext = { pluginSiteConfig: { version: version ?? siteVersion ?? "1.0.0", outDir, siteDir, siteConfig, siteUrl }, llmConfigs }; return { name: "docusaurus-plugin-llms-builder", async postBuild() { await generateLLMsTxtFlow(pluginContext); }, extendCli(cli) { cli.command("llms").description("Generate llms.txt and llms-full.txt file by scanning all documentation files in the directory").action(async () => { await generateLLMsTxtFlow(pluginContext); }); } }; } //#endregion module.exports = defineDocusaurusPlugins;