UNPKG

astro-llms-generate

Version:

Astro integration to automatically generate AI-friendly documentation files: /llms.txt, /llms-small.txt, and /llms-full.txt

378 lines (318 loc) 11.5 kB
import type { AstroConfig, AstroIntegration } from "astro"; import fs from "fs/promises"; import path from "path"; import { fileURLToPath } from "url"; import { JSDOM } from "jsdom"; import { SimpleMarkdown } from "./simple-markdown"; export interface PageData { pathname: string; title: string; description?: string; content?: string; slug?: string; order?: number; } export interface LlmsConfig { title?: string; description?: string; includePatterns?: string[]; excludePatterns?: string[]; customSeparator?: string; } // Simple configuration cache for performance const configurationCache = new Map<string, Required<LlmsConfig>>(); /** * Astro integration to automatically generate AI-friendly documentation files * Generates /llms.txt, /llms-small.txt, and /llms-full.txt in build directory only */ export default function astroLLMsGenerator(userConfig: LlmsConfig = {}): AstroIntegration { let astroConfiguration: AstroConfig; return { name: "astro-llms-generate", hooks: { "astro:config:setup": ({ config }) => { astroConfiguration = config; }, "astro:build:start": async ({ logger }) => { logger.info("Starting LLMs documentation generation..."); }, "astro:build:done": async ({ dir, pages, logger }) => { const distDirectory = fileURLToPath(dir); try { const config = await generateSmartDefaults(astroConfiguration, userConfig, distDirectory); const pageDataList = await discoverAndProcessPages(pages, distDirectory, astroConfiguration); // Generate files only in build directory await Promise.all([ generateLlmsIndexFile(pageDataList, config, distDirectory, astroConfiguration), generateLlmsSmallFile(pageDataList, config, distDirectory, astroConfiguration), generateLlmsFullFile(pageDataList, config, distDirectory) ]); logger.info("✅ Generated llms.txt, llms-small.txt, and llms-full.txt"); logger.info("Available in build output dir"); } catch (error) { logger.error(`Failed to generate LLMs files: ${error}`); } }, }, }; } /** * Generate smart defaults with caching */ async function generateSmartDefaults( astroConfig: AstroConfig, userConfig: LlmsConfig, distDirectory: string ): Promise<Required<LlmsConfig>> { const cacheKey = createCacheKey(astroConfig, userConfig); if (configurationCache.has(cacheKey)) { return configurationCache.get(cacheKey)!; } const packageDescription = await extractPackageDescription(); const autoGeneratedTitle = generateTitleFromSite(astroConfig.site); const completeConfig: Required<LlmsConfig> = { title: userConfig.title || autoGeneratedTitle, description: userConfig.description || packageDescription || `AI-friendly documentation for ${autoGeneratedTitle}`, includePatterns: userConfig.includePatterns || ["**/*"], excludePatterns: userConfig.excludePatterns || ["**/404*", "**/500*", "**/api/**"], customSeparator: userConfig.customSeparator || "\n\n---\n\n" }; configurationCache.set(cacheKey, completeConfig); return completeConfig; } /** * Memory-efficient page discovery with smaller batch processing */ async function discoverAndProcessPages( pages: { pathname: string }[], distDirectory: string, astroConfig: AstroConfig ): Promise<PageData[]> { const processedPages: PageData[] = []; const batchSize = 5; // Reduced batch size for memory efficiency for (let i = 0; i < pages.length; i += batchSize) { const currentBatch = pages.slice(i, i + batchSize); const batchResults = await processBatchOfPages(currentBatch, distDirectory, astroConfig); processedPages.push(...batchResults); // Clear memory between batches if (global.gc) { global.gc(); } } return sortPagesByPathname(processedPages); } /** * Process a batch of pages in parallel with memory cleanup */ async function processBatchOfPages( pageBatch: { pathname: string }[], distDirectory: string, astroConfig: AstroConfig ): Promise<PageData[]> { const batchPromises = pageBatch.map(async (page) => { try { const htmlFilePath = getHtmlFilePath(page.pathname, distDirectory); await fs.access(htmlFilePath); return await extractPageDataFromHtml(htmlFilePath, page.pathname, astroConfig); } catch (error) { console.warn(`⚠️ Could not process page: ${page.pathname}`); return null; } }); const batchResults = await Promise.all(batchPromises); return batchResults.filter((page): page is PageData => page !== null); } /** * Extract page data from HTML file with memory-efficient processing */ async function extractPageDataFromHtml( htmlFilePath: string, pathname: string, astroConfig: AstroConfig ): Promise<PageData> { try { const htmlContent = await fs.readFile(htmlFilePath, "utf-8"); const documentModel = new JSDOM(htmlContent); const document = documentModel.window.document; const extractedTitle = extractTitleFromDocument(document, pathname); const metaDescription = extractMetaDescription(document); const mainContent = await extractMainContentAsMarkdown(document); // Clean up JSDOM instance documentModel.window.close(); return { pathname, title: extractedTitle, description: metaDescription, content: mainContent.trim(), slug: pathname }; } catch (error) { throw new Error(`Failed to extract page data from ${htmlFilePath}: ${error}`); } } /** * Generate llms.txt index file in build directory */ async function generateLlmsIndexFile( pages: PageData[], config: Required<LlmsConfig>, distDirectory: string, astroConfig: AstroConfig ): Promise<void> { const contentLines = createIndexFileContent(pages, config, astroConfig.site || ""); await fs.writeFile(path.join(distDirectory, "llms.txt"), contentLines, "utf-8"); } /** * Generate llms-small.txt structure file in build directory */ async function generateLlmsSmallFile( pages: PageData[], config: Required<LlmsConfig>, distDirectory: string, astroConfig: AstroConfig ): Promise<void> { const contentLines = createSmallFileContent(pages, config, astroConfig.site || ""); await fs.writeFile(path.join(distDirectory, "llms-small.txt"), contentLines, "utf-8"); } /** * Generate llms-full.txt content file in build directory */ async function generateLlmsFullFile( pages: PageData[], config: Required<LlmsConfig>, distDirectory: string ): Promise<void> { const contentLines = createFullFileContent(pages, config); await fs.writeFile(path.join(distDirectory, "llms-full.txt"), contentLines, "utf-8"); } // ====== UTILITY FUNCTIONS ====== function createIndexFileContent(pages: PageData[], config: Required<LlmsConfig>, baseUrl: string): string { const lines: string[] = [ `# ${config.title}`, `> ${config.description}`, "", "## Pages", "" ]; const groupedPages = groupPagesByDirectory(pages); for (const [directoryName, directoryPages] of Object.entries(groupedPages)) { if (directoryName !== "/") { lines.push(`### ${directoryName}`); lines.push(""); } for (const page of directoryPages) { const pageUrl = baseUrl ? new URL(page.pathname, baseUrl).toString() : page.pathname; const pageDescription = page.description ? ` - ${page.description}` : ""; lines.push(`- [${page.title}](${pageUrl})${pageDescription}`); } lines.push(""); } lines.push("", "*Auto-generated documentation index*"); return lines.join("\n").trim(); } function createSmallFileContent(pages: PageData[], config: Required<LlmsConfig>, baseUrl: string): string { const lines: string[] = [ `# ${config.title}`, "> Structure-only documentation", "" ]; for (const page of pages) { const pageUrl = baseUrl ? new URL(page.pathname, baseUrl).toString() : page.pathname; lines.push(`- [${page.title}](${pageUrl})`); } return lines.join("\n").trim(); } function createFullFileContent(pages: PageData[], config: Required<LlmsConfig>): string { const lines: string[] = [ `# ${config.title}`, `> ${config.description}`, "", "*Complete documentation content below*", "" ]; const pageContents = pages .filter(page => page.content && page.content.length > 0) .map(page => { const parts = [`# ${page.title}`]; if (page.description) { parts.push(`> ${page.description}`); } parts.push("", page.content!); return parts.join("\n"); }); lines.push(pageContents.join(config.customSeparator)); return lines.join("\n").trim(); } function groupPagesByDirectory(pages: PageData[]): Record<string, PageData[]> { const groups: Record<string, PageData[]> = {}; for (const page of pages) { const directoryPath = path.dirname(page.pathname); const directoryName = directoryPath === "/" || directoryPath === "." ? "/" : directoryPath.split("/").filter(Boolean).pop() || "/"; if (!groups[directoryName]) { groups[directoryName] = []; } groups[directoryName].push(page); } return groups; } function extractTitleFromDocument(document: Document, pathname: string): string { const h1Element = document.querySelector("h1"); const titleElement = document.querySelector("title"); return h1Element?.textContent?.trim() || titleElement?.textContent?.trim() || pathname.split("/").filter(Boolean).pop() || "Untitled"; } function extractMetaDescription(document: Document): string | undefined { return document .querySelector('meta[name="description"]') ?.getAttribute("content") ?.trim(); } async function extractMainContentAsMarkdown(document: Document): Promise<string> { const mainElement = document.querySelector("main") || document.querySelector("body"); if (!mainElement) return ""; // Remove title to avoid duplication const h1Element = mainElement.querySelector("h1"); if (h1Element) h1Element.remove(); return await SimpleMarkdown( mainElement.innerHTML.trim(), ['header', 'footer', 'nav', '.no-llms', 'script', 'style'], false ); } function getHtmlFilePath(pathname: string, distDirectory: string): string { if (pathname.endsWith("/")) { return path.join(distDirectory, pathname, "index.html"); } const htmlFilePath = path.join(distDirectory, pathname + ".html"); const indexFilePath = path.join(distDirectory, pathname, "index.html"); return pathname.includes(".") ? htmlFilePath : indexFilePath; } function createCacheKey(astroConfig: AstroConfig, userConfig: LlmsConfig): string { return JSON.stringify({ astroConfig: astroConfig.site, userConfig }); } function sortPagesByPathname(pages: PageData[]): PageData[] { return pages.sort((a, b) => a.pathname.localeCompare(b.pathname)); } async function extractPackageDescription(): Promise<string> { try { const packageFilePath = path.join(process.cwd(), "package.json"); const packageContent = await fs.readFile(packageFilePath, "utf-8"); const packageData = JSON.parse(packageContent); return packageData.description || ""; } catch { return ""; } } function generateTitleFromSite(siteUrl?: string): string { if (!siteUrl) return "Documentation"; try { const url = new URL(siteUrl); return url.hostname.replace(/^www\./, ""); } catch { return siteUrl; } }