astro-llms-generate
Version:
Astro integration to automatically generate AI-friendly documentation files: /llms.txt, /llms-small.txt, and /llms-full.txt
378 lines (318 loc) • 11.5 kB
text/typescript
import type { AstroConfig, AstroIntegration } from "astro";
import fs from "fs/promises";
import path from "path";
import { fileURLToPath } from "url";
import { JSDOM } from "jsdom";
import { SimpleMarkdown } from "./simple-markdown";
export interface PageData {
pathname: string;
title: string;
description?: string;
content?: string;
slug?: string;
order?: number;
}
export interface LlmsConfig {
title?: string;
description?: string;
includePatterns?: string[];
excludePatterns?: string[];
customSeparator?: string;
}
// Simple configuration cache for performance
const configurationCache = new Map<string, Required<LlmsConfig>>();
/**
* Astro integration to automatically generate AI-friendly documentation files
* Generates /llms.txt, /llms-small.txt, and /llms-full.txt in build directory only
*/
export default function astroLLMsGenerator(userConfig: LlmsConfig = {}): AstroIntegration {
let astroConfiguration: AstroConfig;
return {
name: "astro-llms-generate",
hooks: {
"astro:config:setup": ({ config }) => {
astroConfiguration = config;
},
"astro:build:start": async ({ logger }) => {
logger.info("Starting LLMs documentation generation...");
},
"astro:build:done": async ({ dir, pages, logger }) => {
const distDirectory = fileURLToPath(dir);
try {
const config = await generateSmartDefaults(astroConfiguration, userConfig, distDirectory);
const pageDataList = await discoverAndProcessPages(pages, distDirectory, astroConfiguration);
// Generate files only in build directory
await Promise.all([
generateLlmsIndexFile(pageDataList, config, distDirectory, astroConfiguration),
generateLlmsSmallFile(pageDataList, config, distDirectory, astroConfiguration),
generateLlmsFullFile(pageDataList, config, distDirectory)
]);
logger.info("✅ Generated llms.txt, llms-small.txt, and llms-full.txt");
logger.info("Available in build output dir");
} catch (error) {
logger.error(`Failed to generate LLMs files: ${error}`);
}
},
},
};
}
/**
* Generate smart defaults with caching
*/
async function generateSmartDefaults(
astroConfig: AstroConfig,
userConfig: LlmsConfig,
distDirectory: string
): Promise<Required<LlmsConfig>> {
const cacheKey = createCacheKey(astroConfig, userConfig);
if (configurationCache.has(cacheKey)) {
return configurationCache.get(cacheKey)!;
}
const packageDescription = await extractPackageDescription();
const autoGeneratedTitle = generateTitleFromSite(astroConfig.site);
const completeConfig: Required<LlmsConfig> = {
title: userConfig.title || autoGeneratedTitle,
description: userConfig.description || packageDescription || `AI-friendly documentation for ${autoGeneratedTitle}`,
includePatterns: userConfig.includePatterns || ["**/*"],
excludePatterns: userConfig.excludePatterns || ["**/404*", "**/500*", "**/api/**"],
customSeparator: userConfig.customSeparator || "\n\n---\n\n"
};
configurationCache.set(cacheKey, completeConfig);
return completeConfig;
}
/**
* Memory-efficient page discovery with smaller batch processing
*/
async function discoverAndProcessPages(
pages: { pathname: string }[],
distDirectory: string,
astroConfig: AstroConfig
): Promise<PageData[]> {
const processedPages: PageData[] = [];
const batchSize = 5; // Reduced batch size for memory efficiency
for (let i = 0; i < pages.length; i += batchSize) {
const currentBatch = pages.slice(i, i + batchSize);
const batchResults = await processBatchOfPages(currentBatch, distDirectory, astroConfig);
processedPages.push(...batchResults);
// Clear memory between batches
if (global.gc) {
global.gc();
}
}
return sortPagesByPathname(processedPages);
}
/**
* Process a batch of pages in parallel with memory cleanup
*/
async function processBatchOfPages(
pageBatch: { pathname: string }[],
distDirectory: string,
astroConfig: AstroConfig
): Promise<PageData[]> {
const batchPromises = pageBatch.map(async (page) => {
try {
const htmlFilePath = getHtmlFilePath(page.pathname, distDirectory);
await fs.access(htmlFilePath);
return await extractPageDataFromHtml(htmlFilePath, page.pathname, astroConfig);
} catch (error) {
console.warn(`⚠️ Could not process page: ${page.pathname}`);
return null;
}
});
const batchResults = await Promise.all(batchPromises);
return batchResults.filter((page): page is PageData => page !== null);
}
/**
* Extract page data from HTML file with memory-efficient processing
*/
async function extractPageDataFromHtml(
htmlFilePath: string,
pathname: string,
astroConfig: AstroConfig
): Promise<PageData> {
try {
const htmlContent = await fs.readFile(htmlFilePath, "utf-8");
const documentModel = new JSDOM(htmlContent);
const document = documentModel.window.document;
const extractedTitle = extractTitleFromDocument(document, pathname);
const metaDescription = extractMetaDescription(document);
const mainContent = await extractMainContentAsMarkdown(document);
// Clean up JSDOM instance
documentModel.window.close();
return {
pathname,
title: extractedTitle,
description: metaDescription,
content: mainContent.trim(),
slug: pathname
};
} catch (error) {
throw new Error(`Failed to extract page data from ${htmlFilePath}: ${error}`);
}
}
/**
* Generate llms.txt index file in build directory
*/
async function generateLlmsIndexFile(
pages: PageData[],
config: Required<LlmsConfig>,
distDirectory: string,
astroConfig: AstroConfig
): Promise<void> {
const contentLines = createIndexFileContent(pages, config, astroConfig.site || "");
await fs.writeFile(path.join(distDirectory, "llms.txt"), contentLines, "utf-8");
}
/**
* Generate llms-small.txt structure file in build directory
*/
async function generateLlmsSmallFile(
pages: PageData[],
config: Required<LlmsConfig>,
distDirectory: string,
astroConfig: AstroConfig
): Promise<void> {
const contentLines = createSmallFileContent(pages, config, astroConfig.site || "");
await fs.writeFile(path.join(distDirectory, "llms-small.txt"), contentLines, "utf-8");
}
/**
* Generate llms-full.txt content file in build directory
*/
async function generateLlmsFullFile(
pages: PageData[],
config: Required<LlmsConfig>,
distDirectory: string
): Promise<void> {
const contentLines = createFullFileContent(pages, config);
await fs.writeFile(path.join(distDirectory, "llms-full.txt"), contentLines, "utf-8");
}
// ====== UTILITY FUNCTIONS ======
function createIndexFileContent(pages: PageData[], config: Required<LlmsConfig>, baseUrl: string): string {
const lines: string[] = [
`# ${config.title}`,
`> ${config.description}`,
"",
"## Pages",
""
];
const groupedPages = groupPagesByDirectory(pages);
for (const [directoryName, directoryPages] of Object.entries(groupedPages)) {
if (directoryName !== "/") {
lines.push(`### ${directoryName}`);
lines.push("");
}
for (const page of directoryPages) {
const pageUrl = baseUrl ? new URL(page.pathname, baseUrl).toString() : page.pathname;
const pageDescription = page.description ? ` - ${page.description}` : "";
lines.push(`- [${page.title}](${pageUrl})${pageDescription}`);
}
lines.push("");
}
lines.push("", "*Auto-generated documentation index*");
return lines.join("\n").trim();
}
function createSmallFileContent(pages: PageData[], config: Required<LlmsConfig>, baseUrl: string): string {
const lines: string[] = [
`# ${config.title}`,
"> Structure-only documentation",
""
];
for (const page of pages) {
const pageUrl = baseUrl ? new URL(page.pathname, baseUrl).toString() : page.pathname;
lines.push(`- [${page.title}](${pageUrl})`);
}
return lines.join("\n").trim();
}
function createFullFileContent(pages: PageData[], config: Required<LlmsConfig>): string {
const lines: string[] = [
`# ${config.title}`,
`> ${config.description}`,
"",
"*Complete documentation content below*",
""
];
const pageContents = pages
.filter(page => page.content && page.content.length > 0)
.map(page => {
const parts = [`# ${page.title}`];
if (page.description) {
parts.push(`> ${page.description}`);
}
parts.push("", page.content!);
return parts.join("\n");
});
lines.push(pageContents.join(config.customSeparator));
return lines.join("\n").trim();
}
function groupPagesByDirectory(pages: PageData[]): Record<string, PageData[]> {
const groups: Record<string, PageData[]> = {};
for (const page of pages) {
const directoryPath = path.dirname(page.pathname);
const directoryName = directoryPath === "/" || directoryPath === "."
? "/"
: directoryPath.split("/").filter(Boolean).pop() || "/";
if (!groups[directoryName]) {
groups[directoryName] = [];
}
groups[directoryName].push(page);
}
return groups;
}
function extractTitleFromDocument(document: Document, pathname: string): string {
const h1Element = document.querySelector("h1");
const titleElement = document.querySelector("title");
return h1Element?.textContent?.trim() ||
titleElement?.textContent?.trim() ||
pathname.split("/").filter(Boolean).pop() ||
"Untitled";
}
function extractMetaDescription(document: Document): string | undefined {
return document
.querySelector('meta[name="description"]')
?.getAttribute("content")
?.trim();
}
async function extractMainContentAsMarkdown(document: Document): Promise<string> {
const mainElement = document.querySelector("main") || document.querySelector("body");
if (!mainElement) return "";
// Remove title to avoid duplication
const h1Element = mainElement.querySelector("h1");
if (h1Element) h1Element.remove();
return await SimpleMarkdown(
mainElement.innerHTML.trim(),
['header', 'footer', 'nav', '.no-llms', 'script', 'style'],
false
);
}
function getHtmlFilePath(pathname: string, distDirectory: string): string {
if (pathname.endsWith("/")) {
return path.join(distDirectory, pathname, "index.html");
}
const htmlFilePath = path.join(distDirectory, pathname + ".html");
const indexFilePath = path.join(distDirectory, pathname, "index.html");
return pathname.includes(".") ? htmlFilePath : indexFilePath;
}
function createCacheKey(astroConfig: AstroConfig, userConfig: LlmsConfig): string {
return JSON.stringify({ astroConfig: astroConfig.site, userConfig });
}
function sortPagesByPathname(pages: PageData[]): PageData[] {
return pages.sort((a, b) => a.pathname.localeCompare(b.pathname));
}
async function extractPackageDescription(): Promise<string> {
try {
const packageFilePath = path.join(process.cwd(), "package.json");
const packageContent = await fs.readFile(packageFilePath, "utf-8");
const packageData = JSON.parse(packageContent);
return packageData.description || "";
} catch {
return "";
}
}
function generateTitleFromSite(siteUrl?: string): string {
if (!siteUrl) return "Documentation";
try {
const url = new URL(siteUrl);
return url.hostname.replace(/^www\./, "");
} catch {
return siteUrl;
}
}