docusaurus-plugin-llms
Version:
Docusaurus plugin for generating LLM-friendly documentation following the llmstxt.org standard
368 lines (312 loc) • 12.3 kB
text/typescript
/**
* Utility functions for the docusaurus-plugin-llms plugin
*/
import * as fs from 'fs/promises';
import * as path from 'path';
import { minimatch } from 'minimatch';
import matter from 'gray-matter';
import * as YAML from 'yaml';
import { PluginOptions } from './types';
/**
* Write content to a file
* @param filePath - Path to write the file to
* @param data - Content to write
*/
export async function writeFile(filePath: string, data: string): Promise<void> {
return fs.writeFile(filePath, data, 'utf8');
}
/**
* Read content from a file
* @param filePath - Path of the file to read
* @returns Content of the file
*/
export async function readFile(filePath: string): Promise<string> {
return fs.readFile(filePath, 'utf8');
}
/**
* Check if a file should be ignored based on glob patterns
* @param filePath - Path to the file
* @param baseDir - Base directory for relative paths
* @param ignorePatterns - Glob patterns for files to ignore
* @returns Whether the file should be ignored
*/
export function shouldIgnoreFile(filePath: string, baseDir: string, ignorePatterns: string[]): boolean {
if (ignorePatterns.length === 0) {
return false;
}
const relativePath = path.relative(baseDir, filePath);
return ignorePatterns.some(pattern =>
minimatch(relativePath, pattern, { matchBase: true })
);
}
/**
* Recursively reads all Markdown files in a directory
* @param dir - Directory to scan
* @param baseDir - Base directory for relative paths
* @param ignorePatterns - Glob patterns for files to ignore
* @returns Array of file paths
*/
export async function readMarkdownFiles(dir: string, baseDir: string, ignorePatterns: string[] = []): Promise<string[]> {
const files: string[] = [];
const entries = await fs.readdir(dir, { withFileTypes: true });
for (const entry of entries) {
const fullPath = path.join(dir, entry.name);
if (shouldIgnoreFile(fullPath, baseDir, ignorePatterns)) {
continue;
}
if (entry.isDirectory()) {
const subDirFiles = await readMarkdownFiles(fullPath, baseDir, ignorePatterns);
files.push(...subDirFiles);
} else if (entry.name.endsWith('.md') || entry.name.endsWith('.mdx')) {
// Skip partial files (those starting with underscore)
if (!entry.name.startsWith('_')) {
files.push(fullPath);
}
}
}
return files;
}
/**
* Extract title from content or use the filename
* @param data - Frontmatter data
* @param content - Markdown content
* @param filePath - Path to the file
* @returns Extracted title
*/
export function extractTitle(data: any, content: string, filePath: string): string {
// First try frontmatter
if (data.title) {
return data.title;
}
// Then try first heading
const headingMatch = content.match(/^#\s+(.*)/m);
if (headingMatch) {
return headingMatch[1].trim();
}
// Finally use filename
return path.basename(filePath, path.extname(filePath))
.replace(/-/g, ' ')
.replace(/\b\w/g, (c: string) => c.toUpperCase());
}
/**
* Resolve and inline partial imports in markdown content
* @param content - The markdown content with import statements
* @param filePath - The path of the file containing the imports
* @returns Content with partials resolved
*/
export async function resolvePartialImports(content: string, filePath: string): Promise<string> {
let resolved = content;
// Match import statements for partials and JSX usage
// Pattern 1: import PartialName from './_partial.mdx'
// Pattern 2: import { PartialName } from './_partial.mdx'
const importRegex = /^\s*import\s+(?:(\w+)|{\s*(\w+)\s*})\s+from\s+['"]([^'"]+_[^'"]+\.mdx?)['"];?\s*$/gm;
const imports = new Map<string, string>();
// First pass: collect all imports
let match;
while ((match = importRegex.exec(content)) !== null) {
const componentName = match[1] || match[2];
const importPath = match[3];
// Only process imports for partial files (containing underscore)
if (importPath.includes('_')) {
imports.set(componentName, importPath);
}
}
// Resolve each partial import
for (const [componentName, importPath] of imports) {
try {
// Resolve the partial file path relative to the current file
const dir = path.dirname(filePath);
const partialPath = path.resolve(dir, importPath);
// Read the partial file
const partialContent = await readFile(partialPath);
const { content: partialMarkdown } = matter(partialContent);
// Remove the import statement
resolved = resolved.replace(
new RegExp(`^\\s*import\\s+(?:${componentName}|{\\s*${componentName}\\s*})\\s+from\\s+['"]${importPath.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}['"];?\\s*$`, 'gm'),
''
);
// Replace JSX usage with the partial content
// Handle both self-closing tags and tags with content
// <PartialName /> or <PartialName></PartialName> or <PartialName>...</PartialName>
const jsxRegex = new RegExp(`<${componentName}\\s*(?:[^>]*?)(?:/>|>[^<]*</${componentName}>)`, 'g');
resolved = resolved.replace(jsxRegex, partialMarkdown.trim());
} catch (error) {
console.warn(`Failed to resolve partial import "${importPath}" in ${filePath}: ${error}`);
// Leave the import and usage as-is if we can't resolve it
}
}
return resolved;
}
/**
* Clean markdown content for LLM consumption
* @param content - Raw markdown content
* @param excludeImports - Whether to exclude import statements
* @param removeDuplicateHeadings - Whether to remove redundant content that duplicates heading text
* @returns Cleaned content
*/
export function cleanMarkdownContent(content: string, excludeImports: boolean = false, removeDuplicateHeadings: boolean = false): string {
let cleaned = content;
// Remove import statements if requested
if (excludeImports) {
// Remove ES6/React import statements
// This regex matches:
// - import ... from "...";
// - import ... from '...';
// - import { ... } from "...";
// - import * as ... from "...";
// - import "..."; (side-effect imports)
cleaned = cleaned.replace(/^\s*import\s+.*?;?\s*$/gm, '');
}
// Remove HTML tags, but preserve XML content in code blocks
// We need to be selective to avoid removing XML content from code blocks
// This regex targets common HTML tags while being more conservative about XML
cleaned = cleaned.replace(/<\/?(?:div|span|p|br|hr|img|a|strong|em|b|i|u|h[1-6]|ul|ol|li|table|tr|td|th|thead|tbody)\b[^>]*>/gi, '');
// Remove redundant content that just repeats the heading (if requested)
if (removeDuplicateHeadings) {
// Split content into lines and process line by line
const lines = cleaned.split('\n');
const processedLines: string[] = [];
let i = 0;
while (i < lines.length) {
const currentLine = lines[i];
// Check if current line is a heading (accounting for leading whitespace)
const headingMatch = currentLine.match(/^\s*(#+)\s+(.+)$/);
if (headingMatch) {
const headingLevel = headingMatch[1];
const headingText = headingMatch[2].trim();
processedLines.push(currentLine);
i++;
// Look ahead for potential redundant content
// Skip empty lines
while (i < lines.length && lines[i].trim() === '') {
processedLines.push(lines[i]);
i++;
}
// Check if the next non-empty line just repeats the heading text
// but is NOT itself a heading (to avoid removing valid headings of different levels)
if (i < lines.length) {
const nextLine = lines[i].trim();
const nextLineIsHeading = /^\s*#+\s+/.test(nextLine);
// Only remove if it exactly matches the heading text AND is not a heading itself
if (nextLine === headingText && !nextLineIsHeading) {
// Skip this redundant line
i++;
}
}
} else {
processedLines.push(currentLine);
i++;
}
}
cleaned = processedLines.join('\n');
}
// Normalize whitespace
cleaned = cleaned.replace(/\r\n/g, '\n')
.replace(/\n{3,}/g, '\n\n')
.trim();
return cleaned;
}
/**
* Apply path transformations according to configuration
* @param urlPath - Original URL path
* @param pathTransformation - Path transformation configuration
* @returns Transformed URL path
*/
export function applyPathTransformations(
urlPath: string,
pathTransformation?: PluginOptions['pathTransformation']
): string {
if (!pathTransformation) {
return urlPath;
}
let transformedPath = urlPath;
// Remove ignored path segments
if (pathTransformation.ignorePaths?.length) {
for (const ignorePath of pathTransformation.ignorePaths) {
// Create a regex that matches the ignore path at the beginning, middle, or end of the path
// We use word boundaries to ensure we match complete path segments
const ignoreRegex = new RegExp(`(^|/)(${ignorePath})(/|$)`, 'g');
transformedPath = transformedPath.replace(ignoreRegex, '$1$3');
}
// Clean up any double slashes that might have been created
transformedPath = transformedPath.replace(/\/+/g, '/');
// Remove leading slash if present
transformedPath = transformedPath.replace(/^\//, '');
}
// Add path segments if they're not already present
if (pathTransformation.addPaths?.length) {
// Process in reverse order to maintain the specified order in the final path
// This is because each path is prepended to the front
const pathsToAdd = [...pathTransformation.addPaths].reverse();
for (const addPath of pathsToAdd) {
// Only add if not already present at the beginning
if (!transformedPath.startsWith(addPath + '/') && transformedPath !== addPath) {
transformedPath = `${addPath}/${transformedPath}`;
}
}
}
return transformedPath;
}
/**
* Sanitize a string to create a safe filename
* @param input - Input string (typically a title)
* @param fallback - Fallback string if input becomes empty after sanitization
* @returns Sanitized filename (without extension)
*/
export function sanitizeForFilename(input: string, fallback: string = 'untitled'): string {
if (!input) return fallback;
const sanitized = input
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '');
return sanitized || fallback;
}
/**
* Ensure a unique identifier from a set of used identifiers
* @param baseIdentifier - Base identifier to make unique
* @param usedIdentifiers - Set of already used identifiers
* @param suffix - Suffix pattern (default: number in parentheses)
* @returns Unique identifier
*/
export function ensureUniqueIdentifier(
baseIdentifier: string,
usedIdentifiers: Set<string>,
suffix: (counter: number, base: string) => string = (counter) => `(${counter})`
): string {
let uniqueIdentifier = baseIdentifier;
let counter = 1;
while (usedIdentifiers.has(uniqueIdentifier.toLowerCase())) {
counter++;
uniqueIdentifier = `${baseIdentifier}${suffix(counter, baseIdentifier)}`;
}
usedIdentifiers.add(uniqueIdentifier.toLowerCase());
return uniqueIdentifier;
}
/**
* Create standardized markdown content template
* @param title - Document title
* @param description - Document description
* @param content - Document content
* @param includeMetadata - Whether to include description metadata
* @param frontMatter - Optional frontmatter to include at the top
* @returns Formatted markdown content
*/
export function createMarkdownContent(
title: string,
description: string = '',
content: string = '',
includeMetadata: boolean = true,
frontMatter?: Record<string, any>
): string {
let result = '';
// Add frontmatter if provided
if (frontMatter && Object.keys(frontMatter).length > 0) {
result += '---\n';
result += YAML.stringify(frontMatter);
result += '---\n\n';
}
const descriptionLine = includeMetadata && description ? `\n\n> ${description}\n` : '\n';
result += `# ${title}${descriptionLine}
${content}`.trim() + '\n';
return result;
}