@signalwire/docusaurus-plugin-llms-txt
Version:
Generate Markdown versions of Docusaurus HTML pages and an llms.txt index file
101 lines (100 loc) • 4.92 kB
JavaScript
/**
* HTML file processing
* Process individual HTML files to extract content and convert to markdown
*/
import path from 'path';
import fs from 'fs-extra';
import { getContentConfig } from '../config';
import { ERROR_MESSAGES } from '../constants';
import { getErrorMessage, getErrorCause, createProcessingError, } from '../errors';
import { PathManager, htmlPathToMdPath } from '../filesystem/paths';
import { saveMarkdownFile } from '../generation/markdown-writer';
import { extractHtmlMetadata, convertHtmlToMarkdown } from './html-parser';
/**
* Process a single HTML file → Markdown + metadata
* @internal
*/
export async function processHtmlFileWithContext(fullHtmlPath, routePath, config, mdOutDir, logger, siteUrl, outDir, routeLookup) {
// Use PathManager for all path operations
const pathManager = new PathManager(path.dirname(mdOutDir), config, outDir);
const relHtmlPath = pathManager.getRelativeHtmlPath(fullHtmlPath);
logger.debug(`Processing: ${routePath}`);
try {
const html = await fs.readFile(fullHtmlPath, 'utf8');
const contentConfig = getContentConfig(config);
const contentSelectors = contentConfig.contentSelectors;
let title;
let description;
let markdown = '';
// Process content if markdown files are enabled OR if llms-full.txt is enabled
if (contentConfig.enableMarkdownFiles || contentConfig.enableLlmsFullTxt) {
// Full processing for individual markdown files
const conversionOptions = {
remarkStringify: contentConfig.remarkStringify,
remarkGfm: contentConfig.remarkGfm,
rehypeProcessTables: contentConfig.rehypeProcessTables,
rehypeProcessLinks: true,
baseUrl: siteUrl,
relativePaths: contentConfig.relativePaths,
enableMarkdownFiles: contentConfig.enableMarkdownFiles,
excludeRoutes: contentConfig.excludeRoutes,
fullConfig: config,
logger: logger,
routeLookup: routeLookup,
// Pass simplified plugin arrays to the conversion pipeline
beforeDefaultRehypePlugins: contentConfig.beforeDefaultRehypePlugins,
rehypePlugins: contentConfig.rehypePlugins,
beforeDefaultRemarkPlugins: contentConfig.beforeDefaultRemarkPlugins,
remarkPlugins: contentConfig.remarkPlugins,
};
const result = convertHtmlToMarkdown(html, conversionOptions, contentSelectors);
title = result.title;
description = result.description;
markdown = result.content;
if (!markdown)
throw createProcessingError(`HTML to Markdown conversion resulted in empty content for "${relHtmlPath}". This usually means your contentSelectors didn't match any elements in the HTML. Try using different CSS selectors or check if the HTML file contains the expected content structure.`, {
filePath: relHtmlPath,
contentSelectors,
});
// Save markdown files if enableMarkdownFiles is true
if (contentConfig.enableMarkdownFiles) {
logger.debug(`Saving markdown: ${routePath}`);
const mdPath = htmlPathToMdPath(relHtmlPath, mdOutDir);
await saveMarkdownFile(mdPath, markdown);
// Calculate relative markdown file path using PathManager
const relativeMdPath = pathManager.getRelativeMarkdownPath(mdPath);
return {
routePath,
htmlPath: relHtmlPath,
title,
description,
markdownFile: relativeMdPath,
};
}
else {
// enableLlmsFullTxt is true but enableMarkdownFiles is false
// Return content in memory for llms-full.txt generation
return {
routePath,
htmlPath: relHtmlPath,
title,
description,
markdownContent: markdown,
};
}
}
else {
// Lightweight processing for llms.txt only - just extract metadata
const result = extractHtmlMetadata(html);
title = result.title;
description = result.description;
return { routePath, htmlPath: relHtmlPath, title, description };
}
}
catch (error) {
const errorMsg = getErrorMessage(error);
const errorCause = getErrorCause(error);
logger.debug(`Error processing ${fullHtmlPath}: ${errorMsg}`);
throw createProcessingError(ERROR_MESSAGES.HTML_PROCESSING_FAILED(errorMsg), { filePath: relHtmlPath, cause: errorCause });
}
}