UNPKG

@signalwire/docusaurus-plugin-llms-txt

Version:

Generate Markdown versions of Docusaurus HTML pages and an llms.txt index file

107 lines (106 loc) 4.7 kB
/** * HTML content extraction using selectors * Handles HTML parsing and metadata extraction */ import { select, selectAll } from 'hast-util-select'; import { DEFAULT_DOCUMENT_TITLE, HTML_SELECTORS } from '../constants'; import { getErrorMessage, createProcessingError } from '../errors'; import { selectMetaContent } from '../utils/html'; import { defaultPluginRegistry } from './plugins/plugin-registry'; import { extractTitle } from './title-extractor'; /** * Extract content, title, and description from HTML AST * @internal */ export function extractContent(tree, selectors, logger) { const title = extractTitle(tree) ?? 'Untitled'; const description = selectMetaContent(tree, HTML_SELECTORS.META_DESCRIPTION); let content = null; let selectedSelector = ''; for (const selector of selectors) { const elements = selectAll(selector, tree); if (elements.length > 0) { // Use the first matching element const element = elements[0]; selectedSelector = selector; // Create a new Root node containing the selected element content = { type: 'root', children: [element], }; break; } } // Fall back to body if no selectors matched, but try to filter out navigation if (!content) { const bodyElement = select(HTML_SELECTORS.BODY, tree); if (bodyElement) { // Try to find main content within body, excluding nav/header/footer const mainContent = select(`${HTML_SELECTORS.MAIN}, .main-wrapper, #__docusaurus`, bodyElement) ?? bodyElement; content = { type: 'root', children: [mainContent], }; selectedSelector = 'body (fallback)'; } } // Only log selector info in debug mode if logger is available if (content && logger && selectedSelector) { logger.debug(`Using selector: "${selectedSelector}"`); } return { content, title, description }; } /** * Extract only title and description from HTML * @internal */ export function extractHtmlMetadata(html) { try { const parser = defaultPluginRegistry.createMetadataProcessor(); const htmlAst = parser.parse(html); const description = selectMetaContent(htmlAst, HTML_SELECTORS.META_DESCRIPTION); const title = extractTitle(htmlAst); return { title: title ?? DEFAULT_DOCUMENT_TITLE, description: description ?? '', }; } catch (error) { const errorMessage = getErrorMessage(error); throw createProcessingError(`Failed to extract title and description from HTML: ${errorMessage}. This may indicate malformed HTML or an issue with the HTML parsing library.`); } } /** * Convert HTML to Markdown with full processing pipeline * @internal */ export function convertHtmlToMarkdown(html, options, contentSelectors = []) { try { // Parse HTML using plugin registry const parser = defaultPluginRegistry.createMetadataProcessor(); const htmlAst = parser.parse(html); // Extract content, title, and description const { content, title, description } = extractContent(htmlAst, contentSelectors, options.logger); if (!content) { throw createProcessingError(`No content could be extracted from HTML using the provided contentSelectors: [${contentSelectors.join(', ')}]. ` + `The HTML file may not contain elements matching these CSS selectors. ` + `Try using more general selectors like 'main', 'article', or inspect the HTML structure to find the right selectors.`); } // Create processor pipelines using plugin registry const { htmlProcessor, markdownProcessor } = defaultPluginRegistry.createHtmlToMarkdownProcessor(options); // Convert hast to mdast using HTML processor const mdastTree = htmlProcessor.runSync(content); // Process mdast tree through remark plugins and convert to markdown string const processedMdastTree = markdownProcessor.runSync(mdastTree); const markdownContent = String(markdownProcessor.stringify(processedMdastTree)); return { content: markdownContent, title, description, }; } catch (error) { const errorMessage = getErrorMessage(error); throw createProcessingError(`HTML to Markdown conversion failed: ${errorMessage}. This could be due to malformed HTML, unsupported HTML elements, or issues with the conversion pipeline. Check the HTML file structure and content.`); } }