UNPKG

@akira108sys/html-rewriter-readability

Version:

A library to extract readable content with Mozilla/Readability algorithm using Cloudflare HTMLRewriter.

238 lines (237 loc) 12.2 kB
// markdown-converter.ts import { VOID_ELEMENTS } from './constants'; import { escapeHtml, getChildrenIds, getInnerText } from './utils'; // Import from utils export class MarkdownConverter { constructor(elementStore, elementsToKeepIdsSet, baseURI, options = {}) { this.elementStore = elementStore; this.elementsToKeepIdsSet = elementsToKeepIdsSet; this.baseURI = baseURI; this.options = options; } convert(rootElementId) { if (this.options.debug) console.log("Converting extracted elements to Markdown... :)"); if (rootElementId === null) { if (this.options.debug) console.error("Cannot generate Markdown: Root element ID is null."); return ""; } let markdownOutput = ''; const rootChildren = getChildrenIds(rootElementId, this.elementStore); for (const childId of rootChildren) { markdownOutput += this.convertNodeRecursive(childId); } // Clean up unnecessary consecutive line breaks at the end markdownOutput = markdownOutput.replace(/\n{3,}/g, '\n\n').trim(); if (this.options.debug) console.log("Markdown conversion finished."); return markdownOutput; } /** * Recursively converts the element with the specified ID and its descendants to a Markdown string. * @param id ID of the element to convert * @param listLevel Current nesting level of the list (0 is outside a list, 1 is the top-level list) * @param isListOrdered Whether the parent is an ordered list (affects LI elements only) * @param listItemNumber Current item number within an ordered list (affects LI elements only) * @returns Part of the generated Markdown string */ convertNodeRecursive(id, listLevel = 0, isListOrdered = false, // This indicates if the *direct parent* is an OL listItemNumber = 1 // This is the number for *this* LI if it's inside an OL ) { const info = this.elementStore.get(id); // If not kept or no info, return empty string if (!info || !this.elementsToKeepIdsSet.has(id)) { return ''; } let markdown = ''; const tagName = info.tagName; // --- Generate Markdown for child elements first --- let childrenMarkdown = ''; const children = getChildrenIds(id, this.elementStore); children.forEach((childId, index) => { const childInfo = this.elementStore.get(childId); const nextLevel = (tagName === 'UL' || tagName === 'OL' || tagName === 'LI') ? listLevel + 1 : 0; // Increase level within list-related elements const isNextListOrdered = (tagName === 'OL'); // Tell the next level LI if the current one is OL const nextListItemNumber = (tagName === 'OL') ? index + 1 : 1; // Pass the number to the direct children of OL childrenMarkdown += this.convertNodeRecursive(childId, nextLevel, isNextListOrdered, nextListItemNumber); }); // --- Get and escape the element's own direct text --- let elementText = ''; const rawDirectText = getInnerText(id, this.elementStore, false); // Get directTextContent if (rawDirectText) { // Escape Markdown special characters (\, `, *, _, {, }, [, ], (, ), #, +, -, ., !) // Note: Be careful not to over-escape, especially inside code blocks. // Libraries like Turndown handle this well. elementText = rawDirectText.replace(/([\\`*_{}[\]()#+.!-])/g, '\\$1'); // Assuming HTML entities are decoded (& -> & should be checked if unescapeHtmlEntities was done in Phase1) // If not, add decoding process here } // --- List element indentation --- // listLevel=1 means no indent, 2 means 2 spaces, 3 means 4 spaces... const listIndent = ' '.repeat(listLevel > 0 ? listLevel - 1 : 0); // --- Markdown generation per tag --- switch (tagName) { case 'P': { // Paragraph: Indent + own text + children result + 2 newlines // (Trim because child elements might be block elements, potentially adding extra newlines) const pContent = (elementText + childrenMarkdown).trim(); markdown = pContent ? `${listIndent}${pContent}\n\n` : ''; // Don't output empty P break; } case 'H1': markdown = `${listIndent}# ${elementText}${childrenMarkdown}\n\n`; break; case 'H2': markdown = `${listIndent}## ${elementText}${childrenMarkdown}\n\n`; break; case 'H3': markdown = `${listIndent}### ${elementText}${childrenMarkdown}\n\n`; break; case 'H4': markdown = `${listIndent}#### ${elementText}${childrenMarkdown}\n\n`; break; case 'H5': markdown = `${listIndent}##### ${elementText}${childrenMarkdown}\n\n`; break; case 'H6': markdown = `${listIndent}###### ${elementText}${childrenMarkdown}\n\n`; break; case 'UL': case 'OL': // List container itself only adds surrounding newlines. Indentation etc. is delegated to the content (LI). // Add if there's no blank line before the list markdown = childrenMarkdown.startsWith('\n') ? childrenMarkdown : `\n${childrenMarkdown}`; // Add if there's no blank line after the list (might be unnecessary as the last LI adds a newline) // markdown = markdown.endsWith('\n\n') ? markdown : markdown + '\n'; break; case 'LI': { // isListOrdered is determined by whether the *parent* is OL const marker = isListOrdered ? `${listItemNumber}.` : '*'; // Combine LI text and child Markdown let liContent = (elementText + childrenMarkdown).trim(); // Multi-line support: Add indent to lines after the first (ideally matching marker length, but fixed indent here) const itemIndent = `${listIndent} `; // Indent after the marker liContent = liContent.split('\n').map((line, index) => index > 0 ? itemIndent + line.trim() : line.trim()).join('\n'); markdown = `${listIndent}${marker} ${liContent}\n`; break; } case 'A': { let href = info.attributes.href ?? ''; if (href && !href.startsWith('http') && !href.startsWith('#') && !href.startsWith('mailto:') && !href.startsWith('tel:')) { try { href = new URL(href, this.baseURI).href; } catch (e) { console.warn(`Markdown Conv: Failed to resolve href: ${href}`); } } // Link text: Prioritize children if they exist, otherwise use own direct text const linkText = childrenMarkdown.trim() || elementText; // It's often better not to escape Markdown special characters within link text // (e.g., [**bold** link](...)) markdown = `[${linkText.replace(/([\\`*_{}[\]()#+.!-])/g, '\\$1')}](${href || ''})`; // Adjust escaping within link text as needed break; } case 'IMG': { let src = info.attributes.src ?? ''; if (src && !src.startsWith('http') && !src.startsWith('data:')) { try { src = new URL(src, this.baseURI).href; } catch (e) { console.warn(`Markdown Conv: Failed to resolve src: ${src}`); } } const alt = info.attributes.alt ?? ''; const title = info.attributes.title ? ` "${escapeHtml(info.attributes.title)}"` : ''; // Treat images as block elements, so indent + 2 trailing newlines markdown = `${listIndent}![${alt.replace(/([\\`*_{}[\]()#+.!-])/g, '\\$1')}](${src || ''}${title})\n\n`; break; } case 'PRE': { // Find CODE element within PRE const codeChild = children.map(cid => this.elementStore.get(cid)).find(cinfo => cinfo?.tagName === 'CODE'); let codeContent = ''; let lang = ''; if (codeChild) { // Use the directTextContent of the CODE element (do not escape) codeContent = getInnerText(codeChild.id, this.elementStore, false); const langClass = codeChild.attributes.class?.match(/language-(\S+)/); lang = langClass ? langClass[1] : ''; } else { // If no CODE, use the text directly under PRE (do not escape) codeContent = getInnerText(id, this.elementStore, false); } // Do not escape special characters within code blocks markdown = `${listIndent}\`\`\`${lang}\n${codeContent.trim()}\n${listIndent}\`\`\`\n\n`; break; } case 'CODE': { // Assume isCodeBlock flag is set in phase1 if (!info.isCodeBlock) { // Inline code: Escaping content is generally unnecessary markdown = `\`${(elementText + childrenMarkdown).trim()}\``; } else { // If PRE > CODE, it was handled by PRE, so return empty string markdown = ''; // However, PRE needed to be able to get the CODE text // (Handled by the PRE modification above) } break; } case 'STRONG': case 'B': markdown = `**${elementText}${childrenMarkdown}**`; break; case 'EM': case 'I': markdown = `*${elementText}${childrenMarkdown}*`; break; case 'BLOCKQUOTE': { // Combine text within blockquote and child Markdown const bqContent = (elementText + childrenMarkdown).trim(); // Add > to the beginning of each line markdown = `${bqContent.split('\n').map(line => `${listIndent}> ${line.trim()}`).join('\n')}\n\n`; break; } case 'HR': markdown = `${listIndent}---\n\n`; break; case 'BR': // Need to decide whether to use two spaces + newline or just a newline based on surrounding text // Treat as GFM Hard break here markdown = ' \n'; break; // Ignored structural tags (output content only) case 'DIV': case 'SPAN': case 'SECTION': case 'ARTICLE': case 'FIGURE': case 'FIGCAPTION': case 'HEADER': case 'FOOTER': case 'ASIDE': case 'NAV': markdown = `${elementText}${childrenMarkdown}`; break; // Other unhandled tags default: if (!VOID_ELEMENTS.has(tagName)) { // Output content only for unhandled non-void elements markdown = `${elementText}${childrenMarkdown}`; if (this.options.debug) console.log(`Unhandled tag: ${tagName} - outputting content only.`); } else { // Completely ignore unhandled void elements markdown = ''; if (this.options.debug) console.log(`Ignoring void tag: ${tagName}`); } } return markdown; } }