UNPKG

html-content-processor

Version:

A professional library for processing, cleaning, filtering, and converting HTML content to Markdown. Features advanced customization options, presets, plugin support, fluent API, and TypeScript integration for reliable content extraction.

215 lines (214 loc) 10.2 kB
"use strict"; var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.DefaultMarkdownGenerator = void 0; const html2text_1 = require("./html2text"); /** * Regex for link references, used to convert links to reference style. */ const LINK_PATTERN = /!?\[([^\]]+)\]\(([^)]+?)(?:\s+"([^"]*)")?\)/g; /** * Quickly joins a base URL and a relative URL, handling common cases. * @param baseUrl The base URL. * @param url The URL to join. * @returns The joined URL. */ function fastUrlJoin(baseUrl, url) { if (url.startsWith('http://') || url.startsWith('https://') || url.startsWith('mailto:') || url.startsWith('//') || url.startsWith('data:')) { return url; } // Handle absolute paths relative to the domain if (url.startsWith('/')) { if (baseUrl) { try { const base = new URL(baseUrl); return `${base.protocol}//${base.host}${url}`; } catch (e) { // If baseUrl is not a valid URL, it might be a path itself, or invalid // In this case, simple concatenation might be wrong, but it's a fallback return baseUrl.endsWith('/') ? baseUrl.slice(0, -1) + url : baseUrl + url; } } else { // If no baseUrl, an absolute path cannot be resolved meaningfully return url; } } // Handle relative paths if (baseUrl) { try { return new URL(url, baseUrl).href; } catch (e) { // If URL construction fails (e.g. invalid base or relative URL), return original url return url; } } else { // If no baseUrl, a relative path cannot be resolved meaningfully return url; } } class DefaultMarkdownGenerator { /** * Creates a Markdown generator instance. * @param contentFilter Content filter instance. * @param options Generation options. */ constructor(contentFilter = null, options = {}) { this.contentFilter = contentFilter; this.options = Object.assign({ contentSource: 'cleaned_html' }, options); } /** * Converts links to reference style. * @param markdown Markdown text. * @param baseUrl Base URL for resolving relative links. * @returns A tuple containing the Markdown with references and the references section. */ convertLinksToRefs(markdown, baseUrl = '') { const linkMap = new Map(); const urlCache = new Map(); // Cache for resolved URLs const parts = []; let lastEnd = 0; let counter = 1; let match; while ((match = LINK_PATTERN.exec(markdown)) !== null) { parts.push(markdown.slice(lastEnd, match.index)); const [fullMatch, text, url, title] = match; let resolvedUrl = url; if (baseUrl && !(url.startsWith('http://') || url.startsWith('https://') || url.startsWith('mailto:') || url.startsWith('data:'))) { if (!urlCache.has(url)) { urlCache.set(url, fastUrlJoin(baseUrl, url)); } resolvedUrl = urlCache.get(url) || url; } if (!linkMap.has(resolvedUrl)) { const descriptions = []; if (title) descriptions.push(title); // Add text as description only if it's different from title and not empty if (text && text.trim() && text !== title) descriptions.push(text.trim()); linkMap.set(resolvedUrl, { refNum: counter, description: descriptions.length ? ': ' + descriptions.join(' - ') : '' }); counter++; } const linkInfo = linkMap.get(resolvedUrl); if (linkInfo) { // Use ⟨n⟩ format for references to avoid conflict with square brackets in code const refText = text.trim() || resolvedUrl; // Use URL as text if text is empty if (!fullMatch.startsWith('!')) { // Regular link parts.push(`${refText}${linkInfo.refNum}⟩`); } else { // Image parts.push(`![${refText}${linkInfo.refNum}⟩]`); } } lastEnd = match.index + fullMatch.length; } parts.push(markdown.slice(lastEnd)); const convertedText = parts.join(''); if (linkMap.size === 0) { return [convertedText, '']; // No links, no references section } const references = ['\n\n## References\n\n']; // Sort links by their reference number for consistent output const sortedLinks = Array.from(linkMap.entries()).sort((a, b) => a[1].refNum - b[1].refNum); for (const [url, linkInfo] of sortedLinks) { references.push(`⟨${linkInfo.refNum}${url}${linkInfo.description}\n`); } return [convertedText, references.join('')]; } /** * Generates Markdown from HTML. * @param inputHtml Input HTML string. * @param baseUrl Base URL for resolving relative links. * @param html2textOptions Options for HTML to text conversion. * @param contentFilterOverride Optional override for the content filter. * @param citations Whether to generate citations for links. * @returns Markdown generation result object. */ generateMarkdown(inputHtml, baseUrl = '', html2textOptions = {}, contentFilterOverride = null, citations = true) { return __awaiter(this, void 0, void 0, function* () { try { const h = new html2text_1.CustomHtml2Text(baseUrl); const defaultOptions = { bodyWidth: 0, ignoreEmphasis: false, ignoreLinks: false, ignoreImages: false, protectLinks: false, singleLineBreak: true, markCode: true, escapeSnob: false, }; const finalOptions = Object.assign(Object.assign(Object.assign({}, defaultOptions), this.options), html2textOptions); h.updateParams(finalOptions); const effectiveHtml = inputHtml || ''; let rawMarkdown; try { rawMarkdown = yield h.handle(effectiveHtml); } catch (e) { rawMarkdown = `Error converting HTML to markdown: ${e instanceof Error ? e.message : String(e)}`; } // Basic cleanup for Markdown (e.g., extra spaces before code blocks) rawMarkdown = rawMarkdown.replace(/^\s*```/gm, '```'); let markdownWithCitations = rawMarkdown; let referencesMarkdown = ''; if (citations) { try { [markdownWithCitations, referencesMarkdown] = this.convertLinksToRefs(rawMarkdown, baseUrl); } catch (e) { // Preserve raw markdown if citation generation fails referencesMarkdown = `\n\nError generating citations: ${e instanceof Error ? e.message : String(e)}`; } } let fitMarkdown = ''; let fitHtml = ''; const filterToUse = contentFilterOverride || this.contentFilter; if (filterToUse) { try { const filteredChunks = yield filterToUse.filterContent(effectiveHtml); // Wrap each chunk in a div to maintain structure for Markdown conversion, or join directly if preferred fitHtml = filteredChunks.join('\n'); // Simpler join, CustomHtml2Text should handle block elements fitMarkdown = yield h.handle(fitHtml); fitMarkdown = fitMarkdown.replace(/^\s*```/gm, '```'); // Cleanup for fitMarkdown as well } catch (e) { fitMarkdown = `Error generating fit markdown: ${e instanceof Error ? e.message : String(e)}`; fitHtml = `Error during HTML filtering for fit content: ${e instanceof Error ? e.message : String(e)}`; } } return { rawMarkdown: rawMarkdown || '', markdownWithCitations: markdownWithCitations || '', referencesMarkdown: referencesMarkdown || '', fitMarkdown: fitMarkdown || rawMarkdown, fitHtml: fitHtml || '' }; } catch (e) { const errorMsg = `Error in markdown generation: ${e instanceof Error ? e.message : String(e)}`; return { rawMarkdown: errorMsg, markdownWithCitations: errorMsg, referencesMarkdown: '', fitMarkdown: errorMsg, fitHtml: errorMsg }; } }); } } exports.DefaultMarkdownGenerator = DefaultMarkdownGenerator;