UNPKG

doc-to-readable

Version:

Universal document-to-markdown and section splitter for HTML, URLs, and PDFs.

39 lines (31 loc) 1.44 kB
import { parseDomFromString } from './parse-dom.js'; import { convertToMarkdown } from './markdown-converter.js'; import {extractArticleFromDom} from './extract-article.js'; const DEFAULT_MAX_SIZE_BYTES = 2 * 1024 * 1024; // 2MB /** * Convert an HTML string to a markdown article, extracting the main content and title. * @param {string} htmlText - The HTML string to process. * @param {Object} options - { maxSizeBytes } * @returns {Promise<string>} Markdown */ export async function htmlToArticle(htmlText, options = {}) { const maxSize = options.maxSizeBytes || DEFAULT_MAX_SIZE_BYTES; if (htmlText.length > maxSize) { console.debug(`[htmlToArticle] Input HTML exceeds ${maxSize} bytes. Use the bulk option for large files.`); throw new Error(`Input HTML is too large (max ${maxSize} bytes). Please use the bulk option for large files.`); } // 1. Extract title before sanitization const titleMatch = htmlText.match(/<title[^>]*>([^<]*)<\/title>/i); const title = titleMatch ? titleMatch[1].trim() : ''; // 2. Parse HTML string to DOM const doc = await parseDomFromString(htmlText); // 3. Extract readable article const readableHtmlArticle = extractArticleFromDom(doc); // 4. Convert to Markdown let markdown = convertToMarkdown(readableHtmlArticle); // 5. Prepend title if it exists if (title && title.trim()) { markdown = `# ${title.trim()}\n\n${markdown}`; } return markdown; }