UNPKG

doc-to-readable

Version:

Universal document-to-markdown and section splitter for HTML, URLs, and PDFs.

38 lines (36 loc) 1.42 kB
import { processDom } from './process-dom.js'; import { splitMarkdownByHeaders } from './split-markdown.js'; /** * Convert a URL or HTML/text to markdown. * @param {string} input - The URL or HTML/text. * @param {{ type: 'url' | 'html' }} options - Specify input type. * @returns {Promise<string>} Markdown */ export async function docToMarkdown(input, options = { type: 'html' }) { if (options.type === 'url') { return await processDom(input, null, { inline_title: true }); } else if (options.type === 'html') { return await processDom('', input, { inline_title: true }); } else if (options.type === 'markdown') { return input; } else { throw new Error('Not implemented for type: ' + options.type); } } /** * Convert a markdown text or document to a list of split sections (markdown chunks). * @param {string} text - The markdown or document input. * @param {{ type?: 'markdown' | 'url' | 'html' }} [options] - Specify input type. Defaults to 'markdown'. * @returns {Promise<Array<{section: string, title: string|null}>>} */ export async function splitReadableDocs(text, options = { type: 'markdown' }) { let markdown; if (options.type === 'markdown') { markdown = text; } else { markdown = await docToMarkdown(text, { type: options.type || 'html' }); } const sections = splitMarkdownByHeaders(markdown); // Return as { title, content } return sections; }