UNPKG

doc-to-readable

Version:

Universal document-to-markdown and section splitter for HTML, URLs, and PDFs.

73 lines (61 loc) 2.36 kB
import { pdfToHtmlFromBuffer } from './pdf-to-html.js'; // Universal HTML fetcher and DOM creator for browser and Node.js // Exports: fetchHtmlOrDoc, universalFetch, fetchHtml export async function universalFetch(url, options = {}) { try { // Browser environment if (typeof window !== 'undefined' && typeof window.document !== 'undefined') { // Ensure credentials are handled appropriately for CORS // surruound with try catch const response = await fetch(url, { ...options, mode: 'cors', credentials: 'omit', }); if (!response.ok) { throw new Error(`HTTP error! Status: ${response.status}`); } return response; } // Node.js environment else { let fetchFn = globalThis.fetch; // Fallback to node-fetch if global fetch is unavailable if (!fetchFn) { try { fetchFn = fetch; } catch (importError) { throw new Error(`Failed to load node-fetch: ${importError.message}`); } } const response = await fetchFn(url, options); if (!response.ok) { throw new Error(`HTTP error! Status: ${response.status}`); } return response; } } catch (error) { // Log the error for debugging console.debug(`[universalFetch] Error fetching ${url}:`, error.message); throw error; // Re-throw to allow calling code to handle it } } export async function fetchHtml(url, options = {}) { console.debug('fetchHtml', url, options); const res = await universalFetch(url, options); const contentType = res.headers.get('content-type') || ''; const isHtml = contentType.toLowerCase().includes('text/html') || contentType.toLowerCase().includes('application/xhtml+xml'); const isText = contentType.toLowerCase().includes('text/plain') const isPdf = contentType.toLowerCase().includes('pdf'); if (!isHtml && !isPdf && !isText) { throw new Error(`Unsupported content type: ${contentType}. Only HTML and PDF are supported.`); } if (isPdf) { const arrayBuffer = await res.arrayBuffer(); const html = await pdfToHtmlFromBuffer(arrayBuffer); console.debug('html', html); return html; } if (!res.ok) throw new Error(`Fetch failed: ${res.status}`); return await res.text(); }