UNPKG

pomljs

Version:

Prompt Orchestration Markup Language

128 lines (126 loc) 4.28 kB
import * as React from 'react'; import * as fs from 'fs'; import { component, useWithCatch, expandRelative } from '../base.js'; import { Text } from '../essentials.js'; import * as cheerio from 'cheerio'; import { htmlToPoml } from './document.js'; async function fetchWebpage(url) { try { const response = await fetch(url); if (!response.ok) { throw new Error(`HTTP error! Status: ${response.status}`); } return await response.text(); } catch (error) { throw new Error(`Error fetching webpage from ${url}: ${error}`); } } async function extractTextFromHtml(html, selector) { const $ = cheerio.load(html); // Remove scripts and styles $('script').remove(); $('style').remove(); // If selector is provided, extract content from matching elements if (selector) { try { const elements = $(selector); if (elements.length === 0) { return `No elements found matching selector: ${selector}`; } return elements .map((_, el) => $(el).text()) .get() .join('\n\n'); } catch (error) { throw new Error(`Error with selector "${selector}": ${error}`); } } // Get text from body, preserving some structure return $('body').text().trim() || ''; } async function processWebpage(props) { const { src, url, buffer, extractText = false, selector } = props; let html; if (url) { html = await fetchWebpage(url); } else if (src) { const filePath = expandRelative(src); html = fs.readFileSync(filePath, 'utf-8'); } else if (buffer) { if (typeof buffer === 'string') { html = buffer; } else { html = buffer.toString('utf-8'); } } else { throw new Error('Either url, src, or buffer must be provided'); } if (extractText) { const text = await extractTextFromHtml(html, selector); return React.createElement(Text, { whiteSpace: "pre" }, text); } else { // Use the htmlToPoml function to convert HTML to POML components const $ = cheerio.load(html); let content; if (selector) { const selected = $(selector); if (selected.length === 0) { return React.createElement(Text, null, "No elements found matching selector: ", selector); } content = htmlToPoml(selected, $, props); } else { content = htmlToPoml($('body'), $, props); } return content; } } /** * Displays content from a webpage. * * @param {string} url - The URL of the webpage to fetch and display. * @param {string} src - Local file path to an HTML file to display. * @param {string|Buffer} buffer - HTML content as string or buffer. * @param {string} base64 - Base64 encoded HTML content. * @param {boolean} extractText - Whether to extract plain text content (true) or convert HTML to structured POML (false). Default is false. * @param {string} selector - CSS selector to extract specific content from the page (e.g., "article", ".content", "#main"). Default is "body". * * @see {@link Inline} for other props available. * * @example * Display content from a URL: * ```xml * <webpage url="https://example.com" /> * ``` * * Extract only specific content using a selector: * ```xml * <webpage url="https://example.com" selector="main article" /> * ``` * * Convert HTML to structured POML components: * ```xml * <webpage url="https://example.com" extractText="false" /> * ``` */ component('Webpage', { asynchorous: true })((props) => { let { src, url, buffer, base64, extractText, selector, ...others } = props; if (base64) { if (buffer !== undefined) { throw new Error('Either buffer or base64 should be provided, not both.'); } buffer = Buffer.from(base64, 'base64'); } const content = useWithCatch(processWebpage({ ...props, buffer: buffer }), others); return React.createElement(Text, { ...others }, content ?? null); }); //# sourceMappingURL=webpage.js.map