pomljs
Version:
Prompt Orchestration Markup Language
128 lines (126 loc) • 4.28 kB
JavaScript
import * as React from 'react';
import * as fs from 'fs';
import { component, useWithCatch, expandRelative } from '../base.js';
import { Text } from '../essentials.js';
import * as cheerio from 'cheerio';
import { htmlToPoml } from './document.js';
async function fetchWebpage(url) {
try {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`HTTP error! Status: ${response.status}`);
}
return await response.text();
}
catch (error) {
throw new Error(`Error fetching webpage from ${url}: ${error}`);
}
}
async function extractTextFromHtml(html, selector) {
const $ = cheerio.load(html);
// Remove scripts and styles
$('script').remove();
$('style').remove();
// If selector is provided, extract content from matching elements
if (selector) {
try {
const elements = $(selector);
if (elements.length === 0) {
return `No elements found matching selector: ${selector}`;
}
return elements
.map((_, el) => $(el).text())
.get()
.join('\n\n');
}
catch (error) {
throw new Error(`Error with selector "${selector}": ${error}`);
}
}
// Get text from body, preserving some structure
return $('body').text().trim() || '';
}
async function processWebpage(props) {
const { src, url, buffer, extractText = false, selector } = props;
let html;
if (url) {
html = await fetchWebpage(url);
}
else if (src) {
const filePath = expandRelative(src);
html = fs.readFileSync(filePath, 'utf-8');
}
else if (buffer) {
if (typeof buffer === 'string') {
html = buffer;
}
else {
html = buffer.toString('utf-8');
}
}
else {
throw new Error('Either url, src, or buffer must be provided');
}
if (extractText) {
const text = await extractTextFromHtml(html, selector);
return React.createElement(Text, { whiteSpace: "pre" }, text);
}
else {
// Use the htmlToPoml function to convert HTML to POML components
const $ = cheerio.load(html);
let content;
if (selector) {
const selected = $(selector);
if (selected.length === 0) {
return React.createElement(Text, null,
"No elements found matching selector: ",
selector);
}
content = htmlToPoml(selected, $, props);
}
else {
content = htmlToPoml($('body'), $, props);
}
return content;
}
}
/**
* Displays content from a webpage.
*
* @param {string} url - The URL of the webpage to fetch and display.
* @param {string} src - Local file path to an HTML file to display.
* @param {string|Buffer} buffer - HTML content as string or buffer.
* @param {string} base64 - Base64 encoded HTML content.
* @param {boolean} extractText - Whether to extract plain text content (true) or convert HTML to structured POML (false). Default is false.
* @param {string} selector - CSS selector to extract specific content from the page (e.g., "article", ".content", "#main"). Default is "body".
*
* @see {@link Inline} for other props available.
*
* @example
* Display content from a URL:
* ```xml
* <webpage url="https://example.com" />
* ```
*
* Extract only specific content using a selector:
* ```xml
* <webpage url="https://example.com" selector="main article" />
* ```
*
* Convert HTML to structured POML components:
* ```xml
* <webpage url="https://example.com" extractText="false" />
* ```
*/
component('Webpage', { asynchorous: true })((props) => {
let { src, url, buffer, base64, extractText, selector, ...others } = props;
if (base64) {
if (buffer !== undefined) {
throw new Error('Either buffer or base64 should be provided, not both.');
}
buffer = Buffer.from(base64, 'base64');
}
const content = useWithCatch(processWebpage({ ...props, buffer: buffer }), others);
return React.createElement(Text, { ...others }, content ?? null);
});
//# sourceMappingURL=webpage.js.map