UNPKG

capsule-ai-cli

Version:

The AI Model Orchestrator - Intelligent multi-model workflows with device-locked licensing

299 lines • 12 kB
import { BaseTool } from '../base.js'; import { URL } from 'url'; import { convert } from 'html-to-text'; import { parse as parseHtml } from 'node-html-parser'; export class WebFetchTool extends BaseTool { name = 'web_fetch'; displayName = '🌐 Web Fetch'; description = 'Fetch and extract content from web pages - HTML, JSON, text, with CSS selectors'; category = 'web'; icon = '🌐'; parameters = [ { name: 'url', type: 'string', description: 'URL to fetch', required: true }, { name: 'method', type: 'string', description: 'HTTP method', required: false, default: 'GET', enum: ['GET', 'POST', 'PUT', 'DELETE', 'HEAD'] }, { name: 'headers', type: 'object', description: 'HTTP headers as key-value pairs', required: false }, { name: 'body', type: 'string', description: 'Request body (for POST/PUT)', required: false }, { name: 'extractText', type: 'boolean', description: 'Extract text content from HTML', required: false, default: true }, { name: 'selector', type: 'string', description: 'CSS selector to extract specific content', required: false }, { name: 'followRedirects', type: 'boolean', description: 'Follow HTTP redirects', required: false, default: true }, { name: 'timeout', type: 'number', description: 'Request timeout in milliseconds', required: false, default: 30000 }, { name: 'extractMetadata', type: 'boolean', description: 'Extract page metadata (title, description, etc)', required: false, default: false }, { name: 'maxContentLength', type: 'number', description: 'Maximum content length to fetch (bytes)', required: false, default: 1000000 }, { name: 'extractLinks', type: 'boolean', description: 'Extract all links from the page', required: false, default: false }, { name: 'extractImages', type: 'boolean', description: 'Extract all image URLs from the page', required: false, default: false }, { name: 'userAgent', type: 'string', description: 'Custom User-Agent header', required: false, default: 'Mozilla/5.0 (compatible; CapsuleCLI/1.0)' } ]; permissions = { network: true }; ui = { showProgress: true, collapsible: true, dangerous: false }; async run(params, context) { const { url, method = 'GET', headers = {}, body, extractText = true, selector, followRedirects = true, timeout = 30000, extractMetadata = false, maxContentLength = 1000000, extractLinks = false, extractImages = false, userAgent = 'Mozilla/5.0 (compatible; CapsuleCLI/1.0)' } = params; try { new URL(url); } catch (error) { throw new Error(`Invalid URL: ${url}`); } this.reportProgress(context, `Fetching ${url}...`); try { const fetchOptions = { method, headers: { 'User-Agent': userAgent, ...headers }, redirect: followRedirects ? 'follow' : 'manual', timeout, signal: AbortSignal.timeout(timeout) }; if (body && (method === 'POST' || method === 'PUT')) { fetchOptions.body = body; } const response = await fetch(url, fetchOptions); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const contentType = response.headers.get('content-type') || ''; const isHtml = contentType.includes('text/html'); const isJson = contentType.includes('application/json'); const contentLength = parseInt(response.headers.get('content-length') || '0'); if (contentLength > maxContentLength) { throw new Error(`Content too large: ${contentLength} bytes (max: ${maxContentLength})`); } const text = await response.text(); const truncatedText = text.substring(0, maxContentLength); const result = { url: response.url, status: response.status, statusText: response.statusText, headers: Object.fromEntries(response.headers.entries()), contentType, contentLength: text.length }; if (isJson) { try { result.data = JSON.parse(truncatedText); result.type = 'json'; } catch (e) { result.content = truncatedText; result.type = 'text'; } } else if (isHtml) { result.type = 'html'; const doc = parseHtml(truncatedText); if (extractText) { result.text = convert(truncatedText, { wordwrap: false, selectors: [ { selector: 'a', options: { ignoreHref: true } }, { selector: 'img', format: 'skip' }, { selector: 'script', format: 'skip' }, { selector: 'style', format: 'skip' }, { selector: 'nav', format: 'skip' }, { selector: 'header', format: 'skip' }, { selector: 'footer', format: 'skip' } ], limits: { maxChildNodes: 10000 } }).substring(0, 50000); } if (extractMetadata) { const title = doc.querySelector('title')?.text || ''; const description = doc.querySelector('meta[name="description"]')?.getAttribute('content') || ''; const ogTitle = doc.querySelector('meta[property="og:title"]')?.getAttribute('content') || ''; const ogDescription = doc.querySelector('meta[property="og:description"]')?.getAttribute('content') || ''; const ogImage = doc.querySelector('meta[property="og:image"]')?.getAttribute('content') || ''; const keywords = doc.querySelector('meta[name="keywords"]')?.getAttribute('content') || ''; result.metadata = { title: title || ogTitle || undefined, description: description || ogDescription || undefined, ogImage: ogImage || undefined, keywords: keywords ? keywords.split(',').map(k => k.trim()) : undefined }; } if (selector) { const elements = doc.querySelectorAll(selector); if (elements.length > 0) { result.extracted = elements.map(el => ({ text: el.text, html: el.innerHTML, attributes: el.attributes })); } else { result.extracted = []; result.warning = `No elements found matching selector: ${selector}`; } } if (extractLinks) { const links = doc.querySelectorAll('a[href]'); result.links = Array.from(new Set(links.map(link => { const href = link.getAttribute('href') || ''; try { return new URL(href, url).href; } catch { return href; } }).filter(href => href && (href.startsWith('http://') || href.startsWith('https://'))))).slice(0, 100); } if (extractImages) { const images = doc.querySelectorAll('img[src]'); result.images = Array.from(new Set(images.map(img => { const src = img.getAttribute('src') || ''; try { return new URL(src, url).href; } catch { return src; } }).filter(src => src && (src.startsWith('http://') || src.startsWith('https://'))))).slice(0, 50); } if (!extractText && !extractMetadata && !selector && !extractLinks && !extractImages) { result.html = truncatedText.substring(0, 10000); } } else { result.type = 'text'; result.content = truncatedText.substring(0, 50000); } const summary = this.createSummary(result); result.summary = summary; result.display = summary; return result; } catch (error) { if (error.type === 'request-timeout') { throw new Error(`Request timed out after ${timeout}ms`); } throw new Error(`Failed to fetch ${url}: ${error.message}`); } } createSummary(result) { let summary = `🌐 Fetched ${result.url}\n`; summary += `Status: ${result.status} ${result.statusText}\n`; summary += `Type: ${result.type} (${this.formatBytes(result.contentLength)})\n`; if (result.type === 'json') { const keys = result.data ? Object.keys(result.data).slice(0, 5) : []; summary += `JSON Keys: ${keys.join(', ')}${keys.length > 5 ? '...' : ''}\n`; } else if (result.type === 'html') { if (result.metadata?.title) { summary += `Title: ${result.metadata.title}\n`; } if (result.metadata?.description) { summary += `Description: ${result.metadata.description.substring(0, 100)}...\n`; } if (result.extracted) { summary += `Extracted: ${result.extracted.length} elements matching "${result.selector}"\n`; } if (result.links) { summary += `Links: ${result.links.length} found\n`; } if (result.images) { summary += `Images: ${result.images.length} found\n`; } if (result.text) { summary += `\nContent Preview:\n${result.text.substring(0, 300)}...\n`; } } else { summary += `Content: ${result.content?.substring(0, 200)}...\n`; } if (result.warning) { summary += `\nāš ļø ${result.warning}\n`; } return summary; } formatBytes(bytes) { if (bytes < 1024) return bytes + ' bytes'; if (bytes < 1024 * 1024) return (bytes / 1024).toFixed(1) + ' KB'; return (bytes / (1024 * 1024)).toFixed(1) + ' MB'; } } //# sourceMappingURL=web-fetch.js.map