capsule-ai-cli
Version:
The AI Model Orchestrator - Intelligent multi-model workflows with device-locked licensing
299 lines ⢠12 kB
JavaScript
import { BaseTool } from '../base.js';
import { URL } from 'url';
import { convert } from 'html-to-text';
import { parse as parseHtml } from 'node-html-parser';
export class WebFetchTool extends BaseTool {
name = 'web_fetch';
displayName = 'š Web Fetch';
description = 'Fetch and extract content from web pages - HTML, JSON, text, with CSS selectors';
category = 'web';
icon = 'š';
parameters = [
{
name: 'url',
type: 'string',
description: 'URL to fetch',
required: true
},
{
name: 'method',
type: 'string',
description: 'HTTP method',
required: false,
default: 'GET',
enum: ['GET', 'POST', 'PUT', 'DELETE', 'HEAD']
},
{
name: 'headers',
type: 'object',
description: 'HTTP headers as key-value pairs',
required: false
},
{
name: 'body',
type: 'string',
description: 'Request body (for POST/PUT)',
required: false
},
{
name: 'extractText',
type: 'boolean',
description: 'Extract text content from HTML',
required: false,
default: true
},
{
name: 'selector',
type: 'string',
description: 'CSS selector to extract specific content',
required: false
},
{
name: 'followRedirects',
type: 'boolean',
description: 'Follow HTTP redirects',
required: false,
default: true
},
{
name: 'timeout',
type: 'number',
description: 'Request timeout in milliseconds',
required: false,
default: 30000
},
{
name: 'extractMetadata',
type: 'boolean',
description: 'Extract page metadata (title, description, etc)',
required: false,
default: false
},
{
name: 'maxContentLength',
type: 'number',
description: 'Maximum content length to fetch (bytes)',
required: false,
default: 1000000
},
{
name: 'extractLinks',
type: 'boolean',
description: 'Extract all links from the page',
required: false,
default: false
},
{
name: 'extractImages',
type: 'boolean',
description: 'Extract all image URLs from the page',
required: false,
default: false
},
{
name: 'userAgent',
type: 'string',
description: 'Custom User-Agent header',
required: false,
default: 'Mozilla/5.0 (compatible; CapsuleCLI/1.0)'
}
];
permissions = {
network: true
};
ui = {
showProgress: true,
collapsible: true,
dangerous: false
};
async run(params, context) {
const { url, method = 'GET', headers = {}, body, extractText = true, selector, followRedirects = true, timeout = 30000, extractMetadata = false, maxContentLength = 1000000, extractLinks = false, extractImages = false, userAgent = 'Mozilla/5.0 (compatible; CapsuleCLI/1.0)' } = params;
try {
new URL(url);
}
catch (error) {
throw new Error(`Invalid URL: ${url}`);
}
this.reportProgress(context, `Fetching ${url}...`);
try {
const fetchOptions = {
method,
headers: {
'User-Agent': userAgent,
...headers
},
redirect: followRedirects ? 'follow' : 'manual',
timeout,
signal: AbortSignal.timeout(timeout)
};
if (body && (method === 'POST' || method === 'PUT')) {
fetchOptions.body = body;
}
const response = await fetch(url, fetchOptions);
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const contentType = response.headers.get('content-type') || '';
const isHtml = contentType.includes('text/html');
const isJson = contentType.includes('application/json');
const contentLength = parseInt(response.headers.get('content-length') || '0');
if (contentLength > maxContentLength) {
throw new Error(`Content too large: ${contentLength} bytes (max: ${maxContentLength})`);
}
const text = await response.text();
const truncatedText = text.substring(0, maxContentLength);
const result = {
url: response.url,
status: response.status,
statusText: response.statusText,
headers: Object.fromEntries(response.headers.entries()),
contentType,
contentLength: text.length
};
if (isJson) {
try {
result.data = JSON.parse(truncatedText);
result.type = 'json';
}
catch (e) {
result.content = truncatedText;
result.type = 'text';
}
}
else if (isHtml) {
result.type = 'html';
const doc = parseHtml(truncatedText);
if (extractText) {
result.text = convert(truncatedText, {
wordwrap: false,
selectors: [
{ selector: 'a', options: { ignoreHref: true } },
{ selector: 'img', format: 'skip' },
{ selector: 'script', format: 'skip' },
{ selector: 'style', format: 'skip' },
{ selector: 'nav', format: 'skip' },
{ selector: 'header', format: 'skip' },
{ selector: 'footer', format: 'skip' }
],
limits: {
maxChildNodes: 10000
}
}).substring(0, 50000);
}
if (extractMetadata) {
const title = doc.querySelector('title')?.text || '';
const description = doc.querySelector('meta[name="description"]')?.getAttribute('content') || '';
const ogTitle = doc.querySelector('meta[property="og:title"]')?.getAttribute('content') || '';
const ogDescription = doc.querySelector('meta[property="og:description"]')?.getAttribute('content') || '';
const ogImage = doc.querySelector('meta[property="og:image"]')?.getAttribute('content') || '';
const keywords = doc.querySelector('meta[name="keywords"]')?.getAttribute('content') || '';
result.metadata = {
title: title || ogTitle || undefined,
description: description || ogDescription || undefined,
ogImage: ogImage || undefined,
keywords: keywords ? keywords.split(',').map(k => k.trim()) : undefined
};
}
if (selector) {
const elements = doc.querySelectorAll(selector);
if (elements.length > 0) {
result.extracted = elements.map(el => ({
text: el.text,
html: el.innerHTML,
attributes: el.attributes
}));
}
else {
result.extracted = [];
result.warning = `No elements found matching selector: ${selector}`;
}
}
if (extractLinks) {
const links = doc.querySelectorAll('a[href]');
result.links = Array.from(new Set(links.map(link => {
const href = link.getAttribute('href') || '';
try {
return new URL(href, url).href;
}
catch {
return href;
}
}).filter(href => href && (href.startsWith('http://') || href.startsWith('https://'))))).slice(0, 100);
}
if (extractImages) {
const images = doc.querySelectorAll('img[src]');
result.images = Array.from(new Set(images.map(img => {
const src = img.getAttribute('src') || '';
try {
return new URL(src, url).href;
}
catch {
return src;
}
}).filter(src => src && (src.startsWith('http://') || src.startsWith('https://'))))).slice(0, 50);
}
if (!extractText && !extractMetadata && !selector && !extractLinks && !extractImages) {
result.html = truncatedText.substring(0, 10000);
}
}
else {
result.type = 'text';
result.content = truncatedText.substring(0, 50000);
}
const summary = this.createSummary(result);
result.summary = summary;
result.display = summary;
return result;
}
catch (error) {
if (error.type === 'request-timeout') {
throw new Error(`Request timed out after ${timeout}ms`);
}
throw new Error(`Failed to fetch ${url}: ${error.message}`);
}
}
createSummary(result) {
let summary = `š Fetched ${result.url}\n`;
summary += `Status: ${result.status} ${result.statusText}\n`;
summary += `Type: ${result.type} (${this.formatBytes(result.contentLength)})\n`;
if (result.type === 'json') {
const keys = result.data ? Object.keys(result.data).slice(0, 5) : [];
summary += `JSON Keys: ${keys.join(', ')}${keys.length > 5 ? '...' : ''}\n`;
}
else if (result.type === 'html') {
if (result.metadata?.title) {
summary += `Title: ${result.metadata.title}\n`;
}
if (result.metadata?.description) {
summary += `Description: ${result.metadata.description.substring(0, 100)}...\n`;
}
if (result.extracted) {
summary += `Extracted: ${result.extracted.length} elements matching "${result.selector}"\n`;
}
if (result.links) {
summary += `Links: ${result.links.length} found\n`;
}
if (result.images) {
summary += `Images: ${result.images.length} found\n`;
}
if (result.text) {
summary += `\nContent Preview:\n${result.text.substring(0, 300)}...\n`;
}
}
else {
summary += `Content: ${result.content?.substring(0, 200)}...\n`;
}
if (result.warning) {
summary += `\nā ļø ${result.warning}\n`;
}
return summary;
}
formatBytes(bytes) {
if (bytes < 1024)
return bytes + ' bytes';
if (bytes < 1024 * 1024)
return (bytes / 1024).toFixed(1) + ' KB';
return (bytes / (1024 * 1024)).toFixed(1) + ' MB';
}
}
//# sourceMappingURL=web-fetch.js.map