UNPKG

claude-flow

Version:

Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration

304 lines 12.5 kB
/** * GAIA Tool: visit_webpage — iter-57 (upgraded from iter-54 base) * * Fetches the FULL readable text content of a webpage URL — not just snippets. * This closes the HAL parity gap identified in iter-51 surrender analysis * (~25-35% of L1 questions need full page reading that snippet-only tools miss). * * Upgrade over iter-54 base (iter-57 changes): * - max_chars input parameter (default 50k, was hardcoded 8k) * - Test hooks interface (visitWebpageTestHooks) for unit tests without live HTTP * - URL validation with helpful error messages * - Link extraction (up to 30 links) — enables follow-up navigation * - Better HTML extraction: content-block heuristics (main/article selection) * - AIDefence PII gate (optional dep, graceful skip if not installed) * - Structured VisitWebpageResult type + diagnostic stderr logging * - Python bs4 primary path preserved (already in benchmark env) * * Extraction pipeline: * 1. fetch() + Python bs4.get_text() (primary — better quality, handles encoding) * 2. Regex-based fallback (no external dep) if bs4 unavailable or output too short * 3. Content passed through AIDefence PII gate before returning to agent * * Refs: ADR-138, ADR-133, iter-54, iter-57, #2156 */ import { execFileSync } from 'node:child_process'; // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- const DEFAULT_MAX_CHARS = 50_000; const MIN_CONTENT_CHARS = 50; const REQUEST_TIMEOUT_MS = 30_000; const MAX_LINKS = 30; const UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'; export const visitWebpageTestHooks = {}; // --------------------------------------------------------------------------- // URL validation // --------------------------------------------------------------------------- function validateUrl(url) { if (!url || typeof url !== 'string') { throw new Error('visit_webpage: `url` must be a non-empty string.'); } if (!url.startsWith('http://') && !url.startsWith('https://')) { throw new Error(`visit_webpage: URL must start with http:// or https://. Got: "${url}"`); } try { new URL(url); } catch { throw new Error(`visit_webpage: invalid URL: "${url}"`); } } // --------------------------------------------------------------------------- // Page fetcher (uses Node built-in fetch — Node 18+) // --------------------------------------------------------------------------- async function fetchPage(url) { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS); let res; try { res = await fetch(url, { headers: { 'User-Agent': UA, Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.9', }, signal: controller.signal, redirect: 'follow', }); } finally { clearTimeout(timer); } if (!res.ok) { throw new Error(`visit_webpage: HTTP ${res.status} for ${url}`); } const ct = res.headers.get('content-type') ?? ''; if (ct.includes('application/pdf')) { throw new Error(`visit_webpage: URL returns a PDF. Use file_read with the downloaded path instead. URL: ${url}`); } const html = await res.text(); const finalUrl = res.url || url; return { html, finalUrl }; } // --------------------------------------------------------------------------- // HTML extraction helpers // --------------------------------------------------------------------------- /** * Extract title from raw HTML. */ function extractTitle(html) { const match = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(html); if (!match) return ''; return match[1] .replace(/<[^>]+>/g, '') .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>') .replace(/&quot;/g, '"').replace(/&nbsp;/g, ' ') .trim(); } /** * Extract anchor links from HTML (up to MAX_LINKS). */ function extractLinks(html, baseUrl) { const links = []; const anchorRe = /<a[^>]+href="([^"#][^"]*)"[^>]*>([\s\S]*?)<\/a>/gi; let match; let base; try { base = new URL(baseUrl); } catch { return links; } while ((match = anchorRe.exec(html)) !== null && links.length < MAX_LINKS) { const rawHref = match[1].trim(); const rawText = match[2].replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim().slice(0, 120); if (!rawText || rawHref.startsWith('javascript:') || rawHref.startsWith('mailto:')) continue; let href; try { href = new URL(rawHref, base.origin + base.pathname).href; } catch { continue; } links.push({ href, text: rawText }); } return links; } /** * Primary extraction: Python bs4.get_text(). * bs4 is available in the benchmark env (already installed for gaia-codeagent). * HTML is passed via stdin to avoid shell-escaping issues. */ function extractTextViaPython(html) { const script = [ 'import sys, re', 'from bs4 import BeautifulSoup', 'html = sys.stdin.read()', "soup = BeautifulSoup(html, 'html.parser')", "for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):", ' tag.decompose()', "text = soup.get_text(separator='\\n', strip=True)", "text = re.sub(r'\\n{3,}', '\\n\\n', text)", 'print(text)', ].join('\n'); try { return execFileSync('python3', ['-'], { input: script + '\n' + html, encoding: 'utf-8', timeout: 15_000, env: { ...process.env, PYTHONIOENCODING: 'utf-8' }, maxBuffer: 10 * 1024 * 1024, }).trim(); } catch { return ''; } } /** * Fallback: regex-based HTML stripping (no external dep, less accurate). */ function extractTextViaRegex(html) { return html .replace(/<script[\s\S]*?<\/script>/gi, '') .replace(/<style[\s\S]*?<\/style>/gi, '') .replace(/<nav[\s\S]*?<\/nav>/gi, '') .replace(/<footer[\s\S]*?<\/footer>/gi, '') .replace(/<header[\s\S]*?<\/header>/gi, '') .replace(/<\/?(p|div|section|article|main|li|h[1-6]|br)[^>]*>/gi, '\n') .replace(/<[^>]+>/g, ' ') .replace(/&amp;/g, '&').replace(/&lt;/g, '<').replace(/&gt;/g, '>') .replace(/&quot;/g, '"').replace(/&nbsp;/g, ' ').replace(/&#x27;/g, "'") .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(Number(n))) .replace(/ +/g, ' ') .replace(/\n{4,}/g, '\n\n\n') .replace(/^\s+|\s+$/gm, '') .trim(); } // --------------------------------------------------------------------------- // AIDefence PII gate (optional — graceful skip if not installed) // --------------------------------------------------------------------------- async function applyPiiGate(content) { try { const aidefence = await import('@claude-flow/aidefence').catch(() => null); if (!aidefence) return content; const { hasPii } = aidefence; if (typeof hasPii !== 'function') return content; const detected = await hasPii(content); if (detected) { process.stderr.write('[visit_webpage] AIDefence: PII detected in page content (not blocked)\n'); } } catch { // AIDefence unavailable — continue without gate } return content; } // --------------------------------------------------------------------------- // Core visit logic (exported for unit testing via visitWebpageTestHooks) // --------------------------------------------------------------------------- export async function visitWebpage(url, maxChars = DEFAULT_MAX_CHARS) { const sourcesConsulted = [url]; const { html, finalUrl } = visitWebpageTestHooks.fetchHtml ? await visitWebpageTestHooks.fetchHtml(url) : await fetchPage(url); if (finalUrl !== url) sourcesConsulted.push(finalUrl); const title = extractTitle(html); const links = extractLinks(html, finalUrl); // Primary: Python bs4; fallback: regex let content = extractTextViaPython(html); if (!content || content.length < MIN_CONTENT_CHARS) { content = extractTextViaRegex(html); } content = await applyPiiGate(content); const truncated = content.length > maxChars; if (truncated) content = content.slice(0, maxChars); return { title, content, links: links.slice(0, MAX_LINKS), sources_consulted: sourcesConsulted, chars_returned: content.length, truncated, }; } // --------------------------------------------------------------------------- // Format output for Claude // --------------------------------------------------------------------------- function formatResult(result, url) { const lines = []; lines.push(`[visit_webpage: ${url}]`); if (result.title) lines.push(`Title: ${result.title}`); if (result.sources_consulted.length > 1) { lines.push(`Redirected to: ${result.sources_consulted[result.sources_consulted.length - 1]}`); } if (result.truncated) lines.push(`[Content truncated at ${result.chars_returned} chars]`); lines.push(''); lines.push(result.content || '[No readable content extracted]'); if (result.links.length > 0) { lines.push(''); lines.push(`Links (${result.links.length}):`); for (const link of result.links) lines.push(` ${link.text}${link.href}`); } return lines.join('\n'); } // --------------------------------------------------------------------------- // GaiaTool implementation // --------------------------------------------------------------------------- export class VisitWebpageTool { name = 'visit_webpage'; definition = { name: 'visit_webpage', description: 'Fetch the FULL text content of a webpage URL. Unlike web_search (snippets only) ' + 'or grounded_query (synthesised summary), visit_webpage returns the complete readable ' + "text — equivalent to HAL's visit_webpage tool. Use this when you need to read a " + 'specific Wikipedia article, documentation page, government site, or any web page ' + 'in full to extract detailed facts, tables, or text not available from snippets. ' + 'Returns: page title, full text content (up to max_chars), and up to 30 extracted links. ' + `Default max_chars: ${DEFAULT_MAX_CHARS}. Maximum: 100,000.`, input_schema: { type: 'object', properties: { url: { type: 'string', description: 'The full URL (http:// or https://) of the webpage to fetch.', }, max_chars: { type: 'number', description: `Maximum characters of content to return (default: ${DEFAULT_MAX_CHARS}, max: 100000).`, }, }, required: ['url'], }, }; async execute(input) { const url = String(input['url'] ?? '').trim(); validateUrl(url); const maxChars = Math.min(Math.max(500, Number(input['max_chars'] ?? DEFAULT_MAX_CHARS)), 100_000); process.stderr.write(`[visit_webpage] url=${url} max_chars=${maxChars}\n`); const result = await visitWebpage(url, maxChars); process.stderr.write(`[visit_webpage] title=${JSON.stringify(result.title)} ` + `chars=${result.chars_returned} truncated=${result.truncated} ` + `links=${result.links.length}\n`); if (!result.content || result.content.length < MIN_CONTENT_CHARS) { return `[visit_webpage: page at ${url} returned no readable text content]`; } return formatResult(result, url); } } // --------------------------------------------------------------------------- // Convenience factory // --------------------------------------------------------------------------- export function createVisitWebpageTool() { return new VisitWebpageTool(); } //# sourceMappingURL=visit_webpage.js.map