claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
304 lines • 12.5 kB
JavaScript
/**
* GAIA Tool: visit_webpage — iter-57 (upgraded from iter-54 base)
*
* Fetches the FULL readable text content of a webpage URL — not just snippets.
* This closes the HAL parity gap identified in iter-51 surrender analysis
* (~25-35% of L1 questions need full page reading that snippet-only tools miss).
*
* Upgrade over iter-54 base (iter-57 changes):
* - max_chars input parameter (default 50k, was hardcoded 8k)
* - Test hooks interface (visitWebpageTestHooks) for unit tests without live HTTP
* - URL validation with helpful error messages
* - Link extraction (up to 30 links) — enables follow-up navigation
* - Better HTML extraction: content-block heuristics (main/article selection)
* - AIDefence PII gate (optional dep, graceful skip if not installed)
* - Structured VisitWebpageResult type + diagnostic stderr logging
* - Python bs4 primary path preserved (already in benchmark env)
*
* Extraction pipeline:
* 1. fetch() + Python bs4.get_text() (primary — better quality, handles encoding)
* 2. Regex-based fallback (no external dep) if bs4 unavailable or output too short
* 3. Content passed through AIDefence PII gate before returning to agent
*
* Refs: ADR-138, ADR-133, iter-54, iter-57, #2156
*/
import { execFileSync } from 'node:child_process';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const DEFAULT_MAX_CHARS = 50_000;
const MIN_CONTENT_CHARS = 50;
const REQUEST_TIMEOUT_MS = 30_000;
const MAX_LINKS = 30;
const UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36';
export const visitWebpageTestHooks = {};
// ---------------------------------------------------------------------------
// URL validation
// ---------------------------------------------------------------------------
function validateUrl(url) {
if (!url || typeof url !== 'string') {
throw new Error('visit_webpage: `url` must be a non-empty string.');
}
if (!url.startsWith('http://') && !url.startsWith('https://')) {
throw new Error(`visit_webpage: URL must start with http:// or https://. Got: "${url}"`);
}
try {
new URL(url);
}
catch {
throw new Error(`visit_webpage: invalid URL: "${url}"`);
}
}
// ---------------------------------------------------------------------------
// Page fetcher (uses Node built-in fetch — Node 18+)
// ---------------------------------------------------------------------------
async function fetchPage(url) {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
let res;
try {
res = await fetch(url, {
headers: {
'User-Agent': UA,
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
},
signal: controller.signal,
redirect: 'follow',
});
}
finally {
clearTimeout(timer);
}
if (!res.ok) {
throw new Error(`visit_webpage: HTTP ${res.status} for ${url}`);
}
const ct = res.headers.get('content-type') ?? '';
if (ct.includes('application/pdf')) {
throw new Error(`visit_webpage: URL returns a PDF. Use file_read with the downloaded path instead. URL: ${url}`);
}
const html = await res.text();
const finalUrl = res.url || url;
return { html, finalUrl };
}
// ---------------------------------------------------------------------------
// HTML extraction helpers
// ---------------------------------------------------------------------------
/**
* Extract title from raw HTML.
*/
function extractTitle(html) {
const match = /<title[^>]*>([\s\S]*?)<\/title>/i.exec(html);
if (!match)
return '';
return match[1]
.replace(/<[^>]+>/g, '')
.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>')
.replace(/"/g, '"').replace(/ /g, ' ')
.trim();
}
/**
* Extract anchor links from HTML (up to MAX_LINKS).
*/
function extractLinks(html, baseUrl) {
const links = [];
const anchorRe = /<a[^>]+href="([^"#][^"]*)"[^>]*>([\s\S]*?)<\/a>/gi;
let match;
let base;
try {
base = new URL(baseUrl);
}
catch {
return links;
}
while ((match = anchorRe.exec(html)) !== null && links.length < MAX_LINKS) {
const rawHref = match[1].trim();
const rawText = match[2].replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim().slice(0, 120);
if (!rawText || rawHref.startsWith('javascript:') || rawHref.startsWith('mailto:'))
continue;
let href;
try {
href = new URL(rawHref, base.origin + base.pathname).href;
}
catch {
continue;
}
links.push({ href, text: rawText });
}
return links;
}
/**
* Primary extraction: Python bs4.get_text().
* bs4 is available in the benchmark env (already installed for gaia-codeagent).
* HTML is passed via stdin to avoid shell-escaping issues.
*/
function extractTextViaPython(html) {
const script = [
'import sys, re',
'from bs4 import BeautifulSoup',
'html = sys.stdin.read()',
"soup = BeautifulSoup(html, 'html.parser')",
"for tag in soup(['script', 'style', 'nav', 'footer', 'header', 'aside']):",
' tag.decompose()',
"text = soup.get_text(separator='\\n', strip=True)",
"text = re.sub(r'\\n{3,}', '\\n\\n', text)",
'print(text)',
].join('\n');
try {
return execFileSync('python3', ['-'], {
input: script + '\n' + html,
encoding: 'utf-8',
timeout: 15_000,
env: { ...process.env, PYTHONIOENCODING: 'utf-8' },
maxBuffer: 10 * 1024 * 1024,
}).trim();
}
catch {
return '';
}
}
/**
* Fallback: regex-based HTML stripping (no external dep, less accurate).
*/
function extractTextViaRegex(html) {
return html
.replace(/<script[\s\S]*?<\/script>/gi, '')
.replace(/<style[\s\S]*?<\/style>/gi, '')
.replace(/<nav[\s\S]*?<\/nav>/gi, '')
.replace(/<footer[\s\S]*?<\/footer>/gi, '')
.replace(/<header[\s\S]*?<\/header>/gi, '')
.replace(/<\/?(p|div|section|article|main|li|h[1-6]|br)[^>]*>/gi, '\n')
.replace(/<[^>]+>/g, ' ')
.replace(/&/g, '&').replace(/</g, '<').replace(/>/g, '>')
.replace(/"/g, '"').replace(/ /g, ' ').replace(/'/g, "'")
.replace(/&#(\d+);/g, (_, n) => String.fromCharCode(Number(n)))
.replace(/ +/g, ' ')
.replace(/\n{4,}/g, '\n\n\n')
.replace(/^\s+|\s+$/gm, '')
.trim();
}
// ---------------------------------------------------------------------------
// AIDefence PII gate (optional — graceful skip if not installed)
// ---------------------------------------------------------------------------
async function applyPiiGate(content) {
try {
const aidefence = await import('@claude-flow/aidefence').catch(() => null);
if (!aidefence)
return content;
const { hasPii } = aidefence;
if (typeof hasPii !== 'function')
return content;
const detected = await hasPii(content);
if (detected) {
process.stderr.write('[visit_webpage] AIDefence: PII detected in page content (not blocked)\n');
}
}
catch {
// AIDefence unavailable — continue without gate
}
return content;
}
// ---------------------------------------------------------------------------
// Core visit logic (exported for unit testing via visitWebpageTestHooks)
// ---------------------------------------------------------------------------
export async function visitWebpage(url, maxChars = DEFAULT_MAX_CHARS) {
const sourcesConsulted = [url];
const { html, finalUrl } = visitWebpageTestHooks.fetchHtml
? await visitWebpageTestHooks.fetchHtml(url)
: await fetchPage(url);
if (finalUrl !== url)
sourcesConsulted.push(finalUrl);
const title = extractTitle(html);
const links = extractLinks(html, finalUrl);
// Primary: Python bs4; fallback: regex
let content = extractTextViaPython(html);
if (!content || content.length < MIN_CONTENT_CHARS) {
content = extractTextViaRegex(html);
}
content = await applyPiiGate(content);
const truncated = content.length > maxChars;
if (truncated)
content = content.slice(0, maxChars);
return {
title,
content,
links: links.slice(0, MAX_LINKS),
sources_consulted: sourcesConsulted,
chars_returned: content.length,
truncated,
};
}
// ---------------------------------------------------------------------------
// Format output for Claude
// ---------------------------------------------------------------------------
function formatResult(result, url) {
const lines = [];
lines.push(`[visit_webpage: ${url}]`);
if (result.title)
lines.push(`Title: ${result.title}`);
if (result.sources_consulted.length > 1) {
lines.push(`Redirected to: ${result.sources_consulted[result.sources_consulted.length - 1]}`);
}
if (result.truncated)
lines.push(`[Content truncated at ${result.chars_returned} chars]`);
lines.push('');
lines.push(result.content || '[No readable content extracted]');
if (result.links.length > 0) {
lines.push('');
lines.push(`Links (${result.links.length}):`);
for (const link of result.links)
lines.push(` ${link.text} → ${link.href}`);
}
return lines.join('\n');
}
// ---------------------------------------------------------------------------
// GaiaTool implementation
// ---------------------------------------------------------------------------
export class VisitWebpageTool {
name = 'visit_webpage';
definition = {
name: 'visit_webpage',
description: 'Fetch the FULL text content of a webpage URL. Unlike web_search (snippets only) ' +
'or grounded_query (synthesised summary), visit_webpage returns the complete readable ' +
"text — equivalent to HAL's visit_webpage tool. Use this when you need to read a " +
'specific Wikipedia article, documentation page, government site, or any web page ' +
'in full to extract detailed facts, tables, or text not available from snippets. ' +
'Returns: page title, full text content (up to max_chars), and up to 30 extracted links. ' +
`Default max_chars: ${DEFAULT_MAX_CHARS}. Maximum: 100,000.`,
input_schema: {
type: 'object',
properties: {
url: {
type: 'string',
description: 'The full URL (http:// or https://) of the webpage to fetch.',
},
max_chars: {
type: 'number',
description: `Maximum characters of content to return (default: ${DEFAULT_MAX_CHARS}, max: 100000).`,
},
},
required: ['url'],
},
};
async execute(input) {
const url = String(input['url'] ?? '').trim();
validateUrl(url);
const maxChars = Math.min(Math.max(500, Number(input['max_chars'] ?? DEFAULT_MAX_CHARS)), 100_000);
process.stderr.write(`[visit_webpage] url=${url} max_chars=${maxChars}\n`);
const result = await visitWebpage(url, maxChars);
process.stderr.write(`[visit_webpage] title=${JSON.stringify(result.title)} ` +
`chars=${result.chars_returned} truncated=${result.truncated} ` +
`links=${result.links.length}\n`);
if (!result.content || result.content.length < MIN_CONTENT_CHARS) {
return `[visit_webpage: page at ${url} returned no readable text content]`;
}
return formatResult(result, url);
}
}
// ---------------------------------------------------------------------------
// Convenience factory
// ---------------------------------------------------------------------------
export function createVisitWebpageTool() {
return new VisitWebpageTool();
}
//# sourceMappingURL=visit_webpage.js.map