claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
210 lines • 8.61 kB
JavaScript
/**
* GAIA Tool: web_search — ADR-133-PR2
*
* Scrapes DuckDuckGo HTML search results for a query string and returns
* the top-N snippet titles + URLs as a plain-text block. No API key
* required; uses DDG's HTML endpoint which is publicly accessible.
*
* Design notes:
* - Uses native Node.js https/http (no external fetch polyfill).
* - Follows the DDG Lite HTML endpoint: https://html.duckduckgo.com/html/?q=…
* - Parses result titles + URLs via a simple regex (no DOM parser dependency).
* - Rate-limit aware: 1-second back-off between calls is the caller's
* responsibility (the agent loop enforces this in PR-3).
* - PDF / binary detection is handled by file_read.ts, not here.
*
* Refs: ADR-133, #2156
*/
import * as https from 'node:https';
// ---------------------------------------------------------------------------
// Constants
// ---------------------------------------------------------------------------
const DDG_HTML_URL = 'https://html.duckduckgo.com/html/';
const DEFAULT_MAX_RESULTS = 5;
const REQUEST_TIMEOUT_MS = 20_000;
// User-Agent that DDG accepts (plain browser UA).
const UA = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36';
// ---------------------------------------------------------------------------
// HTML fetch helper
// ---------------------------------------------------------------------------
/**
* POST to DuckDuckGo's HTML search endpoint and return the raw HTML string.
* DDG blocks GET for automated scrapers but accepts POST form submissions.
*/
async function fetchDdgHtml(query) {
const body = `q=${encodeURIComponent(query)}&b=&kl=&df=`;
const bodyBytes = Buffer.from(body, 'utf-8');
return new Promise((resolve, reject) => {
const options = {
hostname: 'html.duckduckgo.com',
path: '/html/',
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded',
'Content-Length': bodyBytes.length,
'User-Agent': UA,
Accept: 'text/html,application/xhtml+xml',
'Accept-Language': 'en-US,en;q=0.9',
},
};
const req = https.request(options, (res) => {
// Follow a single redirect if needed (DDG occasionally redirects to /html/)
if (res.statusCode !== undefined &&
res.statusCode >= 300 &&
res.statusCode < 400 &&
res.headers.location) {
const loc = res.headers.location;
res.resume();
// Simple follow — only handle absolute https redirects
if (loc.startsWith('https://')) {
https
.get(loc, { headers: { 'User-Agent': UA } }, (r2) => {
const chunks = [];
r2.on('data', (c) => chunks.push(c));
r2.on('end', () => resolve(Buffer.concat(chunks).toString('utf-8')));
r2.on('error', reject);
})
.on('error', reject);
}
else {
reject(new Error(`Unexpected redirect target: ${loc}`));
}
return;
}
if (res.statusCode !== 200) {
res.resume();
reject(new Error(`DDG returned HTTP ${res.statusCode ?? 'unknown'}`));
return;
}
const chunks = [];
res.on('data', (c) => chunks.push(c));
res.on('end', () => resolve(Buffer.concat(chunks).toString('utf-8')));
res.on('error', reject);
});
req.on('error', reject);
req.setTimeout(REQUEST_TIMEOUT_MS, () => {
req.destroy(new Error(`web_search timeout after ${REQUEST_TIMEOUT_MS}ms`));
});
req.write(bodyBytes);
req.end();
});
}
// ---------------------------------------------------------------------------
// HTML parser (regex-based, no DOM)
// ---------------------------------------------------------------------------
/**
* Extract up to `maxResults` search results from DDG HTML.
*
* DDG's HTML result structure (stable as of 2026):
* <a class="result__a" href="URL">TITLE</a>
* <a class="result__snippet">SNIPPET</a>
*
* We parse with regex to avoid adding an htmlparser2 dependency.
*/
function parseDdgHtml(html, maxResults) {
const results = [];
// Match result blocks — DDG wraps each result in <div class="result …">
// We extract title+url from the result__a anchor, and snippet from result__snippet.
const resultBlockRe = /<a[^>]+class="result__a"[^>]*href="([^"]+)"[^>]*>([\s\S]*?)<\/a>[\s\S]*?(?:<a[^>]+class="result__snippet"[^>]*>([\s\S]*?)<\/a>)?/g;
let match;
while ((match = resultBlockRe.exec(html)) !== null && results.length < maxResults) {
const rawUrl = match[1] ?? '';
const rawTitle = match[2] ?? '';
const rawSnippet = match[3] ?? '';
// DDG wraps URLs in //duckduckgo.com/l/?uddg=ENCODED_URL
const url = decodeRawUrl(rawUrl);
const title = stripHtml(rawTitle).trim();
const snippet = stripHtml(rawSnippet).trim();
if (url && title) {
results.push({ title, url, snippet });
}
}
return results;
}
/**
* Decode the DDG redirect URL back to the real URL.
* Input example: //duckduckgo.com/l/?uddg=https%3A%2F%2Fexample.com%2F&rut=…
*/
function decodeRawUrl(raw) {
if (raw.startsWith('//duckduckgo.com/l/')) {
const qIdx = raw.indexOf('uddg=');
if (qIdx !== -1) {
const encoded = raw.slice(qIdx + 5).split('&')[0];
try {
return decodeURIComponent(encoded);
}
catch {
return raw;
}
}
}
// Direct URL (some results skip the redirect)
if (raw.startsWith('http://') || raw.startsWith('https://'))
return raw;
return raw;
}
/** Strip HTML tags and decode common entities. */
function stripHtml(html) {
return html
.replace(/<[^>]+>/g, '')
.replace(/&/g, '&')
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/ /g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
// ---------------------------------------------------------------------------
// Format output for Claude
// ---------------------------------------------------------------------------
function formatResults(results) {
if (results.length === 0) {
return 'No results found.';
}
return results
.map((r, i) => `[${i + 1}] ${r.title}\n URL: ${r.url}${r.snippet ? '\n ' + r.snippet : ''}`)
.join('\n\n');
}
// ---------------------------------------------------------------------------
// GaiaTool implementation
// ---------------------------------------------------------------------------
export class WebSearchTool {
name = 'web_search';
definition = {
name: 'web_search',
description: 'Search the web using DuckDuckGo and return the top results (title, URL, snippet). ' +
'Use this when you need current information, external facts, or to verify claims.',
input_schema: {
type: 'object',
properties: {
query: {
type: 'string',
description: 'The search query string.',
},
max_results: {
type: 'number',
description: `Maximum number of results to return (default: ${DEFAULT_MAX_RESULTS}, max: 10).`,
},
},
required: ['query'],
},
};
async execute(input) {
const query = String(input['query'] ?? '').trim();
if (!query)
throw new Error('web_search: `query` input is required and must be non-empty.');
const maxResults = Math.min(Math.max(1, Number(input['max_results'] ?? DEFAULT_MAX_RESULTS)), 10);
const html = await fetchDdgHtml(query);
const results = parseDdgHtml(html, maxResults);
return formatResults(results);
}
}
// ---------------------------------------------------------------------------
// Convenience factory
// ---------------------------------------------------------------------------
export function createWebSearchTool() {
return new WebSearchTool();
}
//# sourceMappingURL=web_search.js.map