UNPKG

@oevortex/ddg_search

Version:

A Model Context Protocol server for web search using DuckDuckGo, IAsk AI, and Monica AI

323 lines (279 loc) 9.89 kB
import axios from 'axios'; import * as cheerio from 'cheerio'; import https from 'https'; import { getRandomUserAgent } from './user_agents.js'; // Constants const MAX_CACHE_PAGES = 5; const CACHE_DURATION = 5 * 60 * 1000; // 5 minutes const REQUEST_TIMEOUT = 10000; // 10 seconds // Cache results to avoid repeated requests const resultsCache = new Map(); // HTTPS agent configuration to handle certificate chain issues const httpsAgent = new https.Agent({ rejectUnauthorized: true, // Keep security enabled keepAlive: true, timeout: REQUEST_TIMEOUT, // Provide fallback for certificate issues while maintaining security secureProtocol: 'TLSv1_2_method' }); /** * Generate a cache key for a search query * @param {string} query - The search query * @returns {string} The cache key */ function getCacheKey(query) { return `${query}`; } /** * Clear old entries from the cache */ function clearOldCache() { const now = Date.now(); for (const [key, value] of resultsCache.entries()) { if (now - value.timestamp > CACHE_DURATION) { resultsCache.delete(key); } } } /** * Extract the direct URL from a DuckDuckGo redirect URL * @param {string} duckduckgoUrl - The DuckDuckGo URL to extract from * @returns {string} The direct URL */ function extractDirectUrl(duckduckgoUrl) { try { // Handle relative URLs from DuckDuckGo if (duckduckgoUrl.startsWith('//')) { duckduckgoUrl = 'https:' + duckduckgoUrl; } else if (duckduckgoUrl.startsWith('/')) { duckduckgoUrl = 'https://duckduckgo.com' + duckduckgoUrl; } const url = new URL(duckduckgoUrl); // Extract direct URL from DuckDuckGo redirect if (url.hostname === 'duckduckgo.com' && url.pathname === '/l/') { const uddg = url.searchParams.get('uddg'); if (uddg) { return decodeURIComponent(uddg); } } // Handle ad redirects if (url.hostname === 'duckduckgo.com' && url.pathname === '/y.js') { const u3 = url.searchParams.get('u3'); if (u3) { try { const decodedU3 = decodeURIComponent(u3); const u3Url = new URL(decodedU3); const clickUrl = u3Url.searchParams.get('ld'); if (clickUrl) { return decodeURIComponent(clickUrl); } return decodedU3; } catch { return duckduckgoUrl; } } } return duckduckgoUrl; } catch { // If URL parsing fails, try to extract URL from a basic string match const urlMatch = duckduckgoUrl.match(/https?:\/\/[^\s<>"]+/); if (urlMatch) { return urlMatch[0]; } return duckduckgoUrl; } } /** * Get a favicon URL for a given website URL * @param {string} url - The website URL * @returns {string} The favicon URL */ function getFaviconUrl(url) { try { const urlObj = new URL(url); return `https://www.google.com/s2/favicons?domain=${urlObj.hostname}&sz=32`; } catch { return ''; // Return empty string if URL is invalid } } /** * Generate a Jina AI URL for a given website URL * @param {string} url - The website URL * @returns {string} The Jina AI URL */ function getJinaAiUrl(url) { try { const urlObj = new URL(url); return `https://r.jina.ai/${urlObj.href}`; } catch { return ''; } } /** * Scrapes search results from DuckDuckGo HTML * @param {string} query - The search query * @param {number} numResults - Number of results to return (default: 10) * @param {string} mode - 'short' or 'detailed' mode (default: 'short') * @returns {Promise<Array>} - Array of search results */ async function searchDuckDuckGo(query, numResults = 10, mode = 'short') { try { // Input validation if (!query || typeof query !== 'string') { throw new Error('Invalid query: query must be a non-empty string'); } if (!Number.isInteger(numResults) || numResults < 1 || numResults > 20) { throw new Error('Invalid numResults: must be an integer between 1 and 20'); } if (!['short', 'detailed'].includes(mode)) { throw new Error('Invalid mode: must be "short" or "detailed"'); } // Clear old cache entries clearOldCache(); // Check cache first const cacheKey = getCacheKey(query); const cachedResults = resultsCache.get(cacheKey); if (cachedResults && Date.now() - cachedResults.timestamp < CACHE_DURATION) { console.log(`Cache hit for query: "${query}"`); return cachedResults.results.slice(0, numResults); } // Get a random user agent const userAgent = getRandomUserAgent(); console.log(`Searching DuckDuckGo for: "${query}" (${numResults} results, mode: ${mode})`); // Fetch results with timeout const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT); try { const response = await axios.get( `https://duckduckgo.com/html/?q=${encodeURIComponent(query)}`, { signal: controller.signal, headers: { 'User-Agent': userAgent }, httpsAgent: httpsAgent, timeout: REQUEST_TIMEOUT } ); clearTimeout(timeoutId); if (response.status !== 200) { throw new Error(`HTTP ${response.status}: Failed to fetch search results`); } const html = response.data; // Parse results using cheerio const $ = cheerio.load(html); const results = []; const jinaFetchPromises = []; $('.result').each((i, result) => { const $result = $(result); const titleEl = $result.find('.result__title a'); const linkEl = $result.find('.result__url'); const snippetEl = $result.find('.result__snippet'); const title = titleEl.text()?.trim(); const rawLink = titleEl.attr('href'); const description = snippetEl.text()?.trim(); const displayUrl = linkEl.text()?.trim(); const directLink = extractDirectUrl(rawLink || ''); const favicon = getFaviconUrl(directLink); const jinaUrl = getJinaAiUrl(directLink); if (title && directLink) { if (mode === 'detailed') { jinaFetchPromises.push( axios.get(jinaUrl, { headers: { 'User-Agent': getRandomUserAgent() }, httpsAgent: httpsAgent, timeout: 8000 }) .then(jinaRes => { let jinaContent = ''; if (jinaRes.status === 200 && typeof jinaRes.data === 'string') { const $jina = cheerio.load(jinaRes.data); jinaContent = $jina('body').text(); } return { title, url: directLink, snippet: description || '', favicon: favicon, displayUrl: displayUrl || '', description: jinaContent }; }) .catch(() => { // Return fallback without content return { title, url: directLink, snippet: description || '', favicon: favicon, displayUrl: displayUrl || '', description: '' }; }) ); } else { // short mode: omit description jinaFetchPromises.push( Promise.resolve({ title, url: directLink, snippet: description || '', favicon: favicon, displayUrl: displayUrl || '' }) ); } } }); // Wait for all Jina AI fetches to complete with timeout const jinaResults = await Promise.race([ Promise.all(jinaFetchPromises), new Promise((_, reject) => setTimeout(() => reject(new Error('Content fetch timeout')), 15000) ) ]); results.push(...jinaResults); // Get limited results const limitedResults = results.slice(0, numResults); // Cache the results resultsCache.set(cacheKey, { results: limitedResults, timestamp: Date.now() }); // If cache is too big, remove oldest entries if (resultsCache.size > MAX_CACHE_PAGES) { const oldestKey = Array.from(resultsCache.keys())[0]; resultsCache.delete(oldestKey); } console.log(`Found ${limitedResults.length} results for query: "${query}"`); return limitedResults; } catch (fetchError) { clearTimeout(timeoutId); if (fetchError.name === 'AbortError') { throw new Error('Search request timeout: took longer than 10 seconds'); } if (fetchError.code === 'ENOTFOUND') { throw new Error('Network error: unable to resolve host'); } if (fetchError.code === 'ECONNREFUSED') { throw new Error('Network error: connection refused'); } throw fetchError; } } catch (error) { console.error('Error searching DuckDuckGo:', error.message); // Enhanced error reporting if (error.message.includes('Invalid')) { throw error; // Re-throw validation errors as-is } throw new Error(`Search failed for "${query}": ${error.message}`); } } export { searchDuckDuckGo, extractDirectUrl, getFaviconUrl, getJinaAiUrl };