UNPKG

crawlforge-mcp-server

Version:

CrawlForge MCP Server - Professional Model Context Protocol server with 19 comprehensive web scraping, crawling, and content processing tools.

1,454 lines (1,342 loc) 69.4 kB
#!/usr/bin/env node // Secure Creator Mode Authentication - MUST run before any imports // Only the creator can enable unlimited access with their secret import crypto from 'crypto'; import dotenv from 'dotenv'; // Load .env file early to check for creator secret dotenv.config({ path: '.env', quiet: true }); const CREATOR_SECRET_HASH = 'cfef62e5068d48e7dd6a39c9e16f0be2615510c6b68274fc8abe3156feb5050b'; if (process.env.CRAWLFORGE_CREATOR_SECRET) { const providedHash = crypto .createHash('sha256') .update(process.env.CRAWLFORGE_CREATOR_SECRET) .digest('hex'); if (providedHash === CREATOR_SECRET_HASH) { process.env.CRAWLFORGE_CREATOR_MODE = 'true'; console.log('🔓 Creator Mode Enabled - Unlimited Access'); } else { console.warn('⚠️ Invalid creator secret provided'); } } // Now import everything else import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { z } from "zod"; import { load } from "cheerio"; import { SearchWebTool } from "./src/tools/search/searchWeb.js"; import { CrawlDeepTool } from "./src/tools/crawl/crawlDeep.js"; import { MapSiteTool } from "./src/tools/crawl/mapSite.js"; import { ExtractContentTool } from "./src/tools/extract/extractContent.js"; import { ProcessDocumentTool } from "./src/tools/extract/processDocument.js"; import { SummarizeContentTool } from "./src/tools/extract/summarizeContent.js"; import { AnalyzeContentTool } from "./src/tools/extract/analyzeContent.js"; // Wave 2 Advanced Tools import { BatchScrapeTool } from "./src/tools/advanced/BatchScrapeTool.js"; import { ScrapeWithActionsTool } from "./src/tools/advanced/ScrapeWithActionsTool.js"; // Deep Research Tool import { DeepResearchTool } from "./src/tools/research/deepResearch.js"; // Change Tracking Tool import { TrackChangesTool } from "./src/tools/tracking/trackChanges.js"; // LLMs.txt Generator Tool (Phase 2.5) import { GenerateLLMsTxtTool } from "./src/tools/llmstxt/generateLLMsTxt.js"; // Wave 3-4 Core Managers import { StealthBrowserManager } from "./src/core/StealthBrowserManager.js"; import { LocalizationManager } from "./src/core/LocalizationManager.js"; import { memoryMonitor } from "./src/utils/MemoryMonitor.js"; import { config, validateConfig, isSearchConfigured, getToolConfig, getActiveSearchProvider } from "./src/constants/config.js"; // Authentication Manager import AuthManager from "./src/core/AuthManager.js"; // Initialize Authentication Manager await AuthManager.initialize(); // Check if first time setup is needed (skip in creator mode) if (!AuthManager.isAuthenticated() && !AuthManager.isCreatorMode()) { const apiKey = process.env.CRAWLFORGE_API_KEY; if (apiKey) { // Auto-setup if API key is provided via environment console.log('🔧 Auto-configuring CrawlForge with provided API key...'); const success = await AuthManager.runSetup(apiKey); if (!success) { console.error('❌ Failed to authenticate with provided API key'); console.error('Please check your API key or run: npm run setup'); process.exit(1); } } else { console.log(''); console.log('╔═══════════════════════════════════════════════════════╗'); console.log('║ CrawlForge MCP Server - Setup Required ║'); console.log('╚═══════════════════════════════════════════════════════╝'); console.log(''); console.log('Welcome! This appears to be your first time using CrawlForge.'); console.log(''); console.log('To get started, please run:'); console.log(' npm run setup'); console.log(''); console.log('Or set your API key via environment variable:'); console.log(' export CRAWLFORGE_API_KEY="your_api_key_here"'); console.log(''); console.log('Get your free API key at: https://www.crawlforge.dev/signup'); console.log('(Includes 1,000 free credits!)'); console.log(''); process.exit(0); } } // Validate configuration const configErrors = validateConfig(); if (configErrors.length > 0 && config.server.nodeEnv === 'production') { console.error('Configuration errors:', configErrors); process.exit(1); } // Create the server const server = new McpServer({ name: "crawlforge", version: "3.0.1" }); // Helper function to wrap tool handlers with authentication and credit tracking function withAuth(toolName, handler) { return async (params) => { const startTime = Date.now(); try { // Skip credit checks in creator mode if (!AuthManager.isCreatorMode()) { // Check credits before executing const creditCost = AuthManager.getToolCost(toolName); const hasCredits = await AuthManager.checkCredits(creditCost); if (!hasCredits) { return { content: [{ type: "text", text: JSON.stringify({ error: "Insufficient credits", message: `This operation requires ${creditCost} credits. Please upgrade your plan at https://www.crawlforge.dev/pricing`, creditsRequired: creditCost }, null, 2) }] }; } } // Execute the tool const result = await handler(params); // Report usage for successful execution (skip in creator mode) const processingTime = Date.now() - startTime; if (!AuthManager.isCreatorMode()) { const creditCost = AuthManager.getToolCost(toolName); await AuthManager.reportUsage( toolName, creditCost, params, 200, processingTime ); } return result; } catch (error) { // Report usage even for errors (reduced credit cost) - skip in creator mode const processingTime = Date.now() - startTime; if (!AuthManager.isCreatorMode()) { await AuthManager.reportUsage( toolName, Math.max(1, Math.floor(AuthManager.getToolCost(toolName) * 0.5)), // Half credits for errors params, 500, processingTime ); } throw error; } }; } // Initialize tools let searchWebTool = null; if (isSearchConfigured()) { searchWebTool = new SearchWebTool(getToolConfig('search_web')); } const crawlDeepTool = new CrawlDeepTool(getToolConfig('crawl_deep')); const mapSiteTool = new MapSiteTool(getToolConfig('map_site')); // Initialize Phase 3 tools const extractContentTool = new ExtractContentTool(); const processDocumentTool = new ProcessDocumentTool(); const summarizeContentTool = new SummarizeContentTool(); const analyzeContentTool = new AnalyzeContentTool(); // Initialize Wave 2 Advanced Tools const batchScrapeTool = new BatchScrapeTool(); const scrapeWithActionsTool = new ScrapeWithActionsTool(); // Initialize Deep Research Tool const deepResearchTool = new DeepResearchTool(); // Initialize Change Tracking Tool const trackChangesTool = new TrackChangesTool(); // Initialize LLMs.txt Generator Tool (Phase 2.5) const generateLLMsTxtTool = new GenerateLLMsTxtTool(); // Initialize Wave 3-4 Core Managers const stealthBrowserManager = new StealthBrowserManager(); const localizationManager = new LocalizationManager(); // Zod schemas for tool parameters and responses const FetchUrlSchema = z.object({ url: z.string().url(), headers: z.record(z.string()).optional(), timeout: z.number().min(1000).max(30000).optional().default(10000) }); const ExtractTextSchema = z.object({ url: z.string().url(), remove_scripts: z.boolean().optional().default(true), remove_styles: z.boolean().optional().default(true) }); const ExtractLinksSchema = z.object({ url: z.string().url(), filter_external: z.boolean().optional().default(false), base_url: z.string().url().optional() }); const ExtractMetadataSchema = z.object({ url: z.string().url() }); const ScrapeStructuredSchema = z.object({ url: z.string().url(), selectors: z.record(z.string()) }); const SearchWebSchema = z.object({ query: z.string(), limit: z.number().min(1).max(100).optional(), offset: z.number().min(0).optional(), lang: z.string().optional(), safe_search: z.boolean().optional(), time_range: z.enum(['day', 'week', 'month', 'year', 'all']).optional(), site: z.string().optional(), file_type: z.string().optional() }); const CrawlDeepSchema = z.object({ url: z.string().url(), max_depth: z.number().min(1).max(5).optional(), max_pages: z.number().min(1).max(1000).optional(), include_patterns: z.array(z.string()).optional(), exclude_patterns: z.array(z.string()).optional(), follow_external: z.boolean().optional(), respect_robots: z.boolean().optional(), extract_content: z.boolean().optional(), concurrency: z.number().min(1).max(20).optional() }); const MapSiteSchema = z.object({ url: z.string().url(), include_sitemap: z.boolean().optional(), max_urls: z.number().min(1).max(10000).optional(), group_by_path: z.boolean().optional(), include_metadata: z.boolean().optional() }); const ExtractContentSchema = z.object({ url: z.string().url(), options: z.object({}).optional() }); const ProcessDocumentSchema = z.object({ source: z.string(), sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional(), options: z.object({}).optional() }); const SummarizeContentSchema = z.object({ text: z.string(), options: z.object({}).optional() }); const AnalyzeContentSchema = z.object({ text: z.string(), options: z.object({}).optional() }); // Wave 2 Advanced Tools Schemas const BatchScrapeSchema = z.object({ urls: z.array(z.union([ z.string().url(), z.object({ url: z.string().url(), selectors: z.record(z.string()).optional(), headers: z.record(z.string()).optional(), timeout: z.number().min(1000).max(30000).optional(), metadata: z.record(z.any()).optional() }) ])).min(1).max(50), formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']), mode: z.enum(['sync', 'async']).default('sync'), webhook: z.object({ url: z.string().url(), events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']), headers: z.record(z.string()).optional(), signingSecret: z.string().optional() }).optional(), extractionSchema: z.record(z.string()).optional(), maxConcurrency: z.number().min(1).max(20).default(10), delayBetweenRequests: z.number().min(0).max(10000).default(100), includeMetadata: z.boolean().default(true), includeFailed: z.boolean().default(true), pageSize: z.number().min(1).max(100).default(25), jobOptions: z.object({ priority: z.number().default(0), ttl: z.number().min(60000).default(24 * 60 * 60 * 1000), maxRetries: z.number().min(0).max(5).default(1), tags: z.array(z.string()).default([]) }).optional() }); const ScrapeWithActionsSchema = z.object({ url: z.string().url(), actions: z.array(z.object({ type: z.enum(['wait', 'click', 'type', 'press', 'scroll', 'screenshot', 'executeJavaScript']), selector: z.string().optional(), text: z.string().optional(), key: z.string().optional(), script: z.string().optional(), timeout: z.number().optional(), description: z.string().optional(), continueOnError: z.boolean().default(false), retries: z.number().min(0).max(5).default(0) })).min(1).max(20), formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']), captureIntermediateStates: z.boolean().default(false), captureScreenshots: z.boolean().default(true), formAutoFill: z.object({ fields: z.array(z.object({ selector: z.string(), value: z.string(), type: z.enum(['text', 'select', 'checkbox', 'radio', 'file']).default('text'), waitAfter: z.number().min(0).max(5000).default(100) })), submitSelector: z.string().optional(), waitAfterSubmit: z.number().min(0).max(30000).default(2000) }).optional(), browserOptions: z.object({ headless: z.boolean().default(true), userAgent: z.string().optional(), viewportWidth: z.number().min(800).max(1920).default(1280), viewportHeight: z.number().min(600).max(1080).default(720), timeout: z.number().min(10000).max(120000).default(30000) }).optional(), extractionOptions: z.object({ selectors: z.record(z.string()).optional(), includeMetadata: z.boolean().default(true), includeLinks: z.boolean().default(true), includeImages: z.boolean().default(true) }).optional(), continueOnActionError: z.boolean().default(false), maxRetries: z.number().min(0).max(3).default(1), screenshotOnError: z.boolean().default(true) }); // Deep Research Tool Schema const DeepResearchSchema = z.object({ topic: z.string().min(3).max(500), maxDepth: z.number().min(1).max(10).optional().default(5), maxUrls: z.number().min(1).max(1000).optional().default(50), timeLimit: z.number().min(30000).max(300000).optional().default(120000), researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'), sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']), credibilityThreshold: z.number().min(0).max(1).optional().default(0.3), includeRecentOnly: z.boolean().optional().default(false), enableConflictDetection: z.boolean().optional().default(true), enableSourceVerification: z.boolean().optional().default(true), enableSynthesis: z.boolean().optional().default(true), outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'), includeRawData: z.boolean().optional().default(false), includeActivityLog: z.boolean().optional().default(false), queryExpansion: z.object({ enableSynonyms: z.boolean().optional().default(true), enableSpellCheck: z.boolean().optional().default(true), enableContextual: z.boolean().optional().default(true), maxVariations: z.number().min(1).max(20).optional().default(8) }).optional(), llmConfig: z.object({ provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'), openai: z.object({ apiKey: z.string().optional(), model: z.string().optional().default('gpt-3.5-turbo'), embeddingModel: z.string().optional().default('text-embedding-ada-002') }).optional(), anthropic: z.object({ apiKey: z.string().optional(), model: z.string().optional().default('claude-3-haiku-20240307') }).optional(), enableSemanticAnalysis: z.boolean().optional().default(true), enableIntelligentSynthesis: z.boolean().optional().default(true) }).optional(), concurrency: z.number().min(1).max(20).optional().default(5), cacheResults: z.boolean().optional().default(true), webhook: z.object({ url: z.string().url(), events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']), headers: z.record(z.string()).optional() }).optional() }); // Change Tracking Tool Schema const TrackChangesSchema = z.object({ url: z.string().url(), operation: z.enum(['create_baseline', 'compare', 'monitor', 'get_history', 'get_stats']).default('compare'), content: z.string().optional(), html: z.string().optional(), trackingOptions: z.object({ granularity: z.enum(['page', 'section', 'element', 'text']).default('section'), trackText: z.boolean().default(true), trackStructure: z.boolean().default(true), trackAttributes: z.boolean().default(false), trackImages: z.boolean().default(false), trackLinks: z.boolean().default(true), ignoreWhitespace: z.boolean().default(true), ignoreCase: z.boolean().default(false), customSelectors: z.array(z.string()).optional(), excludeSelectors: z.array(z.string()).optional(), significanceThresholds: z.object({ minor: z.number().min(0).max(1).default(0.1), moderate: z.number().min(0).max(1).default(0.3), major: z.number().min(0).max(1).default(0.7) }).optional() }).optional(), monitoringOptions: z.object({ enabled: z.boolean().default(false), interval: z.number().min(60000).max(24 * 60 * 60 * 1000).default(300000), maxRetries: z.number().min(0).max(5).default(3), retryDelay: z.number().min(1000).max(60000).default(5000), notificationThreshold: z.enum(['minor', 'moderate', 'major', 'critical']).default('moderate'), enableWebhook: z.boolean().default(false), webhookUrl: z.string().url().optional(), webhookSecret: z.string().optional() }).optional(), storageOptions: z.object({ enableSnapshots: z.boolean().default(true), retainHistory: z.boolean().default(true), maxHistoryEntries: z.number().min(1).max(1000).default(100), compressionEnabled: z.boolean().default(true), deltaStorageEnabled: z.boolean().default(true) }).optional(), queryOptions: z.object({ limit: z.number().min(1).max(500).default(50), offset: z.number().min(0).default(0), startTime: z.number().optional(), endTime: z.number().optional(), includeContent: z.boolean().default(false), significanceFilter: z.enum(['all', 'minor', 'moderate', 'major', 'critical']).optional() }).optional(), notificationOptions: z.object({ webhook: z.object({ enabled: z.boolean().default(false), url: z.string().url().optional(), method: z.enum(['POST', 'PUT']).default('POST'), headers: z.record(z.string()).optional(), signingSecret: z.string().optional(), includeContent: z.boolean().default(false) }).optional(), slack: z.object({ enabled: z.boolean().default(false), webhookUrl: z.string().url().optional(), channel: z.string().optional(), username: z.string().optional() }).optional() }).optional() }); // LLMs.txt Generator Tool Schema (Phase 2.5) const GenerateLLMsTxtSchema = z.object({ url: z.string().url(), analysisOptions: z.object({ maxDepth: z.number().min(1).max(5).optional().default(3), maxPages: z.number().min(10).max(500).optional().default(100), detectAPIs: z.boolean().optional().default(true), analyzeContent: z.boolean().optional().default(true), checkSecurity: z.boolean().optional().default(true), respectRobots: z.boolean().optional().default(true) }).optional(), outputOptions: z.object({ includeDetailed: z.boolean().optional().default(true), includeAnalysis: z.boolean().optional().default(false), contactEmail: z.string().email().optional(), organizationName: z.string().optional(), customGuidelines: z.array(z.string()).optional(), customRestrictions: z.array(z.string()).optional() }).optional(), complianceLevel: z.enum(['basic', 'standard', 'strict']).optional().default('standard'), format: z.enum(['both', 'llms-txt', 'llms-full-txt']).optional().default('both') }); // Stealth Mode Tool Schema (Wave 3) const StealthModeSchema = z.object({ operation: z.enum(['configure', 'enable', 'disable', 'create_context', 'create_page', 'get_stats', 'cleanup']).default('configure'), stealthConfig: z.object({ level: z.enum(['basic', 'medium', 'advanced']).default('medium'), randomizeFingerprint: z.boolean().default(true), hideWebDriver: z.boolean().default(true), blockWebRTC: z.boolean().default(true), spoofTimezone: z.boolean().default(true), randomizeHeaders: z.boolean().default(true), useRandomUserAgent: z.boolean().default(true), simulateHumanBehavior: z.boolean().default(true), customUserAgent: z.string().optional(), customViewport: z.object({ width: z.number().min(800).max(1920), height: z.number().min(600).max(1080) }).optional(), locale: z.string().default('en-US'), timezone: z.string().optional(), webRTCPublicIP: z.string().optional(), webRTCLocalIPs: z.array(z.string()).optional(), proxyRotation: z.object({ enabled: z.boolean().default(false), proxies: z.array(z.string()).optional(), rotationInterval: z.number().default(300000) }).optional(), antiDetection: z.object({ cloudflareBypass: z.boolean().default(true), recaptchaHandling: z.boolean().default(true), hideAutomation: z.boolean().default(true), spoofMediaDevices: z.boolean().default(true), spoofBatteryAPI: z.boolean().default(true) }).optional(), fingerprinting: z.object({ canvasNoise: z.boolean().default(true), webglSpoofing: z.boolean().default(true), audioContextSpoofing: z.boolean().default(true), fontSpoofing: z.boolean().default(true), hardwareSpoofing: z.boolean().default(true) }).optional() }).optional(), contextId: z.string().optional(), urlToTest: z.string().url().optional() }); // Localization Tool Schema (Wave 3) const LocalizationSchema = z.object({ operation: z.enum(['configure_country', 'localize_search', 'localize_browser', 'generate_timezone_spoof', 'handle_geo_blocking', 'auto_detect', 'get_stats', 'get_supported_countries']).default('configure_country'), countryCode: z.string().length(2).optional(), language: z.string().optional(), timezone: z.string().optional(), currency: z.string().length(3).optional(), customHeaders: z.record(z.string()).optional(), userAgent: z.string().optional(), acceptLanguage: z.string().optional(), geoLocation: z.object({ latitude: z.number().min(-90).max(90), longitude: z.number().min(-180).max(180), accuracy: z.number().min(1).max(100).optional() }).optional(), proxySettings: z.object({ enabled: z.boolean().default(false), region: z.string().optional(), type: z.enum(['http', 'https', 'socks4', 'socks5']).default('https'), server: z.string().optional(), port: z.number().optional(), username: z.string().optional(), password: z.string().optional(), rotation: z.object({ enabled: z.boolean().default(false), interval: z.number().default(300000), strategy: z.enum(['round-robin', 'random', 'failover']).default('round-robin') }).optional(), fallback: z.object({ enabled: z.boolean().default(true), maxRetries: z.number().default(3), timeout: z.number().default(10000) }).optional() }).optional(), searchParams: z.object({ query: z.string().optional(), limit: z.number().optional(), offset: z.number().optional(), headers: z.record(z.string()).optional() }).optional(), browserOptions: z.object({ locale: z.string().optional(), timezoneId: z.string().optional(), extraHTTPHeaders: z.record(z.string()).optional(), userAgent: z.string().optional() }).optional(), content: z.string().optional(), url: z.string().url().optional(), response: z.object({ status: z.number(), body: z.string().optional(), statusText: z.string().optional() }).optional() }); // Utility function to fetch URL with error handling async function fetchWithTimeout(url, options = {}) { const { timeout = 10000, headers = {} } = options; const controller = new AbortController(); const timeoutId = setTimeout(() => controller.abort(), timeout); try { const response = await fetch(url, { signal: controller.signal, headers: { 'User-Agent': 'CrawlForge/1.0.0', ...headers } }); clearTimeout(timeoutId); return response; } catch (error) { clearTimeout(timeoutId); if (error.name === 'AbortError') { throw new Error(`Request timeout after ${timeout}ms`); } throw error; } } // Tool: fetch_url - Basic URL fetching with headers and response handling server.registerTool("fetch_url", { description: "Fetch content from a URL with optional headers and timeout", inputSchema: { url: z.string().url(), headers: z.record(z.string()).optional(), timeout: z.number().min(1000).max(30000).optional().default(10000) } }, withAuth("fetch_url", async ({ url, headers, timeout }) => { try { const response = await fetchWithTimeout(url, { timeout: timeout || 10000, headers: headers || {} }); const body = await response.text(); const responseHeaders = {}; response.headers.forEach((value, key) => { responseHeaders[key] = value; }); return { content: [{ type: "text", text: JSON.stringify({ status: response.status, statusText: response.statusText, headers: responseHeaders, body: body, contentType: response.headers.get('content-type') || 'unknown', size: body.length, url: response.url }, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Failed to fetch URL: ${error.message}` }], isError: true }; } })); // Tool: extract_text - Extract clean text content from HTML server.registerTool("extract_text", { description: "Extract clean text content from a webpage", inputSchema: { url: z.string().url(), remove_scripts: z.boolean().optional().default(true), remove_styles: z.boolean().optional().default(true) } }, withAuth("extract_text", async ({ url, remove_scripts, remove_styles }) => { try { const response = await fetchWithTimeout(url); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const html = await response.text(); const $ = load(html); // Remove unwanted elements if (remove_scripts !== false) { $('script').remove(); } if (remove_styles !== false) { $('style').remove(); } // Remove common non-content elements $('nav, header, footer, aside, .advertisement, .ad, .sidebar').remove(); // Extract text content const text = $('body').text().replace(/\s+/g, ' ').trim(); return { content: [{ type: "text", text: JSON.stringify({ text: text, word_count: text.split(/\s+/).filter(word => word.length > 0).length, char_count: text.length, url: response.url }, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Failed to extract text: ${error.message}` }], isError: true }; } })); // Tool: extract_links - Extract all links from a webpage with optional filtering server.registerTool("extract_links", { description: "Extract all links from a webpage with optional filtering", inputSchema: { url: z.string().url(), filter_external: z.boolean().optional().default(false), base_url: z.string().url().optional() } }, withAuth("extract_links", async ({ url, filter_external, base_url }) => { try { const response = await fetchWithTimeout(url); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const html = await response.text(); const $ = load(html); const baseUrl = base_url || new URL(url).origin; const pageUrl = new URL(url); const links = []; $('a[href]').each((_, element) => { const href = $(element).attr('href'); const text = $(element).text().trim(); if (!href) return; let absoluteUrl; let isExternal = false; try { if (href.startsWith('http://') || href.startsWith('https://')) { absoluteUrl = href; isExternal = new URL(href).origin !== pageUrl.origin; } else { absoluteUrl = new URL(href, baseUrl).toString(); isExternal = false; } // Apply filtering if (filter_external && isExternal) { return; } links.push({ href: absoluteUrl, text: text, is_external: isExternal, original_href: href }); } catch (urlError) { // Skip invalid URLs } }); // Remove duplicates const uniqueLinks = links.filter((link, index, arr) => arr.findIndex(l => l.href === link.href) === index ); return { content: [{ type: "text", text: JSON.stringify({ links: uniqueLinks, total_count: uniqueLinks.length, internal_count: uniqueLinks.filter(l => !l.is_external).length, external_count: uniqueLinks.filter(l => l.is_external).length, base_url: baseUrl }, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Failed to extract links: ${error.message}` }], isError: true }; } })); // Tool: extract_metadata - Extract page metadata server.registerTool("extract_metadata", { description: "Extract metadata from a webpage (title, description, keywords, etc.)", inputSchema: { url: z.string().url() } }, withAuth("extract_metadata", async ({ url }) => { try { const response = await fetchWithTimeout(url); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const html = await response.text(); const $ = load(html); // Extract basic metadata const title = $('title').text().trim() || $('h1').first().text().trim(); const description = $('meta[name="description"]').attr('content') || $('meta[property="og:description"]').attr('content') || ''; const keywords = $('meta[name="keywords"]').attr('content') || ''; const canonical = $('link[rel="canonical"]').attr('href') || ''; // Extract Open Graph tags const ogTags = {}; $('meta[property^="og:"]').each((_, element) => { const property = $(element).attr('property'); const content = $(element).attr('content'); if (property && content) { ogTags[property.replace('og:', '')] = content; } }); // Extract Twitter Card tags const twitterTags = {}; $('meta[name^="twitter:"]').each((_, element) => { const name = $(element).attr('name'); const content = $(element).attr('content'); if (name && content) { twitterTags[name.replace('twitter:', '')] = content; } }); // Extract additional metadata const author = $('meta[name="author"]').attr('content') || ''; const robots = $('meta[name="robots"]').attr('content') || ''; const viewport = $('meta[name="viewport"]').attr('content') || ''; const charset = $('meta[charset]').attr('charset') || $('meta[http-equiv="Content-Type"]').attr('content') || ''; return { content: [{ type: "text", text: JSON.stringify({ title: title, description: description, keywords: keywords.split(',').map(k => k.trim()).filter(k => k), canonical_url: canonical, author: author, robots: robots, viewport: viewport, charset: charset, og_tags: ogTags, twitter_tags: twitterTags, url: response.url }, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Failed to extract metadata: ${error.message}` }], isError: true }; } })); // Tool: scrape_structured - Extract structured data using CSS selectors server.registerTool("scrape_structured", { description: "Extract structured data from a webpage using CSS selectors", inputSchema: { url: z.string().url(), selectors: z.record(z.string()) } }, withAuth("scrape_structured", async ({ url, selectors }) => { try { const response = await fetchWithTimeout(url); if (!response.ok) { throw new Error(`HTTP ${response.status}: ${response.statusText}`); } const html = await response.text(); const $ = load(html); const results = {}; for (const [fieldName, selector] of Object.entries(selectors)) { try { const elements = $(selector); if (elements.length === 0) { results[fieldName] = null; } else if (elements.length === 1) { // Single element - return text content results[fieldName] = elements.text().trim(); } else { // Multiple elements - return array of text content results[fieldName] = elements.map((_, el) => $(el).text().trim()).get(); } } catch (selectorError) { results[fieldName] = { error: `Invalid selector: ${selector}`, message: selectorError.message }; } } return { content: [{ type: "text", text: JSON.stringify({ data: results, selectors_used: selectors, elements_found: Object.keys(results).length, url: response.url }, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Failed to scrape structured data: ${error.message}` }], isError: true }; } })); // Tool: search_web - Web search with configurable providers if (searchWebTool) { const activeProvider = getActiveSearchProvider(); const providerName = activeProvider === 'google' ? 'Google Custom Search API' : activeProvider === 'duckduckgo' ? 'DuckDuckGo' : 'Auto-selected provider'; server.registerTool("search_web", { description: `Search the web using ${providerName}`, inputSchema: { query: z.string(), limit: z.number().min(1).max(100).optional(), offset: z.number().min(0).optional(), lang: z.string().optional(), safe_search: z.boolean().optional(), time_range: z.enum(['day', 'week', 'month', 'year', 'all']).optional(), site: z.string().optional(), file_type: z.string().optional() } }, withAuth("search_web", async ({ query, limit, offset, lang, safe_search, time_range, site, file_type }) => { try { if (!query) { return { content: [{ type: "text", text: "Query parameter is required" }], isError: true }; } const result = await searchWebTool.execute({ query, limit, offset, lang, safe_search, time_range, site, file_type }); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Search failed: ${error.message}` }], isError: true }; } })); } else { const activeProvider = getActiveSearchProvider(); if (activeProvider === 'google') { console.error("Warning: search_web tool not configured. Set GOOGLE_API_KEY and GOOGLE_SEARCH_ENGINE_ID to enable Google search."); } else { console.error("Warning: search_web tool initialization failed. Check your SEARCH_PROVIDER configuration."); } } // Tool: crawl_deep - Deep crawl websites with BFS algorithm server.registerTool("crawl_deep", { description: "Crawl websites deeply using breadth-first search", inputSchema: { url: z.string().url(), max_depth: z.number().min(1).max(5).optional(), max_pages: z.number().min(1).max(1000).optional(), include_patterns: z.array(z.string()).optional(), exclude_patterns: z.array(z.string()).optional(), follow_external: z.boolean().optional(), respect_robots: z.boolean().optional(), extract_content: z.boolean().optional(), concurrency: z.number().min(1).max(20).optional() } }, withAuth("crawl_deep", async ({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }) => { try { if (!url) { return { content: [{ type: "text", text: "URL parameter is required" }], isError: true }; } const result = await crawlDeepTool.execute({ url, max_depth, max_pages, include_patterns, exclude_patterns, follow_external, respect_robots, extract_content, concurrency }); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Crawl failed: ${error.message}` }], isError: true }; } })); // Tool: map_site - Discover and map website structure server.registerTool("map_site", { description: "Discover and map website structure", inputSchema: { url: z.string().url(), include_sitemap: z.boolean().optional(), max_urls: z.number().min(1).max(10000).optional(), group_by_path: z.boolean().optional(), include_metadata: z.boolean().optional() } }, withAuth("map_site", async ({ url, include_sitemap, max_urls, group_by_path, include_metadata }) => { try { if (!url) { return { content: [{ type: "text", text: "URL parameter is required" }], isError: true }; } const result = await mapSiteTool.execute({ url, include_sitemap, max_urls, group_by_path, include_metadata }); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Site mapping failed: ${error.message}` }], isError: true }; } })); // Phase 3 Tools: Enhanced Content Processing // Tool: extract_content - Enhanced content extraction with readability detection server.registerTool("extract_content", { description: "Extract and analyze main content from web pages with enhanced readability detection", inputSchema: { url: z.string().url(), options: z.object({}).optional() } }, withAuth("extract_content", async ({ url, options }) => { try { if (!url) { return { content: [{ type: "text", text: "URL parameter is required" }], isError: true }; } const result = await extractContentTool.execute({ url, options }); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Content extraction failed: ${error.message}` }], isError: true }; } })); // Tool: process_document - Multi-format document processing server.registerTool("process_document", { description: "Process documents from multiple sources and formats including PDFs and web pages", inputSchema: { source: z.string(), sourceType: z.enum(['url', 'pdf_url', 'file', 'pdf_file']).optional(), options: z.object({}).optional() } }, withAuth("process_document", async ({ source, sourceType, options }) => { try { if (!source) { return { content: [{ type: "text", text: "Source parameter is required" }], isError: true }; } const result = await processDocumentTool.execute({ source, sourceType, options }); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Document processing failed: ${error.message}` }], isError: true }; } })); // Tool: summarize_content - Intelligent content summarization server.registerTool("summarize_content", { description: "Generate intelligent summaries of text content with configurable options", inputSchema: { text: z.string(), options: z.object({}).optional() } }, withAuth("summarize_content", async ({ text, options }) => { try { if (!text) { return { content: [{ type: "text", text: "Text parameter is required" }], isError: true }; } const result = await summarizeContentTool.execute({ text, options }); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Content summarization failed: ${error.message}` }], isError: true }; } })); // Tool: analyze_content - Comprehensive content analysis server.registerTool("analyze_content", { description: "Perform comprehensive content analysis including language detection and topic extraction", inputSchema: { text: z.string(), options: z.object({}).optional() } }, withAuth("analyze_content", async ({ text, options }) => { try { if (!text) { return { content: [{ type: "text", text: "Text parameter is required" }], isError: true }; } const result = await analyzeContentTool.execute({ text, options }); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Content analysis failed: ${error.message}` }], isError: true }; } })); // Wave 2 Advanced Tools // Tool: batch_scrape - Process multiple URLs simultaneously with job management server.registerTool("batch_scrape", { description: "Process multiple URLs simultaneously with support for async job management and webhook notifications", inputSchema: { urls: z.array(z.union([ z.string().url(), z.object({ url: z.string().url(), selectors: z.record(z.string()).optional(), headers: z.record(z.string()).optional(), timeout: z.number().min(1000).max(30000).optional(), metadata: z.record(z.any()).optional() }) ])).min(1).max(50), formats: z.array(z.enum(['markdown', 'html', 'json', 'text'])).default(['json']), mode: z.enum(['sync', 'async']).default('sync'), webhook: z.object({ url: z.string().url(), events: z.array(z.string()).optional().default(['batch_completed', 'batch_failed']), headers: z.record(z.string()).optional(), signingSecret: z.string().optional() }).optional(), extractionSchema: z.record(z.string()).optional(), maxConcurrency: z.number().min(1).max(20).default(10), delayBetweenRequests: z.number().min(0).max(10000).default(100), includeMetadata: z.boolean().default(true), includeFailed: z.boolean().default(true), pageSize: z.number().min(1).max(100).default(25), jobOptions: z.object({ priority: z.number().default(0), ttl: z.number().min(60000).default(24 * 60 * 60 * 1000), maxRetries: z.number().min(0).max(5).default(1), tags: z.array(z.string()).default([]) }).optional() } }, withAuth("batch_scrape", async (params) => { try { const result = await batchScrapeTool.execute(params); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Batch scrape failed: ${error.message}` }], isError: true }; } })); // Tool: scrape_with_actions - Execute action chains before scraping server.registerTool("scrape_with_actions", { description: "Execute browser action chains before scraping content, with form auto-fill and intermediate state capture", inputSchema: { url: z.string().url(), actions: z.array(z.object({ type: z.enum(['wait', 'click', 'type', 'press', 'scroll', 'screenshot', 'executeJavaScript']), selector: z.string().optional(), text: z.string().optional(), key: z.string().optional(), script: z.string().optional(), timeout: z.number().optional(), description: z.string().optional(), continueOnError: z.boolean().default(false), retries: z.number().min(0).max(5).default(0) })).min(1).max(20), formats: z.array(z.enum(['markdown', 'html', 'json', 'text', 'screenshots'])).default(['json']), captureIntermediateStates: z.boolean().default(false), captureScreenshots: z.boolean().default(true), formAutoFill: z.object({ fields: z.array(z.object({ selector: z.string(), value: z.string(), type: z.enum(['text', 'select', 'checkbox', 'radio', 'file']).default('text'), waitAfter: z.number().min(0).max(5000).default(100) })), submitSelector: z.string().optional(), waitAfterSubmit: z.number().min(0).max(30000).default(2000) }).optional(), browserOptions: z.object({ headless: z.boolean().default(true), userAgent: z.string().optional(), viewportWidth: z.number().min(800).max(1920).default(1280), viewportHeight: z.number().min(600).max(1080).default(720), timeout: z.number().min(10000).max(120000).default(30000) }).optional(), extractionOptions: z.object({ selectors: z.record(z.string()).optional(), includeMetadata: z.boolean().default(true), includeLinks: z.boolean().default(true), includeImages: z.boolean().default(true) }).optional(), continueOnActionError: z.boolean().default(false), maxRetries: z.number().min(0).max(3).default(1), screenshotOnError: z.boolean().default(true) } }, withAuth("scrape_with_actions", async (params) => { try { const result = await scrapeWithActionsTool.execute(params); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Scrape with actions failed: ${error.message}` }], isError: true }; } })); // Tool: deep_research - Comprehensive multi-stage research with source verification server.registerTool("deep_research", { description: "Conduct comprehensive multi-stage research with intelligent query expansion, source verification, and conflict detection", inputSchema: { topic: z.string().min(3).max(500), maxDepth: z.number().min(1).max(10).optional().default(5), maxUrls: z.number().min(1).max(1000).optional().default(50), timeLimit: z.number().min(30000).max(300000).optional().default(120000), researchApproach: z.enum(['broad', 'focused', 'academic', 'current_events', 'comparative']).optional().default('broad'), sourceTypes: z.array(z.enum(['academic', 'news', 'government', 'commercial', 'blog', 'wiki', 'any'])).optional().default(['any']), credibilityThreshold: z.number().min(0).max(1).optional().default(0.3), includeRecentOnly: z.boolean().optional().default(false), enableConflictDetection: z.boolean().optional().default(true), enableSourceVerification: z.boolean().optional().default(true), enableSynthesis: z.boolean().optional().default(true), outputFormat: z.enum(['comprehensive', 'summary', 'citations_only', 'conflicts_focus']).optional().default('comprehensive'), includeRawData: z.boolean().optional().default(false), includeActivityLog: z.boolean().optional().default(false), queryExpansion: z.object({ enableSynonyms: z.boolean().optional().default(true), enableSpellCheck: z.boolean().optional().default(true), enableContextual: z.boolean().optional().default(true), maxVariations: z.number().min(1).max(20).optional().default(8) }).optional(), llmConfig: z.object({ provider: z.enum(['auto', 'openai', 'anthropic']).optional().default('auto'), openai: z.object({ apiKey: z.string().optional(), model: z.string().optional().default('gpt-3.5-turbo'), embeddingModel: z.string().optional().default('text-embedding-ada-002') }).optional(), anthropic: z.object({ apiKey: z.string().optional(), model: z.string().optional().default('claude-3-haiku-20240307') }).optional(), enableSemanticAnalysis: z.boolean().optional().default(true), enableIntelligentSynthesis: z.boolean().optional().default(true) }).optional(), concurrency: z.number().min(1).max(20).optional().default(5), cacheResults: z.boolean().optional().default(true), webhook: z.object({ url: z.string().url(), events: z.array(z.enum(['started', 'progress', 'completed', 'failed'])).optional().default(['completed']), headers: z.record(z.string()).optional() }).optional() } }, withAuth("deep_research", async (params) => { try { const result = await deepResearchTool.execute(params); return { content: [{ type: "text", text: JSON.stringify(result, null, 2) }] }; } catch (error) { return { content: [{ type: "text", text: `Deep research failed: ${error.message}` }], isError: true }; } })); // Tool: track_changes - Enhanced Content change tracking with baseline capture and monitoring (Phase 2.4) server.registerTool("track_changes", { description: "Enhanced content change tracking with baseline capture, comparison, scheduled monitoring, advanced comparison engine, alert system, and historical analysis", inputSchema: { url: z.string().url(), operation: z.enum([ 'create_baseline', 'compare', 'monitor', 'get_history', 'get_stats', 'create_scheduled_monitor', 'stop_scheduled_monitor', 'get_dashboard', 'export_history', 'create_alert_rule', 'generate_trend_report', 'get_monitoring_templates' ]).default('compare'), content: z.string().optional(), html: z.string().optional(), trackingOptions: z.object({ granularity: z.enum(['page', 'section', 'element', 'text']).default('section'), trackText: z.boolean().default(true), trackStructure: z.boolean().default(true), trackAttributes: z.boolean().default(fa