UNPKG

mcp-omnisearch

Version:

MCP server for integrating Omnisearch with LLMs

144 lines (143 loc) 8.21 kB
import { ErrorType, ProviderError, } from '../../../common/types.js'; import { is_valid_url, retry_with_backoff, validate_api_key, } from '../../../common/utils.js'; import { config } from '../../../config/env.js'; export class FirecrawlCrawlProvider { constructor() { this.name = 'firecrawl_crawl'; this.description = 'Deep crawling of all accessible subpages on a website with configurable depth limits using Firecrawl. Efficiently discovers and extracts content from multiple pages within a domain. Best for comprehensive site analysis, content indexing, and data collection from entire websites.'; } async process_content(url, extract_depth = 'basic') { // Crawl only works with a single URL (the starting point) const crawl_url = Array.isArray(url) ? url[0] : url; // Validate URL if (!is_valid_url(crawl_url)) { throw new ProviderError(ErrorType.INVALID_INPUT, `Invalid URL provided: ${crawl_url}`, this.name); } const crawl_request = async () => { const api_key = validate_api_key(config.processing.firecrawl_crawl.api_key, this.name); try { // Start the crawl const crawl_response = await fetch(config.processing.firecrawl_crawl.base_url, { method: 'POST', headers: { 'Authorization': `Bearer ${api_key}`, 'Content-Type': 'application/json', }, body: JSON.stringify({ url: crawl_url, scrapeOptions: { formats: ['markdown'], // Prefer markdown for LLM consumption onlyMainContent: true, }, // Use advanced options if extract_depth is advanced maxDepth: extract_depth === 'advanced' ? 3 : 1, limit: extract_depth === 'advanced' ? 50 : 20, }), signal: AbortSignal.timeout(config.processing.firecrawl_crawl.timeout), }); if (!crawl_response.ok) { // Handle error responses based on status codes switch (crawl_response.status) { case 400: throw new ProviderError(ErrorType.INVALID_INPUT, 'Invalid request parameters', this.name); case 401: throw new ProviderError(ErrorType.API_ERROR, 'Invalid API key', this.name); case 403: throw new ProviderError(ErrorType.API_ERROR, 'API key does not have access to this endpoint', this.name); case 429: throw new ProviderError(ErrorType.RATE_LIMIT, 'Rate limit exceeded', this.name); case 500: throw new ProviderError(ErrorType.PROVIDER_ERROR, 'Firecrawl API internal error', this.name); default: throw new ProviderError(ErrorType.API_ERROR, `Unexpected error: ${crawl_response.statusText}`, this.name); } } const crawl_data = (await crawl_response.json()); // Check if there was an error in the response if (!crawl_data.success || crawl_data.error) { throw new ProviderError(ErrorType.PROVIDER_ERROR, `Error starting crawl: ${crawl_data.error || 'Unknown error'}`, this.name); } // For crawls, we always need to poll for results const crawl_id = crawl_data.id; let status_data = null; let attempts = 0; const max_attempts = 20; // More attempts for crawling // Poll for results while (attempts < max_attempts) { attempts++; await new Promise((resolve) => setTimeout(resolve, 5000)); // Wait 5 seconds between polls const status_response = await fetch(`${config.processing.firecrawl_crawl.base_url}/${crawl_id}`, { method: 'GET', headers: { 'Authorization': `Bearer ${api_key}`, }, signal: AbortSignal.timeout(30000), // 30 second timeout for status checks }); if (!status_response.ok) { continue; // Skip this attempt if there's an error } const status_result = (await status_response.json()); if (!status_result.success) { throw new ProviderError(ErrorType.PROVIDER_ERROR, `Error checking crawl status: ${status_result.error || 'Unknown error'}`, this.name); } if (status_result.status === 'completed' && status_result.data && status_result.data.length > 0) { status_data = status_result; break; } else if (status_result.status === 'error') { throw new ProviderError(ErrorType.PROVIDER_ERROR, `Error crawling website: ${status_result.error || 'Unknown error'}`, this.name); } // If still processing, continue polling } // If we've reached max attempts without completion if (!status_data || !status_data.data || status_data.data.length === 0) { throw new ProviderError(ErrorType.PROVIDER_ERROR, 'Crawl timed out or returned no data - try again later or with a smaller scope', this.name); } // Filter out failed pages const successful_pages = status_data.data.filter((page) => !page.error && (page.markdown || page.html || page.rawHtml)); if (successful_pages.length === 0) { throw new ProviderError(ErrorType.PROVIDER_ERROR, 'All crawled pages failed to extract content', this.name); } // Map results to raw_contents array const raw_contents = successful_pages.map((page) => ({ url: page.url, content: page.markdown || page.html || page.rawHtml || '', })); // Combine all results into a single content string const combined_content = raw_contents .map((result) => `# ${result.url}\n\n${result.content}\n\n---\n\n`) .join('\n\n'); // Calculate total word count const word_count = combined_content .split(/\s+/) .filter(Boolean).length; // Get title from first successful result if available const title = successful_pages[0]?.metadata?.title; // Track failed URLs const failed_urls = status_data.data .filter((page) => page.error) .map((page) => page.url); return { content: combined_content, raw_contents, metadata: { title, word_count, failed_urls: failed_urls.length > 0 ? failed_urls : undefined, urls_processed: status_data.data.length, successful_extractions: successful_pages.length, extract_depth, }, source_provider: this.name, }; } catch (error) { if (error instanceof ProviderError) { throw error; } throw new ProviderError(ErrorType.API_ERROR, `Failed to crawl website: ${error instanceof Error ? error.message : 'Unknown error'}`, this.name); } }; return retry_with_backoff(crawl_request); } }