UNPKG

mcp-omnisearch

Version:

MCP server for integrating Omnisearch with LLMs

175 lines (174 loc) 10.2 kB
import { ErrorType, ProviderError, } from '../../../common/types.js'; import { is_valid_url, retry_with_backoff, validate_api_key, } from '../../../common/utils.js'; import { config } from '../../../config/env.js'; export class FirecrawlActionsProvider { constructor() { this.name = 'firecrawl_actions'; this.description = 'Support for page interactions (clicking, scrolling, etc.) before extraction for dynamic content using Firecrawl. Enables extraction from JavaScript-heavy sites, single-page applications, and content behind user interactions. Best for accessing content that requires navigation, form filling, or other interactions.'; } async process_content(url, extract_depth = 'basic') { // Actions works with a single URL const actions_url = Array.isArray(url) ? url[0] : url; // Validate URL if (!is_valid_url(actions_url)) { throw new ProviderError(ErrorType.INVALID_INPUT, `Invalid URL provided: ${actions_url}`, this.name); } const actions_request = async () => { const api_key = validate_api_key(config.processing.firecrawl_actions.api_key, this.name); try { // Define actions based on extract_depth // For basic, we'll just scroll down once to load more content // For advanced, we'll perform more complex interactions const actions = extract_depth === 'advanced' ? [ { type: 'wait', duration: 2000 }, // Wait for initial page load { type: 'scroll', duration: 1000 }, // Scroll down { type: 'wait', duration: 1000 }, // Wait for content to load { type: 'scroll', duration: 1000 }, // Scroll down more { type: 'wait', duration: 1000 }, // Wait for content to load // Click on "Read more" or "Show more" buttons if they exist { type: 'click', selector: 'button:contains("Read more"), button:contains("Show more"), a:contains("Read more"), a:contains("Show more")' }, { type: 'wait', duration: 2000 }, // Wait for content to expand ] : [ { type: 'wait', duration: 2000 }, // Wait for initial page load { type: 'scroll', duration: 1000 }, // Scroll down once { type: 'wait', duration: 1000 }, // Wait for content to load ]; // Start the actions const actions_response = await fetch(config.processing.firecrawl_actions.base_url, { method: 'POST', headers: { 'Authorization': `Bearer ${api_key}`, 'Content-Type': 'application/json', }, body: JSON.stringify({ url: actions_url, formats: ['markdown', 'screenshot'], // Prefer markdown for LLM consumption and include screenshot actions: actions.map(action => { // Convert our action format to Firecrawl's action format switch (action.type) { case 'wait': return { type: 'wait', milliseconds: action.duration || 1000, selector: action.selector, }; case 'scroll': return { type: 'scroll', // Firecrawl might use different parameters for scroll // Adjust as needed based on their documentation }; case 'click': return { type: 'click', selector: action.selector, x: action.x, y: action.y, }; case 'type': return { type: 'type', selector: action.selector, text: action.text || '', }; case 'select': return { type: 'select', selector: action.selector, value: action.value || '', }; default: return action; } }), }), signal: AbortSignal.timeout(config.processing.firecrawl_actions.timeout), }); if (!actions_response.ok) { // Handle error responses based on status codes switch (actions_response.status) { case 400: throw new ProviderError(ErrorType.INVALID_INPUT, 'Invalid request parameters', this.name); case 401: throw new ProviderError(ErrorType.API_ERROR, 'Invalid API key', this.name); case 403: throw new ProviderError(ErrorType.API_ERROR, 'API key does not have access to this endpoint', this.name); case 429: throw new ProviderError(ErrorType.RATE_LIMIT, 'Rate limit exceeded', this.name); case 500: throw new ProviderError(ErrorType.PROVIDER_ERROR, 'Firecrawl API internal error', this.name); default: throw new ProviderError(ErrorType.API_ERROR, `Unexpected error: ${actions_response.statusText}`, this.name); } } const actions_data = (await actions_response.json()); // Check if there was an error in the response if (!actions_data.success || actions_data.error) { throw new ProviderError(ErrorType.PROVIDER_ERROR, `Error performing actions: ${actions_data.error || 'Unknown error'}`, this.name); } // Check if we have data if (!actions_data.data) { throw new ProviderError(ErrorType.PROVIDER_ERROR, 'No data returned from API', this.name); } // Check if we have content if (!actions_data.data.markdown && !actions_data.data.html && !actions_data.data.rawHtml) { throw new ProviderError(ErrorType.PROVIDER_ERROR, 'No content extracted after performing actions', this.name); } // Prefer markdown, fallback to HTML, then rawHtml const content = actions_data.data.markdown || actions_data.data.html || actions_data.data.rawHtml || ''; // Add information about the actions performed const actions_description = `# Content from ${actions_url} after interactions\n\n` + `The following actions were performed before extraction:\n\n` + actions.map((action, index) => { switch (action.type) { case 'click': return `${index + 1}. Click on ${action.selector || `coordinates (${action.x}, ${action.y})`}`; case 'type': return `${index + 1}. Type "${action.text}" ${action.selector ? `into ${action.selector}` : ''}`; case 'scroll': return `${index + 1}. Scroll ${action.duration ? `for ${action.duration}ms` : ''}`; case 'wait': return `${index + 1}. Wait ${action.duration ? `for ${action.duration}ms` : ''}`; case 'select': return `${index + 1}. Select "${action.value}" from ${action.selector}`; default: return `${index + 1}. Perform ${action.type} action`; } }).join('\n') + '\n\n---\n\n' + content; // Create a single raw_content entry const raw_contents = [{ url: actions_url, content: actions_description, }]; // Calculate word count const word_count = actions_description .split(/\s+/) .filter(Boolean).length; return { content: actions_description, raw_contents, metadata: { title: `Content from ${actions_url} after interactions`, word_count, urls_processed: 1, successful_extractions: 1, extract_depth, screenshot: actions_data.data.screenshot, }, source_provider: this.name, }; } catch (error) { if (error instanceof ProviderError) { throw error; } throw new ProviderError(ErrorType.API_ERROR, `Failed to perform actions: ${error instanceof Error ? error.message : 'Unknown error'}`, this.name); } }; return retry_with_backoff(actions_request); } }