UNPKG

mcp-omnisearch

Version:

MCP server for integrating Omnisearch with LLMs

1,309 lines • 98.8 kB

JavaScript

#!/usr/bin/env node import { ValibotJsonSchemaAdapter } from "@tmcp/adapter-valibot"; import { StdioTransport } from "@tmcp/transport-stdio"; import { McpServer } from "tmcp"; import * as v from "valibot"; import { randomUUID } from "node:crypto"; import { readFileSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { dirname, join } from "node:path"; import { Octokit } from "octokit"; import { fileURLToPath } from "node:url"; //#region src/config/env.ts const TAVILY_API_KEY = process.env.TAVILY_API_KEY; const BRAVE_API_KEY = process.env.BRAVE_API_KEY; const KAGI_API_KEY = process.env.KAGI_API_KEY; const GITHUB_API_KEY = process.env.GITHUB_API_KEY; const EXA_API_KEY = process.env.EXA_API_KEY; const LINKUP_API_KEY = process.env.LINKUP_API_KEY; const FIRECRAWL_API_KEY = process.env.FIRECRAWL_API_KEY; const FIRECRAWL_BASE_URL = process.env.FIRECRAWL_BASE_URL; const config = { search: { tavily: { api_key: TAVILY_API_KEY, base_url: "https://api.tavily.com", timeout: 3e4 }, brave: { api_key: BRAVE_API_KEY, base_url: "https://api.search.brave.com/res/v1", timeout: 1e4 }, kagi: { api_key: KAGI_API_KEY, base_url: "https://kagi.com/api/v0", timeout: 2e4 }, github: { api_key: GITHUB_API_KEY, base_url: "https://api.github.com", timeout: 2e4 }, exa: { api_key: EXA_API_KEY, base_url: "https://api.exa.ai", timeout: 3e4 } }, ai_response: { kagi_fastgpt: { api_key: KAGI_API_KEY, base_url: "https://kagi.com/api/v0/fastgpt", timeout: 3e4 }, exa_answer: { api_key: EXA_API_KEY, base_url: "https://api.exa.ai", timeout: 3e4 }, linkup: { api_key: LINKUP_API_KEY, base_url: "https://api.linkup.so/v1", timeout: 3e4 } }, processing: { kagi_summarizer: { api_key: KAGI_API_KEY, base_url: "https://kagi.com/api/v0/summarize", timeout: 3e4 }, tavily_extract: { api_key: TAVILY_API_KEY, base_url: "https://api.tavily.com", timeout: 3e4 }, firecrawl_scrape: { api_key: FIRECRAWL_API_KEY, base_url: FIRECRAWL_BASE_URL ? `${FIRECRAWL_BASE_URL}/v2/scrape` : "https://api.firecrawl.dev/v2/scrape", timeout: 6e4 }, firecrawl_crawl: { api_key: FIRECRAWL_API_KEY, base_url: FIRECRAWL_BASE_URL ? `${FIRECRAWL_BASE_URL}/v2/crawl` : "https://api.firecrawl.dev/v2/crawl", timeout: 12e4 }, firecrawl_map: { api_key: FIRECRAWL_API_KEY, base_url: FIRECRAWL_BASE_URL ? `${FIRECRAWL_BASE_URL}/v2/map` : "https://api.firecrawl.dev/v2/map", timeout: 6e4 }, firecrawl_extract: { api_key: FIRECRAWL_API_KEY, base_url: FIRECRAWL_BASE_URL ? `${FIRECRAWL_BASE_URL}/v2/extract` : "https://api.firecrawl.dev/v2/extract", timeout: 6e4 }, firecrawl_actions: { api_key: FIRECRAWL_API_KEY, base_url: FIRECRAWL_BASE_URL ? `${FIRECRAWL_BASE_URL}/v2/scrape` : "https://api.firecrawl.dev/v2/scrape", timeout: 9e4 }, exa_contents: { api_key: EXA_API_KEY, base_url: "https://api.exa.ai", timeout: 3e4 }, exa_similar: { api_key: EXA_API_KEY, base_url: "https://api.exa.ai", timeout: 3e4 } }, enhancement: { kagi_enrichment: { api_key: KAGI_API_KEY, base_url: "https://kagi.com/api/v0/enrich", timeout: 2e4 } } }; const remote_deployment_markers = [ "AWS_LAMBDA_FUNCTION_NAME", "CONTAINER", "DOCKER_CONTAINER", "FLY_APP_NAME", "K_SERVICE", "RENDER", "VERCEL" ]; const should_warn_for_local_file_offload = (env = process.env) => env.OMNISEARCH_LARGE_RESULT_MODE === "file" && remote_deployment_markers.some((marker) => Boolean(env[marker])); const warn_for_local_file_offload = (env = process.env, warn = console.warn) => { if (!should_warn_for_local_file_offload(env)) return; warn("Warning: OMNISEARCH_LARGE_RESULT_MODE=file returns server-side temp-file paths and is only useful for local shared-filesystem stdio clients. Use OMNISEARCH_LARGE_RESULT_MODE=inline for remote, hosted, or containerized MCP deployments."); }; const validate_config = () => { const missing_keys = []; const available_keys = []; if (!TAVILY_API_KEY) missing_keys.push("TAVILY_API_KEY"); else available_keys.push("TAVILY_API_KEY"); if (!BRAVE_API_KEY) missing_keys.push("BRAVE_API_KEY"); else available_keys.push("BRAVE_API_KEY"); if (!KAGI_API_KEY) missing_keys.push("KAGI_API_KEY"); else available_keys.push("KAGI_API_KEY"); if (!GITHUB_API_KEY) missing_keys.push("GITHUB_API_KEY"); else available_keys.push("GITHUB_API_KEY"); if (!FIRECRAWL_API_KEY) missing_keys.push("FIRECRAWL_API_KEY"); else available_keys.push("FIRECRAWL_API_KEY"); if (!EXA_API_KEY) missing_keys.push("EXA_API_KEY"); else available_keys.push("EXA_API_KEY"); if (!LINKUP_API_KEY) missing_keys.push("LINKUP_API_KEY"); else available_keys.push("LINKUP_API_KEY"); if (available_keys.length > 0) console.error(`Found API keys for: ${available_keys.join(", ")}`); else console.error("Warning: No API keys found. No providers will be available."); if (missing_keys.length > 0) console.warn(`Missing API keys for: ${missing_keys.join(", ")}. Some providers will not be available.`); warn_for_local_file_offload(); }; //#endregion //#region src/common/types.ts var ProviderError = class extends Error { constructor(type, message, provider, details) { super(message); this.type = type; this.provider = provider; this.details = details; this.name = "ProviderError"; } }; //#endregion //#region src/common/errors.ts const provider_error = (type, message, provider, details = {}) => new ProviderError(type, message, provider, details); const normalize_provider_http_error = (provider, status, message) => { switch (status) { case 400: case 422: return provider_error("INVALID_INPUT", `Invalid request: ${message}`, provider, { status, retryable: false }); case 401: case 403: return provider_error("AUTH_ERROR", status === 401 ? "Invalid API key" : "API key does not have access to this endpoint", provider, { status, retryable: false }); case 408: return provider_error("TIMEOUT", `${provider} API request timed out`, provider, { status, retryable: true }); case 429: return provider_error("RATE_LIMIT", `Rate limit exceeded for ${provider}`, provider, { status, retryable: true }); default: if (status >= 500) return provider_error("TRANSIENT_PROVIDER_ERROR", `${provider} API internal error`, provider, { status, retryable: true }); return provider_error("API_ERROR", `Unexpected error: ${message}`, provider, { status, retryable: false }); } }; function handle_provider_error(error, provider_name, operation = "operation") { if (error instanceof ProviderError) throw error; throw new ProviderError("API_ERROR", `Failed to ${operation}: ${error instanceof Error ? error.message : "Unknown error"}`, provider_name); } const sanitize_query = (query) => { return query.trim().replace(/[\n\r]+/g, " "); }; const create_error_response = (error) => { if (error instanceof ProviderError) return { error: error.message, type: error.type, provider: error.provider, retryable: error.details?.retryable ?? false }; return { error: `Unexpected error: ${error.message}`, type: "API_ERROR", retryable: false }; }; //#endregion //#region src/common/http.ts const tryParseJson = (text) => { if (!text) return void 0; try { return JSON.parse(text); } catch { return; } }; const get_error_message = (body) => { if (typeof body !== "object" || body === null) return void 0; for (const key of [ "message", "error", "detail" ]) if (key in body) { const value = body[key]; if (typeof value === "string") return value; } }; const http_json = async (provider, url, options = {}) => { const res = await fetch(url, options); const raw = await res.text(); const body = tryParseJson(raw); if (!(res.ok || options.expectedStatuses && options.expectedStatuses.includes(res.status))) { const message = get_error_message(body) || raw || res.statusText; throw normalize_provider_http_error(provider, res.status, message); } return body ?? raw; }; //#endregion //#region src/common/provider-response.ts const parse_provider_response = (provider, schema, data) => { const result = v.safeParse(schema, data); if (result.success) return result.output; throw new ProviderError("PROVIDER_ERROR", `Malformed ${provider} response: ${v.summarize(result.issues)}`, provider, { issue_count: result.issues.length }); }; //#endregion //#region src/common/retry.ts const delay = (ms) => { return new Promise((resolve) => setTimeout(resolve, ms)); }; const is_object_with_name = (error) => typeof error === "object" && error !== null && "name" in error && typeof error.name === "string"; const is_retryable_error = (error) => { if (error instanceof ProviderError) { if (typeof error.details?.retryable === "boolean") return error.details.retryable; if (error.type === "RATE_LIMIT" || error.type === "TIMEOUT" || error.type === "TRANSIENT_PROVIDER_ERROR") return true; const status = error.details?.status; return status !== void 0 && (status === 408 || status === 429 || status >= 500); } if (is_object_with_name(error)) return error.name === "AbortError" || error.name === "TimeoutError" || error.name === "TypeError"; return false; }; const normalize_retry_options = (max_retries_or_options = {}, initial_delay) => { const options = typeof max_retries_or_options === "number" ? { max_retries: max_retries_or_options, initial_delay: initial_delay ?? 1e3 } : max_retries_or_options; return { max_retries: options.max_retries ?? 3, initial_delay: options.initial_delay ?? 1e3, jitter_ratio: options.jitter_ratio ?? .2, random: options.random ?? Math.random, should_retry: options.should_retry ?? is_retryable_error }; }; const apply_jitter = (delay_time, jitter_ratio, random) => { if (jitter_ratio <= 0) return delay_time; const jitter = 1 + (random() * 2 - 1) * jitter_ratio; return Math.max(0, Math.round(delay_time * jitter)); }; const retry_with_backoff = async (fn, max_retries_or_options, initial_delay) => { const options = normalize_retry_options(max_retries_or_options, initial_delay); let retries = 0; while (true) try { return await fn(); } catch (error) { if (retries >= options.max_retries || !options.should_retry(error)) throw error; await delay(apply_jitter(options.initial_delay * Math.pow(2, retries), options.jitter_ratio, options.random)); retries++; } }; //#endregion //#region src/common/validation.ts const normalize_api_key = (raw) => { return raw.trim().replace(/^(['"])(.*)\1$/, "$2"); }; const validate_api_key = (key, provider) => { if (!key) throw new ProviderError("INVALID_INPUT", `API key not found for ${provider}`, provider); return normalize_api_key(key); }; const is_api_key_valid = (key, provider) => { if (!key || key.trim() === "") { console.warn(`API key not found or empty for ${provider}`); return false; } return true; }; const is_valid_url = (url) => { try { new URL(url); return true; } catch { return false; } }; const validate_processing_urls = (url, provider_name) => { const urls = Array.isArray(url) ? url : [url]; for (const u of urls) if (!is_valid_url(u)) throw new ProviderError("INVALID_INPUT", `Invalid URL provided: ${u}`, provider_name); return urls; }; //#endregion //#region src/providers/ai-response/exa-answer/index.ts const exa_answer_response_schema = v.object({ answer: v.string(), citations: v.optional(v.array(v.object({ id: v.string(), title: v.string(), url: v.string(), publishedDate: v.optional(v.string()), text: v.optional(v.string()), image: v.optional(v.string()), favicon: v.optional(v.string()) }))), requestId: v.string() }); var ExaAnswerProvider = class { name = "exa_answer"; description = "Get direct AI-generated answers to questions using Exa Answer API"; async search(params) { const api_key = validate_api_key(config.ai_response.exa_answer.api_key, this.name); const search_request = async () => { try { const raw_data = await http_json(this.name, `${config.ai_response.exa_answer.base_url}/answer`, { method: "POST", headers: { "x-api-key": api_key, "Content-Type": "application/json" }, body: JSON.stringify({ query: sanitize_query(params.query) }), signal: AbortSignal.timeout(config.ai_response.exa_answer.timeout) }); const data = parse_provider_response(this.name, exa_answer_response_schema, raw_data); const results = [{ title: "AI Answer", url: "", snippet: data.answer, score: 1, source_provider: this.name, metadata: { requestId: data.requestId, type: "ai_answer", citations_count: data.citations?.length || 0 } }]; if (data.citations && data.citations.length > 0) { const limit = params.limit ?? data.citations.length; const citation_results = data.citations.slice(0, limit).map((citation, index) => ({ title: citation.title, url: citation.url, snippet: citation.text || "Source reference", score: .9 - index * .01, source_provider: this.name, metadata: { id: citation.id, publishedDate: citation.publishedDate, type: "citation" } })); results.push(...citation_results); } return results; } catch (error) { handle_provider_error(error, this.name, "fetch AI response"); } }; return retry_with_backoff(search_request); } }; //#endregion //#region src/providers/ai-response/kagi-fastgpt/index.ts const kagi_fastgpt_response_schema = v.object({ meta: v.object({ id: v.string(), node: v.string(), ms: v.number() }), data: v.object({ output: v.string(), tokens: v.number(), references: v.array(v.object({ title: v.string(), snippet: v.string(), url: v.string() })) }) }); var KagiFastGPTProvider = class { name = "kagi_fastgpt"; description = "Quick AI-generated answers with citations, optimized for rapid response (900ms typical start time). Runs full search underneath for enriched answers."; async search(params) { const response = await this.get_answer(params.query); const results = []; results.push({ title: "Kagi FastGPT Response", url: "https://kagi.com/fastgpt", snippet: response.data.output, source_provider: this.name }); if (response.data.references && response.data.references.length > 0) results.push(...response.data.references.map((ref) => ({ title: ref.title, url: ref.url, snippet: ref.snippet, source_provider: this.name }))); const filtered_results = results.filter((result) => result.title && result.url && result.snippet); if (params.limit && params.limit > 0) return filtered_results.slice(0, params.limit); return filtered_results; } async get_answer(query, options = {}) { const api_key = validate_api_key(config.ai_response.kagi_fastgpt.api_key, this.name); const final_options = { cache: true, web_search: true, ...options }; try { const raw_data = await http_json(this.name, "https://kagi.com/api/v0/fastgpt", { method: "POST", headers: { "Content-Type": "application/json", Authorization: `Bot ${api_key}` }, body: JSON.stringify({ query, cache: final_options.cache, web_search: final_options.web_search }), signal: AbortSignal.timeout(config.ai_response.kagi_fastgpt.timeout) }); return parse_provider_response(this.name, kagi_fastgpt_response_schema, raw_data); } catch (error) { const error_message = error instanceof Error ? error.message : String(error); throw new Error(`Failed to get Kagi FastGPT answer: ${error_message}`); } } }; //#endregion //#region src/providers/ai-response/linkup/index.ts const linkup_sourced_answer_response_schema = v.object({ answer: v.string(), sources: v.array(v.object({ favicon: v.string(), name: v.string(), snippet: v.string(), url: v.string() })) }); var LinkupProvider = class { name = "linkup"; description = "AI-powered deep search with sourced answers via Linkup. Uses agentic search with standard depth for balanced speed and accuracy."; async search(params) { const api_key = validate_api_key(config.ai_response.linkup.api_key, this.name); const search_request = async () => { try { const request_body = { q: sanitize_query(params.query), depth: "standard", outputType: "sourcedAnswer" }; if (params.include_domains && params.include_domains.length > 0) request_body.includeDomains = params.include_domains; if (params.exclude_domains && params.exclude_domains.length > 0) request_body.excludeDomains = params.exclude_domains; if (params.limit) request_body.maxResults = params.limit; const raw_data = await http_json(this.name, `${config.ai_response.linkup.base_url}/search`, { method: "POST", headers: { Authorization: `Bearer ${api_key}`, "Content-Type": "application/json" }, body: JSON.stringify(request_body), signal: AbortSignal.timeout(config.ai_response.linkup.timeout) }); const data = parse_provider_response(this.name, linkup_sourced_answer_response_schema, raw_data); const results = [{ title: "Linkup AI Answer", url: "", snippet: data.answer, score: 1, source_provider: this.name, metadata: { type: "ai_answer", depth: "standard", sources_count: data.sources?.length || 0 } }]; if (data.sources && data.sources.length > 0) { const source_results = data.sources.map((source, index) => ({ title: source.name, url: source.url, snippet: source.snippet || "Source reference", score: .9 - index * .01, source_provider: this.name, metadata: { type: "source", favicon: source.favicon } })); results.push(...source_results); } return results; } catch (error) { handle_provider_error(error, this.name, "fetch AI response"); } }; return retry_with_backoff(search_request); } }; //#endregion //#region src/providers/enhancement/kagi-enrichment/index.ts var KagiEnrichmentSearchProvider = class { name = "kagi_enrichment"; description = "Search specialized indexes (Teclis for web, TinyGem for news). Ideal for discovering non-mainstream results and supplementary knowledge."; async search(params) { const api_key = validate_api_key(config.enhancement.kagi_enrichment.api_key, this.name); const query = sanitize_query(params.query); const limit = params.limit ?? 5; const enrich_request = async () => { try { const [webData, newsData] = await Promise.all([http_json(this.name, `https://kagi.com/api/v0/enrich/web?${new URLSearchParams({ q: query, limit: String(limit) })}`, { method: "GET", headers: { Authorization: `Bot ${api_key}`, Accept: "application/json" }, signal: AbortSignal.timeout(config.enhancement.kagi_enrichment.timeout) }), http_json(this.name, `https://kagi.com/api/v0/enrich/news?${new URLSearchParams({ q: query, limit: String(limit) })}`, { method: "GET", headers: { Authorization: `Bot ${api_key}`, Accept: "application/json" }, signal: AbortSignal.timeout(config.enhancement.kagi_enrichment.timeout) })]); if (!webData?.data || !newsData?.data) throw new ProviderError("API_ERROR", "Unexpected response: missing data from enrichment endpoints", this.name); return [...webData.data, ...newsData.data].flatMap((result) => { if (!result.title || !result.url) return []; return [{ title: result.title, url: result.url, snippet: (result.snippet ?? "").replace(/'/g, "'").replace(/"/g, "\"").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">"), score: result.rank ? 1 / result.rank : void 0, source_provider: this.name }]; }); } catch (error) { handle_provider_error(error, this.name, "enrich content"); } }; return retry_with_backoff(enrich_request); } }; //#endregion //#region src/providers/processing/exa-contents/index.ts const exa_contents_response_schema = v.object({ results: v.array(v.object({ id: v.string(), title: v.string(), url: v.string(), text: v.optional(v.string()), highlights: v.optional(v.array(v.string())), summary: v.optional(v.string()), publishedDate: v.optional(v.string()), author: v.optional(v.string()) })), requestId: v.string() }); var ExaContentsProvider = class { name = "exa_contents"; description = "Extract full content from Exa search result IDs"; async process_content(idsOrUrls, extract_depth = "basic") { const api_key = validate_api_key(config.processing.exa_contents.api_key, this.name); const items = Array.isArray(idsOrUrls) ? idsOrUrls : [idsOrUrls]; if (items.length === 0) throw new ProviderError("INVALID_INPUT", "At least one ID must be provided", this.name); const process_request = async () => { try { const looksLikeUrl = (value) => { try { new URL(value); return true; } catch { return false; } }; const request_body = { ...items.every(looksLikeUrl) ? { urls: items } : { ids: items }, text: true, highlights: extract_depth === "advanced", summary: extract_depth === "advanced" }; const raw_data = await http_json(this.name, `${config.processing.exa_contents.base_url}/contents`, { method: "POST", headers: { "x-api-key": api_key, Authorization: `Bearer ${api_key}`, "Content-Type": "application/json" }, body: JSON.stringify(request_body) }); const data = parse_provider_response(this.name, exa_contents_response_schema, raw_data); let combined_content = ""; const raw_contents = []; let total_word_count = 0; for (const result of data.results) { const content = result.text || result.summary || "No content available"; const word_count = content.split(/\s+/).length; total_word_count += word_count; combined_content += `## ${result.title}\n\n`; if (result.author) combined_content += `**Author:** ${result.author}\n`; if (result.publishedDate) combined_content += `**Published:** ${result.publishedDate}\n`; combined_content += `**URL:** ${result.url}\n\n`; if (result.highlights && result.highlights.length > 0) { combined_content += `**Key Highlights:**\n`; for (const highlight of result.highlights) combined_content += `- ${highlight}\n`; combined_content += "\n"; } if (result.summary && result.text) { combined_content += `**Summary:** ${result.summary}\n\n`; combined_content += `**Full Content:**\n${result.text}\n\n`; } else combined_content += `${content}\n\n`; combined_content += "---\n\n"; raw_contents.push({ url: result.url, content }); } return { content: combined_content, raw_contents, metadata: { title: `Content from ${data.results.length} Exa results`, word_count: total_word_count, urls_processed: data.results.length, successful_extractions: data.results.length, extract_depth, requestId: data.requestId }, source_provider: this.name }; } catch (error) { handle_provider_error(error, this.name, "extract contents"); } }; return retry_with_backoff(process_request); } }; //#endregion //#region src/providers/processing/exa-similar/index.ts const exa_similar_response_schema = v.object({ results: v.array(v.object({ id: v.string(), title: v.string(), url: v.string(), text: v.optional(v.string()), highlights: v.optional(v.array(v.string())), summary: v.optional(v.string()), publishedDate: v.optional(v.string()), author: v.optional(v.string()), score: v.optional(v.number()) })), requestId: v.string() }); var ExaSimilarProvider = class { name = "exa_similar"; description = "Find web pages semantically similar to a given URL using Exa"; async process_content(url, extract_depth = "basic") { const api_key = validate_api_key(config.processing.exa_similar.api_key, this.name); const target_url = Array.isArray(url) ? url[0] : url; if (!target_url) throw new ProviderError("INVALID_INPUT", "A URL must be provided", this.name); validate_processing_urls(target_url, this.name); const process_request = async () => { try { const request_body = { url: target_url, numResults: extract_depth === "advanced" ? 15 : 10, contents: { text: { maxCharacters: extract_depth === "advanced" ? 3e3 : 1500 }, highlights: extract_depth === "advanced", summary: extract_depth === "advanced" } }; const raw_data = await http_json(this.name, `${config.processing.exa_similar.base_url}/findSimilar`, { method: "POST", headers: { "x-api-key": api_key, Authorization: `Bearer ${api_key}`, "Content-Type": "application/json" }, body: JSON.stringify(request_body) }); const data = parse_provider_response(this.name, exa_similar_response_schema, raw_data); let combined_content = `# Similar Pages to ${target_url}\n\n`; combined_content += `Found ${data.results.length} similar pages:\n\n`; const raw_contents = []; let total_word_count = 0; for (const result of data.results) { const content = result.text || result.summary || "No content available"; const word_count = content.split(/\s+/).length; total_word_count += word_count; combined_content += `## ${result.title}\n\n`; if (result.author) combined_content += `**Author:** ${result.author}\n`; if (result.publishedDate) combined_content += `**Published:** ${result.publishedDate}\n`; if (result.score) combined_content += `**Similarity Score:** ${result.score.toFixed(3)}\n`; combined_content += `**URL:** ${result.url}\n\n`; if (result.highlights && result.highlights.length > 0) { combined_content += `**Key Highlights:**\n`; for (const highlight of result.highlights) combined_content += `- ${highlight}\n`; combined_content += "\n"; } if (result.summary && result.text) { combined_content += `**Summary:** ${result.summary}\n\n`; combined_content += `**Content Preview:**\n${result.text.substring(0, 500)}${result.text.length > 500 ? "..." : ""}\n\n`; } else combined_content += `${content.substring(0, 500)}${content.length > 500 ? "..." : ""}\n\n`; combined_content += "---\n\n"; raw_contents.push({ url: result.url, content }); } return { content: combined_content, raw_contents, metadata: { title: `Similar pages to ${target_url}`, word_count: total_word_count, urls_processed: data.results.length, successful_extractions: data.results.length, extract_depth, original_url: target_url, requestId: data.requestId }, source_provider: this.name }; } catch (error) { handle_provider_error(error, this.name, "find similar pages"); } }; return retry_with_backoff(process_request); } }; //#endregion //#region src/common/firecrawl-utils.ts const make_firecrawl_request = async (provider_name, base_url, api_key, body, timeout, schema) => { return parse_provider_response(provider_name, schema, await http_json(provider_name, base_url, { method: "POST", headers: { Authorization: `Bearer ${api_key}`, "Content-Type": "application/json" }, body: JSON.stringify(body), signal: AbortSignal.timeout(timeout) })); }; const validate_firecrawl_response = (data, provider_name, error_message) => { if (!data.success || data.error) throw new ProviderError("PROVIDER_ERROR", `${error_message}: ${data.error || "Unknown error"}`, provider_name); }; const poll_firecrawl_job = async (config, schema) => { let attempts = 0; while (attempts < config.max_attempts) { attempts++; await new Promise((resolve) => setTimeout(resolve, config.poll_interval)); let status_result; try { const raw_status_result = await http_json(config.provider_name, config.status_url, { method: "GET", headers: { Authorization: `Bearer ${config.api_key}` }, signal: AbortSignal.timeout(config.timeout) }); status_result = parse_provider_response(config.provider_name, schema, raw_status_result); } catch (error) { if (error instanceof ProviderError && error.details?.retryable === false) throw error; continue; } if (status_result.success === false) throw new ProviderError("PROVIDER_ERROR", `Error checking job status: ${status_result.error || "Unknown error"}`, config.provider_name); if (status_result.status === "completed" && status_result.data) return status_result; if (status_result.status === "error" || status_result.status === "failed" || status_result.status === "cancelled") throw new ProviderError("PROVIDER_ERROR", `Job failed: ${status_result.error || "Unknown error"}`, config.provider_name); } throw new ProviderError("TIMEOUT", "Job timed out - try again later or with a smaller scope", config.provider_name, { retryable: true }); }; //#endregion //#region src/providers/processing/firecrawl-actions/index.ts const firecrawl_metadata_schema$2 = v.record(v.string(), v.unknown()); const firecrawl_actions_response_schema = v.object({ success: v.boolean(), data: v.optional(v.object({ markdown: v.optional(v.string()), html: v.optional(v.nullable(v.string())), rawHtml: v.optional(v.nullable(v.string())), screenshot: v.optional(v.nullable(v.string())), actions: v.optional(v.object({ screenshots: v.optional(v.array(v.string())) })), metadata: v.optional(firecrawl_metadata_schema$2) })), error: v.optional(v.string()) }); var FirecrawlActionsProvider = class { name = "firecrawl_actions"; description = "Support for page interactions (clicking, scrolling, etc.) before extraction for dynamic content using Firecrawl. Enables extraction from JavaScript-heavy sites, single-page applications, and content behind user interactions. Best for accessing content that requires navigation, form filling, or other interactions."; async process_content(url, extract_depth = "basic") { const actions_url = validate_processing_urls(url, this.name)[0]; const actions_request = async () => { const api_key = validate_api_key(config.processing.firecrawl_actions.api_key, this.name); try { const actions = extract_depth === "advanced" ? [ { type: "wait", milliseconds: 2e3 }, { type: "scroll", direction: "down" }, { type: "wait", milliseconds: 1e3 }, { type: "scroll", direction: "down" }, { type: "wait", milliseconds: 1e3 }, { type: "click", selector: "button:contains(\"Read more\"), button:contains(\"Show more\"), a:contains(\"Read more\"), a:contains(\"Show more\")" }, { type: "wait", milliseconds: 2e3 } ] : [ { type: "wait", milliseconds: 2e3 }, { type: "scroll", direction: "down" }, { type: "wait", milliseconds: 1e3 } ]; const actions_data = await make_firecrawl_request(this.name, config.processing.firecrawl_actions.base_url, api_key, { url: actions_url, formats: ["markdown", "screenshot"], actions }, config.processing.firecrawl_actions.timeout, firecrawl_actions_response_schema); validate_firecrawl_response(actions_data, this.name, "Error performing actions"); if (!actions_data.data) throw new ProviderError("PROVIDER_ERROR", "No data returned from API", this.name); if (!actions_data.data.markdown && !actions_data.data.html && !actions_data.data.rawHtml) throw new ProviderError("PROVIDER_ERROR", "No content extracted after performing actions", this.name); const content = actions_data.data.markdown || actions_data.data.html || actions_data.data.rawHtml || ""; const actions_description = `# Content from ${actions_url} after interactions\n\nThe following actions were performed before extraction:\n\n` + actions.map((action, index) => { switch (action.type) { case "click": return `${index + 1}. Click on ${action.selector || `coordinates (${action.x}, ${action.y})`}`; case "write": return `${index + 1}. Write "${action.text}" ${action.selector ? `into ${action.selector}` : ""}`; case "scroll": return `${index + 1}. Scroll ${action.direction || "down"}`; case "wait": return `${index + 1}. Wait ${action.milliseconds ? `for ${action.milliseconds}ms` : ""}`; case "executeJavascript": return `${index + 1}. Execute JavaScript`; case "screenshot": return `${index + 1}. Take screenshot`; default: return `${index + 1}. Perform ${String(action.type)} action`; } }).join("\n") + "\n\n---\n\n" + content; const raw_contents = [{ url: actions_url, content: actions_description }]; const word_count = actions_description.split(/\s+/).filter(Boolean).length; return { content: actions_description, raw_contents, metadata: { title: `Content from ${actions_url} after interactions`, word_count, urls_processed: 1, successful_extractions: 1, extract_depth, screenshot: actions_data.data.screenshot }, source_provider: this.name }; } catch (error) { handle_provider_error(error, this.name, "perform actions"); } }; return retry_with_backoff(actions_request); } }; //#endregion //#region src/providers/processing/firecrawl-crawl/index.ts const firecrawl_metadata_schema$1 = v.record(v.string(), v.unknown()); const firecrawl_crawl_response_schema = v.object({ success: v.boolean(), id: v.string(), url: v.string(), error: v.optional(v.string()) }); const firecrawl_crawl_status_response_schema = v.object({ status: v.string(), total: v.optional(v.number()), completed: v.optional(v.number()), data: v.optional(v.array(v.object({ url: v.optional(v.string()), markdown: v.optional(v.string()), html: v.optional(v.nullable(v.string())), rawHtml: v.optional(v.nullable(v.string())), metadata: v.optional(firecrawl_metadata_schema$1), error: v.optional(v.nullable(v.string())) }))), error: v.optional(v.string()) }); const get_firecrawl_page_url = (page, fallback_url) => { const metadata_url = page.metadata?.sourceURL ?? page.metadata?.url; return page.url ?? (typeof metadata_url === "string" ? metadata_url : fallback_url); }; const get_firecrawl_page_error = (page) => { const metadata_error = page.metadata?.error; return page.error ?? (typeof metadata_error === "string" ? metadata_error : void 0); }; var FirecrawlCrawlProvider = class { name = "firecrawl_crawl"; description = "Deep crawling of all accessible subpages on a website with configurable depth limits using Firecrawl. Efficiently discovers and extracts content from multiple pages within a domain. Best for comprehensive site analysis, content indexing, and data collection from entire websites."; async process_content(url, extract_depth = "basic") { const crawl_url = validate_processing_urls(url, this.name)[0]; const crawl_request = async () => { const api_key = validate_api_key(config.processing.firecrawl_crawl.api_key, this.name); try { const crawl_data = await make_firecrawl_request(this.name, config.processing.firecrawl_crawl.base_url, api_key, { url: crawl_url, scrapeOptions: { formats: ["markdown"], onlyMainContent: true }, maxDiscoveryDepth: extract_depth === "advanced" ? 3 : 1, limit: extract_depth === "advanced" ? 50 : 20 }, config.processing.firecrawl_crawl.timeout, firecrawl_crawl_response_schema); validate_firecrawl_response(crawl_data, this.name, "Error starting crawl"); const status_data = await poll_firecrawl_job({ provider_name: this.name, status_url: `${config.processing.firecrawl_crawl.base_url}/${crawl_data.id}`, api_key, max_attempts: 20, poll_interval: 5e3, timeout: 3e4 }, firecrawl_crawl_status_response_schema); if (!status_data.data || status_data.data.length === 0) throw new ProviderError("PROVIDER_ERROR", "Crawl returned no data", this.name); const successful_pages = status_data.data.filter((page) => !get_firecrawl_page_error(page) && (page.markdown || page.html || page.rawHtml)); if (successful_pages.length === 0) throw new ProviderError("PROVIDER_ERROR", "All crawled pages failed to extract content", this.name); const raw_contents = successful_pages.map((page) => ({ url: get_firecrawl_page_url(page, crawl_url), content: page.markdown || page.html || page.rawHtml || "" })); const combined_content = raw_contents.map((result) => `# ${result.url}\n\n${result.content}\n\n---\n\n`).join("\n\n"); const word_count = combined_content.split(/\s+/).filter(Boolean).length; const title_value = successful_pages[0]?.metadata?.title; const title = typeof title_value === "string" ? title_value : void 0; const failed_urls = status_data.data.filter((page) => get_firecrawl_page_error(page)).map((page) => get_firecrawl_page_url(page, crawl_url)); return { content: combined_content, raw_contents, metadata: { title, word_count, failed_urls: failed_urls.length > 0 ? failed_urls : void 0, urls_processed: status_data.data.length, successful_extractions: successful_pages.length, extract_depth }, source_provider: this.name }; } catch (error) { handle_provider_error(error, this.name, "crawl website"); } }; return retry_with_backoff(crawl_request); } }; //#endregion //#region src/providers/processing/firecrawl-extract/index.ts const firecrawl_extract_response_schema = v.object({ success: v.boolean(), id: v.string(), error: v.optional(v.string()) }); const firecrawl_extract_status_response_schema = v.object({ success: v.boolean(), id: v.string(), status: v.string(), data: v.optional(v.record(v.string(), v.unknown())), error: v.optional(v.string()) }); var FirecrawlExtractProvider = class { name = "firecrawl_extract"; description = "Structured data extraction with AI using natural language prompts via Firecrawl. Extracts specific information from web pages based on custom extraction instructions. Best for targeted data collection, information extraction, and converting unstructured web content into structured data."; async process_content(url, extract_depth = "basic") { const extract_url = validate_processing_urls(url, this.name)[0]; const extract_request = async () => { const api_key = validate_api_key(config.processing.firecrawl_extract.api_key, this.name); try { const extraction_prompt = extract_depth === "advanced" ? "Extract all relevant information from this page including: title, author, date published, main content, categories or tags, related links, and any structured data like product information, pricing, or specifications. Format the data in a well-structured way." : "Extract the main content, title, and author from this page. Summarize the key information."; const extract_data = await make_firecrawl_request(this.name, config.processing.firecrawl_extract.base_url, api_key, { urls: [extract_url], prompt: extraction_prompt, showSources: true, scrapeOptions: { formats: ["markdown"], onlyMainContent: true, waitFor: extract_depth === "advanced" ? 5e3 : 2e3 } }, config.processing.firecrawl_extract.timeout, firecrawl_extract_response_schema); validate_firecrawl_response(extract_data, this.name, "Error starting extraction"); const status_data = await poll_firecrawl_job({ provider_name: this.name, status_url: `${config.processing.firecrawl_extract.base_url}/${extract_data.id}`, api_key, max_attempts: 15, poll_interval: 3e3, timeout: 3e4 }, firecrawl_extract_status_response_schema); if (!status_data.data) throw new ProviderError("PROVIDER_ERROR", "No data extracted from URL", this.name); let formatted_content = `# Extracted Data from ${extract_url}\n\n`; for (const [key, value] of Object.entries(status_data.data)) if (typeof value === "string") formatted_content += `## ${key.charAt(0).toUpperCase() + key.slice(1)}\n\n${value}\n\n`; else if (Array.isArray(value)) { formatted_content += `## ${key.charAt(0).toUpperCase() + key.slice(1)}\n\n`; value.forEach((item, index) => { if (typeof item === "object") { formatted_content += `### Item ${index + 1}\n\n`; for (const [itemKey, itemValue] of Object.entries(item)) formatted_content += `- **${itemKey}**: ${String(itemValue)}\n`; formatted_content += "\n"; } else formatted_content += `- ${item}\n`; }); formatted_content += "\n"; } else if (typeof value === "object" && value !== null) { formatted_content += `## ${key.charAt(0).toUpperCase() + key.slice(1)}\n\n`; for (const [subKey, subValue] of Object.entries(value)) formatted_content += `- **${subKey}**: ${subValue}\n`; formatted_content += "\n"; } const raw_contents = [{ url: extract_url, content: formatted_content }]; const title = typeof status_data.data.title === "string" ? status_data.data.title : `Extracted Data from ${extract_url}`; const word_count = formatted_content.split(/\s+/).filter(Boolean).length; return { content: formatted_content, raw_contents, metadata: { title, word_count, urls_processed: 1, successful_extractions: 1, extract_depth }, source_provider: this.name }; } catch (error) { handle_provider_error(error, this.name, "extract data"); } }; return retry_with_backoff(extract_request); } }; //#endregion //#region src/providers/processing/firecrawl-map/index.ts const firecrawl_map_response_schema = v.object({ success: v.boolean(), links: v.optional(v.array(v.object({ url: v.string(), title: v.optional(v.string()) }))), error: v.optional(v.string()) }); var FirecrawlMapProvider = class { name = "firecrawl_map"; description = "Fast URL collection from websites for comprehensive site mapping using Firecrawl. Efficiently discovers all accessible URLs within a domain without extracting content. Best for site auditing, URL discovery, and preparing for targeted content extraction."; async process_content(url, extract_depth = "basic") { const map_url = validate_processing_urls(url, this.name)[0]; const map_request = async () => { const api_key = validate_api_key(config.processing.firecrawl_map.api_key, this.name); try { const map_data = await make_firecrawl_request(this.name, config.processing.firecrawl_map.base_url, api_key, { url: map_url, limit: extract_depth === "advanced" ? 200 : 50, includeSubdomains: false }, config.processing.firecrawl_map.timeout, firecrawl_map_response_schema); validate_firecrawl_response(map_data, this.name, "Error mapping website"); if (!map_data.links || map_data.links.length === 0) throw new ProviderError("PROVIDER_ERROR", "No URLs discovered during mapping", this.name); const formatted_content = `# Site Map for ${map_url}\n\nFound ${map_data.links.length} URLs:\n\n` + map_data.links.map((link) => link.title ? `- ${link.url} — ${link.title}` : `- ${link.url}`).join("\n"); return { content: formatted_content, raw_contents: [{ url: map_url, content: formatted_content }], metadata: { title: `Site Map for ${map_url}`, word_count: map_data.links.length, urls_processed: 1, successful_extractions: 1, extract_depth }, source_provider: this.name }; } catch (error) { handle_provider_error(error, this.name, "map website"); } }; return retry_with_backoff(map_request); } }; //#endregion //#region src/common/results.ts const CHARS_PER_TOKEN = 4; const MAX_SAFE_CHARS = 2e4 * CHARS_PER_TOKEN; const get_large_result_mode = (mode_override) => { if (mode_override) return mode_override; const configured_mode = process.env.OMNISEARCH_LARGE_RESULT_MODE; if (configured_mode === "inline" || configured_mode === "file") return configured_mode; return "file"; }; const format_as_text = (result) => { const lines = []; const sections = []; let current_line = 1; const add_line = (line) => { if (line.startsWith("URL: ")) sections.push({ title: line, line: current_line }); else if (line.startsWith("# ")) sections.push({ title: line.slice(2), line: current_line }); else if (line.startsWith("## ")) sections.push({ title: line.slice(3), line: current_line }); else if (line.startsWith("### ")) sections.push({ title: line.slice(4), line: current_line }); lines.push(line); current_line++; }; const add_content = (content) => { const content_lines = content.split("\n"); for (const line of content_lines) add_line(line); }; const raw_contents = result.raw_contents; if (raw_contents?.length) for (const item of raw_contents) { add_line("=".repeat(80)); add_line(`URL: ${item.url}`); add_line("=".repeat(80)); if (item.content) add_content(item.content); add_line(""); } else if (result.content) add_content(JSON.stringify(result.content)); else add_content(JSON.stringify(result, null, 2)); if (result.metadata) { add_line(""); add_line("=".repeat(80)); sections.push({ title: "METADATA", line: current_line }); add_line("METADATA"); add_line("=".repeat(80)); add_content(JSON.stringify(result.metadata, null, 2)); } return { text: lines.join("\n"), sections, total_lines: current_line - 1 }; }; const handle_large_result = (result, provider_name, options = {}) => { const char_count = JSON.stringify(result, null, 2).length; if (char_count <= MAX_SAFE_CHARS) return result; if (get_large_result_mode(options.mode) === "inline") return result; const file_id = randomUUID(); const file_path = join(tmpdir(), `mcp-${provider_name}-${file_id}.txt`); const { text, sections, total_lines } = format_as_text(result); writeFileSync(file_path, text, "utf-8"); const result_obj = result; const metadata = result_obj.metadata; const word_count = metadata?.word_count ?? "unknown"; const urls_processed = metadata?.urls_processed ?? "unknown"; return { file_path, total_lines, estimated_tokens: Math.round(char_count / CHARS_PER_TOKEN), sections, read_hint: `Use Read tool with file_path="${file_path}" and offset=LINE_NUMBER limit=50 to read a section`, metadata: { word_count, urls_processed, source_provider: result_obj.source_provider } }; }; const omit_raw_contents = (result) => { const { raw_contents: _raw_contents, ...compact_result } = result; return compact_result; }; const aggregate_url_results = (results, provider_name, urls, extract_depth) => { const successful_results = results.filter((r) => r.success); const failed_urls = results.filter((r) => !r.success).map((r) => r.url); if (successful_results.length === 0) throw new ProviderError("PROVIDER_ERROR", "Failed to extract content from all URLs", provider_name); const raw_contents = successful_results.map((result) => ({ url: result.url, content: result.content })); const combined_content = raw_contents.map((result) => result.content).join("\n\n"); const word_count = combined_content.split(/\s+/).filter(Boolean).length; return { content: combined_content, raw_contents, metadata: { title: successful_results[0]?.metadata?.title, word_count, failed_urls: failed_urls.length > 0 ? failed_urls : void 0, urls_processed: urls.length, successful_extractions: successful_results.length, extract_depth }, source_provider: provider_name }; }; //#endregion //#region src/providers/processing/firecrawl-scrape/index.ts const firecrawl_metadata_schema = v.record(v.string(), v.unknown()); const firecrawl_scrape_response_schema = v.object({ success: v.boolean(), data: v.optional(v.object({ markdown: v.optional(v.string()), html: v.optional(v.nullable(v.string())), rawHtml: v.optional(v.nullable(v.string())), screenshot: v.optional(v.nullable(v.string())), links: v.optional(v.array(v.string())), metadata: v.optional(firecrawl_metadata_schema), llm_extraction: v.optional(v.unknown()), warning: v.optional(v.string()) })), error: v.optional(v.string()) }); var FirecrawlScrapeProvider = class { name = "firecrawl_scrape"; description = "Extract clean, LLM-ready data from single URLs with enhanced formatting options using Firecrawl. Efficiently converts web content into markdown, plain text, or structured data with configurable extraction options. Best for content analysis, data collection, and AI training data preparation."; async process_content(url, extract_depth = "basic") { const urls = validate_processing_urls(url, this.name); const scrape_request = async () => { const api_key = validate_api_key(config.processing.firecrawl_scrape.api_key, this.name); try { return aggregate_url_results(await Promise.all(urls.map(async (single_url) => { try { const data = await make_firecrawl_request(this.name, config.processing.firecrawl_scrape.base_url, api_key, { url: single_url, formats: ["markdown"], onlyMainContent: true, waitFor: extract_depth === "advanced" ? 5e3 : 2e3 }, config.processing.firecrawl_scrape.timeout, firecrawl_scrape_response_schema); validate_firecrawl_response(data, this.name, "Error scraping URL"); if (!data.data) throw new ProviderError("PROVIDER_ERROR", "No data returned from API", this.name); if (!data.data.markdown && !data.data.html && !data.data.rawHtml) throw new ProviderError("PROVIDER_ERROR", "No content extracted from URL", this.name); return { url: single_url, content: data.data.markdown || data.data.html || data.data.rawHtml || "", metadata: data.data.metadata, success: true }; } catch (error) { console.error