UNPKG

jina-mcp-tools

Version:

Jina AI Model Context Protocol (MCP) server that integrates with Jina AI APIs for web reading and search

360 lines (309 loc) 11.1 kB
#!/usr/bin/env node import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { z } from "zod"; import fetch from "node-fetch"; // Get Jina API key from environment (optional) const getJinaApiKey = () => { return process.env.JINA_API_KEY || null; }; // Helper to create headers with or without API key const createHeaders = (baseHeaders = {}) => { const headers = { ...baseHeaders }; const apiKey = getJinaApiKey(); if (apiKey) { headers["Authorization"] = `Bearer ${apiKey}`; } return headers; }; // Create MCP server for Jina AI tools const server = new McpServer({ name: "jina-mcp-tools", version: "1.1.3", description: "Jina AI tools for web reading and search" }); /** * Extraction modes define HOW content is processed from websites * These are independent of output format and control engine, selectors, and metadata collection */ const ExtractionMode = { /** Balanced speed and quality - uses direct engine with links summary (DEFAULT) */ STANDARD: "standard", /** Maximum data extraction - uses browser engine with links + images summary */ COMPREHENSIVE: "comprehensive", /** Clean content focus - removes ads, navigation, noise using CSS selectors */ CLEAN_CONTENT: "clean_content" }; /** * Output formats define HOW content is returned to the user * These work with any extraction mode and control the structure and includes */ const OutputFormat = { /** Jina API's native format - no X-Return-Format header (DEFAULT) */ DEFAULT: "default", /** Structured markdown with headers and links - uses X-Return-Format: markdown */ MARKDOWN: "markdown", /** Plain text only, fastest processing - uses X-Return-Format: text */ TEXT: "text", /** Rich metadata with links and images - uses markdown + summaries */ STRUCTURED: "structured" }; /** * Detects if a URL is a GitHub file URL and handles it directly without Jina reader * @param {string} url - The URL to check and potentially convert * @returns {{isGitHub: boolean, convertedUrl: string, originalUrl: string, shouldBypassJina: boolean}} */ const handleGitHubUrl = (url) => { const isGitHub = url.includes('github.com') && url.includes('/blob/'); if (isGitHub) { // Convert blob URLs to raw.githubusercontent.com format // Pattern: https://github.com/owner/repo/blob/ref/path -> https://raw.githubusercontent.com/owner/repo/refs/heads/branch/path // Or: https://github.com/owner/repo/blob/commit-hash/path -> https://raw.githubusercontent.com/owner/repo/commit-hash/path const match = url.match(/github\.com\/([^\/]+)\/([^\/]+)\/blob\/([^\/]+)\/?(.*)/); let convertedUrl; if (match) { const [, owner, repo, ref, path] = match; // Check if ref looks like a commit hash (40 chars, hex) or branch name const isCommitHash = /^[a-f0-9]{40}$/i.test(ref); if (isCommitHash) { // Direct commit hash convertedUrl = `https://raw.githubusercontent.com/${owner}/${repo}/${ref}/${path}`; } else { // Branch name - add refs/heads/ prefix convertedUrl = `https://raw.githubusercontent.com/${owner}/${repo}/refs/heads/${ref}/${path}`; } } else { // Fallback to simple replacement if regex doesn't match convertedUrl = url .replace('github.com', 'raw.githubusercontent.com') .replace('/blob/', '/'); } return { isGitHub: true, convertedUrl, originalUrl: url, shouldBypassJina: true }; } return { isGitHub: false, convertedUrl: url, originalUrl: url, shouldBypassJina: false }; }; /** * Maps extraction mode and output format combinations to Jina API parameters * @param {string} mode - ExtractionMode value * @param {string} format - OutputFormat value * @param {boolean} isGitHub - Whether this is a GitHub URL (overrides other settings) * @returns {object} Jina API headers object */ const buildJinaHeaders = (mode, format, isGitHub) => { const baseHeaders = { "Content-Type": "application/json", "Accept": "application/json" }; // GitHub URLs get special optimized treatment regardless of user options if (isGitHub) { return { ...baseHeaders, "X-Engine": "direct", "X-Return-Format": "text", "X-Timeout": "10" }; } // Apply extraction mode settings switch (mode) { case ExtractionMode.STANDARD: baseHeaders["X-Engine"] = "direct"; baseHeaders["X-With-Links-Summary"] = "true"; baseHeaders["X-Timeout"] = "10"; break; case ExtractionMode.COMPREHENSIVE: baseHeaders["X-Engine"] = "browser"; baseHeaders["X-With-Links-Summary"] = "true"; baseHeaders["X-With-Images-Summary"] = "true"; baseHeaders["X-Timeout"] = "15"; break; case ExtractionMode.CLEAN_CONTENT: baseHeaders["X-Engine"] = "browser"; baseHeaders["X-Target-Selector"] = "main,article,.content"; baseHeaders["X-Remove-Selector"] = "nav,header,footer,.sidebar,.ads"; baseHeaders["X-Timeout"] = "15"; break; } // Apply output format settings switch (format) { case OutputFormat.DEFAULT: // No X-Return-Format header - uses Jina's native format break; case OutputFormat.MARKDOWN: baseHeaders["X-Return-Format"] = "markdown"; break; case OutputFormat.TEXT: baseHeaders["X-Return-Format"] = "text"; break; case OutputFormat.STRUCTURED: baseHeaders["X-Return-Format"] = "markdown"; baseHeaders["X-With-Links-Summary"] = "true"; baseHeaders["X-With-Images-Summary"] = "true"; break; } return baseHeaders; }; // READER TOOL - Elegant Enum Interface server.registerTool( "jina_reader", { title: "Jina Web Reader", description: `Read and extract content from web page.`, inputSchema: { url: z.string().url().describe("URL of the webpage to read and extract content from"), mode: z.enum(["standard", "comprehensive", "clean_content"]) .optional() .default("standard") .describe(`Extraction mode - how content is processed: • "standard" - Best for technical document pages (direct engine, links) • "comprehensive" - Maximum data extraction for media-rich pages (browser engine, links + images) • "clean_content" - Remove ads, navigation, noise (CSS selectors)`), format: z.enum(["default", "markdown", "text", "structured"]) .optional() .default("default") .describe(`Output format - how content is returned: • "default" - Markdown focusing on main content -- best for technical documents • "markdown" - Markdown with headers • "text" - Plain text only • "structured" - Rich metadata (links + images)`), customTimeout: z.number().optional().describe("Override timeout in seconds for slow sites") } }, async ({ url, mode = "standard", format = "default", customTimeout }) => { try { // Handle GitHub URL detection and conversion const { isGitHub, convertedUrl, originalUrl, shouldBypassJina } = handleGitHubUrl(url); const actualUrl = convertedUrl; // For GitHub repo files, bypass Jina and fetch directly if (shouldBypassJina) { const directResponse = await fetch(actualUrl); if (!directResponse.ok) { throw new Error(`GitHub API error (${directResponse.status}): ${directResponse.statusText}`); } // Raw file content const content = await directResponse.text(); return { content: [{ type: "text", text: content }] }; } // Regular Jina processing for non-GitHub URLs const jinaHeaders = buildJinaHeaders(mode, format, isGitHub); if (customTimeout) { jinaHeaders["X-Timeout"] = customTimeout.toString(); } const headers = createHeaders(jinaHeaders); const response = await fetch("https://r.jina.ai/", { method: "POST", headers, body: JSON.stringify({ url: actualUrl }) }); if (!response.ok) { const errorText = await response.text(); throw new Error(`Jina Reader API error (${response.status}): ${errorText}`); } const data = await response.json(); const responseData = data.data || {}; const resultText = responseData.content || "No content extracted"; return { content: [{ type: "text", text: resultText }] }; } catch (error) { return { content: [{ type: "text", text: error.message }], isError: true }; } } ); // SEARCH TOOL server.registerTool( "jina_search", { title: "Jina Web Search", description: `Search the web. The response includes only partial contents of each web page. Use jina reader for full content.`, inputSchema: { query: z.string().min(1).describe("Search query to find information on the web"), count: z.number().optional().default(5).describe("Number of search results to return"), siteFilter: z.string().optional().describe("Limit search to specific domain (e.g., 'github.com')") } }, async ({ query, count, siteFilter }) => { try { const encodedQuery = encodeURIComponent(query); const baseHeaders = { "X-Respond-With": "no-content", }; if (siteFilter) { baseHeaders["X-Site"] = `https://${siteFilter}`; } const headers = createHeaders(baseHeaders); const response = await fetch(`https://s.jina.ai/?q=${encodedQuery}`, { method: "GET", headers }); if (!response.ok) { const errorText = await response.text(); throw new Error(`Jina Search API error (${response.status}): ${errorText}`); } const text = await response.text(); return { content: [{ type: "text", text: text }] }; } catch (error) { return { content: [{ type: "text", text: error.message }], isError: true }; } } ); // Main function to start the server async function main() { try { // Check for API key (now optional) const apiKey = getJinaApiKey(); if (apiKey) { console.error(`Jina AI API key found with length ${apiKey.length}`); if (apiKey.length < 10) { console.warn("Warning: JINA_API_KEY seems too short. Please verify your API key."); } } else { console.error("No Jina AI API key found. Some features may be limited."); } // Connect the server to stdio transport const transport = new StdioServerTransport(); await server.connect(transport); } catch (error) { console.error("Server error:", error); process.exit(1); } } // Execute the main function main().catch((error) => { console.error("Fatal error in main():", error); process.exit(1); });