UNPKG

@mcpcn/mcp-pdf-reader

Version:

MCP PDF阅读服务器 - 提供PDF文本提取、元数据读取和页数统计功能

248 lines (247 loc) 9.28 kB
#!/usr/bin/env node import { Server } from "@modelcontextprotocol/sdk/server/index.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js"; import fsExtra from "fs-extra"; import { existsSync } from "node:fs"; import * as path from "node:path"; // 延迟加载 pdfjs-dist,避免服务器启动时因依赖解析失败导致工具列表拿不到 let pdfjsLib = null; // PDF阅读工具定义 const READ_PDF_TOOL = { name: "read_pdf", description: "读取PDF文件内容,提取文本、元数据和页数信息", inputSchema: { type: "object", properties: { sources: { type: "array", items: { type: "object", properties: { path: { type: "string", description: "PDF文件路径(本地路径或URL)" }, pages: { type: "array", items: { type: "number" }, description: "要提取的特定页面(可选),如[1,2,3]" } }, required: ["path"] }, description: "要处理的PDF源文件列表" }, include_metadata: { type: "boolean", description: "是否包含PDF元数据,默认true" }, include_page_count: { type: "boolean", description: "是否包含页数统计,默认true" }, include_full_text: { type: "boolean", description: "是否包含完整文本内容,默认true" } }, required: ["sources"] } }; const PDF_TOOLS = [ READ_PDF_TOOL ]; // 辅助函数:从URL获取PDF数据 async function fetchPdfFromUrl(url) { const response = await fetch(url); if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`); } const buffer = await response.arrayBuffer(); return new Uint8Array(buffer); } // 辅助函数:验证文件路径是否安全(在项目根目录下) function isPathSafe(filePath) { const resolvedPath = path.resolve(filePath); const workspaceRoot = process.cwd(); return resolvedPath.startsWith(workspaceRoot); } // PDF读取功能实现 async function handleReadPDF(sources, options = {}) { // 动态导入 pdfjs-dist,确保服务器能先正常列出工具 // 在Node.js环境中优先使用legacy版本 if (!pdfjsLib) { try { // 先尝试legacy版本,这在Node.js环境中更稳定 pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs'); } catch (e1) { try { // 如果legacy版本失败,再尝试标准版本 pdfjsLib = await import('pdfjs-dist'); } catch (e2) { // console.error('pdfjs-dist 加载失败:', e1, e2); throw new Error('无法加载 pdfjs-dist,请执行 npm install 并确保 Node 版本>=18'); } } } const { include_metadata = true, include_page_count = true, include_full_text = true } = options; const results = []; for (const source of sources) { try { // console.error(`开始处理PDF: ${source.path}`); let pdfData; // 判断是URL还是本地文件 if (source.path.startsWith('http://') || source.path.startsWith('https://')) { // URL处理 pdfData = await fetchPdfFromUrl(source.path); } else { // 本地文件处理 // 安全检查:确保路径在项目根目录下 if (!isPathSafe(source.path)) { throw new Error('文件路径不安全,只能访问项目根目录下的文件'); } if (!existsSync(source.path)) { throw new Error(`文件不存在: ${source.path}`); } const buffer = await fsExtra.readFile(source.path); pdfData = new Uint8Array(buffer); } // 使用 pdfjs-dist 加载PDF const loadingTask = pdfjsLib.getDocument({ data: pdfData }); const pdf = await loadingTask.promise; const result = { source: source.path, success: true, data: {} }; // 获取页数 const numPages = pdf.numPages; if (include_page_count) { result.data.num_pages = numPages; } // 获取元数据 if (include_metadata) { try { const metadata = await pdf.getMetadata(); result.data.info = metadata.info; result.data.metadata = metadata.metadata ? JSON.parse(JSON.stringify(metadata.metadata)) : null; } catch (error) { // console.error('获取元数据失败:', error); } } // 提取文本内容 const pageTexts = []; const pagesToProcess = source.pages || Array.from({ length: numPages }, (_, i) => i + 1); for (const pageNum of pagesToProcess) { if (pageNum < 1 || pageNum > numPages) { // console.error(`跳过无效页面: ${pageNum} (总页数: ${numPages})`); continue; } try { const page = await pdf.getPage(pageNum); const textContent = await page.getTextContent(); const pageText = textContent.items .map((item) => item.str || '') .join(' ') .trim(); pageTexts.push({ page: pageNum, text: pageText }); } catch (error) { // console.error(`提取第${pageNum}页文本失败:`, error); pageTexts.push({ page: pageNum, text: '' }); } } // 根据是否指定页面来决定返回格式 if (source.pages && source.pages.length > 0) { // 指定了页面,返回特定页面的文本 result.data.page_texts = pageTexts; } else { // 没有指定页面,根据 include_full_text 决定 if (include_full_text) { const fullText = pageTexts.map(p => p.text).join('\n\n').trim(); result.data.full_text = fullText; } } results.push(result); // console.error(`PDF处理完成: ${source.path} (${numPages}页)`); } catch (error) { // console.error(`处理PDF失败 (${source.path}):`, error); results.push({ source: source.path, success: false, error: error instanceof Error ? error.message : String(error) }); } } return { content: [{ type: "text", text: JSON.stringify({ results: results }, null, 2) }], isError: false }; } // 服务器设置 const server = new Server({ name: "mcp-pdf-reader", version: "1.0.0", }, { capabilities: { tools: {}, }, }); // 设置请求处理程序 server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: PDF_TOOLS, })); server.setRequestHandler(CallToolRequestSchema, async (request) => { try { switch (request.params.name) { case "read_pdf": { const { sources, include_metadata, include_page_count, include_full_text } = request.params.arguments; return await handleReadPDF(sources, { include_metadata, include_page_count, include_full_text }); } default: return { content: [{ type: "text", text: `Unknown tool: ${request.params.name}` }], isError: true }; } } catch (error) { return { content: [{ type: "text", text: `Error: ${error instanceof Error ? error.message : String(error)}` }], isError: true }; } }); async function runServer() { const transport = new StdioServerTransport(); await server.connect(transport); // console.error("PDF Reader MCP Server running on stdio"); } runServer().catch((error) => { console.error("Fatal error running server:", error); process.exit(1); });