UNPKG

@mcpcn/mcp-pdf-reader

Version:

一个基于MCP协议的PDF阅读服务器,为AI代理提供安全的PDF文件读取和信息提取功能

425 lines (424 loc) 17.1 kB
#!/usr/bin/env node import { Server } from "@modelcontextprotocol/sdk/server/index.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js"; import fsExtra from "fs-extra"; import { existsSync } from "node:fs"; import * as path from "node:path"; import { z } from "zod"; // 延迟加载 pdfjs-dist,避免服务器启动时因依赖解析失败导致工具列表拿不到 let pdfjsLib = null; // Zod Schema 定义 const PdfSourceSchema = z.object({ path: z.string().min(1, "PDF路径不能为空"), pages: z.array(z.number().int().positive("页码必须是正整数")).optional() }); const ReadPdfRequestSchema = z.object({ sources: z.array(PdfSourceSchema).min(1, "至少需要指定一个PDF源"), include_metadata: z.boolean().default(true).optional(), include_page_count: z.boolean().default(true).optional(), include_full_text: z.boolean().default(true).optional() }); // PDF阅读工具定义 const READ_PDF_TOOL = { name: "read_pdf", description: "安全地读取PDF文件(本地或URL),提取文本、元数据和页数信息。支持批量处理和特定页面提取。", inputSchema: { type: "object", properties: { sources: { type: "array", items: { type: "object", properties: { path: { type: "string", description: "PDF文件路径(项目根目录下的相对路径)或URL(http/https)" }, pages: { type: "array", items: { type: "number" }, description: "要提取的特定页面数组(可选),如[1,2,3]。如果未指定则处理全部页面" } }, required: ["path"], additionalProperties: false }, minItems: 1, description: "要处理的PDF源文件列表" }, include_metadata: { type: "boolean", default: true, description: "是否包含PDF元数据(作者、标题、创建日期等),默认true" }, include_page_count: { type: "boolean", default: true, description: "是否包含总页数统计,默认true" }, include_full_text: { type: "boolean", default: true, description: "是否包含完整文本内容(当未指定pages时),默认true" } }, required: ["sources"], additionalProperties: false } }; const PDF_TOOLS = [ READ_PDF_TOOL ]; // 辅助函数:从URL获取PDF数据 async function fetchPdfFromUrl(url) { const response = await fetch(url); if (!response.ok) { throw new Error(`HTTP error! status: ${response.status}`); } const buffer = await response.arrayBuffer(); return new Uint8Array(buffer); } // 辅助函数:验证文件路径是否安全 function isPathSafe(filePath) { // 如果是绝对路径,检查是否是PDF文件并且存在 if (path.isAbsolute(filePath)) { // 检查文件扩展名 const ext = path.extname(filePath).toLowerCase(); if (ext !== '.pdf') { return false; } // 检查路径中是否包含危险字符 const dangerousPatterns = ['..', '~/', '$', '`', ';', '|', '&']; return !dangerousPatterns.some(pattern => filePath.includes(pattern)); } // 相对路径:必须在项目根目录下 const resolvedPath = path.resolve(filePath); const workspaceRoot = process.cwd(); return resolvedPath.startsWith(workspaceRoot); } // 检查并自动安装依赖 async function checkAndInstallDependencies() { const { spawn } = await import('child_process'); const { promisify } = await import('util'); const { existsSync } = await import('fs'); const path = await import('path'); // 检查node_modules是否存在 const nodeModulesPath = path.join(process.cwd(), 'node_modules'); const pdfJsPath = path.join(nodeModulesPath, 'pdfjs-dist'); if (!existsSync(pdfJsPath)) { console.error('🔧 检测到 pdfjs-dist 未安装,正在自动安装依赖...'); try { const execPromise = promisify(spawn); const installProcess = spawn('npm', ['install'], { stdio: 'inherit', shell: true }); await new Promise((resolve, reject) => { installProcess.on('close', (code) => { if (code === 0) { console.error('✅ 依赖安装完成!'); resolve(code); } else { reject(new Error(`npm install 失败,退出代码: ${code}`)); } }); installProcess.on('error', reject); }); } catch (error) { throw new Error(`自动安装依赖失败: ${error instanceof Error ? error.message : String(error)}\n\n请手动执行以下命令:\ncd ${process.cwd()}\nnpm install`); } } } // 初始化PDF库 async function initializePdfLib() { if (pdfjsLib) { return pdfjsLib; } // 检查 Node.js 版本 const nodeVersion = process.version; const majorVersion = parseInt(nodeVersion.split('.')[0].substring(1)); if (majorVersion < 18) { throw new Error(`不支持的 Node.js 版本 ${nodeVersion},需要版本 >= 18.0.0`); } // 尝试加载 pdfjs-dist 的多种策略 const loadingStrategies = [ { name: 'legacy/build/pdf.mjs', fn: () => import('pdfjs-dist/legacy/build/pdf.mjs') }, { name: 'build/pdf.mjs', fn: () => import('pdfjs-dist/build/pdf.mjs') }, { name: 'pdfjs-dist', fn: () => import('pdfjs-dist') } ]; const errors = []; for (const strategy of loadingStrategies) { try { console.error(`尝试加载 pdfjs-dist (${strategy.name})...`); pdfjsLib = await strategy.fn(); console.error(`成功加载 pdfjs-dist (${strategy.name})`); return pdfjsLib; } catch (error) { const errorMsg = `${strategy.name}: ${error instanceof Error ? error.message : String(error)}`; errors.push(errorMsg); console.error(`加载 ${strategy.name} 失败: ${errorMsg}`); } } // 如果所有策略都失败,尝试检查和自动安装依赖 console.error('所有加载策略都失败,检查依赖安装...'); try { await checkAndInstallDependencies(); // 重新尝试加载 for (const strategy of loadingStrategies) { try { console.error(`重新尝试加载 pdfjs-dist (${strategy.name})...`); pdfjsLib = await strategy.fn(); console.error(`成功加载 pdfjs-dist (${strategy.name})`); return pdfjsLib; } catch (error) { console.error(`重试 ${strategy.name} 仍然失败`); } } } catch (installError) { throw new Error(`PDF阅读服务初始化失败:\n` + `Node.js版本: ${nodeVersion}\n` + `工作目录: ${process.cwd()}\n` + `加载失败详情: ${errors.join(', ')}\n` + `安装错误: ${installError instanceof Error ? installError.message : String(installError)}\n\n` + `💡 解决方案:\n` + `1. 确保 Node.js 版本 >= 18 (当前: ${nodeVersion})\n` + `2. 手动执行: cd "${process.cwd()}" && npm install\n` + `3. 检查是否有权限问题\n` + `4. 重新启动服务`); } throw new Error(`无法加载 pdfjs-dist 库\n` + `Node.js版本: ${nodeVersion}\n` + `尝试的加载策略: ${loadingStrategies.map(s => s.name).join(', ')}\n` + `错误详情: ${errors.join(', ')}\n\n` + `请执行: cd "${process.cwd()}" && npm install`); } // 处理单个PDF文件 async function processSinglePdf(source, options, pdfLib) { try { let pdfData; // 判断是URL还是本地文件 if (source.path.startsWith('http://') || source.path.startsWith('https://')) { pdfData = await fetchPdfFromUrl(source.path); } else { // 本地文件处理 if (!isPathSafe(source.path)) { if (path.isAbsolute(source.path)) { throw new Error(`绝对路径不安全或不是PDF文件: ${source.path}\n请确保路径是PDF文件且不包含危险字符`); } else { throw new Error(`相对路径不安全,只能访问项目根目录下的文件: ${source.path}`); } } if (!existsSync(source.path)) { throw new Error(`文件不存在: ${source.path}\n请检查路径是否正确!`); } try { const buffer = await fsExtra.readFile(source.path); pdfData = new Uint8Array(buffer); } catch (readError) { throw new Error(`无法读取文件: ${source.path}\n错误: ${readError instanceof Error ? readError.message : String(readError)}`); } } // 使用 pdfjs-dist 加载PDF const loadingTask = pdfLib.getDocument({ data: pdfData, standardFontDataUrl: null // 优化性能,不加载标准字体 }); const pdf = await loadingTask.promise; const result = { source: source.path, success: true, data: {} }; const { include_metadata, include_page_count, include_full_text } = options; const numPages = pdf.numPages; // 获取页数 if (include_page_count) { result.data.num_pages = numPages; } // 获取元数据 if (include_metadata) { try { const metadata = await pdf.getMetadata(); result.data.info = metadata.info || {}; result.data.metadata = metadata.metadata ? JSON.parse(JSON.stringify(metadata.metadata)) : {}; } catch (error) { // 元数据获取失败不应影响整体处理 result.data.info = {}; result.data.metadata = {}; } } // 处理页面提取逻辑 const shouldExtractSpecificPages = source.pages && source.pages.length > 0; const pagesToProcess = shouldExtractSpecificPages ? source.pages : (include_full_text ? Array.from({ length: numPages }, (_, i) => i + 1) : []); const pageTexts = []; // 只有在需要文本内容时才进行提取 if (include_full_text || shouldExtractSpecificPages) { for (const pageNum of pagesToProcess) { if (pageNum < 1 || pageNum > numPages) { console.error(`跳过无效页面: ${pageNum} (总页数: ${numPages})`); continue; } try { const page = await pdf.getPage(pageNum); const textContent = await page.getTextContent(); const pageText = textContent.items .map((item) => 'str' in item ? item.str : '') .join(' ') .replace(/\s+/g, ' ') // 合并多个空格 .trim(); pageTexts.push({ page: pageNum, text: pageText }); } catch (error) { console.error(`提取第${pageNum}页文本失败:`, error); pageTexts.push({ page: pageNum, text: '' }); } } } // 根据是否指定页面来决定返回格式 if (shouldExtractSpecificPages) { result.data.page_texts = pageTexts; } else if (include_full_text && pageTexts.length > 0) { const fullText = pageTexts.map(p => p.text).join('\n\n').trim(); result.data.full_text = fullText; } return result; } catch (error) { console.error(`处理PDF失败 (${source.path}):`, error); return { source: source.path, success: false, error: error instanceof Error ? error.message : String(error) }; } } // PDF读取功能实现 async function handleReadPDF(request) { try { // 初始化PDF库 console.error('开始初始化PDF库...'); const pdfLib = await initializePdfLib(); console.error('PDF库初始化成功'); const { sources, include_metadata = true, include_page_count = true, include_full_text = true } = request; console.error(`开始处理 ${sources.length} 个PDF文件...`); // 并行处理多个PDF文件以提高性能 const results = await Promise.all(sources.map((source, index) => { console.error(`处理PDF文件 ${index + 1}/${sources.length}: ${source.path}`); return processSinglePdf(source, { include_metadata, include_page_count, include_full_text }, pdfLib); })); const successCount = results.filter(r => r.success).length; const failCount = results.length - successCount; console.error(`PDF处理完成: 成功 ${successCount}, 失败 ${failCount}`); return { content: [{ type: "text", text: JSON.stringify({ results }, null, 2) }], isError: results.some(r => !r.success) }; } catch (error) { console.error('PDF读取服务发生错误:', error); // 根据错误类型提供更具体的错误信息 let errorMessage = `PDF读取服务错误: ${error instanceof Error ? error.message : String(error)}`; if (error instanceof Error) { if (error.message.includes('pdfjs-dist')) { errorMessage = `无法加载PDF解析库: ${error.message}\n\n这通常是依赖安装问题,请尝试:\n1. cd "${process.cwd()}"\n2. npm install\n3. 重新启动服务`; } else if (error.message.includes('Node.js')) { errorMessage = `Node.js版本问题: ${error.message}`; } } return { content: [{ type: "text", text: errorMessage }], isError: true }; } } // 服务器设置 const server = new Server({ name: "@mcpcn/pdf-reader-mcp", version: "1.1.1", }, { capabilities: { tools: {}, }, }); // 设置请求处理程序 server.setRequestHandler(ListToolsRequestSchema, async () => ({ tools: PDF_TOOLS, })); server.setRequestHandler(CallToolRequestSchema, async (request) => { try { switch (request.params.name) { case "read_pdf": { // 使用Zod验证输入参数 const validationResult = ReadPdfRequestSchema.safeParse(request.params.arguments); if (!validationResult.success) { return { content: [{ type: "text", text: `输入参数验证失败: ${validationResult.error.errors.map(e => `${e.path.join('.')}: ${e.message}`).join(', ')}` }], isError: true }; } return await handleReadPDF(validationResult.data); } default: return { content: [{ type: "text", text: `未知工具: ${request.params.name}` }], isError: true }; } } catch (error) { return { content: [{ type: "text", text: `服务器错误: ${error instanceof Error ? error.message : String(error)}` }], isError: true }; } }); async function runServer() { const transport = new StdioServerTransport(); await server.connect(transport); // console.error("PDF Reader MCP Server running on stdio"); } runServer().catch((error) => { console.error("Fatal error running server:", error); process.exit(1); });