@mcpcn/mcp-pdf-reader
Version:
一个基于MCP协议的PDF阅读服务器,为AI代理提供安全的PDF文件读取和信息提取功能
425 lines (424 loc) • 17.1 kB
JavaScript
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
import fsExtra from "fs-extra";
import { existsSync } from "node:fs";
import * as path from "node:path";
import { z } from "zod";
// 延迟加载 pdfjs-dist,避免服务器启动时因依赖解析失败导致工具列表拿不到
let pdfjsLib = null;
// Zod Schema 定义
const PdfSourceSchema = z.object({
path: z.string().min(1, "PDF路径不能为空"),
pages: z.array(z.number().int().positive("页码必须是正整数")).optional()
});
const ReadPdfRequestSchema = z.object({
sources: z.array(PdfSourceSchema).min(1, "至少需要指定一个PDF源"),
include_metadata: z.boolean().default(true).optional(),
include_page_count: z.boolean().default(true).optional(),
include_full_text: z.boolean().default(true).optional()
});
// PDF阅读工具定义
const READ_PDF_TOOL = {
name: "read_pdf",
description: "安全地读取PDF文件(本地或URL),提取文本、元数据和页数信息。支持批量处理和特定页面提取。",
inputSchema: {
type: "object",
properties: {
sources: {
type: "array",
items: {
type: "object",
properties: {
path: {
type: "string",
description: "PDF文件路径(项目根目录下的相对路径)或URL(http/https)"
},
pages: {
type: "array",
items: {
type: "number"
},
description: "要提取的特定页面数组(可选),如[1,2,3]。如果未指定则处理全部页面"
}
},
required: ["path"],
additionalProperties: false
},
minItems: 1,
description: "要处理的PDF源文件列表"
},
include_metadata: {
type: "boolean",
default: true,
description: "是否包含PDF元数据(作者、标题、创建日期等),默认true"
},
include_page_count: {
type: "boolean",
default: true,
description: "是否包含总页数统计,默认true"
},
include_full_text: {
type: "boolean",
default: true,
description: "是否包含完整文本内容(当未指定pages时),默认true"
}
},
required: ["sources"],
additionalProperties: false
}
};
const PDF_TOOLS = [
READ_PDF_TOOL
];
// 辅助函数:从URL获取PDF数据
async function fetchPdfFromUrl(url) {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const buffer = await response.arrayBuffer();
return new Uint8Array(buffer);
}
// 辅助函数:验证文件路径是否安全
function isPathSafe(filePath) {
// 如果是绝对路径,检查是否是PDF文件并且存在
if (path.isAbsolute(filePath)) {
// 检查文件扩展名
const ext = path.extname(filePath).toLowerCase();
if (ext !== '.pdf') {
return false;
}
// 检查路径中是否包含危险字符
const dangerousPatterns = ['..', '~/', '$', '`', ';', '|', '&'];
return !dangerousPatterns.some(pattern => filePath.includes(pattern));
}
// 相对路径:必须在项目根目录下
const resolvedPath = path.resolve(filePath);
const workspaceRoot = process.cwd();
return resolvedPath.startsWith(workspaceRoot);
}
// 检查并自动安装依赖
async function checkAndInstallDependencies() {
const { spawn } = await import('child_process');
const { promisify } = await import('util');
const { existsSync } = await import('fs');
const path = await import('path');
// 检查node_modules是否存在
const nodeModulesPath = path.join(process.cwd(), 'node_modules');
const pdfJsPath = path.join(nodeModulesPath, 'pdfjs-dist');
if (!existsSync(pdfJsPath)) {
console.error('🔧 检测到 pdfjs-dist 未安装,正在自动安装依赖...');
try {
const execPromise = promisify(spawn);
const installProcess = spawn('npm', ['install'], {
stdio: 'inherit',
shell: true
});
await new Promise((resolve, reject) => {
installProcess.on('close', (code) => {
if (code === 0) {
console.error('✅ 依赖安装完成!');
resolve(code);
}
else {
reject(new Error(`npm install 失败,退出代码: ${code}`));
}
});
installProcess.on('error', reject);
});
}
catch (error) {
throw new Error(`自动安装依赖失败: ${error instanceof Error ? error.message : String(error)}\n\n请手动执行以下命令:\ncd ${process.cwd()}\nnpm install`);
}
}
}
// 初始化PDF库
async function initializePdfLib() {
if (pdfjsLib) {
return pdfjsLib;
}
// 检查 Node.js 版本
const nodeVersion = process.version;
const majorVersion = parseInt(nodeVersion.split('.')[0].substring(1));
if (majorVersion < 18) {
throw new Error(`不支持的 Node.js 版本 ${nodeVersion},需要版本 >= 18.0.0`);
}
// 尝试加载 pdfjs-dist 的多种策略
const loadingStrategies = [
{
name: 'legacy/build/pdf.mjs',
fn: () => import('pdfjs-dist/legacy/build/pdf.mjs')
},
{
name: 'build/pdf.mjs',
fn: () => import('pdfjs-dist/build/pdf.mjs')
},
{
name: 'pdfjs-dist',
fn: () => import('pdfjs-dist')
}
];
const errors = [];
for (const strategy of loadingStrategies) {
try {
console.error(`尝试加载 pdfjs-dist (${strategy.name})...`);
pdfjsLib = await strategy.fn();
console.error(`成功加载 pdfjs-dist (${strategy.name})`);
return pdfjsLib;
}
catch (error) {
const errorMsg = `${strategy.name}: ${error instanceof Error ? error.message : String(error)}`;
errors.push(errorMsg);
console.error(`加载 ${strategy.name} 失败: ${errorMsg}`);
}
}
// 如果所有策略都失败,尝试检查和自动安装依赖
console.error('所有加载策略都失败,检查依赖安装...');
try {
await checkAndInstallDependencies();
// 重新尝试加载
for (const strategy of loadingStrategies) {
try {
console.error(`重新尝试加载 pdfjs-dist (${strategy.name})...`);
pdfjsLib = await strategy.fn();
console.error(`成功加载 pdfjs-dist (${strategy.name})`);
return pdfjsLib;
}
catch (error) {
console.error(`重试 ${strategy.name} 仍然失败`);
}
}
}
catch (installError) {
throw new Error(`PDF阅读服务初始化失败:\n` +
`Node.js版本: ${nodeVersion}\n` +
`工作目录: ${process.cwd()}\n` +
`加载失败详情: ${errors.join(', ')}\n` +
`安装错误: ${installError instanceof Error ? installError.message : String(installError)}\n\n` +
`💡 解决方案:\n` +
`1. 确保 Node.js 版本 >= 18 (当前: ${nodeVersion})\n` +
`2. 手动执行: cd "${process.cwd()}" && npm install\n` +
`3. 检查是否有权限问题\n` +
`4. 重新启动服务`);
}
throw new Error(`无法加载 pdfjs-dist 库\n` +
`Node.js版本: ${nodeVersion}\n` +
`尝试的加载策略: ${loadingStrategies.map(s => s.name).join(', ')}\n` +
`错误详情: ${errors.join(', ')}\n\n` +
`请执行: cd "${process.cwd()}" && npm install`);
}
// 处理单个PDF文件
async function processSinglePdf(source, options, pdfLib) {
try {
let pdfData;
// 判断是URL还是本地文件
if (source.path.startsWith('http://') || source.path.startsWith('https://')) {
pdfData = await fetchPdfFromUrl(source.path);
}
else {
// 本地文件处理
if (!isPathSafe(source.path)) {
if (path.isAbsolute(source.path)) {
throw new Error(`绝对路径不安全或不是PDF文件: ${source.path}\n请确保路径是PDF文件且不包含危险字符`);
}
else {
throw new Error(`相对路径不安全,只能访问项目根目录下的文件: ${source.path}`);
}
}
if (!existsSync(source.path)) {
throw new Error(`文件不存在: ${source.path}\n请检查路径是否正确!`);
}
try {
const buffer = await fsExtra.readFile(source.path);
pdfData = new Uint8Array(buffer);
}
catch (readError) {
throw new Error(`无法读取文件: ${source.path}\n错误: ${readError instanceof Error ? readError.message : String(readError)}`);
}
}
// 使用 pdfjs-dist 加载PDF
const loadingTask = pdfLib.getDocument({
data: pdfData,
standardFontDataUrl: null // 优化性能,不加载标准字体
});
const pdf = await loadingTask.promise;
const result = {
source: source.path,
success: true,
data: {}
};
const { include_metadata, include_page_count, include_full_text } = options;
const numPages = pdf.numPages;
// 获取页数
if (include_page_count) {
result.data.num_pages = numPages;
}
// 获取元数据
if (include_metadata) {
try {
const metadata = await pdf.getMetadata();
result.data.info = metadata.info || {};
result.data.metadata = metadata.metadata ?
JSON.parse(JSON.stringify(metadata.metadata)) : {};
}
catch (error) {
// 元数据获取失败不应影响整体处理
result.data.info = {};
result.data.metadata = {};
}
}
// 处理页面提取逻辑
const shouldExtractSpecificPages = source.pages && source.pages.length > 0;
const pagesToProcess = shouldExtractSpecificPages
? source.pages
: (include_full_text ? Array.from({ length: numPages }, (_, i) => i + 1) : []);
const pageTexts = [];
// 只有在需要文本内容时才进行提取
if (include_full_text || shouldExtractSpecificPages) {
for (const pageNum of pagesToProcess) {
if (pageNum < 1 || pageNum > numPages) {
console.error(`跳过无效页面: ${pageNum} (总页数: ${numPages})`);
continue;
}
try {
const page = await pdf.getPage(pageNum);
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item) => 'str' in item ? item.str : '')
.join(' ')
.replace(/\s+/g, ' ') // 合并多个空格
.trim();
pageTexts.push({ page: pageNum, text: pageText });
}
catch (error) {
console.error(`提取第${pageNum}页文本失败:`, error);
pageTexts.push({ page: pageNum, text: '' });
}
}
}
// 根据是否指定页面来决定返回格式
if (shouldExtractSpecificPages) {
result.data.page_texts = pageTexts;
}
else if (include_full_text && pageTexts.length > 0) {
const fullText = pageTexts.map(p => p.text).join('\n\n').trim();
result.data.full_text = fullText;
}
return result;
}
catch (error) {
console.error(`处理PDF失败 (${source.path}):`, error);
return {
source: source.path,
success: false,
error: error instanceof Error ? error.message : String(error)
};
}
}
// PDF读取功能实现
async function handleReadPDF(request) {
try {
// 初始化PDF库
console.error('开始初始化PDF库...');
const pdfLib = await initializePdfLib();
console.error('PDF库初始化成功');
const { sources, include_metadata = true, include_page_count = true, include_full_text = true } = request;
console.error(`开始处理 ${sources.length} 个PDF文件...`);
// 并行处理多个PDF文件以提高性能
const results = await Promise.all(sources.map((source, index) => {
console.error(`处理PDF文件 ${index + 1}/${sources.length}: ${source.path}`);
return processSinglePdf(source, { include_metadata, include_page_count, include_full_text }, pdfLib);
}));
const successCount = results.filter(r => r.success).length;
const failCount = results.length - successCount;
console.error(`PDF处理完成: 成功 ${successCount}, 失败 ${failCount}`);
return {
content: [{
type: "text",
text: JSON.stringify({ results }, null, 2)
}],
isError: results.some(r => !r.success)
};
}
catch (error) {
console.error('PDF读取服务发生错误:', error);
// 根据错误类型提供更具体的错误信息
let errorMessage = `PDF读取服务错误: ${error instanceof Error ? error.message : String(error)}`;
if (error instanceof Error) {
if (error.message.includes('pdfjs-dist')) {
errorMessage = `无法加载PDF解析库: ${error.message}\n\n这通常是依赖安装问题,请尝试:\n1. cd "${process.cwd()}"\n2. npm install\n3. 重新启动服务`;
}
else if (error.message.includes('Node.js')) {
errorMessage = `Node.js版本问题: ${error.message}`;
}
}
return {
content: [{
type: "text",
text: errorMessage
}],
isError: true
};
}
}
// 服务器设置
const server = new Server({
name: "@mcpcn/pdf-reader-mcp",
version: "1.1.1",
}, {
capabilities: {
tools: {},
},
});
// 设置请求处理程序
server.setRequestHandler(ListToolsRequestSchema, async () => ({
tools: PDF_TOOLS,
}));
server.setRequestHandler(CallToolRequestSchema, async (request) => {
try {
switch (request.params.name) {
case "read_pdf": {
// 使用Zod验证输入参数
const validationResult = ReadPdfRequestSchema.safeParse(request.params.arguments);
if (!validationResult.success) {
return {
content: [{
type: "text",
text: `输入参数验证失败: ${validationResult.error.errors.map(e => `${e.path.join('.')}: ${e.message}`).join(', ')}`
}],
isError: true
};
}
return await handleReadPDF(validationResult.data);
}
default:
return {
content: [{
type: "text",
text: `未知工具: ${request.params.name}`
}],
isError: true
};
}
}
catch (error) {
return {
content: [{
type: "text",
text: `服务器错误: ${error instanceof Error ? error.message : String(error)}`
}],
isError: true
};
}
});
async function runServer() {
const transport = new StdioServerTransport();
await server.connect(transport);
// console.error("PDF Reader MCP Server running on stdio");
}
runServer().catch((error) => {
console.error("Fatal error running server:", error);
process.exit(1);
});