@mcpcn/mcp-pdf-reader
Version:
MCP PDF阅读服务器 - 提供PDF文本提取、元数据读取和页数统计功能
248 lines (247 loc) • 9.28 kB
JavaScript
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
import { CallToolRequestSchema, ListToolsRequestSchema, } from "@modelcontextprotocol/sdk/types.js";
import fsExtra from "fs-extra";
import { existsSync } from "node:fs";
import * as path from "node:path";
// 延迟加载 pdfjs-dist,避免服务器启动时因依赖解析失败导致工具列表拿不到
let pdfjsLib = null;
// PDF阅读工具定义
const READ_PDF_TOOL = {
name: "read_pdf",
description: "读取PDF文件内容,提取文本、元数据和页数信息",
inputSchema: {
type: "object",
properties: {
sources: {
type: "array",
items: {
type: "object",
properties: {
path: {
type: "string",
description: "PDF文件路径(本地路径或URL)"
},
pages: {
type: "array",
items: {
type: "number"
},
description: "要提取的特定页面(可选),如[1,2,3]"
}
},
required: ["path"]
},
description: "要处理的PDF源文件列表"
},
include_metadata: {
type: "boolean",
description: "是否包含PDF元数据,默认true"
},
include_page_count: {
type: "boolean",
description: "是否包含页数统计,默认true"
},
include_full_text: {
type: "boolean",
description: "是否包含完整文本内容,默认true"
}
},
required: ["sources"]
}
};
const PDF_TOOLS = [
READ_PDF_TOOL
];
// 辅助函数:从URL获取PDF数据
async function fetchPdfFromUrl(url) {
const response = await fetch(url);
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const buffer = await response.arrayBuffer();
return new Uint8Array(buffer);
}
// 辅助函数:验证文件路径是否安全(在项目根目录下)
function isPathSafe(filePath) {
const resolvedPath = path.resolve(filePath);
const workspaceRoot = process.cwd();
return resolvedPath.startsWith(workspaceRoot);
}
// PDF读取功能实现
async function handleReadPDF(sources, options = {}) {
// 动态导入 pdfjs-dist,确保服务器能先正常列出工具
// 在Node.js环境中优先使用legacy版本
if (!pdfjsLib) {
try {
// 先尝试legacy版本,这在Node.js环境中更稳定
pdfjsLib = await import('pdfjs-dist/legacy/build/pdf.mjs');
}
catch (e1) {
try {
// 如果legacy版本失败,再尝试标准版本
pdfjsLib = await import('pdfjs-dist');
}
catch (e2) {
// console.error('pdfjs-dist 加载失败:', e1, e2);
throw new Error('无法加载 pdfjs-dist,请执行 npm install 并确保 Node 版本>=18');
}
}
}
const { include_metadata = true, include_page_count = true, include_full_text = true } = options;
const results = [];
for (const source of sources) {
try {
// console.error(`开始处理PDF: ${source.path}`);
let pdfData;
// 判断是URL还是本地文件
if (source.path.startsWith('http://') || source.path.startsWith('https://')) {
// URL处理
pdfData = await fetchPdfFromUrl(source.path);
}
else {
// 本地文件处理
// 安全检查:确保路径在项目根目录下
if (!isPathSafe(source.path)) {
throw new Error('文件路径不安全,只能访问项目根目录下的文件');
}
if (!existsSync(source.path)) {
throw new Error(`文件不存在: ${source.path}`);
}
const buffer = await fsExtra.readFile(source.path);
pdfData = new Uint8Array(buffer);
}
// 使用 pdfjs-dist 加载PDF
const loadingTask = pdfjsLib.getDocument({ data: pdfData });
const pdf = await loadingTask.promise;
const result = {
source: source.path,
success: true,
data: {}
};
// 获取页数
const numPages = pdf.numPages;
if (include_page_count) {
result.data.num_pages = numPages;
}
// 获取元数据
if (include_metadata) {
try {
const metadata = await pdf.getMetadata();
result.data.info = metadata.info;
result.data.metadata = metadata.metadata ? JSON.parse(JSON.stringify(metadata.metadata)) : null;
}
catch (error) {
// console.error('获取元数据失败:', error);
}
}
// 提取文本内容
const pageTexts = [];
const pagesToProcess = source.pages || Array.from({ length: numPages }, (_, i) => i + 1);
for (const pageNum of pagesToProcess) {
if (pageNum < 1 || pageNum > numPages) {
// console.error(`跳过无效页面: ${pageNum} (总页数: ${numPages})`);
continue;
}
try {
const page = await pdf.getPage(pageNum);
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item) => item.str || '')
.join(' ')
.trim();
pageTexts.push({ page: pageNum, text: pageText });
}
catch (error) {
// console.error(`提取第${pageNum}页文本失败:`, error);
pageTexts.push({ page: pageNum, text: '' });
}
}
// 根据是否指定页面来决定返回格式
if (source.pages && source.pages.length > 0) {
// 指定了页面,返回特定页面的文本
result.data.page_texts = pageTexts;
}
else {
// 没有指定页面,根据 include_full_text 决定
if (include_full_text) {
const fullText = pageTexts.map(p => p.text).join('\n\n').trim();
result.data.full_text = fullText;
}
}
results.push(result);
// console.error(`PDF处理完成: ${source.path} (${numPages}页)`);
}
catch (error) {
// console.error(`处理PDF失败 (${source.path}):`, error);
results.push({
source: source.path,
success: false,
error: error instanceof Error ? error.message : String(error)
});
}
}
return {
content: [{
type: "text",
text: JSON.stringify({
results: results
}, null, 2)
}],
isError: false
};
}
// 服务器设置
const server = new Server({
name: "mcp-pdf-reader",
version: "1.0.0",
}, {
capabilities: {
tools: {},
},
});
// 设置请求处理程序
server.setRequestHandler(ListToolsRequestSchema, async () => ({
tools: PDF_TOOLS,
}));
server.setRequestHandler(CallToolRequestSchema, async (request) => {
try {
switch (request.params.name) {
case "read_pdf": {
const { sources, include_metadata, include_page_count, include_full_text } = request.params.arguments;
return await handleReadPDF(sources, {
include_metadata,
include_page_count,
include_full_text
});
}
default:
return {
content: [{
type: "text",
text: `Unknown tool: ${request.params.name}`
}],
isError: true
};
}
}
catch (error) {
return {
content: [{
type: "text",
text: `Error: ${error instanceof Error ? error.message : String(error)}`
}],
isError: true
};
}
});
async function runServer() {
const transport = new StdioServerTransport();
await server.connect(transport);
// console.error("PDF Reader MCP Server running on stdio");
}
runServer().catch((error) => {
console.error("Fatal error running server:", error);
process.exit(1);
});