UNPKG

@aml2lab/mcp-pdf2md

Version:

MCP server: convert PDF to Markdown

298 lines (297 loc) 11.2 kB
#!/usr/bin/env node import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; // Optional HTTP transport: // import express from "express"; // import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js"; import { z } from "zod"; import pdf2md from "@aml2lab/pdf2md"; import { readFile, writeFile, mkdir, readdir, stat } from "node:fs/promises"; import { basename, join, relative, resolve, dirname, isAbsolute, } from "node:path"; import { Buffer } from "node:buffer"; import { request } from "undici"; // ---------------------- // Version / server // ---------------------- const server = new McpServer({ name: "mcp-pdf2md", version: "0.3.0" }, { debouncedNotificationMethods: ["notifications/tools/list_changed"] }); // ---------------------- // CLI defaults & env fallbacks // ---------------------- function parseCliDefaults(argv) { const defaults = { inputDir: process.env.PDF2MD_DEFAULT_INPUT_DIR ?? undefined, outputDir: process.env.PDF2MD_DEFAULT_OUTPUT_DIR ?? undefined, }; for (let i = 2; i < argv.length; i++) { const a = argv[i]; if (a === "--default-input-dir" && argv[i + 1]) { defaults.inputDir = argv[++i]; continue; } if (a === "--default-output-dir" && argv[i + 1]) { defaults.outputDir = argv[++i]; } } return defaults; } const CLI_DEFAULTS = parseCliDefaults(process.argv); // ---------------------- // Utilities // ---------------------- function pathFromMaybeFileUrl(input) { if (input.startsWith("file://")) { const url = new URL(input); const p = process.platform === "win32" ? decodeURIComponent(url.pathname.replace(/^\/+/, "")) : decodeURIComponent(url.pathname); return p; } return input; } async function loadPdfBytes(source) { if (source.startsWith("data:")) { const m = source.match(/^data:(.*?);base64,(.+)$/); if (!m) throw new Error("Invalid data: URL (expected base64)"); return Uint8Array.from(Buffer.from(m[2], "base64")); } if (/^https?:\/\//i.test(source)) { const res = await request(source, { method: "GET" }); if (res.statusCode < 200 || res.statusCode >= 300) { throw new Error(`Failed to download PDF: HTTP ${res.statusCode}`); } const buf = Buffer.from(await res.body.arrayBuffer()); return new Uint8Array(buf); } // local path const local = source.startsWith("file://") ? pathFromMaybeFileUrl(source) : source; const buf = await readFile(local); return new Uint8Array(buf); } async function ensureDir(dir) { await mkdir(dir, { recursive: true }); } function changeExt(name, ext) { return name.replace(/\.[^.]+$/g, "") + ext; } async function walkDir(dir) { const out = []; const entries = await readdir(dir, { withFileTypes: true }); for (const d of entries) { const full = join(dir, d.name); if (d.isDirectory()) { out.push(...(await walkDir(full))); } else { out.push(full); } } return out; } async function fileExists(p) { try { await stat(p); return true; } catch { return false; } } function looksRelativePath(p) { if (/^https?:\/\//i.test(p) || p.startsWith("file://") || p.startsWith("data:")) return false; return !isAbsolute(p); } // ---------------------- // Zod shapes for tool inputs (MCP SDK expects shapes) // ---------------------- const SingleInputShape = { /** * PDF location: * - absolute local path (Windows or POSIX) * - file:// URL * - https:// URL * - data:application/pdf;base64,<...> * - OR a relative path; if so and a default input dir is set, it resolves under that folder */ source: z.string().describe("PDF location"), /** * Optional: write the generated Markdown to this directory. * If omitted, uses --default-output-dir when provided. */ outputDir: z.string().optional().describe("Directory to write the .md output"), /** * Optional: override output file name (e.g. 'myfile.md'). * If omitted, uses the source base name with .md. */ outputName: z.string().optional().describe("Target markdown filename, defaults from source"), }; const FolderInputShape = { /** * Folder containing PDFs. If omitted, uses --default-input-dir. */ inputDir: z.string().optional().describe("Folder containing PDF files"), /** * Folder to write Markdown. If omitted, uses --default-output-dir. */ outputDir: z.string().optional().describe("Folder to write generated Markdown files"), /** * Overwrite existing .md files if present. */ overwrite: z.boolean().optional().default(false), /** * Concurrency for parallel conversions (1-16). */ concurrency: z.number().int().positive().max(16).optional().default(4), }; // ---------------------- // Tool 1: Single file (enhanced with defaults) // ---------------------- server.registerTool("pdf_to_markdown", { title: "Convert one PDF to Markdown", description: "Convert a single PDF (path/URL/data URI) to Markdown. If outputDir is omitted, uses --default-output-dir when set. If source is relative and --default-input-dir is set, resolves under it.", inputSchema: SingleInputShape, }, async (args) => { const { source: rawSource, outputDir, outputName } = z.object(SingleInputShape).parse(args); // Resolve relative source via default input dir if provided let source = rawSource; if (looksRelativePath(source) && CLI_DEFAULTS.inputDir) { source = join(pathFromMaybeFileUrl(CLI_DEFAULTS.inputDir), source); } const pdfBytes = await loadPdfBytes(source); const markdown = await pdf2md(pdfBytes); // Determine output name let displayName = "document.pdf"; try { if (/^https?:\/\//.test(source) || source.startsWith("file://")) { displayName = basename(new URL(source).pathname || "document.pdf"); } else if (!source.startsWith("data:")) { displayName = basename(source); } } catch { /* ignore */ } const mdName = outputName ?? changeExt(displayName, ".md"); // Output dir: explicit > default > none const chosenOutDir = outputDir ?? CLI_DEFAULTS.outputDir; let savedPath; if (chosenOutDir) { const outDirAbs = resolve(pathFromMaybeFileUrl(chosenOutDir)); await ensureDir(outDirAbs); const outPath = join(outDirAbs, mdName); await writeFile(outPath, markdown, "utf8"); savedPath = outPath; } const summaryLines = [ `Source: ${source}`, chosenOutDir ? `Saved: ${savedPath}` : "Saved: <not saved to disk>", `Name: ${mdName}`, ].join("\n"); return { content: [ { type: "text", text: summaryLines, mimeType: "text/plain", name: "summary.txt" }, { type: "text", text: markdown, mimeType: "text/markdown", name: mdName } ] }; }); server.registerTool("pdf_folder_to_markdown", { title: "Convert a folder of PDFs to Markdown", description: "Recursively convert PDFs in inputDir into Markdown in outputDir. If inputDir/outputDir omitted, uses CLI defaults.", inputSchema: FolderInputShape, }, async (args) => { const { inputDir, outputDir, overwrite, concurrency } = z.object(FolderInputShape).parse(args); // Apply defaults if missing const resolvedInputDir = inputDir ?? CLI_DEFAULTS.inputDir ?? (() => { throw new Error("inputDir not provided and --default-input-dir not set"); })(); const resolvedOutputDir = outputDir ?? CLI_DEFAULTS.outputDir ?? (() => { throw new Error("outputDir not provided and --default-output-dir not set"); })(); const inDir = resolve(pathFromMaybeFileUrl(resolvedInputDir)); const outDir = resolve(pathFromMaybeFileUrl(resolvedOutputDir)); await ensureDir(outDir); // Gather PDFs const allFiles = await walkDir(inDir); const pdfs = allFiles.filter(f => /\.pdf$/i.test(f)); const manifest = []; const workers = Math.max(1, Math.min(concurrency ?? 4, 16)); let idx = 0; async function worker() { for (;;) { const myIdx = idx++; if (myIdx >= pdfs.length) return; const pdfPath = pdfs[myIdx]; try { const rel = relative(inDir, pdfPath); const targetPath = join(outDir, changeExt(rel, ".md")); await ensureDir(dirname(targetPath)); if (!overwrite && await fileExists(targetPath)) { manifest.push({ input: pdfPath, output: targetPath, status: "skipped" }); continue; } const pdfBytes = await readFile(pdfPath); const md = await pdf2md(new Uint8Array(pdfBytes)); await writeFile(targetPath, md, "utf8"); manifest.push({ input: pdfPath, output: targetPath, status: "converted", bytes: Buffer.byteLength(md, "utf8") }); } catch (err) { manifest.push({ input: pdfPath, status: "failed", error: String(err?.message ?? err) }); } } } await Promise.all(Array.from({ length: workers }, () => worker())); const summary = [ `Input dir: ${inDir}`, `Output dir: ${outDir}`, `Found PDFs: ${pdfs.length}`, `Converted: ${manifest.filter(m => m.status === "converted").length}`, `Skipped: ${manifest.filter(m => m.status === "skipped").length}`, `Failed: ${manifest.filter(m => m.status === "failed").length}` ].join("\n"); return { content: [ { type: "text", text: summary, mimeType: "text/plain", name: "summary.txt" }, { type: "text", text: JSON.stringify({ inputDir: inDir, outputDir: outDir, items: manifest }, null, 2), mimeType: "application/json", name: "pdf2md_manifest.json" } ] }; }); // ---------------------- // Transport (stdio default) // ---------------------- const useStdio = true; async function main() { if (useStdio) { const transport = new StdioServerTransport(); await server.connect(transport); return; } // Optional HTTP transport // const app = express(); // const transport = new StreamableHTTPServerTransport({ app, endpoint: "/mcp", allowedOrigins: ["http://localhost"] }); // await server.connect(transport); // app.listen(3000, () => console.error("[mcp-pdf2md] HTTP: http://127.0.0.1:3000/mcp")); } main().catch((err) => { console.error(err); process.exit(1); });