@aml2lab/mcp-pdf2md
Version:
MCP server: convert PDF to Markdown
298 lines (297 loc) • 11.2 kB
JavaScript
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
// Optional HTTP transport:
// import express from "express";
// import { StreamableHTTPServerTransport } from "@modelcontextprotocol/sdk/server/streamableHttp.js";
import { z } from "zod";
import pdf2md from "@aml2lab/pdf2md";
import { readFile, writeFile, mkdir, readdir, stat } from "node:fs/promises";
import { basename, join, relative, resolve, dirname, isAbsolute, } from "node:path";
import { Buffer } from "node:buffer";
import { request } from "undici";
// ----------------------
// Version / server
// ----------------------
const server = new McpServer({ name: "mcp-pdf2md", version: "0.3.0" }, { debouncedNotificationMethods: ["notifications/tools/list_changed"] });
// ----------------------
// CLI defaults & env fallbacks
// ----------------------
function parseCliDefaults(argv) {
const defaults = {
inputDir: process.env.PDF2MD_DEFAULT_INPUT_DIR ?? undefined,
outputDir: process.env.PDF2MD_DEFAULT_OUTPUT_DIR ?? undefined,
};
for (let i = 2; i < argv.length; i++) {
const a = argv[i];
if (a === "--default-input-dir" && argv[i + 1]) {
defaults.inputDir = argv[++i];
continue;
}
if (a === "--default-output-dir" && argv[i + 1]) {
defaults.outputDir = argv[++i];
}
}
return defaults;
}
const CLI_DEFAULTS = parseCliDefaults(process.argv);
// ----------------------
// Utilities
// ----------------------
function pathFromMaybeFileUrl(input) {
if (input.startsWith("file://")) {
const url = new URL(input);
const p = process.platform === "win32"
? decodeURIComponent(url.pathname.replace(/^\/+/, ""))
: decodeURIComponent(url.pathname);
return p;
}
return input;
}
async function loadPdfBytes(source) {
if (source.startsWith("data:")) {
const m = source.match(/^data:(.*?);base64,(.+)$/);
if (!m)
throw new Error("Invalid data: URL (expected base64)");
return Uint8Array.from(Buffer.from(m[2], "base64"));
}
if (/^https?:\/\//i.test(source)) {
const res = await request(source, { method: "GET" });
if (res.statusCode < 200 || res.statusCode >= 300) {
throw new Error(`Failed to download PDF: HTTP ${res.statusCode}`);
}
const buf = Buffer.from(await res.body.arrayBuffer());
return new Uint8Array(buf);
}
// local path
const local = source.startsWith("file://") ? pathFromMaybeFileUrl(source) : source;
const buf = await readFile(local);
return new Uint8Array(buf);
}
async function ensureDir(dir) {
await mkdir(dir, { recursive: true });
}
function changeExt(name, ext) {
return name.replace(/\.[^.]+$/g, "") + ext;
}
async function walkDir(dir) {
const out = [];
const entries = await readdir(dir, { withFileTypes: true });
for (const d of entries) {
const full = join(dir, d.name);
if (d.isDirectory()) {
out.push(...(await walkDir(full)));
}
else {
out.push(full);
}
}
return out;
}
async function fileExists(p) {
try {
await stat(p);
return true;
}
catch {
return false;
}
}
function looksRelativePath(p) {
if (/^https?:\/\//i.test(p) || p.startsWith("file://") || p.startsWith("data:"))
return false;
return !isAbsolute(p);
}
// ----------------------
// Zod shapes for tool inputs (MCP SDK expects shapes)
// ----------------------
const SingleInputShape = {
/**
* PDF location:
* - absolute local path (Windows or POSIX)
* - file:// URL
* - https:// URL
* - data:application/pdf;base64,<...>
* - OR a relative path; if so and a default input dir is set, it resolves under that folder
*/
source: z.string().describe("PDF location"),
/**
* Optional: write the generated Markdown to this directory.
* If omitted, uses --default-output-dir when provided.
*/
outputDir: z.string().optional().describe("Directory to write the .md output"),
/**
* Optional: override output file name (e.g. 'myfile.md').
* If omitted, uses the source base name with .md.
*/
outputName: z.string().optional().describe("Target markdown filename, defaults from source"),
};
const FolderInputShape = {
/**
* Folder containing PDFs. If omitted, uses --default-input-dir.
*/
inputDir: z.string().optional().describe("Folder containing PDF files"),
/**
* Folder to write Markdown. If omitted, uses --default-output-dir.
*/
outputDir: z.string().optional().describe("Folder to write generated Markdown files"),
/**
* Overwrite existing .md files if present.
*/
overwrite: z.boolean().optional().default(false),
/**
* Concurrency for parallel conversions (1-16).
*/
concurrency: z.number().int().positive().max(16).optional().default(4),
};
// ----------------------
// Tool 1: Single file (enhanced with defaults)
// ----------------------
server.registerTool("pdf_to_markdown", {
title: "Convert one PDF to Markdown",
description: "Convert a single PDF (path/URL/data URI) to Markdown. If outputDir is omitted, uses --default-output-dir when set. If source is relative and --default-input-dir is set, resolves under it.",
inputSchema: SingleInputShape,
}, async (args) => {
const { source: rawSource, outputDir, outputName } = z.object(SingleInputShape).parse(args);
// Resolve relative source via default input dir if provided
let source = rawSource;
if (looksRelativePath(source) && CLI_DEFAULTS.inputDir) {
source = join(pathFromMaybeFileUrl(CLI_DEFAULTS.inputDir), source);
}
const pdfBytes = await loadPdfBytes(source);
const markdown = await pdf2md(pdfBytes);
// Determine output name
let displayName = "document.pdf";
try {
if (/^https?:\/\//.test(source) || source.startsWith("file://")) {
displayName = basename(new URL(source).pathname || "document.pdf");
}
else if (!source.startsWith("data:")) {
displayName = basename(source);
}
}
catch {
/* ignore */
}
const mdName = outputName ?? changeExt(displayName, ".md");
// Output dir: explicit > default > none
const chosenOutDir = outputDir ?? CLI_DEFAULTS.outputDir;
let savedPath;
if (chosenOutDir) {
const outDirAbs = resolve(pathFromMaybeFileUrl(chosenOutDir));
await ensureDir(outDirAbs);
const outPath = join(outDirAbs, mdName);
await writeFile(outPath, markdown, "utf8");
savedPath = outPath;
}
const summaryLines = [
`Source: ${source}`,
chosenOutDir ? `Saved: ${savedPath}` : "Saved: <not saved to disk>",
`Name: ${mdName}`,
].join("\n");
return {
content: [
{ type: "text", text: summaryLines, mimeType: "text/plain", name: "summary.txt" },
{ type: "text", text: markdown, mimeType: "text/markdown", name: mdName }
]
};
});
server.registerTool("pdf_folder_to_markdown", {
title: "Convert a folder of PDFs to Markdown",
description: "Recursively convert PDFs in inputDir into Markdown in outputDir. If inputDir/outputDir omitted, uses CLI defaults.",
inputSchema: FolderInputShape,
}, async (args) => {
const { inputDir, outputDir, overwrite, concurrency } = z.object(FolderInputShape).parse(args);
// Apply defaults if missing
const resolvedInputDir = inputDir ?? CLI_DEFAULTS.inputDir ??
(() => { throw new Error("inputDir not provided and --default-input-dir not set"); })();
const resolvedOutputDir = outputDir ?? CLI_DEFAULTS.outputDir ??
(() => { throw new Error("outputDir not provided and --default-output-dir not set"); })();
const inDir = resolve(pathFromMaybeFileUrl(resolvedInputDir));
const outDir = resolve(pathFromMaybeFileUrl(resolvedOutputDir));
await ensureDir(outDir);
// Gather PDFs
const allFiles = await walkDir(inDir);
const pdfs = allFiles.filter(f => /\.pdf$/i.test(f));
const manifest = [];
const workers = Math.max(1, Math.min(concurrency ?? 4, 16));
let idx = 0;
async function worker() {
for (;;) {
const myIdx = idx++;
if (myIdx >= pdfs.length)
return;
const pdfPath = pdfs[myIdx];
try {
const rel = relative(inDir, pdfPath);
const targetPath = join(outDir, changeExt(rel, ".md"));
await ensureDir(dirname(targetPath));
if (!overwrite && await fileExists(targetPath)) {
manifest.push({ input: pdfPath, output: targetPath, status: "skipped" });
continue;
}
const pdfBytes = await readFile(pdfPath);
const md = await pdf2md(new Uint8Array(pdfBytes));
await writeFile(targetPath, md, "utf8");
manifest.push({
input: pdfPath,
output: targetPath,
status: "converted",
bytes: Buffer.byteLength(md, "utf8")
});
}
catch (err) {
manifest.push({
input: pdfPath,
status: "failed",
error: String(err?.message ?? err)
});
}
}
}
await Promise.all(Array.from({ length: workers }, () => worker()));
const summary = [
`Input dir: ${inDir}`,
`Output dir: ${outDir}`,
`Found PDFs: ${pdfs.length}`,
`Converted: ${manifest.filter(m => m.status === "converted").length}`,
`Skipped: ${manifest.filter(m => m.status === "skipped").length}`,
`Failed: ${manifest.filter(m => m.status === "failed").length}`
].join("\n");
return {
content: [
{
type: "text",
text: summary,
mimeType: "text/plain",
name: "summary.txt"
},
{
type: "text",
text: JSON.stringify({ inputDir: inDir, outputDir: outDir, items: manifest }, null, 2),
mimeType: "application/json",
name: "pdf2md_manifest.json"
}
]
};
});
// ----------------------
// Transport (stdio default)
// ----------------------
const useStdio = true;
async function main() {
if (useStdio) {
const transport = new StdioServerTransport();
await server.connect(transport);
return;
}
// Optional HTTP transport
// const app = express();
// const transport = new StreamableHTTPServerTransport({ app, endpoint: "/mcp", allowedOrigins: ["http://localhost"] });
// await server.connect(transport);
// app.listen(3000, () => console.error("[mcp-pdf2md] HTTP: http://127.0.0.1:3000/mcp"));
}
main().catch((err) => {
console.error(err);
process.exit(1);
});