UNPKG

brain-mcp

Version:

Brain MCP Server - Semantic knowledge base access for Claude Code via Model Context Protocol. Provides intelligent search and navigation of files from multiple locations through native MCP tools.

211 lines 8.46 kB
"use strict"; /** * PDF parser for extracting text and structure from PDF files */ var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) { if (k2 === undefined) k2 = k; var desc = Object.getOwnPropertyDescriptor(m, k); if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) { desc = { enumerable: true, get: function() { return m[k]; } }; } Object.defineProperty(o, k2, desc); }) : (function(o, m, k, k2) { if (k2 === undefined) k2 = k; o[k2] = m[k]; })); var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) { Object.defineProperty(o, "default", { enumerable: true, value: v }); }) : function(o, v) { o["default"] = v; }); var __importStar = (this && this.__importStar) || (function () { var ownKeys = function(o) { ownKeys = Object.getOwnPropertyNames || function (o) { var ar = []; for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k; return ar; }; return ownKeys(o); }; return function (mod) { if (mod && mod.__esModule) return mod; var result = {}; if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]); __setModuleDefault(result, mod); return result; }; })(); var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.PDFParser = void 0; const path = __importStar(require("path")); const fs = __importStar(require("fs")); const child_process_1 = require("child_process"); const pdf_parse_1 = __importDefault(require("pdf-parse")); const types_1 = require("../models/types"); class PDFParser { async parse(filePath, content, notesRoot) { let text; let pdfInfo = {}; try { // Try using pdftotext for cleaner text extraction text = (0, child_process_1.execSync)(`pdftotext -layout "${filePath}" -`, { encoding: 'utf-8', maxBuffer: 50 * 1024 * 1024 // 50MB buffer }); // Get basic info using pdf-parse for metadata const dataBuffer = await fs.promises.readFile(filePath); const pdfData = await (0, pdf_parse_1.default)(dataBuffer, { max: 1 }); // Only parse first page for metadata pdfInfo = pdfData.info || {}; } catch (error) { // Fallback to pdf-parse if pdftotext is not available console.log('pdftotext not available, falling back to pdf-parse'); const dataBuffer = await fs.promises.readFile(filePath); const pdfData = await (0, pdf_parse_1.default)(dataBuffer); text = pdfData.text; pdfInfo = pdfData.info || {}; } // Extract title from metadata or filename const title = pdfInfo?.Title || path.basename(filePath, path.extname(filePath)); // Extract sections based on text patterns const headings = this.extractHeadings(text); // Extract links (PDFs might contain URLs) const outgoingLinks = this.extractLinks(text, filePath); // Extract tags (if any hashtags are in the text) const tags = this.extractTags(text); // Calculate word count const wordCount = text.split(/\s+/).filter(word => word.length > 0).length; // Get file modification time const stats = await fs.promises.stat(filePath); const lastModified = stats.mtime; // Calculate relative path const relativePath = path.relative(notesRoot, filePath); // Build metadata from PDF info const frontmatter = {}; if (pdfInfo) { if (pdfInfo.Title) frontmatter.title = pdfInfo.Title; if (pdfInfo.Author) frontmatter.author = pdfInfo.Author; if (pdfInfo.Subject) frontmatter.subject = pdfInfo.Subject; if (pdfInfo.Keywords) frontmatter.keywords = pdfInfo.Keywords; if (pdfInfo.CreationDate) frontmatter.creationDate = pdfInfo.CreationDate; if (pdfInfo.ModDate) frontmatter.modificationDate = pdfInfo.ModDate; } return { path: filePath, relativePath, title, headings, outgoingLinks, tags, frontmatter, lastModified, wordCount }; } extractHeadings(text) { const headings = []; const lines = text.split('\n'); let lineNumber = 0; for (const line of lines) { lineNumber++; const trimmedLine = line.trim(); // Detect potential headings based on patterns // Pattern 1: All caps lines with more than 3 words if (trimmedLine.length > 10 && trimmedLine === trimmedLine.toUpperCase() && !/^\d+$/.test(trimmedLine) && trimmedLine.split(/\s+/).length > 2) { headings.push({ level: 1, text: trimmedLine, lineNumber, slug: this.createSlug(trimmedLine) }); } // Pattern 2: Lines that look like numbered sections (e.g., "1. Introduction", "2.1 Background") else if (/^(?:\d+\.)+\s+[A-Z]/.test(trimmedLine)) { const match = trimmedLine.match(/^((?:\d+\.)+)\s+(.+)$/); if (match) { const level = match[1].split('.').length; headings.push({ level: Math.min(level, 6), text: match[2], lineNumber, slug: this.createSlug(match[2]) }); } } // Pattern 3: Lines that start with "Chapter", "Section", etc. else if (/^(Chapter|Section|Part)\s+\d+[:\s]/i.test(trimmedLine)) { headings.push({ level: 1, text: trimmedLine, lineNumber, slug: this.createSlug(trimmedLine) }); } } return headings; } extractLinks(text, sourcePath) { const links = []; const lines = text.split('\n'); // URL pattern const urlPattern = /https?:\/\/[^\s<>"\{\}\|\^\[\]`]+/g; lines.forEach((line, lineIndex) => { let match; while ((match = urlPattern.exec(line)) !== null) { let url = match[0]; // Remove trailing punctuation url = url.replace(/[.,;!?]+$/, ''); // Extract context around the link const start = Math.max(0, match.index - 50); const end = Math.min(line.length, match.index + url.length + 50); let context = line.slice(start, end).trim(); if (start > 0) context = '...' + context; if (end < line.length) context = context + '...'; links.push({ sourcePath, targetPath: null, linkType: types_1.LinkType.MARKDOWN, // Using markdown type for URLs linkText: url, context, lineNumber: lineIndex + 1, isBroken: false }); } }); return links; } extractTags(text) { const tags = new Set(); const tagPattern = /(?:^|(?<=\s))#([a-zA-Z0-9_-]+)/g; let match; while ((match = tagPattern.exec(text)) !== null) { tags.add(match[1]); } return tags; } createSlug(text) { let slug = text.toLowerCase().replace(/[^\w\s-]/g, ''); slug = slug.replace(/[-\s]+/g, '-'); return slug.replace(/^-+|-+$/g, ''); } supports(extension) { return this.getSupportedExtensions().includes(extension.toLowerCase()); } getSupportedExtensions() { return ['.pdf']; } } exports.PDFParser = PDFParser; //# sourceMappingURL=PDFParser.js.map