UNPKG

ai-knowledge-hub

Version:

MCP server that provides unified access to organizational knowledge across multiple platforms (local docs, Guru, Notion)

407 lines 14 kB
/** * Markdown parser wrapper using remark ecosystem */ import { remark } from 'remark'; import remarkParse from 'remark-parse'; import remarkGfm from 'remark-gfm'; import remarkFrontmatter from 'remark-frontmatter'; import matter from 'gray-matter'; import { DEFAULT_PARSER_OPTIONS, } from '../types/markdown.js'; /** * Markdown parser class using remark */ export class MarkdownParser { processor; constructor(options = {}) { const config = { ...DEFAULT_PARSER_OPTIONS, ...options }; this.processor = remark() .use(remarkParse, { // Enable parsing of hard line breaks breaks: true, }) .use(remarkGfm); if (config.extractMetadata) { this.processor.use(remarkFrontmatter, ['yaml', 'toml']); } } /** * Parse markdown content into our custom AST format */ parseToAST(content) { try { const tree = this.processor.parse(content); return this.convertMdastToMarkdownNodes(tree.children); } catch (error) { throw new Error(`Failed to parse markdown: ${error instanceof Error ? error.message : String(error)}`); } } /** * Parse markdown file into MarkdownDocument */ parseDocument(content, filePath) { try { // Extract frontmatter with gray-matter const { data: frontMatter, content: bodyContent } = matter(content); // Parse the markdown content const ast = this.parseToAST(bodyContent); // Extract metadata const metadata = this.extractMetadata(frontMatter, bodyContent, ast); // Create document info const documentInfo = { name: this.getFileNameFromPath(filePath), category: this.getCategoryFromPath(filePath), path: filePath, }; return { ...documentInfo, content: bodyContent, metadata, lastModified: new Date(), size: content.length, }; } catch { throw new Error(`Failed to parse document: ${filePath}`); } } /** * Validate markdown content */ validate(content) { const errors = []; const warnings = []; let metadata; try { // Parse frontmatter const { data: frontMatter, content: bodyContent } = matter(content); // Try to parse the markdown const ast = this.parseToAST(bodyContent); metadata = this.extractMetadata(frontMatter, bodyContent, ast); // Validation checks this.validateStructure(ast, errors, warnings); this.validateContent(content, errors, warnings); } catch (error) { errors.push({ type: 'syntax', message: `Parse error: ${error instanceof Error ? error.message : String(error)}`, severity: 'error', }); } return { isValid: errors.length === 0, errors, warnings, metadata, }; } /** * Convert remark MDAST nodes to our MarkdownNode format */ convertMdastToMarkdownNodes(nodes) { return nodes.map(node => this.convertMdastNode(node)).filter(Boolean); } /** * Convert a single MDAST node to MarkdownNode */ convertMdastNode(node) { switch (node.type) { case 'heading': { const heading = node; return { type: 'heading', level: heading.depth, content: this.extractTextContent(heading.children), children: this.convertInlineNodes(heading.children), }; } case 'paragraph': { const paragraph = node; return { type: 'paragraph', content: this.extractTextContent(paragraph.children), children: this.convertInlineNodes(paragraph.children), }; } case 'list': { const list = node; return { type: 'list', ordered: list.ordered ?? false, children: this.convertMdastToMarkdownNodes(list.children), }; } case 'listItem': { const listItem = node; return { type: 'list_item', checked: listItem.checked ?? undefined, children: this.convertMdastToMarkdownNodes(listItem.children), }; } case 'code': { const code = node; return { type: 'code', content: code.value, language: code.lang ?? undefined, }; } case 'blockquote': { const blockquote = node; return { type: 'quote', children: this.convertMdastToMarkdownNodes(blockquote.children), }; } case 'table': { const table = node; return { type: 'table', children: this.convertMdastToMarkdownNodes(table.children), }; } case 'tableRow': { const tableRow = node; return { type: 'table_row', children: this.convertMdastToMarkdownNodes(tableRow.children), }; } case 'tableCell': { const tableCell = node; return { type: 'table_cell', content: this.extractTextContent(tableCell.children), children: this.convertInlineNodes(tableCell.children), }; } case 'image': { const image = node; return { type: 'image', url: image.url, alt: image.alt ?? undefined, title: image.title ?? undefined, }; } case 'thematicBreak': { return { type: 'divider', }; } case 'text': { const text = node; return { type: 'text', content: text.value, }; } case 'break': { // Handle hard line breaks return { type: 'text', content: '\n', }; } default: // Silently ignore unsupported node types return null; } } /** * Convert inline nodes (for rich text) */ convertInlineNodes(nodes) { return nodes.map(node => { if (node.type === 'text') { return { type: 'text', content: node.value }; } else if (node.type === 'break') { return { type: 'text', content: '\n' }; // Preserve hard line breaks } else if (node.type === 'strong') { return { type: 'text', content: this.extractTextContent(node.children), bold: true, }; } else if (node.type === 'emphasis') { return { type: 'text', content: this.extractTextContent(node.children), italic: true, }; } else if (node.type === 'delete') { return { type: 'text', content: this.extractTextContent(node.children), strikethrough: true, }; } else if (node.type === 'inlineCode') { return { type: 'text', content: node.value, code: true, }; } else if (node.type === 'link') { return { type: 'text', content: this.extractTextContent(node.children), link: { url: node.url, title: node.title ?? undefined, }, }; } const nodeWithValue = node; return { type: 'text', content: nodeWithValue.value ?? '' }; }); } /** * Extract plain text content from node children */ extractTextContent(children) { return children .map(child => { const textChild = child; if (textChild.type === 'text') { return textChild.value ?? ''; } if (textChild.type === 'break') { return '\n'; // Preserve hard line breaks } if (textChild.children) { return this.extractTextContent(textChild.children); } return ''; }) .join(''); } /** * Extract metadata from frontmatter and content */ extractMetadata(frontMatter, content, ast) { // Extract headings from AST const headings = ast .filter(node => node.type === 'heading') .map(node => ({ level: node.level ?? 1, text: node.content ?? '', anchor: (node.content ?? '').toLowerCase().replace(/[^a-z0-9]/g, '-'), })); // Calculate word count const wordCount = content .replace(/[#*`_[\]()]/g, '') .split(/\s+/) .filter(word => word.length > 0).length; const title = typeof frontMatter.title === 'string' ? frontMatter.title : undefined; const description = typeof frontMatter.description === 'string' ? frontMatter.description : undefined; const author = typeof frontMatter.author === 'string' ? frontMatter.author : undefined; const date = typeof frontMatter.date === 'string' ? frontMatter.date : undefined; let tags = []; if (Array.isArray(frontMatter.tags)) { tags = frontMatter.tags.filter((tag) => typeof tag === 'string'); } else if (typeof frontMatter.tags === 'string') { tags = [frontMatter.tags]; } let categories = []; if (Array.isArray(frontMatter.categories)) { categories = frontMatter.categories.filter((cat) => typeof cat === 'string'); } else if (typeof frontMatter.category === 'string') { categories = [frontMatter.category]; } return { title: title ?? headings[0]?.text, description, tags, categories, author, date, lastModified: new Date().toISOString(), frontMatter, wordCount, headings, }; } /** * Validate document structure */ validateStructure(ast, errors, warnings) { let hasH1 = false; let lastHeadingLevel = 0; for (const node of ast) { if (node.type === 'heading') { if (node.level === 1) { if (hasH1) { warnings.push({ type: 'structure', message: 'Multiple H1 headings found. Consider using only one H1 per document.', suggestion: 'Use H2-H6 for subsequent sections', }); } hasH1 = true; } // Check heading hierarchy if (lastHeadingLevel > 0 && node.level !== null && node.level !== undefined && node.level > lastHeadingLevel + 1) { warnings.push({ type: 'structure', message: `Heading level skipped: H${lastHeadingLevel} followed by H${node.level}`, suggestion: 'Use sequential heading levels for better document structure', }); } lastHeadingLevel = node.level ?? 1; } } if (!hasH1) { warnings.push({ type: 'structure', message: 'No H1 heading found', suggestion: 'Consider adding a main title with # at the beginning', }); } } /** * Validate content quality */ validateContent(content, errors, warnings) { // Check for very short content if (content.trim().length < 50) { warnings.push({ type: 'content', message: 'Document appears to be very short', suggestion: 'Consider adding more detailed content', }); } // Check for broken links (basic patterns) const brokenLinkPattern = /\[([^\]]*)\]\(\s*\)/g; if (brokenLinkPattern.test(content)) { errors.push({ type: 'content', message: 'Empty link URLs detected', severity: 'warning', }); } } /** * Get filename from path */ getFileNameFromPath(filePath) { return filePath.split('/').pop()?.replace(/\.md$/, '') ?? 'untitled'; } /** * Get category from path */ getCategoryFromPath(filePath) { const parts = filePath.split('/'); return parts.length > 1 ? parts[parts.length - 2] : 'general'; } } //# sourceMappingURL=markdown-parser.js.map