UNPKG

apple-dev-mcp

Version:

Complete Apple development guidance: Human Interface Guidelines (design) + Technical Documentation for iOS, macOS, watchOS, tvOS, and visionOS

476 lines 19.8 kB
/** * Enhanced Content Processor Service * * Handles enhanced content extraction and processing using Turndown * for clean HTML-to-Markdown conversion and structured content organization. */ import TurndownService from 'turndown'; import MarkdownIt from 'markdown-it'; export class ContentProcessorService { turndown; markdown; config; commonWords = new Set([ 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'a', 'an', 'is', 'are', 'was', 'were', 'be', 'been', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would', 'should', 'could', 'can', 'may', 'might', 'must', 'shall' ]); constructor(config = {}) { this.config = { removeImages: true, preserveCodeBlocks: true, cleanNavigationElements: true, extractStructuredData: true, ...config }; // Configure Turndown for high-quality conversion this.turndown = new TurndownService({ headingStyle: 'atx', // Use # headers bulletListMarker: '-', // Use - for lists codeBlockStyle: 'fenced', // Use ``` code blocks fence: '```', // Code fence marker emDelimiter: '*', // Use * for emphasis strongDelimiter: '**', // Use ** for strong linkStyle: 'inlined', // Inline links linkReferenceStyle: 'full' // Full reference links }); // Configure Turndown rules this.configureTurndownRules(); // Initialize markdown parser for processing this.markdown = new MarkdownIt({ html: false, // Don't allow HTML tags xhtmlOut: false, // Don't use XHTML output breaks: false, // Don't convert line breaks to <br> linkify: true, // Auto-convert URL-like text to links typographer: true // Enable smart quotes and other typography }); } // Legacy interface method - kept for compatibility async process(section) { if (!section.content) { throw new Error(`No content available for section: ${section.title}`); } const result = await this.processContent(section.content, section.url); return result.cleanedMarkdown; } /** * Enhanced content processing method that returns full structured result */ async processContent(html, url) { const startTime = Date.now(); try { // Step 0: Check for JavaScript error pages if (this.isJavaScriptErrorPage(html)) { throw new Error('JavaScript error page detected - content requires browser JavaScript execution'); } // Step 1: Clean the HTML const cleanedHtml = this.cleanHtml(html); // Step 2: Convert to markdown const rawMarkdown = this.turndown.turndown(cleanedHtml); // Step 3: Clean and normalize markdown const cleanedMarkdown = this.cleanMarkdown(rawMarkdown); // Step 4: Extract structured content const structuredContent = this.extractStructuredContent(cleanedMarkdown, url); // Step 5: Calculate quality metrics const quality = this.calculateQualityMetrics(cleanedMarkdown, structuredContent); const processingTime = Date.now() - startTime; return { cleanedMarkdown, structuredContent, quality, processingMetrics: { extractionTime: processingTime, contentLength: cleanedMarkdown.length, structureScore: this.calculateStructureScore(structuredContent), cleaningScore: this.calculateCleaningScore(html, cleanedMarkdown) } }; } catch (error) { throw new Error(`Content processing failed: ${error instanceof Error ? error.message : 'Unknown error'}`); } } extractSnippet(content, maxLength = 200) { const cleaned = content .replace(/#+\s*/g, '') // Remove markdown headers .replace(/\n+/g, ' ') // Replace newlines with spaces .trim(); return cleaned.length > maxLength ? cleaned.substring(0, maxLength) + '...' : cleaned; } extractKeywords(content, section) { const text = content.toLowerCase(); // Extract words of 3+ characters const words = text.match(/\b\w{3,}\b/g) || []; // Filter out common words and get unique keywords const keywords = [...new Set(words)] .filter(word => !this.commonWords.has(word)) .slice(0, 20); // Top 20 keywords // Add section-specific keywords keywords.push(section.platform.toLowerCase(), section.category, section.title.toLowerCase()); return [...new Set(keywords)]; } /** * Configure Turndown rules for Apple HIG content */ configureTurndownRules() { // Remove images if configured if (this.config.removeImages) { this.turndown.addRule('removeImages', { filter: 'img', replacement: () => '' }); } // Remove navigation elements if (this.config.cleanNavigationElements) { this.turndown.addRule('removeNavigation', { filter: (node) => { if (node.nodeType !== 1) return false; // Only element nodes const element = node; // Remove common navigation patterns const navSelectors = [ 'nav', '.nav', '#nav', '.navigation', '.breadcrumb', '.skip-navigation', '.skip-link', '.page-navigation', '.toc', '.sidebar', '.menu' ]; return navSelectors.some(selector => { try { return element.matches?.(selector) || element.classList?.contains(selector.replace('.', '')) || element.id === selector.replace('#', ''); } catch { return false; } }); }, replacement: () => '' }); } // Clean up code blocks if (this.config.preserveCodeBlocks) { this.turndown.addRule('preserveCode', { filter: ['pre', 'code'], replacement: (content, node) => { if (node.nodeName === 'PRE') { return `\n\`\`\`\n${content}\n\`\`\`\n`; } return `\`${content}\``; } }); } // Remove footer and header content this.turndown.addRule('removeHeaderFooter', { filter: (node) => { if (node.nodeType !== 1) return false; const element = node; const tagName = element.tagName?.toLowerCase(); return tagName === 'header' || tagName === 'footer'; }, replacement: () => '' }); } /** * Clean HTML before conversion */ cleanHtml(html) { // Remove script and style tags let cleaned = html .replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, '') .replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, ''); // Remove common unwanted elements const unwantedPatterns = [ /<button[^>]*>.*?<\/button>/gi, /<input[^>]*>/gi, /<form[^>]*>.*?<\/form>/gi, /<!--[\s\S]*?-->/g, // Comments /<meta[^>]*>/gi, /<link[^>]*>/gi ]; unwantedPatterns.forEach(pattern => { cleaned = cleaned.replace(pattern, ''); }); return cleaned; } /** * Check if HTML content is a JavaScript error page */ isJavaScriptErrorPage(html) { const lowerHtml = html.toLowerCase(); const plainText = html.replace(/<[^>]*>/g, '').trim(); // Primary indicators: explicit JavaScript error messages const criticalErrorIndicators = [ 'please turn on javascript in your browser and refresh the page', 'this page requires javascript', 'javascript is required to view this content', 'enable javascript and refresh' ]; const hasCriticalError = criticalErrorIndicators.some(indicator => lowerHtml.includes(indicator)); // Secondary check: Very minimal content that's mostly CSS/style with noscript const hasNoscriptOnly = lowerHtml.includes('<noscript>') && lowerHtml.includes('class="noscript"') && plainText.length < 150; // Tertiary check: Content is only CSS styles with almost no readable text const isMostlyCSS = html.includes('<style>') && plainText.length < 100 && html.length > 1000; // Large HTML but tiny text content return hasCriticalError || hasNoscriptOnly || isMostlyCSS; } /** * Clean and normalize markdown content */ cleanMarkdown(markdown) { return markdown // Remove excessive whitespace .replace(/\n{3,}/g, '\n\n') // Remove leading/trailing whitespace .trim() // Fix common formatting issues .replace(/^[\s]*\n/gm, '') // Remove "Skip Navigation" artifacts .replace(/Skip Navigation\s*/gi, '') // Clean up list formatting .replace(/^\s*-\s*$/gm, '') // Remove empty links .replace(/\[(\s*)\]\(\s*\)/g, '') // Fix multiple spaces .replace(/ +/g, ' '); } /** * Extract structured content from cleaned markdown */ extractStructuredContent(markdown, _url) { const lines = markdown.split('\n'); let overview = ''; const guidelines = []; const examples = []; const relatedConcepts = []; let currentSection = 'overview'; let currentContent = ''; for (const line of lines) { const trimmed = line.trim(); // Detect section headers if (trimmed.match(/^#+\s*(overview|summary|what|description)/i)) { currentSection = 'overview'; continue; } else if (trimmed.match(/^#+\s*(guidelines?|best practices?|do|don't|recommendations?)/i)) { if (currentContent && currentSection === 'overview') { overview = currentContent.trim(); } currentSection = 'guidelines'; currentContent = ''; continue; } else if (trimmed.match(/^#+\s*(examples?|usage|use cases?|for example)/i)) { currentSection = 'examples'; currentContent = ''; continue; } else if (trimmed.match(/^#+\s*(related|see also|links)/i)) { currentSection = 'related'; currentContent = ''; continue; } // Process content based on current section if (trimmed) { if (currentSection === 'overview' && !trimmed.startsWith('#')) { currentContent += line + '\n'; } else if (currentSection === 'guidelines') { if (trimmed.startsWith('-') || trimmed.startsWith('*') || /^\d+\./.test(trimmed)) { guidelines.push(trimmed.replace(/^[-*\d.]+\s*/, '')); } else if (!trimmed.startsWith('#')) { guidelines.push(trimmed); } } else if (currentSection === 'examples') { if (trimmed.startsWith('-') || trimmed.startsWith('*') || /^\d+\./.test(trimmed)) { examples.push(trimmed.replace(/^[-*\d.]+\s*/, '')); } else if (!trimmed.startsWith('#')) { examples.push(trimmed); } } } } // Handle remaining overview content if (currentContent && currentSection === 'overview') { overview = currentContent.trim(); } // If no structured overview found, use first paragraph if (!overview) { const firstParagraph = lines.find(line => line.trim().length > 50 && !line.trim().startsWith('#') && !line.trim().startsWith('-') && !line.trim().startsWith('*')); overview = firstParagraph?.trim() || 'No overview available'; } // Extract related concepts from content this.extractRelatedConcepts(markdown, relatedConcepts); return { overview, guidelines: guidelines.length ? guidelines : ['No specific guidelines identified'], examples: examples.length ? examples : ['No examples provided'], relatedConcepts, specifications: this.extractSpecifications(markdown) }; } /** * Extract related concepts and cross-references */ extractRelatedConcepts(markdown, relatedConcepts) { // Look for Apple HIG component mentions const componentPatterns = [ /\b(buttons?|navigation bars?|tab bars?|toolbars?|alerts?|action sheets?)\b/gi, /\b(pickers?|text fields?|switches?|sliders?|steppers?)\b/gi, /\b(color|typography|layout|spacing|accessibility)\b/gi ]; componentPatterns.forEach(pattern => { const matches = markdown.match(pattern); if (matches) { matches.forEach(match => { const normalized = match.toLowerCase().trim(); if (!relatedConcepts.includes(normalized)) { relatedConcepts.push(normalized); } }); } }); } /** * Extract technical specifications from content */ extractSpecifications(markdown) { const specs = {}; // Look for dimension specifications const dimensionMatches = markdown.match(/(?:width|height|size):\s*([^.\n]+)/gi); if (dimensionMatches) { specs.dimensions = {}; dimensionMatches.forEach(match => { if (match.toLowerCase().includes('width')) { specs.dimensions.width = match.split(':')[1]?.trim(); } if (match.toLowerCase().includes('height')) { specs.dimensions.height = match.split(':')[1]?.trim(); } }); } // Look for spacing specifications const spacingMatches = markdown.match(/(?:padding|margin|spacing):\s*([^.\n]+)/gi); if (spacingMatches) { specs.spacing = {}; spacingMatches.forEach(match => { if (match.toLowerCase().includes('padding')) { specs.spacing.padding = match.split(':')[1]?.trim(); } if (match.toLowerCase().includes('margin')) { specs.spacing.margin = match.split(':')[1]?.trim(); } }); } return Object.keys(specs).length > 0 ? specs : undefined; } /** * Calculate content quality metrics */ calculateQualityMetrics(markdown, structured) { const length = markdown.length; const hasOverview = structured.overview.length > 50; const hasGuidelines = structured.guidelines.length > 0 && structured.guidelines[0] !== 'No specific guidelines identified'; const hasExamples = structured.examples.length > 0 && structured.examples[0] !== 'No examples provided'; const hasRelated = structured.relatedConcepts.length > 0; // Calculate structure score let structureScore = 0.2; // Base score if (hasOverview) structureScore += 0.3; if (hasGuidelines) structureScore += 0.2; if (hasExamples) structureScore += 0.2; if (hasRelated) structureScore += 0.1; // Calculate content richness const wordCount = markdown.split(/\s+/).length; const contentRichness = Math.min(wordCount / 500, 1); // Normalize to 500 words max const score = (structureScore + contentRichness) / 2; return { score, length, structureScore, appleTermsScore: this.calculateAppleTermsScore(markdown), codeExamplesCount: (markdown.match(/```/g) || []).length / 2, imageReferencesCount: 0, // We remove images headingCount: (markdown.match(/^#+/gm) || []).length, isFallbackContent: false, extractionMethod: 'enhanced-turndown', confidence: Math.min(score + 0.2, 1) // Boost confidence for structured content }; } /** * Calculate Apple-specific terms score */ calculateAppleTermsScore(content) { const appleTerms = [ 'iOS', 'macOS', 'watchOS', 'tvOS', 'visionOS', 'SwiftUI', 'UIKit', 'AppKit', 'Human Interface Guidelines', 'HIG', 'accessibility', 'VoiceOver', 'design system', 'interface' ]; let termCount = 0; const contentLower = content.toLowerCase(); appleTerms.forEach(term => { const matches = contentLower.split(term.toLowerCase()).length - 1; termCount += matches; }); return Math.min(termCount / 10, 1); // Normalize to max 10 terms } /** * Calculate structure score for processed content */ calculateStructureScore(structured) { let score = 0; if (structured.overview.length > 50) score += 0.4; if (structured.guidelines.length > 0 && structured.guidelines[0] !== 'No specific guidelines identified') score += 0.3; if (structured.examples.length > 0 && structured.examples[0] !== 'No examples provided') score += 0.2; if (structured.relatedConcepts.length > 0) score += 0.1; return score; } /** * Calculate cleaning score (how much improvement was made) */ calculateCleaningScore(originalHtml, cleanedMarkdown) { const originalLength = originalHtml.length; const cleanedLength = cleanedMarkdown.length; // Calculate reduction in content (higher = more cleaning) const reduction = (originalLength - cleanedLength) / originalLength; // Look for artifacts that were removed const artifactPatterns = [ /Skip Navigation/gi, /<script/gi, /<style/gi, /<nav/gi, /\[(\s*)\]/g ]; let artifactsRemoved = 0; artifactPatterns.forEach(pattern => { const originalCount = (originalHtml.match(pattern) || []).length; const cleanedCount = (cleanedMarkdown.match(pattern) || []).length; artifactsRemoved += Math.max(0, originalCount - cleanedCount); }); return Math.min((reduction * 0.7) + (artifactsRemoved * 0.1), 1); } } //# sourceMappingURL=content-processor.service.js.map