apple-dev-mcp
Version:
Complete Apple development guidance: Human Interface Guidelines (design) + Technical Documentation for iOS, macOS, watchOS, tvOS, and visionOS
409 lines • 20.4 kB
JavaScript
/**
* Content Processor Service
*
* Handles content extraction and processing using Turndown
* for clean HTML-to-Markdown conversion and structured content organization.
*/
import TurndownService from 'turndown';
export class ContentProcessorService {
turndown;
appleDesignTerms = [
'accessibility', 'animation', 'branding', 'buttons', 'color', 'controls',
'design', 'feedback', 'gestures', 'haptics', 'icons', 'images', 'input',
'interface', 'layout', 'materials', 'motion', 'navigation', 'presentation',
'selection', 'status', 'system', 'typography', 'visual', 'widgets',
'human interface guidelines', 'user experience', 'user interface',
'touch target', 'dynamic type', 'voiceover', 'dark mode', 'light mode'
];
fallbackIndicators = [
'this page requires javascript',
'please turn on javascript',
'javascript is required',
'single page application',
'content not available',
'loading...',
'page not found',
'skip navigation',
'refresh the page to view'
];
appleSPAIndicators = [
'skip navigation',
'current page is',
'supported platforms',
'change log',
'platform considerations',
'additional considerations for',
'no additional considerations for'
];
constructor() {
// Configure Turndown for high-quality conversion
this.turndown = new TurndownService({
headingStyle: 'atx', // Use # headers
bulletListMarker: '-', // Use - for lists
codeBlockStyle: 'fenced', // Use ``` code blocks
fence: '```', // Code fence marker
emDelimiter: '*', // Use * for emphasis
strongDelimiter: '**', // Use ** for strong
linkStyle: 'inlined', // Inline links
linkReferenceStyle: 'full' // Full reference links
});
this.configureTurndownRules();
}
configureTurndownRules() {
// Remove images for MCP efficiency (as mentioned in original code)
this.turndown.addRule('removeImages', {
filter: 'img',
replacement: () => ''
});
// Clean navigation and footer elements with proper spacing
this.turndown.addRule('removeNavigation', {
filter: ['nav', 'footer', 'header'],
replacement: () => '\n\n' // Add spacing to prevent word concatenation
});
// Remove elements by class name with spacing
this.turndown.addRule('removeByClass', {
filter: (node) => {
if (node.nodeType === 1) { // Element node
const className = node.className;
return typeof className === 'string' &&
(className.includes('navigation') || className.includes('breadcrumb'));
}
return false;
},
replacement: () => '\n' // Add line break to prevent concatenation
});
// Ensure proper spacing for block elements
this.turndown.addRule('blockElementSpacing', {
filter: ['div', 'section', 'article', 'aside', 'main'],
replacement: (content, node) => {
// Only add spacing if the element actually contains content
const trimmedContent = content.trim();
if (!trimmedContent)
return '';
// Check if this is likely a content container
const element = node;
const hasContentClass = element.className && (element.className.includes('content') ||
element.className.includes('section') ||
element.className.includes('main'));
// Add spacing around content blocks
return hasContentClass ? `\n\n${trimmedContent}\n\n` : trimmedContent;
}
});
// Preserve code blocks
this.turndown.addRule('preserveCode', {
filter: 'code',
replacement: (content) => `\`${content}\``
});
// Smarter whitespace handling - preserve line breaks but clean up excess spaces
this.turndown.addRule('smartWhitespace', {
filter: (node) => node.nodeType === 3, // Text nodes
replacement: (content) => {
// Preserve single line breaks, collapse multiple spaces
return content
.replace(/[ \t]+/g, ' ') // Collapse horizontal whitespace
.replace(/\n[ \t]+/g, '\n') // Clean up indented lines
.replace(/[ \t]+\n/g, '\n'); // Clean up trailing spaces
}
});
// Ensure headers have proper spacing
this.turndown.addRule('headerSpacing', {
filter: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'],
replacement: (content, node) => {
const level = parseInt(node.nodeName.charAt(1));
const headerPrefix = '#'.repeat(level);
const trimmedContent = content.trim();
if (!trimmedContent)
return '';
// Add spacing around headers
return `\n\n${headerPrefix} ${trimmedContent}\n\n`;
}
});
}
/**
* Process HTML content into clean markdown with metadata
*/
async processContent(html, section) {
// Clean the HTML
const cleanedHtml = this.cleanHtml(html);
// Convert to markdown
const rawMarkdown = this.turndown.turndown(cleanedHtml);
// Clean and normalize markdown
const cleanedMarkdown = this.cleanMarkdown(rawMarkdown);
// Extract keywords
const keywords = this.extractKeywords(cleanedMarkdown, section);
// Find related sections (simplified)
const relatedSections = this.extractRelatedSections(cleanedMarkdown, section);
// Calculate quality metrics (pass original for better fallback detection)
const quality = this.calculateQualityMetrics(cleanedMarkdown, rawMarkdown);
// Generate front matter
const frontMatter = this.generateFrontMatter(section, quality, keywords);
return {
cleanedMarkdown,
frontMatter,
quality,
keywords,
relatedSections
};
}
cleanHtml(html) {
let cleaned = html;
// Remove script and style tags
cleaned = cleaned.replace(/<script\b[^<]*(?:(?!<\/script>)<[^<]*)*<\/script>/gi, ' ');
cleaned = cleaned.replace(/<style\b[^<]*(?:(?!<\/style>)<[^<]*)*<\/style>/gi, ' ');
// Remove navigation elements with spacing to prevent word concatenation
cleaned = cleaned.replace(/<nav\b[^>]*>.*?<\/nav>/gi, ' ');
cleaned = cleaned.replace(/<footer\b[^>]*>.*?<\/footer>/gi, ' ');
cleaned = cleaned.replace(/<header\b[^>]*>.*?<\/header>/gi, ' ');
// Remove common UI elements that aren't content
cleaned = cleaned.replace(/class="[^"]*breadcrumb[^"]*"/gi, '');
cleaned = cleaned.replace(/class="[^"]*navigation[^"]*"/gi, '');
// Clean up multiple spaces that might have been introduced
cleaned = cleaned.replace(/\s+/g, ' ');
return cleaned;
}
cleanMarkdown(markdown) {
let cleaned = markdown;
// Remove JavaScript fallback content at the beginning (more aggressive pattern)
cleaned = cleaned.replace(/^.*?this page requires javascript.*?refresh the page to view its content\.?\s*/is, '');
// Also remove the standalone JavaScript warning if it appears
cleaned = cleaned.replace(/^#\s*this page requires javascript\.?\s*please turn on javascript.*?\s*/is, '');
// Remove any remaining JavaScript-related headers and content
cleaned = cleaned.replace(/^#\s*this page requires javascript\.?\s*/gim, '');
cleaned = cleaned.replace(/please turn on javascript.*?content\.\s*/gim, '');
// Remove "Skip Navigation" and navigation elements
cleaned = cleaned.replace(/skip navigation\s*/gi, '');
// Remove Apple SPA metadata sections
cleaned = this.removeAppleSPAMetadata(cleaned);
// Clean up repeated section titles (common in Apple's SPA output)
cleaned = this.removeRepeatedTitles(cleaned);
// Clean up spacing while preserving word boundaries
cleaned = cleaned.replace(/\n{3,}/g, '\n\n'); // Limit to double line breaks
cleaned = cleaned.replace(/[ \t]+$/gm, ''); // Remove trailing spaces
cleaned = cleaned.replace(/[ \t]+/g, ' '); // Collapse multiple spaces to single space
// Ensure spacing around headers
cleaned = cleaned.replace(/([^\n])(#+\s)/g, '$1\n\n$2'); // Space before headers
cleaned = cleaned.replace(/(#+\s[^\n]+)([^\n])/g, '$1\n\n$2'); // Space after headers
// Clean up malformed links
cleaned = cleaned.replace(/\[([^\]]*)\]\(\)/g, '$1');
// Remove empty headers
cleaned = cleaned.replace(/^#+\s*$/gm, '');
// Standardize list formatting
cleaned = cleaned.replace(/^\s*[*+]\s/gm, '- ');
// Fix common word concatenation issues from Apple's SPA
cleaned = this.fixWordConcatenation(cleaned);
// Remove trailing metadata sections
cleaned = this.removeTrailingMetadata(cleaned);
return cleaned.trim();
}
removeAppleSPAMetadata(content) {
let cleaned = content;
// Remove "Platform considerations" sections that are just "No additional considerations"
cleaned = cleaned.replace(/platform considerations\s*no additional considerations for.*?\./gi, '');
// Remove "Current page is X" indicators
cleaned = cleaned.replace(/current page is \w+\s*/gi, '');
// Remove "Supported platforms" lists at the end
cleaned = cleaned.replace(/supported platforms.*$/gi, '');
return cleaned;
}
removeRepeatedTitles(content) {
// If the title appears multiple times, keep only the first structured occurrence
const lines = content.split('\n');
const titleCounts = new Map();
return lines.filter(line => {
const titleMatch = line.match(/^#+\s*(.+)$/);
if (titleMatch) {
const title = titleMatch[1].toLowerCase().trim();
const count = titleCounts.get(title) || 0;
titleCounts.set(title, count + 1);
// Keep first occurrence and any with more structure (longer headers)
return count === 0 || line.length > 20;
}
return true;
}).join('\n');
}
/**
* Fix common word concatenation issues from HTML to Markdown conversion
*/
fixWordConcatenation(content) {
let fixed = content;
// Fix concatenated sentences (lowercase letter followed by uppercase letter)
fixed = fixed.replace(/([a-z])([A-Z])/g, '$1 $2');
// Fix concatenated words with common Apple HIG terms
const higTerms = [
'Best practices', 'Guidelines', 'When to use', 'How to use',
'iOS', 'macOS', 'watchOS', 'tvOS', 'visionOS',
'Tab bar', 'Navigation bar', 'Button', 'Picker', 'Slider',
'Action sheet', 'Alert', 'Popover', 'Sheet',
'Accessibility', 'VoiceOver', 'Dynamic Type',
'SF Symbols', 'App Store'
];
for (const term of higTerms) {
// Only fix clear concatenation cases to avoid false positives
const termNoSpaces = term.replace(/\s+/g, '');
// Fix cases where lowercase word is concatenated before the term
const regex = new RegExp(`([a-z]{2,})${termNoSpaces}`, 'gi');
fixed = fixed.replace(regex, (match, prefix) => {
// Avoid splitting compound words that should stay together
if (prefix.length < 3)
return match;
return `${prefix} ${term}`;
});
// Fix cases where the term is concatenated before an uppercase word
const reverseRegex = new RegExp(`${termNoSpaces}([A-Z][a-z]{2,})`, 'gi');
fixed = fixed.replace(reverseRegex, (match) => {
const suffixMatch = match.match(new RegExp(`${termNoSpaces}(.+)`, 'i'));
if (suffixMatch && suffixMatch[1].length > 2) {
return `${term} ${suffixMatch[1]}`;
}
return match;
});
}
// Fix common concatenation patterns
fixed = fixed.replace(/([a-z])\.([A-Z])/g, '$1. $2'); // Period followed by capital
fixed = fixed.replace(/([a-z])!([A-Z])/g, '$1! $2'); // Exclamation followed by capital
fixed = fixed.replace(/([a-z])\?([A-Z])/g, '$1? $2'); // Question mark followed by capital
// Fix number concatenations
fixed = fixed.replace(/([0-9])([A-Za-z])/g, '$1 $2'); // Number followed by letter
fixed = fixed.replace(/([A-Za-z])([0-9])/g, '$1 $2'); // Letter followed by number
return fixed;
}
removeTrailingMetadata(content) {
let cleaned = content;
// Remove common trailing sections
const trailingSections = [
'resources?\\s*related.*?change log.*$',
'change log\\s*date\\s*changes.*$',
'videos\\s*discoverable design.*$',
'platform considerations.*?resources.*$'
];
for (const pattern of trailingSections) {
cleaned = cleaned.replace(new RegExp(pattern, 'gis'), '');
}
return cleaned.trim();
}
extractKeywords(content, section) {
const keywords = new Set();
// Add section title and platform
keywords.add(section.title.toLowerCase());
keywords.add(section.platform.toLowerCase());
keywords.add(section.category.toLowerCase());
// Extract from content
const words = content.toLowerCase().match(/\b[a-z]{3,}\b/g) || [];
for (const word of words) {
if (this.appleDesignTerms.includes(word)) {
keywords.add(word);
}
}
return Array.from(keywords).slice(0, 20); // Limit to 20 keywords
}
extractRelatedSections(content, _section) {
const related = new Set();
// Simple pattern matching for "See also" sections
const seeAlsoMatch = content.match(/see also[:\s]+(.*?)(?:\n|$)/i);
if (seeAlsoMatch) {
const links = seeAlsoMatch[1].match(/\[([^\]]+)\]/g) || [];
links.forEach(link => {
const title = link.slice(1, -1);
related.add(title);
});
}
return Array.from(related).slice(0, 5);
}
calculateQualityMetrics(content, originalContent) {
const length = content.length;
const headingCount = (content.match(/^#+/gm) || []).length;
const codeExamplesCount = (content.match(/```/g) || []).length / 2;
const imageReferencesCount = (content.match(/!\[.*?\]/g) || []).length;
// Enhanced fallback detection (check original content for better detection)
const isFallbackContent = this.detectFallbackContent(content, originalContent);
// Check for Apple SPA issues (malformed content that isn't complete fallback)
const hasAppleSPAIssues = this.appleSPAIndicators.some(indicator => content.toLowerCase().includes(indicator));
// Calculate structure score
const structureScore = Math.min(1.0, (headingCount * 0.1) + (codeExamplesCount * 0.2));
// Calculate Apple terms score
const appleTermsFound = this.appleDesignTerms.filter(term => content.toLowerCase().includes(term)).length;
const appleTermsScore = Math.min(1.0, appleTermsFound / 10);
// Calculate content quality indicators
const hasSubstantialContent = length > 400; // Lowered threshold
const hasGuidelines = content.toLowerCase().includes('best practices') ||
content.toLowerCase().includes('guideline') ||
content.toLowerCase().includes('consider') ||
content.toLowerCase().includes('avoid') ||
content.toLowerCase().includes('should') ||
content.toLowerCase().includes('when');
// Calculate guideline quality score
const guidelineScore = hasGuidelines ? 0.4 : 0; // Increased weight for guidelines
// Bonus for well-structured content
const structureBonus = headingCount >= 2 ? 0.2 : 0;
// Overall quality score with enhanced detection
let score;
if (isFallbackContent) {
score = 0.1; // Clear fallback content
}
else if (hasAppleSPAIssues && !hasSubstantialContent) {
score = 0.3; // SPA content with issues but some real content
}
else {
// Normal quality calculation for good content
const lengthScore = Math.min(1.0, length / 800); // Lowered threshold: 800 chars = perfect length score
score = lengthScore * 0.2 + structureScore * 0.15 + appleTermsScore * 0.15 + guidelineScore * 0.35 + structureBonus * 0.15;
}
return {
score: Math.min(1.0, score),
length,
structureScore,
appleTermsScore,
codeExamplesCount,
imageReferencesCount,
headingCount,
isFallbackContent,
extractionMethod: 'turndown-enhanced',
confidence: isFallbackContent ? 0.1 : Math.min(1.0, score + 0.1)
};
}
detectFallbackContent(content, originalContent) {
const contentLower = content.toLowerCase();
const originalLower = originalContent?.toLowerCase() || contentLower;
// Check if content has substantial guidelines/practices (indicating real content)
const hasSubstantialRealContent = content.length > 500 &&
(contentLower.includes('best practices') ||
contentLower.includes('guideline') ||
contentLower.includes('accessibility') ||
contentLower.includes('consider') ||
contentLower.includes('ensure') ||
contentLower.includes('avoid'));
// If we have substantial real content, it's not fallback even if it has JS warnings
if (hasSubstantialRealContent) {
return false;
}
// Primary fallback indicators (check both cleaned and original content)
const hasFallbackIndicators = this.fallbackIndicators.some(indicator => contentLower.includes(indicator) || originalLower.includes(indicator));
// Secondary indicators: very short content with SPA artifacts
const isTooShort = content.length < 200;
const hasOnlySPAIndicators = this.appleSPAIndicators.some(indicator => contentLower.includes(indicator) || originalLower.includes(indicator));
// If content mentions JavaScript issues but has little real content
const hasJavaScriptIssues = (contentLower.includes('javascript') || originalLower.includes('javascript')) &&
(contentLower.includes('required') || contentLower.includes('turn on') ||
originalLower.includes('required') || originalLower.includes('turn on'));
return hasFallbackIndicators || (hasJavaScriptIssues && !hasSubstantialRealContent) || (isTooShort && hasOnlySPAIndicators);
}
generateFrontMatter(section, quality, keywords) {
const frontMatter = {
title: section.title,
platform: section.platform,
category: section.category,
url: section.url,
quality_score: Math.round(quality.score * 100) / 100,
content_length: quality.length,
last_updated: new Date().toISOString(),
keywords: keywords,
has_code_examples: quality.codeExamplesCount > 0,
has_images: quality.imageReferencesCount > 0,
is_fallback: quality.isFallbackContent
};
return '---\n' + Object.entries(frontMatter)
.map(([key, value]) => `${key}: ${Array.isArray(value) ? JSON.stringify(value) : value}`)
.join('\n') + '\n---\n\n';
}
}
//# sourceMappingURL=content-processor.service.js.map