cleanweb-mcp
Version:
A lightweight MCP server for extracting clean web content with intelligent content filtering and Markdown conversion
202 lines (201 loc) • 7.29 kB
JavaScript
import axios from 'axios';
import TurndownService from 'turndown';
import { Readability } from '@mozilla/readability';
import { JSDOM } from 'jsdom';
// Turndown配置
const turndownService = new TurndownService({
headingStyle: 'atx',
hr: '---',
bulletListMarker: '*',
codeBlockStyle: 'fenced',
fence: '```',
emDelimiter: '_',
strongDelimiter: '**',
linkStyle: 'inlined'
});
// 移除不需要的元素
turndownService.remove(['script', 'style', 'nav', 'header', 'footer', 'aside', 'iframe']);
// 自定义图片处理规则 - 移除过长的图片链接
turndownService.addRule('images', {
filter: 'img',
replacement: function (content, node) {
const alt = node.getAttribute('alt') || '';
const src = node.getAttribute('src') || '';
// 如果图片链接过长(超过100字符),只显示alt文本或简化显示
if (src.length > 100) {
return alt ? `[图片: ${alt}]` : '[图片]';
}
// 对于较短的链接,保持原有格式
return alt ? `` : ``;
}
});
// 清理HTML内容
function cleanHtml(html, url) {
const dom = new JSDOM(html, { url });
const document = dom.window.document;
// 使用Readability提取主要内容
const reader = new Readability(document);
const article = reader.parse();
if (article && article.content) {
return {
title: article.title || '',
content: article.content,
textContent: article.textContent || '',
excerpt: article.excerpt || ''
};
}
// 如果Readability失败,使用备用方法
return fallbackExtraction(document);
}
// 备用内容提取方法
function fallbackExtraction(document) {
// 移除不需要的元素
const unwantedSelectors = [
'script', 'style', 'nav', 'header', 'footer', 'aside',
'.advertisement', '.ads', '.sidebar', '.menu', '.navigation',
'[class*="ad-"]', '[id*="ad-"]', '.social-share', '.comments'
];
unwantedSelectors.forEach(selector => {
const elements = document.querySelectorAll(selector);
elements.forEach(el => el.remove());
});
// 尝试找到主要内容区域
const contentSelectors = [
'main', 'article', '.content', '.post', '.entry',
'[role="main"]', '.main-content', '#content', '#main'
];
let mainContent = null;
for (const selector of contentSelectors) {
mainContent = document.querySelector(selector);
if (mainContent)
break;
}
if (!mainContent) {
mainContent = document.body;
}
return {
title: document.title || '',
content: mainContent.innerHTML,
textContent: mainContent.textContent || '',
excerpt: mainContent.textContent?.substring(0, 200) + '...' || ''
};
}
// 使用axios获取网页内容
async function fetchWebContent(url, options = {}) {
try {
const response = await axios.get(url, {
timeout: options.timeout || 30000,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1'
},
maxRedirects: 5,
validateStatus: (status) => status < 400
});
return response.data;
}
catch (error) {
throw new Error(`获取网页失败: ${error.message}`);
}
}
// 处理URL的主要函数
async function processUrl(url, options = {}) {
try {
console.error(`开始提取: ${url}`);
// 获取网页HTML
const html = await fetchWebContent(url, options);
// 清理和提取内容
const cleanedContent = cleanHtml(html, url);
// 转换为Markdown
const markdown = turndownService.turndown(cleanedContent.content);
console.error(`提取完成: ${cleanedContent.title}`);
return {
url,
title: cleanedContent.title,
content: markdown,
textContent: cleanedContent.textContent,
excerpt: cleanedContent.excerpt,
timestamp: new Date().toISOString()
};
}
catch (error) {
throw new Error(`处理URL失败: ${error.message}`);
}
}
export const webContentExtractor = {
name: "extract_web_content",
description: "Extract and clean content from web URLs, converting to Markdown format. Supports removing ads, navigation and irrelevant content, focusing on main article content.",
parameters: {
type: "object",
properties: {
url: {
type: "string",
description: "The web URL to extract content from"
},
format: {
type: "string",
enum: ["markdown", "json"],
description: "Return format: markdown (content only) or json (with complete information)",
default: "markdown"
},
timeout: {
type: "number",
description: "Page loading timeout in milliseconds, default 30000",
default: 30000
}
},
required: ["url"]
},
async run(args) {
try {
// Parameter validation
if (!args.url) {
throw new Error("URL parameter cannot be empty");
}
// Validate URL format
try {
new URL(args.url);
}
catch {
throw new Error("Invalid URL format");
}
// 处理选项
const options = {
timeout: args.timeout || 30000,
fastMode: true // 默认启用快速模式
};
// 提取内容
const result = await processUrl(args.url, options);
// 根据格式返回结果
if (args.format === 'json') {
return {
content: [{
type: "text",
text: `# Web Content Extraction Result\n\n**URL:** ${result.url}\n**Title:** ${result.title}\n**Extraction Time:** ${result.timestamp}\n\n## Extracted Content\n\n\`\`\`json\n${JSON.stringify(result, null, 2)}\n\`\`\``
}]
};
}
else {
return {
content: [{
type: "text",
text: `# ${result.title}\n\n**Source:** ${result.url}\n**Extraction Time:** ${result.timestamp}\n\n---\n\n${result.content}`
}]
};
}
}
catch (error) {
return {
content: [{
type: "text",
text: `❌ Web content extraction failed: ${error.message}`
}],
isError: true
};
}
}
};