article-writer-cn
Version:
AI 驱动的智能写作系统 - 专注公众号/自媒体文章创作
348 lines • 11.9 kB
JavaScript
/**
* 文档爬虫 - 静态页面爬取
*/
import axios from 'axios';
import * as cheerio from 'cheerio';
import TurndownService from 'turndown';
import { URL } from 'url';
import pLimit from 'p-limit';
import fs from 'fs-extra';
import path from 'path';
export class DocumentationCrawler {
config;
visitedUrls = new Set();
urlQueue = [];
crawledPages = [];
turndownService;
limit;
startTime = 0;
constructor(config) {
this.config = {
maxPages: 100,
concurrency: 5,
rateLimit: 500,
timeout: 10000,
selectors: {
mainContent: 'article, main, .content, .markdown-body, .docs-content, .documentation',
title: 'h1, title',
codeBlocks: 'pre code, .highlight code, pre'
},
...config
};
this.turndownService = new TurndownService({
codeBlockStyle: 'fenced',
headingStyle: 'atx'
});
this.limit = pLimit(this.config.concurrency);
}
/**
* 开始爬取
*/
async crawl(onProgress) {
console.log(`🚀 开始爬取: ${this.config.baseUrl}`);
console.log(`📊 配置: 最大页数=${this.config.maxPages}, 并发=${this.config.concurrency}`);
this.urlQueue.push(this.config.baseUrl);
this.startTime = Date.now();
while (this.urlQueue.length > 0 && this.crawledPages.length < this.config.maxPages) {
const batch = this.urlQueue.splice(0, this.config.concurrency);
const promises = batch.map(url => this.limit(() => this.crawlPage(url)));
await Promise.allSettled(promises);
// 进度回调
if (onProgress) {
const elapsed = Date.now() - this.startTime;
const speed = this.crawledPages.length / (elapsed / 1000);
onProgress({
current: this.crawledPages.length,
total: this.config.maxPages,
queue: this.urlQueue.length,
elapsed,
speed
});
}
// 速率限制
if (this.config.rateLimit) {
await this.sleep(this.config.rateLimit);
}
}
const result = {
pages: this.crawledPages,
totalPages: this.crawledPages.length,
duration: Date.now() - this.startTime,
categories: this.categorizePages()
};
console.log(`✅ 爬取完成: ${result.totalPages} 页, 用时 ${this.formatDuration(result.duration)}`);
return result;
}
/**
* 爬取单个页面
*/
async crawlPage(url) {
if (this.visitedUrls.has(url))
return;
this.visitedUrls.add(url);
try {
console.log(`📄 爬取: ${url}`);
const response = await axios.get(url, {
timeout: this.config.timeout,
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; ArticleWriter-Crawler/1.0)'
}
});
const $ = cheerio.load(response.data);
// 提取内容
const title = this.extractTitle($);
const content = this.extractMainContent($);
const codeExamples = this.extractCodeExamples($);
const links = this.extractLinks($, url);
const page = {
url,
title,
content,
htmlContent: response.data,
codeExamples,
category: this.inferCategory(url, title),
scrapedAt: new Date().toISOString(),
links
};
this.crawledPages.push(page);
// 将新链接加入队列
this.addLinksToQueue(links);
}
catch (error) {
console.error(`❌ 爬取失败 ${url}: ${error.message}`);
}
}
/**
* 提取标题
*/
extractTitle($) {
const selector = this.config.selectors.title;
const selectors = selector.split(',').map(s => s.trim());
for (const sel of selectors) {
const title = $(sel).first().text().trim();
if (title)
return title;
}
return 'Untitled';
}
/**
* 提取主要内容
*/
extractMainContent($) {
const selector = this.config.selectors.mainContent;
const selectors = selector.split(',').map(s => s.trim());
for (const sel of selectors) {
const element = $(sel).first();
if (element.length > 0) {
// 移除不需要的元素
element.find('nav, .sidebar, .toc, script, style, .navigation, .breadcrumb').remove();
// 转换为 Markdown
const html = element.html() || '';
const markdown = this.turndownService.turndown(html);
if (markdown.trim().length > 100) {
return markdown;
}
}
}
return '';
}
/**
* 提取代码示例
*/
extractCodeExamples($) {
const selector = this.config.selectors.codeBlocks;
const examples = [];
$(selector).each((_, element) => {
const $code = $(element);
const content = $code.text().trim();
if (content.length === 0 || content.length > 5000)
return;
// 检测语言
const language = this.detectLanguage($code);
examples.push({
language,
content,
lineCount: content.split('\n').length
});
});
return examples.slice(0, 20); // 限制每页最多20个代码示例
}
/**
* 检测代码语言
*/
detectLanguage($code) {
// 从 class 中提取
const className = $code.attr('class') || '';
const langPatterns = [
/language-(\w+)/,
/lang-(\w+)/,
/highlight-(\w+)/,
/(\w+)-highlight/
];
for (const pattern of langPatterns) {
const match = className.match(pattern);
if (match)
return match[1];
}
// 从父元素查找
const parent = $code.parent();
const parentClass = parent.attr('class') || '';
for (const pattern of langPatterns) {
const match = parentClass.match(pattern);
if (match)
return match[1];
}
// 内容启发式检测
const content = $code.text();
if (content.includes('import ') && content.includes('from '))
return 'javascript';
if (content.includes('export ') && content.includes('const '))
return 'javascript';
if (content.includes('def ') && content.includes(':'))
return 'python';
if (content.includes('<?php'))
return 'php';
if (content.includes('public class'))
return 'java';
if (content.includes('func ') && content.includes('return'))
return 'go';
return 'text';
}
/**
* 提取链接
*/
extractLinks($, currentUrl) {
const links = [];
const baseUrl = new URL(this.config.baseUrl);
$('a[href]').each((_, element) => {
const href = $(element).attr('href');
if (!href)
return;
try {
const absoluteUrl = new URL(href, currentUrl).href;
const parsedUrl = new URL(absoluteUrl);
// 移除 hash 和 query
parsedUrl.hash = '';
const cleanUrl = parsedUrl.href;
// 只保留同域名链接
if (parsedUrl.hostname === baseUrl.hostname) {
links.push(cleanUrl);
}
}
catch {
// 忽略无效 URL
}
});
return [...new Set(links)]; // 去重
}
/**
* 将链接加入队列
*/
addLinksToQueue(links) {
for (const link of links) {
if (this.shouldCrawl(link) && this.crawledPages.length < this.config.maxPages) {
this.urlQueue.push(link);
}
}
}
/**
* 判断是否应该爬取该链接
*/
shouldCrawl(url) {
// 已访问过
if (this.visitedUrls.has(url))
return false;
// 已在队列中
if (this.urlQueue.includes(url))
return false;
// URL 模式匹配
const { include, exclude } = this.config.urlPatterns || {};
if (exclude) {
for (const pattern of exclude) {
if (url.includes(pattern))
return false;
}
}
if (include && include.length > 0) {
return include.some(pattern => url.includes(pattern));
}
return true;
}
/**
* 推断分类
*/
inferCategory(url, title) {
const categoryPatterns = {
'getting-started': ['intro', 'getting-started', 'quickstart', 'installation', 'setup'],
'guide': ['guide', 'tutorial', 'how-to', 'learn'],
'api': ['api', 'reference', 'docs/api', '/api/'],
'examples': ['example', 'demo', 'sample', 'playground'],
'advanced': ['advanced', 'deep-dive', 'internals', 'optimization'],
'migration': ['migration', 'upgrade', 'changelog', 'breaking']
};
const combined = `${url.toLowerCase()} ${title.toLowerCase()}`;
for (const [category, patterns] of Object.entries(categoryPatterns)) {
if (patterns.some(p => combined.includes(p))) {
return category;
}
}
return 'other';
}
/**
* 分类统计
*/
categorizePages() {
const categories = {};
for (const page of this.crawledPages) {
if (!categories[page.category]) {
categories[page.category] = [];
}
categories[page.category].push(page);
}
return categories;
}
/**
* 保存结果
*/
async saveResults(outputDir) {
await fs.ensureDir(outputDir);
// 保存原始数据
const rawDir = path.join(outputDir, 'pages');
await fs.ensureDir(rawDir);
for (const [index, page] of this.crawledPages.entries()) {
const filename = `${String(index + 1).padStart(3, '0')}-${this.slugify(page.title)}.json`;
await fs.writeJSON(path.join(rawDir, filename), page, { spaces: 2 });
}
// 保存摘要
const summary = {
name: this.config.name,
baseUrl: this.config.baseUrl,
totalPages: this.crawledPages.length,
categories: Object.fromEntries(Object.entries(this.categorizePages()).map(([cat, pages]) => [cat, pages.length])),
crawledAt: new Date().toISOString()
};
await fs.writeJSON(path.join(outputDir, 'summary.json'), summary, { spaces: 2 });
console.log(`💾 保存完成: ${outputDir}`);
}
// 工具方法
sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
slugify(text) {
return text
.toLowerCase()
.replace(/[^a-z0-9\u4e00-\u9fa5]+/g, '-')
.replace(/^-+|-+$/g, '')
.substring(0, 50);
}
formatDuration(ms) {
const seconds = Math.floor(ms / 1000);
const minutes = Math.floor(seconds / 60);
const remainingSeconds = seconds % 60;
if (minutes > 0) {
return `${minutes}m ${remainingSeconds}s`;
}
return `${seconds}s`;
}
}
//# sourceMappingURL=doc-crawler.js.map