article-writer-cn
Version:
AI 驱动的智能写作系统 - 专注公众号/自媒体文章创作
129 lines • 4.23 kB
JavaScript
/**
* 动态页面爬虫 - 使用 Puppeteer 处理 JavaScript 渲染的页面
*/
import * as cheerio from 'cheerio';
import { DocumentationCrawler } from './doc-crawler.js';
async function loadPuppeteer() {
try {
const pptr = await import('puppeteer');
return pptr.default || pptr;
}
catch (err) {
throw new Error('❌ Puppeteer 未安装。请运行: npm install puppeteer\n' +
' 或使用静态爬虫(移除 --dynamic 参数)');
}
}
export class DynamicCrawler extends DocumentationCrawler {
browser = null;
page = null;
constructor(config) {
super(config);
}
/**
* 初始化浏览器
*/
async initBrowser() {
if (this.browser)
return;
const puppeteer = await loadPuppeteer();
console.log('🌐 启动 Puppeteer 浏览器...');
this.browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--disable-gpu'
]
});
this.page = await this.browser.newPage();
// 设置视口
await this.page.setViewport({
width: 1920,
height: 1080
});
// 设置 User-Agent
await this.page.setUserAgent('Mozilla/5.0 (compatible; ArticleWriter-Crawler/1.0)');
console.log('✅ 浏览器已启动');
}
/**
* 爬取单个页面(覆盖父类方法)
*/
async crawlPage(url) {
if (!this.page || !this.browser) {
await this.initBrowser();
}
try {
console.log(`📄 爬取(动态): ${url}`);
// 导航到页面
await this.page.goto(url, {
waitUntil: 'networkidle2',
timeout: 30000
});
// 等待主内容加载
try {
const contentSelector = this.config.selectors?.mainContent || 'article, main';
await this.page.waitForSelector(contentSelector, { timeout: 5000 });
}
catch {
// 如果没有找到主内容选择器,继续处理
console.log(`⚠️ 未找到主内容选择器,尝试提取整个页面`);
}
// 额外等待,确保 JS 完全执行
await this.sleep(1000);
// 获取渲染后的 HTML
const htmlContent = await this.page.content();
const $ = cheerio.load(htmlContent);
// 提取内容(使用父类方法)
const title = this.extractTitle($);
const content = this.extractMainContent($);
const codeExamples = this.extractCodeExamples($);
const links = this.extractLinks($, url);
const page = {
url,
title,
content,
htmlContent,
codeExamples,
category: this.inferCategory(url, title),
scrapedAt: new Date().toISOString(),
links
};
// 直接添加到爬取结果
this.crawledPages.push(page);
this.visitedUrls.add(url);
// 将新链接加入队列
this.addLinksToQueue(links);
}
catch (error) {
console.error(`❌ 动态爬取失败 ${url}: ${error.message}`);
// 降级到静态爬取
console.log(`🔄 尝试静态爬取...`);
await super.crawlPage(url);
}
}
/**
* 关闭浏览器
*/
async close() {
if (this.browser) {
await this.browser.close();
this.browser = null;
this.page = null;
console.log('🔚 浏览器已关闭');
}
}
/**
* 覆盖爬取方法,确保最后关闭浏览器
*/
async crawl(onProgress) {
try {
return await super.crawl(onProgress);
}
finally {
await this.close();
}
}
}
//# sourceMappingURL=dynamic-crawler.js.map