UNPKG

article-writer-cn

Version:

AI 驱动的智能写作系统 - 专注公众号/自媒体文章创作

169 lines 6.09 kB
/** * 爬虫管理器 - 统一入口,自动选择合适的爬虫 */ import fs from 'fs-extra'; import path from 'path'; import { DocumentationCrawler } from './doc-crawler.js'; import { DynamicCrawler } from './dynamic-crawler.js'; import { PDFExtractor } from './pdf-extractor.js'; import { KnowledgeConverter } from './knowledge-converter.js'; import { ProgressBar } from './progress-bar.js'; export class CrawlerManager { /** * 执行爬取 */ async executeCrawl(options) { console.log(`\n🚀 开始文档爬取任务`); console.log(`📝 项目: ${options.name}`); // PDF 模式 if (options.pdfPath) { return await this.crawlPDF(options); } // 网页爬取模式 if (options.url) { return await this.crawlWebsite(options); } console.error('❌ 错误: 必须提供 url 或 pdfPath'); return null; } /** * 爬取网站 */ async crawlWebsite(options) { const config = { name: options.name, baseUrl: options.url, maxPages: options.maxPages || 200, concurrency: options.concurrency || 5 }; // 选择爬虫类型 const crawler = options.useDynamic ? new DynamicCrawler(config) : new DocumentationCrawler(config); const crawlerType = options.useDynamic ? '动态爬虫 (Puppeteer)' : '静态爬虫'; console.log(`🔧 使用: ${crawlerType}`); // 进度条 const progressBar = new ProgressBar(); // 开始爬取 const result = await crawler.crawl((progress) => { progressBar.update(progress); }); progressBar.complete(result.duration, result.totalPages); // 保存原始数据 const rawDir = path.join(options.outputDir, 'raw', options.name); await crawler.saveResults(rawDir); // 转换为知识库 await this.convertAndIndex(options.name, rawDir, options.outputDir); // 显示报告 this.showReport(result, options); return result; } /** * 爬取 PDF */ async crawlPDF(options) { console.log(`📄 PDF 模式`); console.log(`📂 文件: ${options.pdfPath}`); const extractor = new PDFExtractor(); // 提取 PDF const extractionResult = await extractor.extractFromPDF(options.pdfPath); // 转换为 CrawledPage 格式 const page = extractor.convertToCrawledPage(options.pdfPath, extractionResult, options.name); // 保存 const rawDir = path.join(options.outputDir, 'raw', options.name); await extractor.saveExtraction(rawDir, options.name, page); // 转换为知识库 await this.convertAndIndex(options.name, rawDir, options.outputDir); // 构造结果 const result = { pages: [page], totalPages: 1, duration: 0, categories: { 'pdf-document': [page] } }; this.showReport(result, options); return result; } /** * 转换为知识库 */ async convertAndIndex(name, rawDir, outputDir) { console.log(`\n🔄 转换知识库...`); // 转换为 Markdown const indexedDir = path.join(outputDir, 'indexed'); await fs.ensureDir(indexedDir); const converter = new KnowledgeConverter(); await converter.convertToKnowledgeBase(rawDir, indexedDir); console.log(`✅ 知识库转换完成`); } /** * 显示报告 */ showReport(result, options) { console.log(`\n📊 爬取报告`); console.log(`${'='.repeat(50)}`); console.log(`📝 项目名称: ${options.name}`); console.log(`📄 总页数: ${result.totalPages}`); console.log(`⏱️ 用时: ${this.formatDuration(result.duration)}`); console.log(`📂 分类数: ${Object.keys(result.categories).length}`); console.log(`\n📚 分类统计:`); for (const [category, pages] of Object.entries(result.categories)) { console.log(` - ${this.formatCategoryName(category)}: ${pages.length} 页`); } console.log(`\n💾 存储位置:`); console.log(` - 原始数据: ${path.join(options.outputDir, 'raw', options.name)}`); console.log(` - 知识库: ${path.join(options.outputDir, 'indexed')}`); console.log(`\n💡 使用方式:`); console.log(` 1. 查看索引: cat ${path.join(options.outputDir, 'indexed', options.name)}-index.md`); console.log(` 2. AI 写作时会自动读取知识库 Markdown 文件`); console.log(`\n✅ 任务完成!\n`); } /** * 格式化分类名称 */ formatCategoryName(category) { const names = { 'getting-started': '入门指南', 'guide': '使用指南', 'api': 'API 参考', 'examples': '示例代码', 'advanced': '进阶内容', 'migration': '迁移指南', 'pdf-document': 'PDF 文档', 'other': '其他' }; return names[category] || category; } /** * 格式化持续时间 */ formatDuration(ms) { const seconds = Math.floor(ms / 1000); const minutes = Math.floor(seconds / 60); const hours = Math.floor(minutes / 60); if (hours > 0) { return `${hours}小时${minutes % 60}分`; } if (minutes > 0) { return `${minutes}${seconds % 60}秒`; } return `${seconds}秒`; } /** * 检测是否为动态页面 */ static isDynamicSite(url) { const dynamicPatterns = [ 'react', 'vue', 'angular', 'svelte', 'next.js', 'nuxt', 'spa' ]; const lowerUrl = url.toLowerCase(); return dynamicPatterns.some(pattern => lowerUrl.includes(pattern)); } } //# sourceMappingURL=crawler-manager.js.map