UNPKG

article-writer-cn

Version:

AI 驱动的智能写作系统 - 专注公众号/自媒体文章创作

68 lines 1.94 kB
/** * 文档爬虫 - 静态页面爬取 */ import TurndownService from 'turndown'; import pLimit from 'p-limit'; import type { CrawlerConfig, CrawledPage, CodeExample, CrawlProgress, CrawlResult } from './types.js'; export declare class DocumentationCrawler { protected config: CrawlerConfig; protected visitedUrls: Set<string>; protected urlQueue: string[]; protected crawledPages: CrawledPage[]; protected turndownService: TurndownService; protected limit: ReturnType<typeof pLimit>; protected startTime: number; constructor(config: CrawlerConfig); /** * 开始爬取 */ crawl(onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult>; /** * 爬取单个页面 */ protected crawlPage(url: string): Promise<void>; /** * 提取标题 */ protected extractTitle($: any): string; /** * 提取主要内容 */ protected extractMainContent($: any): string; /** * 提取代码示例 */ protected extractCodeExamples($: any): CodeExample[]; /** * 检测代码语言 */ protected detectLanguage($code: any): string; /** * 提取链接 */ protected extractLinks($: any, currentUrl: string): string[]; /** * 将链接加入队列 */ protected addLinksToQueue(links: string[]): void; /** * 判断是否应该爬取该链接 */ protected shouldCrawl(url: string): boolean; /** * 推断分类 */ protected inferCategory(url: string, title: string): string; /** * 分类统计 */ protected categorizePages(): Record<string, CrawledPage[]>; /** * 保存结果 */ saveResults(outputDir: string): Promise<void>; protected sleep(ms: number): Promise<void>; protected slugify(text: string): string; protected formatDuration(ms: number): string; } //# sourceMappingURL=doc-crawler.d.ts.map