article-writer-cn
Version:
AI 驱动的智能写作系统 - 专注公众号/自媒体文章创作
68 lines • 1.94 kB
TypeScript
/**
* 文档爬虫 - 静态页面爬取
*/
import TurndownService from 'turndown';
import pLimit from 'p-limit';
import type { CrawlerConfig, CrawledPage, CodeExample, CrawlProgress, CrawlResult } from './types.js';
export declare class DocumentationCrawler {
protected config: CrawlerConfig;
protected visitedUrls: Set<string>;
protected urlQueue: string[];
protected crawledPages: CrawledPage[];
protected turndownService: TurndownService;
protected limit: ReturnType<typeof pLimit>;
protected startTime: number;
constructor(config: CrawlerConfig);
/**
* 开始爬取
*/
crawl(onProgress?: (progress: CrawlProgress) => void): Promise<CrawlResult>;
/**
* 爬取单个页面
*/
protected crawlPage(url: string): Promise<void>;
/**
* 提取标题
*/
protected extractTitle($: any): string;
/**
* 提取主要内容
*/
protected extractMainContent($: any): string;
/**
* 提取代码示例
*/
protected extractCodeExamples($: any): CodeExample[];
/**
* 检测代码语言
*/
protected detectLanguage($code: any): string;
/**
* 提取链接
*/
protected extractLinks($: any, currentUrl: string): string[];
/**
* 将链接加入队列
*/
protected addLinksToQueue(links: string[]): void;
/**
* 判断是否应该爬取该链接
*/
protected shouldCrawl(url: string): boolean;
/**
* 推断分类
*/
protected inferCategory(url: string, title: string): string;
/**
* 分类统计
*/
protected categorizePages(): Record<string, CrawledPage[]>;
/**
* 保存结果
*/
saveResults(outputDir: string): Promise<void>;
protected sleep(ms: number): Promise<void>;
protected slugify(text: string): string;
protected formatDuration(ms: number): string;
}
//# sourceMappingURL=doc-crawler.d.ts.map