chen-crawler
Version:
Web Crawler Provider for Chen Framework
69 lines (68 loc) • 1.71 kB
TypeScript
import { Crawler } from './base';
import { HttpClientOptions } from 'chen/web';
import { Storage } from '../storage';
import * as cheerio from 'cheerio';
/**
* XmlSelector interface
*/
export interface XmlSelector extends cheerio.Static {
}
/**
* Sitemap class
*/
export declare class SitemapCrawler extends Crawler {
/**
* Whether follow anchor tag links
* @type {boolean}
*/
protected followHtmlLinks: boolean;
/**
* Worker Process ID
* @type {string}
*/
protected worker: string;
/**
* WebCrawler constructor
* @param {string} private name
* @param {string} private startingUrl
* @param {HttpClientOptions} private config
*/
constructor(storage: Storage, name: string, startingUrl: string, config: HttpClientOptions);
/**
* Load xml to cheerio
* @param {string} content
* @return {XmlSelector}
*/
private loadXml(content);
/**
* Extract urls from sitemap
* @param {string} content
* @return {string[]}
*/
private extractUrlsFromSitemap(content);
/**
* Extract urls from gzip content
* @param {Buffer} body
* @return {Promise<string[]>}
*/
private extractUrlsFromGzip(body);
/**
* Crawl sitemap url
* @param {string} sitemapUrl
*/
private crawlSitemapUrl(sitemapUrl);
/**
* Extract urls from sitemap then add to queue
* @param {string} sitemapContent
* @return {Promise<void>}
*/
private extractUrlsFromSitemapAndAddToQueue(sitemapContent);
/**
* Crawl url data
*/
private crawlUrlData();
/**
* Start crawling
*/
protected crawl(): Promise<void>;
}