chen-crawler
Version:
Web Crawler Provider for Chen Framework
305 lines (304 loc) • 8.53 kB
TypeScript
import { Model, StorageService } from 'chen/core';
import { HttpClient, HttpClientOptions, HttpClientResponse } from 'chen/web';
import { Queue, ProcessingList, QueueFilter } from '../queue';
import { Storage } from '../storage';
import * as cheerio from 'cheerio';
import * as events from 'events';
import * as urllib from 'url';
/**
* HtmlSelector interface
*/
export interface HtmlSelector extends cheerio.Static {
}
/**
* CrawledContent interface
*/
export interface CrawledContent {
url: string;
title: string;
content: string;
}
/**
* HeadlessBrowserEnabler interface
*/
export interface HeadlessBrowserEnabler extends Function {
(url: urllib.Url): boolean;
}
/**
* CrawledContentFilter interface
*/
export interface CrawledContentFilter extends Function {
(url: urllib.Url, data: CrawledContent): boolean;
}
/**
* Abstract Crawler class
*/
export declare abstract class Crawler extends events.EventEmitter {
private name;
private startingUrl;
protected config: HttpClientOptions;
/**
* URL filter
* @type {QueueFilter}
*/
private queueFilter;
/**
* Flag to determine if crawler is currently running
* @type {boolean}
*/
protected running: boolean;
/**
* Crawled content filter
* @type {CrawledContentFilter}
*/
private crawledContentFilter;
/**
* url queue
* @type {Queue}
*/
protected queue: Queue;
/**
* Flag to determine if page is rendered via ajax
* @type {boolean | HeadlessBrowserEnabler}
*/
private headlessBrowserEnabled;
/**
* Crawler http client
* @type {HttpClient}
*/
protected httpClient: HttpClient;
/**
* Flag whether crawl anchor tag links
* @type {boolean}
*/
protected followHtmlLinks: boolean;
/**
* Storage for page html and text
* @type {StorageService<Model>}
*/
private storage;
/**
* Browser is processing
* @type {boolean}
*/
private browserBusy;
/**
* ProcessingList instance
* @type {ProcessingList}
*/
protected inProcessList: ProcessingList;
/**
* Abstract crawler constructor
* @param {Storage} storage
* @param {string} private name
* @param {string} private startingUrl
* @param {HttpClientOptions} protected config
*/
constructor(storage: Storage, name: string, startingUrl: string, config: HttpClientOptions);
/**
* Get name
* @return {string}
*/
getName(): string;
/**
* Get starting url
* @return {string}
*/
getStartingUrl(): string;
/**
* Get http client configuration
* @return {HttpClientOptions}
*/
getConfig(): HttpClientOptions;
/**
* Set URL Queue Filter
* @param {QueueFilter} filter
* @return {this}
*/
setQueueFilter(filter: QueueFilter): this;
/**
* Set content filter to be saved in the database
* @param {CrawledContentFilter} event
* @return {this}
*/
setContentFilter(filter: CrawledContentFilter): this;
/**
* Filter url
* @param {urllib.Url} url
* @return {boolean}
*/
protected filterQueue(url: urllib.Url): boolean;
/**
* Filter content
* @param {urllib.Url} url
* @param {CrawledContent} data
* @return {boolean}
*/
protected filterContent(url: urllib.Url, data: CrawledContent): boolean;
/**
* Check if url matched for URL ajax filter
* @param {urllib.Url | string} url
* @return {boolean}
*/
protected isHeadlessBrowserEnabled(url: urllib.Url | string): boolean;
/**
* Flag for enabling crawler for ajax rendered content
* @param {boolean | HeadlessBrowserEnabler = true} enable
* @return {this}
*/
useHeadlessBrowser(enable?: boolean | HeadlessBrowserEnabler): this;
/**
* Get cheerio instance
* @param {string} body
* @return {HtmlSelector}
*/
protected loadHtml(body: string): HtmlSelector;
/**
* Load url
* @param {string} url
* @return {Promise<HttpClientResponse>}
*/
protected loadUrl(url: string): Promise<HttpClientResponse>;
/**
* Listen on fetch start event
* @param {(urllib.Url, worker) => void} fn
* @return {this}
*/
onFetchStart(fn: (url: urllib.Url, worker: string) => void): this;
/**
* Listen on fetch complete event
* @param {(HtmlSelector) => void} fn
* @return {this}
*/
onFetchComplete<U extends Model>(fn: (url: urllib.Url, html: HtmlSelector, model: U, worker: string) => void): this;
/**
* On fetch error
* @param {(urllib.Url, HttpClientResponse, worker) => void} fn
* @return {this}
*/
onFetchError(fn: (url: urllib.Url, response: HttpClientResponse, worker: string) => void): this;
/**
* Listen on error event
* @param {(err) => void} fn
* @return {this}
*/
onError(fn: (err) => void): this;
/**
* Listen on start event
* @param {() => void} fn
* @return {this}
*/
onStart(fn: () => void): this;
/**
* Listen on stop event
* @param {() => void} fn
* @return {this}
*/
onStop(fn: () => void): this;
/**
* Format url with additional filtering
* @param {URL} urlInfo
* @return {string}
*/
protected formatFromParsedUrl(urlInfo: urllib.Url): string;
/**
* Remove unnecessary segments in url like hash
* @param {string} url
* @return {string}
*/
protected cleanUrl(url: string): string;
/**
* Set storage service
* @param {StorageService<Model>} service
* @return {this}
*/
setStorageService(service: StorageService<Model>): this;
/**
* Save crawled data
* @param {urllib.Url | string} url
* @param {HtmlSelector} select
* @return {Promise<Model>}
*/
protected saveContent(url: urllib.Url | string, select: HtmlSelector): Promise<Model>;
/**
* Save crawled data to storage
* @param {CrawledContent} data
* @return {Promise<Model>}
*/
protected insertData(data: CrawledContent): Promise<Model>;
/**
* Check url is already crawled and saved to storage
* @param {string} url
* @return {Promise<Model>}
*/
protected getProcessed(url: string): Promise<Model>;
/**
* Extract then add to queue
* @param {string} url
* @param {HtmlSelector} select
* @return {Promise<void>}
*/
protected extractUrlsFromHtmlAndAddToQueue(url: string, select: HtmlSelector): Promise<void>;
/**
* Add to queue
* @param {string[]} urls
* @return {Promise<void>}
*/
protected addToQueue(urls: string[]): Promise<void>;
/**
* Extract urls from given cheerio instance
* @param {string} baseUrl
* @param {HtmlSelector} htmlSelector
* @return {string[]}
*/
protected extractUrlsFromHtml(baseUrl: string, htmlSelector: HtmlSelector): string[];
/**
* Filter extracted urls
* @param {string[]} extractedUrls
* @return {Promise<string[]>}
*/
protected filterExtractedUrls(extractedUrls: string[]): Promise<string[]>;
/**
* Load Url via browser
* @param {string} url
* @return {Promise<string>}
*/
protected loadUrlFromBrowser(url: string): Promise<string>;
/**
* Crawl given url
* @param {string} url
* @param {string} worker
* @return {Promise<void>}
*/
protected crawlUrlViaHttpClient(url: string, worker: string): Promise<void>;
/**
* Crawl via browser and control the queue for browser crawling
* @param {string} url
* @param {string} worker
* @return {Promise<void>}
*/
protected crawlUrlViaHeadlessBrowser(url: string, worker: string): Promise<void>;
/**
* Start crawler
* @return {Promise<void>}
*/
start(): Promise<void>;
/**
* Check if already in process
* @param {string} url
* @return {Promise<boolean>}
*/
protected inProcess(url: string): Promise<boolean>;
/**
* Check whether to use headless browser or not then crawl the url
* @param {string} url
* @param {string} worker
* @return {Promise<void>}
*/
protected crawlUrl(url: string, worker: string): Promise<void>;
/**
* Crawl
* @return {Promise<void>}
*/
protected abstract crawl(): Promise<void>;
}