UNPKG

chen-crawler

Version:

Web Crawler Provider for Chen Framework

305 lines (304 loc) 8.53 kB
import { Model, StorageService } from 'chen/core'; import { HttpClient, HttpClientOptions, HttpClientResponse } from 'chen/web'; import { Queue, ProcessingList, QueueFilter } from '../queue'; import { Storage } from '../storage'; import * as cheerio from 'cheerio'; import * as events from 'events'; import * as urllib from 'url'; /** * HtmlSelector interface */ export interface HtmlSelector extends cheerio.Static { } /** * CrawledContent interface */ export interface CrawledContent { url: string; title: string; content: string; } /** * HeadlessBrowserEnabler interface */ export interface HeadlessBrowserEnabler extends Function { (url: urllib.Url): boolean; } /** * CrawledContentFilter interface */ export interface CrawledContentFilter extends Function { (url: urllib.Url, data: CrawledContent): boolean; } /** * Abstract Crawler class */ export declare abstract class Crawler extends events.EventEmitter { private name; private startingUrl; protected config: HttpClientOptions; /** * URL filter * @type {QueueFilter} */ private queueFilter; /** * Flag to determine if crawler is currently running * @type {boolean} */ protected running: boolean; /** * Crawled content filter * @type {CrawledContentFilter} */ private crawledContentFilter; /** * url queue * @type {Queue} */ protected queue: Queue; /** * Flag to determine if page is rendered via ajax * @type {boolean | HeadlessBrowserEnabler} */ private headlessBrowserEnabled; /** * Crawler http client * @type {HttpClient} */ protected httpClient: HttpClient; /** * Flag whether crawl anchor tag links * @type {boolean} */ protected followHtmlLinks: boolean; /** * Storage for page html and text * @type {StorageService<Model>} */ private storage; /** * Browser is processing * @type {boolean} */ private browserBusy; /** * ProcessingList instance * @type {ProcessingList} */ protected inProcessList: ProcessingList; /** * Abstract crawler constructor * @param {Storage} storage * @param {string} private name * @param {string} private startingUrl * @param {HttpClientOptions} protected config */ constructor(storage: Storage, name: string, startingUrl: string, config: HttpClientOptions); /** * Get name * @return {string} */ getName(): string; /** * Get starting url * @return {string} */ getStartingUrl(): string; /** * Get http client configuration * @return {HttpClientOptions} */ getConfig(): HttpClientOptions; /** * Set URL Queue Filter * @param {QueueFilter} filter * @return {this} */ setQueueFilter(filter: QueueFilter): this; /** * Set content filter to be saved in the database * @param {CrawledContentFilter} event * @return {this} */ setContentFilter(filter: CrawledContentFilter): this; /** * Filter url * @param {urllib.Url} url * @return {boolean} */ protected filterQueue(url: urllib.Url): boolean; /** * Filter content * @param {urllib.Url} url * @param {CrawledContent} data * @return {boolean} */ protected filterContent(url: urllib.Url, data: CrawledContent): boolean; /** * Check if url matched for URL ajax filter * @param {urllib.Url | string} url * @return {boolean} */ protected isHeadlessBrowserEnabled(url: urllib.Url | string): boolean; /** * Flag for enabling crawler for ajax rendered content * @param {boolean | HeadlessBrowserEnabler = true} enable * @return {this} */ useHeadlessBrowser(enable?: boolean | HeadlessBrowserEnabler): this; /** * Get cheerio instance * @param {string} body * @return {HtmlSelector} */ protected loadHtml(body: string): HtmlSelector; /** * Load url * @param {string} url * @return {Promise<HttpClientResponse>} */ protected loadUrl(url: string): Promise<HttpClientResponse>; /** * Listen on fetch start event * @param {(urllib.Url, worker) => void} fn * @return {this} */ onFetchStart(fn: (url: urllib.Url, worker: string) => void): this; /** * Listen on fetch complete event * @param {(HtmlSelector) => void} fn * @return {this} */ onFetchComplete<U extends Model>(fn: (url: urllib.Url, html: HtmlSelector, model: U, worker: string) => void): this; /** * On fetch error * @param {(urllib.Url, HttpClientResponse, worker) => void} fn * @return {this} */ onFetchError(fn: (url: urllib.Url, response: HttpClientResponse, worker: string) => void): this; /** * Listen on error event * @param {(err) => void} fn * @return {this} */ onError(fn: (err) => void): this; /** * Listen on start event * @param {() => void} fn * @return {this} */ onStart(fn: () => void): this; /** * Listen on stop event * @param {() => void} fn * @return {this} */ onStop(fn: () => void): this; /** * Format url with additional filtering * @param {URL} urlInfo * @return {string} */ protected formatFromParsedUrl(urlInfo: urllib.Url): string; /** * Remove unnecessary segments in url like hash * @param {string} url * @return {string} */ protected cleanUrl(url: string): string; /** * Set storage service * @param {StorageService<Model>} service * @return {this} */ setStorageService(service: StorageService<Model>): this; /** * Save crawled data * @param {urllib.Url | string} url * @param {HtmlSelector} select * @return {Promise<Model>} */ protected saveContent(url: urllib.Url | string, select: HtmlSelector): Promise<Model>; /** * Save crawled data to storage * @param {CrawledContent} data * @return {Promise<Model>} */ protected insertData(data: CrawledContent): Promise<Model>; /** * Check url is already crawled and saved to storage * @param {string} url * @return {Promise<Model>} */ protected getProcessed(url: string): Promise<Model>; /** * Extract then add to queue * @param {string} url * @param {HtmlSelector} select * @return {Promise<void>} */ protected extractUrlsFromHtmlAndAddToQueue(url: string, select: HtmlSelector): Promise<void>; /** * Add to queue * @param {string[]} urls * @return {Promise<void>} */ protected addToQueue(urls: string[]): Promise<void>; /** * Extract urls from given cheerio instance * @param {string} baseUrl * @param {HtmlSelector} htmlSelector * @return {string[]} */ protected extractUrlsFromHtml(baseUrl: string, htmlSelector: HtmlSelector): string[]; /** * Filter extracted urls * @param {string[]} extractedUrls * @return {Promise<string[]>} */ protected filterExtractedUrls(extractedUrls: string[]): Promise<string[]>; /** * Load Url via browser * @param {string} url * @return {Promise<string>} */ protected loadUrlFromBrowser(url: string): Promise<string>; /** * Crawl given url * @param {string} url * @param {string} worker * @return {Promise<void>} */ protected crawlUrlViaHttpClient(url: string, worker: string): Promise<void>; /** * Crawl via browser and control the queue for browser crawling * @param {string} url * @param {string} worker * @return {Promise<void>} */ protected crawlUrlViaHeadlessBrowser(url: string, worker: string): Promise<void>; /** * Start crawler * @return {Promise<void>} */ start(): Promise<void>; /** * Check if already in process * @param {string} url * @return {Promise<boolean>} */ protected inProcess(url: string): Promise<boolean>; /** * Check whether to use headless browser or not then crawl the url * @param {string} url * @param {string} worker * @return {Promise<void>} */ protected crawlUrl(url: string, worker: string): Promise<void>; /** * Crawl * @return {Promise<void>} */ protected abstract crawl(): Promise<void>; }