chen-crawler

Version:

Web Crawler Provider for Chen Framework

69 lines (68 loc) • 1.71 kB

TypeScript

import { Crawler } from './base'; import { HttpClientOptions } from 'chen/web'; import { Storage } from '../storage'; import * as cheerio from 'cheerio'; /** * XmlSelector interface */ export interface XmlSelector extends cheerio.Static { } /** * Sitemap class */ export declare class SitemapCrawler extends Crawler { /** * Whether follow anchor tag links * @type {boolean} */ protected followHtmlLinks: boolean; /** * Worker Process ID * @type {string} */ protected worker: string; /** * WebCrawler constructor * @param {string} private name * @param {string} private startingUrl * @param {HttpClientOptions} private config */ constructor(storage: Storage, name: string, startingUrl: string, config: HttpClientOptions); /** * Load xml to cheerio * @param {string} content * @return {XmlSelector} */ private loadXml(content); /** * Extract urls from sitemap * @param {string} content * @return {string[]} */ private extractUrlsFromSitemap(content); /** * Extract urls from gzip content * @param {Buffer} body * @return {Promise<string[]>} */ private extractUrlsFromGzip(body); /** * Crawl sitemap url * @param {string} sitemapUrl */ private crawlSitemapUrl(sitemapUrl); /** * Extract urls from sitemap then add to queue * @param {string} sitemapContent * @return {Promise<void>} */ private extractUrlsFromSitemapAndAddToQueue(sitemapContent); /** * Crawl url data */ private crawlUrlData(); /** * Start crawling */ protected crawl(): Promise<void>; }