crawlerzzz

Version:

84 lines (70 loc) • 1.68 kB

text/typescript

import axios from 'axios'; import { BehaviorSubject, of, merge } from 'rxjs'; import { takeUntil, distinct, concatMap, delay, filter, flatMap, bufferTime } from 'rxjs/operators'; import * as R from 'ramda'; type StaticCrawlerOptions = { domain: string; startFrom?: string; noDuplicatedLink?: boolean; rateLimit?: number; exitOnIdleSeconds?: number; }; const staticCrawler = (opts: StaticCrawlerOptions) => { const { domain, startFrom, rateLimit, noDulicatedLink, exitOnIdleSeconds } = { startFrom: '', rateLimit: 0, noDulicatedLink: true, exitOnIdleSeconds: 60, ...opts }; type SubjectDataType = { url: string; data?: any; }; let link$ = new BehaviorSubject<SubjectDataType>({ url: startFrom }).pipe(noDulicatedLink ? distinct(R.prop('url')) : R.identity); const concatUrl = (url: string) => /^https?/.test(url) ? url : domain + url; const requestPromise = ({ url = '', data = null }) => axios.get(concatUrl(url)).then(res => ({ src: res.data, url, data })); const pageSource$ = link$.pipe( //@ts-ignore concatMap(link => of(link).pipe(delay(rateLimit))), flatMap(requestPromise) ); const idleTimeout$ = merge(pageSource$, link$).pipe( bufferTime(exitOnIdleSeconds * 1000), filter(buffered => buffered.length == 0) ); idleTimeout$.subscribe(() => { console.log( `Crawler has been idle for ${exitOnIdleSeconds}ms. Exiting...` ); //@ts-ignore process.exit(); }); link$ = link$.pipe(takeUntil(idleTimeout$)); // @ts-ignore const queueLink = (url: string, data = null) => link$.next({ url, data }); return { pageSource$, link$, queueLink }; }; export default staticCrawler;