crawlerzzz
Version:
135 lines (120 loc) • 2.66 kB
text/typescript
import axios from 'axios';
import { BehaviorSubject, of, merge, interval } from 'rxjs';
import {
takeUntil,
distinct,
concatMap,
delay,
flatMap,
filter,
bufferTime
} from 'rxjs/operators';
import * as R from 'ramda';
import * as puppeteer from 'puppeteer';
export type LiveCrawlerOptions = {
domain?: string;
processPage: Function;
startFrom?: string;
noDuplicatedLink?: boolean;
manualLogin?: boolean;
secondsWaitForLogin?: number;
rateLimit?: number;
exitOnIdleSeconds?: number;
launchConfig?: object;
pageConfig?: { viewport: { width: number; height: number } };
};
export type LinkType = {
url: string;
data?: any;
};
const liveCrawler = (opts: LiveCrawlerOptions) => {
let {
domain,
startFrom,
noDuplicatedLink,
manualLogin,
processPage,
rateLimit,
exitOnIdleSeconds,
secondsWaitForLogin,
launchConfig,
pageConfig
} = {
startFrom: '',
secondsWaitForLogin: 45,
noDuplicatedLink: true,
exitOnIdleSeconds: 60,
rateLimit: 0,
pageConfig: {
viewport: {
width: 1920,
height: 1080
}
},
...opts
};
let link$ = new BehaviorSubject({
url: startFrom
}).pipe(noDuplicatedLink ? distinct(R.prop('url')) : R.identity);
if (manualLogin) {
launchConfig = {
headless: false,
userDataDir: '~/.browsercache',
...launchConfig
};
}
const crawl = async () => {
const browser = await puppeteer.launch(launchConfig);
let page = await browser.newPage();
page.setViewport(pageConfig.viewport);
if (manualLogin) {
await page.goto(domain + startFrom);
await page.waitFor(secondsWaitForLogin * 1000);
page = await browser.newPage();
page.setViewport(pageConfig.viewport);
}
const workOnPage = async (link: LinkType) => {
const { url = '', data } = link;
//@ts-ignore
await page.goto(url.startsWith('http') ? url : domain + url);
await processPage(page, link);
const src = await page.content();
return {
src,
url,
data
};
};
const pageSource$ = link$.pipe(
concatMap(link =>
of(link).pipe(
//@ts-ignore
delay(rateLimit),
flatMap(workOnPage)
)
)
);
const idleTimeout$ = link$.pipe(
bufferTime(exitOnIdleSeconds * 1000),
filter(buffered => buffered.length == 0)
);
idleTimeout$.subscribe(() => {
console.log(
`Crawler has been idle for ${exitOnIdleSeconds}ms. Exiting...`
);
//@ts-ignore
process.exit();
});
link$ = link$.pipe(takeUntil(idleTimeout$));
const queueLink = (url: string, data?: any) =>
//@ts-ignore
link$.next({ url, data });
return {
pageSource$,
link$,
queueLink
};
};
return crawl();
};
export default liveCrawler;