crawlerzzz
Version:
84 lines (70 loc) • 1.68 kB
text/typescript
import axios from 'axios';
import { BehaviorSubject, of, merge } from 'rxjs';
import {
takeUntil,
distinct,
concatMap,
delay,
filter,
flatMap,
bufferTime
} from 'rxjs/operators';
import * as R from 'ramda';
type StaticCrawlerOptions = {
domain: string;
startFrom?: string;
noDuplicatedLink?: boolean;
rateLimit?: number;
exitOnIdleSeconds?: number;
};
const staticCrawler = (opts: StaticCrawlerOptions) => {
const {
domain,
startFrom,
rateLimit,
noDulicatedLink,
exitOnIdleSeconds
} = {
startFrom: '',
rateLimit: 0,
noDulicatedLink: true,
exitOnIdleSeconds: 60,
...opts
};
type SubjectDataType = {
url: string;
data?: any;
};
let link$ = new BehaviorSubject<SubjectDataType>({
url: startFrom
}).pipe(noDulicatedLink ? distinct(R.prop('url')) : R.identity);
const concatUrl = (url: string) =>
/^https?/.test(url) ? url : domain + url;
const requestPromise = ({ url = '', data = null }) =>
axios.get(concatUrl(url)).then(res => ({
src: res.data,
url,
data
}));
const pageSource$ = link$.pipe(
//@ts-ignore
concatMap(link => of(link).pipe(delay(rateLimit))),
flatMap(requestPromise)
);
const idleTimeout$ = merge(pageSource$, link$).pipe(
bufferTime(exitOnIdleSeconds * 1000),
filter(buffered => buffered.length == 0)
);
idleTimeout$.subscribe(() => {
console.log(
`Crawler has been idle for ${exitOnIdleSeconds}ms. Exiting...`
);
//@ts-ignore
process.exit();
});
link$ = link$.pipe(takeUntil(idleTimeout$));
// @ts-ignore
const queueLink = (url: string, data = null) => link$.next({ url, data });
return { pageSource$, link$, queueLink };
};
export default staticCrawler;