UNPKG

crawlerzzz

Version:

52 lines 2.41 kB
"use strict"; var __assign = (this && this.__assign) || function () { __assign = Object.assign || function(t) { for (var s, i = 1, n = arguments.length; i < n; i++) { s = arguments[i]; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p]; } return t; }; return __assign.apply(this, arguments); }; Object.defineProperty(exports, "__esModule", { value: true }); var axios_1 = require("axios"); var rxjs_1 = require("rxjs"); var operators_1 = require("rxjs/operators"); var R = require("ramda"); var staticCrawler = function (opts) { var _a = __assign({ startFrom: '', rateLimit: 0, noDulicatedLink: true, exitOnIdleSeconds: 60 }, opts), domain = _a.domain, startFrom = _a.startFrom, rateLimit = _a.rateLimit, noDulicatedLink = _a.noDulicatedLink, exitOnIdleSeconds = _a.exitOnIdleSeconds; var link$ = new rxjs_1.BehaviorSubject({ url: startFrom }).pipe(noDulicatedLink ? operators_1.distinct(R.prop('url')) : R.identity); var concatUrl = function (url) { return /^https?/.test(url) ? url : domain + url; }; var requestPromise = function (_a) { var _b = _a.url, url = _b === void 0 ? '' : _b, _c = _a.data, data = _c === void 0 ? null : _c; return axios_1.default.get(concatUrl(url)).then(function (res) { return ({ src: res.data, url: url, data: data }); }); }; var pageSource$ = link$.pipe( //@ts-ignore operators_1.concatMap(function (link) { return rxjs_1.of(link).pipe(operators_1.delay(rateLimit)); }), operators_1.flatMap(requestPromise)); var idleTimeout$ = rxjs_1.merge(pageSource$, link$).pipe(operators_1.bufferTime(exitOnIdleSeconds * 1000), operators_1.filter(function (buffered) { return buffered.length == 0; })); idleTimeout$.subscribe(function () { console.log("Crawler has been idle for " + exitOnIdleSeconds + "ms. Exiting..."); //@ts-ignore process.exit(); }); link$ = link$.pipe(operators_1.takeUntil(idleTimeout$)); // @ts-ignore var queueLink = function (url, data) { if (data === void 0) { data = null; } return link$.next({ url: url, data: data }); }; return { pageSource$: pageSource$, link$: link$, queueLink: queueLink }; }; exports.default = staticCrawler; //# sourceMappingURL=staticCrawler.js.map