crawlerzzz
Version:
52 lines • 2.41 kB
JavaScript
;
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
Object.defineProperty(exports, "__esModule", { value: true });
var axios_1 = require("axios");
var rxjs_1 = require("rxjs");
var operators_1 = require("rxjs/operators");
var R = require("ramda");
var staticCrawler = function (opts) {
var _a = __assign({ startFrom: '', rateLimit: 0, noDulicatedLink: true, exitOnIdleSeconds: 60 }, opts), domain = _a.domain, startFrom = _a.startFrom, rateLimit = _a.rateLimit, noDulicatedLink = _a.noDulicatedLink, exitOnIdleSeconds = _a.exitOnIdleSeconds;
var link$ = new rxjs_1.BehaviorSubject({
url: startFrom
}).pipe(noDulicatedLink ? operators_1.distinct(R.prop('url')) : R.identity);
var concatUrl = function (url) {
return /^https?/.test(url) ? url : domain + url;
};
var requestPromise = function (_a) {
var _b = _a.url, url = _b === void 0 ? '' : _b, _c = _a.data, data = _c === void 0 ? null : _c;
return axios_1.default.get(concatUrl(url)).then(function (res) { return ({
src: res.data,
url: url,
data: data
}); });
};
var pageSource$ = link$.pipe(
//@ts-ignore
operators_1.concatMap(function (link) { return rxjs_1.of(link).pipe(operators_1.delay(rateLimit)); }), operators_1.flatMap(requestPromise));
var idleTimeout$ = rxjs_1.merge(pageSource$, link$).pipe(operators_1.bufferTime(exitOnIdleSeconds * 1000), operators_1.filter(function (buffered) { return buffered.length == 0; }));
idleTimeout$.subscribe(function () {
console.log("Crawler has been idle for " + exitOnIdleSeconds + "ms. Exiting...");
//@ts-ignore
process.exit();
});
link$ = link$.pipe(operators_1.takeUntil(idleTimeout$));
// @ts-ignore
var queueLink = function (url, data) {
if (data === void 0) { data = null; }
return link$.next({ url: url, data: data });
};
return { pageSource$: pageSource$, link$: link$, queueLink: queueLink };
};
exports.default = staticCrawler;
//# sourceMappingURL=staticCrawler.js.map