crawler-ts
Version:
Lightweight crawler written in TypeScript using ES6 generators.
161 lines (160 loc) • 9.88 kB
JavaScript
"use strict";
var __generator = (this && this.__generator) || function (thisArg, body) {
var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g;
return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g;
function verb(n) { return function (v) { return step([n, v]); }; }
function step(op) {
if (f) throw new TypeError("Generator is already executing.");
while (_) try {
if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t;
if (y = 0, t) op = [op[0] & 2, t.value];
switch (op[0]) {
case 0: case 1: t = op; break;
case 4: _.label++; return { value: op[1], done: false };
case 5: _.label++; y = op[1]; op = [0]; continue;
case 7: op = _.ops.pop(); _.trys.pop(); continue;
default:
if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; }
if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; }
if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; }
if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; }
if (t[2]) _.ops.pop();
_.trys.pop(); continue;
}
op = body.call(thisArg, _);
} catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; }
if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true };
}
};
var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); }
var __asyncValues = (this && this.__asyncValues) || function (o) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var m = o[Symbol.asyncIterator], i;
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
};
var __asyncDelegator = (this && this.__asyncDelegator) || function (o) {
var i, p;
return i = {}, verb("next"), verb("throw", function (e) { throw e; }), verb("return"), i[Symbol.iterator] = function () { return this; }, i;
function verb(n, f) { i[n] = o[n] ? function (v) { return (p = !p) ? { value: __await(o[n](v)), done: n === "return" } : f ? f(v) : v; } : f; }
};
var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var g = generator.apply(thisArg, _arguments || []), i, q = [];
return i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i;
function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; }
function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }
function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }
function fulfill(value) { resume("next", value); }
function reject(value) { resume("throw", value); }
function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }
};
var __values = (this && this.__values) || function(o) {
var s = typeof Symbol === "function" && Symbol.iterator, m = s && o[s], i = 0;
if (m) return m.call(o);
if (o && typeof o.length === "number") return {
next: function () {
if (o && i >= o.length) o = void 0;
return { value: o && o[i++], done: !o };
}
};
throw new TypeError(s ? "Object is not iterable." : "Symbol.iterator is not defined.");
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.createCrawler = void 0;
function createCrawler(options) {
var requester = options.requester, shouldParse = options.shouldParse, parser = options.parser, shouldYield = options.shouldYield, follower = options.follower, shouldQueue = options.shouldQueue, logger = options.logger;
return function gen(location) {
return __asyncGenerator(this, arguments, function gen_1() {
var response, _a, parsed, _b, _c, next, e_1, e_2_1, e_3;
var e_2, _d;
return __generator(this, function (_e) {
switch (_e.label) {
case 0:
_e.trys.push([0, 28, , 29]);
logger === null || logger === void 0 ? void 0 : logger.info("Requesting " + location);
return [4 /*yield*/, __await(requester(location))];
case 1:
response = _e.sent();
_a = response;
if (!_a) return [3 /*break*/, 3];
return [4 /*yield*/, __await(shouldParse({ location: location, response: response }))];
case 2:
_a = (_e.sent());
_e.label = 3;
case 3:
if (!_a) return [3 /*break*/, 27];
logger === null || logger === void 0 ? void 0 : logger.info("Parsing " + location);
return [4 /*yield*/, __await(parser({ location: location, response: response }))];
case 4:
parsed = _e.sent();
if (!!parsed) return [3 /*break*/, 6];
return [4 /*yield*/, __await(void 0)];
case 5: return [2 /*return*/, _e.sent()];
case 6: return [4 /*yield*/, __await(shouldYield({ location: location, response: response, parsed: parsed }))];
case 7:
if (!_e.sent()) return [3 /*break*/, 10];
logger === null || logger === void 0 ? void 0 : logger.info("Yielding " + location);
return [4 /*yield*/, __await({ location: location, response: response, parsed: parsed })];
case 8: return [4 /*yield*/, _e.sent()];
case 9:
_e.sent();
_e.label = 10;
case 10:
_e.trys.push([10, 21, 22, 27]);
_b = __asyncValues(follower({ location: location, response: response, parsed: parsed }));
_e.label = 11;
case 11: return [4 /*yield*/, __await(_b.next())];
case 12:
if (!(_c = _e.sent(), !_c.done)) return [3 /*break*/, 20];
next = _c.value;
_e.label = 13;
case 13:
_e.trys.push([13, 18, , 19]);
return [4 /*yield*/, __await(shouldQueue({ location: next, origin: location, response: response, parsed: parsed }))];
case 14:
if (!_e.sent()) return [3 /*break*/, 17];
logger === null || logger === void 0 ? void 0 : logger.info("Queueing " + next);
return [5 /*yield**/, __values(__asyncDelegator(__asyncValues(gen(next))))];
case 15: return [4 /*yield*/, __await.apply(void 0, [_e.sent()])];
case 16:
_e.sent();
_e.label = 17;
case 17: return [3 /*break*/, 19];
case 18:
e_1 = _e.sent();
logger === null || logger === void 0 ? void 0 : logger.error("Cannot queue " + next);
logger === null || logger === void 0 ? void 0 : logger.error(e_1);
return [3 /*break*/, 19];
case 19: return [3 /*break*/, 11];
case 20: return [3 /*break*/, 27];
case 21:
e_2_1 = _e.sent();
e_2 = { error: e_2_1 };
return [3 /*break*/, 27];
case 22:
_e.trys.push([22, , 25, 26]);
if (!(_c && !_c.done && (_d = _b.return))) return [3 /*break*/, 24];
return [4 /*yield*/, __await(_d.call(_b))];
case 23:
_e.sent();
_e.label = 24;
case 24: return [3 /*break*/, 26];
case 25:
if (e_2) throw e_2.error;
return [7 /*endfinally*/];
case 26: return [7 /*endfinally*/];
case 27: return [3 /*break*/, 29];
case 28:
e_3 = _e.sent();
logger === null || logger === void 0 ? void 0 : logger.error("Cannot visit " + location);
logger === null || logger === void 0 ? void 0 : logger.error(e_3);
return [3 /*break*/, 29];
case 29: return [2 /*return*/];
}
});
});
};
}
exports.createCrawler = createCrawler;