UNPKG

crawler-ts

Version:

Lightweight crawler written in TypeScript using ES6 generators.

161 lines (160 loc) 9.88 kB
"use strict"; var __generator = (this && this.__generator) || function (thisArg, body) { var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g; return g = { next: verb(0), "throw": verb(1), "return": verb(2) }, typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; function verb(n) { return function (v) { return step([n, v]); }; } function step(op) { if (f) throw new TypeError("Generator is already executing."); while (_) try { if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; if (y = 0, t) op = [op[0] & 2, t.value]; switch (op[0]) { case 0: case 1: t = op; break; case 4: _.label++; return { value: op[1], done: false }; case 5: _.label++; y = op[1]; op = [0]; continue; case 7: op = _.ops.pop(); _.trys.pop(); continue; default: if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } if (t[2]) _.ops.pop(); _.trys.pop(); continue; } op = body.call(thisArg, _); } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; } }; var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); } var __asyncValues = (this && this.__asyncValues) || function (o) { if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined."); var m = o[Symbol.asyncIterator], i; return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i); function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; } function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); } }; var __asyncDelegator = (this && this.__asyncDelegator) || function (o) { var i, p; return i = {}, verb("next"), verb("throw", function (e) { throw e; }), verb("return"), i[Symbol.iterator] = function () { return this; }, i; function verb(n, f) { i[n] = o[n] ? function (v) { return (p = !p) ? { value: __await(o[n](v)), done: n === "return" } : f ? f(v) : v; } : f; } }; var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) { if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined."); var g = generator.apply(thisArg, _arguments || []), i, q = []; return i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i; function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; } function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } } function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); } function fulfill(value) { resume("next", value); } function reject(value) { resume("throw", value); } function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); } }; var __values = (this && this.__values) || function(o) { var s = typeof Symbol === "function" && Symbol.iterator, m = s && o[s], i = 0; if (m) return m.call(o); if (o && typeof o.length === "number") return { next: function () { if (o && i >= o.length) o = void 0; return { value: o && o[i++], done: !o }; } }; throw new TypeError(s ? "Object is not iterable." : "Symbol.iterator is not defined."); }; Object.defineProperty(exports, "__esModule", { value: true }); exports.createCrawler = void 0; function createCrawler(options) { var requester = options.requester, shouldParse = options.shouldParse, parser = options.parser, shouldYield = options.shouldYield, follower = options.follower, shouldQueue = options.shouldQueue, logger = options.logger; return function gen(location) { return __asyncGenerator(this, arguments, function gen_1() { var response, _a, parsed, _b, _c, next, e_1, e_2_1, e_3; var e_2, _d; return __generator(this, function (_e) { switch (_e.label) { case 0: _e.trys.push([0, 28, , 29]); logger === null || logger === void 0 ? void 0 : logger.info("Requesting " + location); return [4 /*yield*/, __await(requester(location))]; case 1: response = _e.sent(); _a = response; if (!_a) return [3 /*break*/, 3]; return [4 /*yield*/, __await(shouldParse({ location: location, response: response }))]; case 2: _a = (_e.sent()); _e.label = 3; case 3: if (!_a) return [3 /*break*/, 27]; logger === null || logger === void 0 ? void 0 : logger.info("Parsing " + location); return [4 /*yield*/, __await(parser({ location: location, response: response }))]; case 4: parsed = _e.sent(); if (!!parsed) return [3 /*break*/, 6]; return [4 /*yield*/, __await(void 0)]; case 5: return [2 /*return*/, _e.sent()]; case 6: return [4 /*yield*/, __await(shouldYield({ location: location, response: response, parsed: parsed }))]; case 7: if (!_e.sent()) return [3 /*break*/, 10]; logger === null || logger === void 0 ? void 0 : logger.info("Yielding " + location); return [4 /*yield*/, __await({ location: location, response: response, parsed: parsed })]; case 8: return [4 /*yield*/, _e.sent()]; case 9: _e.sent(); _e.label = 10; case 10: _e.trys.push([10, 21, 22, 27]); _b = __asyncValues(follower({ location: location, response: response, parsed: parsed })); _e.label = 11; case 11: return [4 /*yield*/, __await(_b.next())]; case 12: if (!(_c = _e.sent(), !_c.done)) return [3 /*break*/, 20]; next = _c.value; _e.label = 13; case 13: _e.trys.push([13, 18, , 19]); return [4 /*yield*/, __await(shouldQueue({ location: next, origin: location, response: response, parsed: parsed }))]; case 14: if (!_e.sent()) return [3 /*break*/, 17]; logger === null || logger === void 0 ? void 0 : logger.info("Queueing " + next); return [5 /*yield**/, __values(__asyncDelegator(__asyncValues(gen(next))))]; case 15: return [4 /*yield*/, __await.apply(void 0, [_e.sent()])]; case 16: _e.sent(); _e.label = 17; case 17: return [3 /*break*/, 19]; case 18: e_1 = _e.sent(); logger === null || logger === void 0 ? void 0 : logger.error("Cannot queue " + next); logger === null || logger === void 0 ? void 0 : logger.error(e_1); return [3 /*break*/, 19]; case 19: return [3 /*break*/, 11]; case 20: return [3 /*break*/, 27]; case 21: e_2_1 = _e.sent(); e_2 = { error: e_2_1 }; return [3 /*break*/, 27]; case 22: _e.trys.push([22, , 25, 26]); if (!(_c && !_c.done && (_d = _b.return))) return [3 /*break*/, 24]; return [4 /*yield*/, __await(_d.call(_b))]; case 23: _e.sent(); _e.label = 24; case 24: return [3 /*break*/, 26]; case 25: if (e_2) throw e_2.error; return [7 /*endfinally*/]; case 26: return [7 /*endfinally*/]; case 27: return [3 /*break*/, 29]; case 28: e_3 = _e.sent(); logger === null || logger === void 0 ? void 0 : logger.error("Cannot visit " + location); logger === null || logger === void 0 ? void 0 : logger.error(e_3); return [3 /*break*/, 29]; case 29: return [2 /*return*/]; } }); }); }; } exports.createCrawler = createCrawler;