UNPKG

crawler-ts-htmlparser2

Version:

Lightweight crawler written in TypeScript using ES6 generators.

62 lines (61 loc) 3.26 kB
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); }; var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); } var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) { if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined."); var g = generator.apply(thisArg, _arguments || []), i, q = []; return i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i; function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; } function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } } function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); } function fulfill(value) { resume("next", value); } function reject(value) { resume("throw", value); } function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); } }; import { selectAll } from 'css-select'; import { parseDocument } from 'htmlparser2'; import { createCrawler as createCrawlerBase } from 'crawler-ts'; import { createRequester as createFetchRequester } from 'crawler-ts-fetch'; export function createRequester(delayMilliseconds) { const requester = createFetchRequester(delayMilliseconds); return (url) => __awaiter(this, void 0, void 0, function* () { const response = yield requester(url.href); const body = yield response.text(); return { status: response.status, headers: Object.fromEntries(response.headers.entries()), body, }; }); } export function parser({ response }) { return __awaiter(this, void 0, void 0, function* () { return parseDocument(response.body); }); } export function follower({ location, parsed }) { return __asyncGenerator(this, arguments, function* follower_1() { const links = selectAll('a', parsed) .map((node) => node) .map((node) => node.attribs['href']) .filter((link) => !!link); for (const link of links) { const url = new URL(link, location.href); url.username = ''; url.password = ''; url.hash = ''; yield yield __await(url); } }); } export function createCrawler(options) { return createCrawlerBase(Object.assign(Object.assign({}, options), { parser, follower, requester: createRequester() })); }