crawler-ts-htmlparser2
Version:
Lightweight crawler written in TypeScript using ES6 generators.
62 lines (61 loc) • 3.26 kB
JavaScript
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); }
var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) {
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
var g = generator.apply(thisArg, _arguments || []), i, q = [];
return i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i;
function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; }
function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }
function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }
function fulfill(value) { resume("next", value); }
function reject(value) { resume("throw", value); }
function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }
};
import { selectAll } from 'css-select';
import { parseDocument } from 'htmlparser2';
import { createCrawler as createCrawlerBase } from 'crawler-ts';
import { createRequester as createFetchRequester } from 'crawler-ts-fetch';
export function createRequester(delayMilliseconds) {
const requester = createFetchRequester(delayMilliseconds);
return (url) => __awaiter(this, void 0, void 0, function* () {
const response = yield requester(url.href);
const body = yield response.text();
return {
status: response.status,
headers: Object.fromEntries(response.headers.entries()),
body,
};
});
}
export function parser({ response }) {
return __awaiter(this, void 0, void 0, function* () {
return parseDocument(response.body);
});
}
export function follower({ location, parsed }) {
return __asyncGenerator(this, arguments, function* follower_1() {
const links = selectAll('a', parsed)
.map((node) => node)
.map((node) => node.attribs['href'])
.filter((link) => !!link);
for (const link of links) {
const url = new URL(link, location.href);
url.username = '';
url.password = '';
url.hash = '';
yield yield __await(url);
}
});
}
export function createCrawler(options) {
return createCrawlerBase(Object.assign(Object.assign({}, options), { parser,
follower, requester: createRequester() }));
}