UNPKG

crawler-ts-htmlparser2

Version:

Lightweight crawler written in TypeScript using ES6 generators.

46 lines (45 loc) 1.82 kB
import { allowExtensions, allowRegex, ignoreDoubles, ignoreRegex } from 'crawler-ts'; export const urlToString = (url) => url.href; export const allowUrlRegex = allowRegex(urlToString); export const allowUrlExtensions = allowExtensions(urlToString); export const ignoreUrlRegex = ignoreRegex(urlToString); export const ignoreUrlDoubles = ignoreDoubles(urlToString); export function allowHttpOk(logger) { return function ({ response }) { if (response.status !== 200) { logger === null || logger === void 0 ? void 0 : logger.info(`Not allowing ${response.status}`); return false; } return true; }; } const htmlContentType = /text\/html;?.*/; export function allowHtml(logger) { return function ({ location, response }) { const contentType = response.headers['content-type']; if (!contentType || !htmlContentType.test(contentType)) { logger === null || logger === void 0 ? void 0 : logger.info(`Not allowing ${contentType}`); return false; } return true; }; } export function allowHosts(allowedHosts, logger) { return ({ location }) => { if (allowedHosts.indexOf(location.host) === -1) { logger === null || logger === void 0 ? void 0 : logger.info(`Host not allowed ${location.host}`); return false; } return true; }; } export function allowProtocols(allowedProtocols, logger) { const transformedProtocols = allowedProtocols.map((protocol) => `${protocol}:`); return ({ location }) => { if (transformedProtocols.indexOf(location.protocol) === -1) { logger === null || logger === void 0 ? void 0 : logger.info(`Protocol not allowed ${location.protocol}`); return false; } return true; }; }