crawler-ts-htmlparser2
Version:
Lightweight crawler written in TypeScript using ES6 generators.
46 lines (45 loc) • 1.82 kB
JavaScript
import { allowExtensions, allowRegex, ignoreDoubles, ignoreRegex } from 'crawler-ts';
export const urlToString = (url) => url.href;
export const allowUrlRegex = allowRegex(urlToString);
export const allowUrlExtensions = allowExtensions(urlToString);
export const ignoreUrlRegex = ignoreRegex(urlToString);
export const ignoreUrlDoubles = ignoreDoubles(urlToString);
export function allowHttpOk(logger) {
return function ({ response }) {
if (response.status !== 200) {
logger === null || logger === void 0 ? void 0 : logger.info(`Not allowing ${response.status}`);
return false;
}
return true;
};
}
const htmlContentType = /text\/html;?.*/;
export function allowHtml(logger) {
return function ({ location, response }) {
const contentType = response.headers['content-type'];
if (!contentType || !htmlContentType.test(contentType)) {
logger === null || logger === void 0 ? void 0 : logger.info(`Not allowing ${contentType}`);
return false;
}
return true;
};
}
export function allowHosts(allowedHosts, logger) {
return ({ location }) => {
if (allowedHosts.indexOf(location.host) === -1) {
logger === null || logger === void 0 ? void 0 : logger.info(`Host not allowed ${location.host}`);
return false;
}
return true;
};
}
export function allowProtocols(allowedProtocols, logger) {
const transformedProtocols = allowedProtocols.map((protocol) => `${protocol}:`);
return ({ location }) => {
if (transformedProtocols.indexOf(location.protocol) === -1) {
logger === null || logger === void 0 ? void 0 : logger.info(`Protocol not allowed ${location.protocol}`);
return false;
}
return true;
};
}