crawler-ts
Version:
Lightweight crawler written in TypeScript using ES6 generators.
128 lines (127 loc) • 4.51 kB
JavaScript
Object.defineProperty(exports, "__esModule", { value: true });
exports.cache = exports.ignoreDoubles = exports.ignoreRegex = exports.allowRegex = exports.allowExtensions = exports.chain = exports.toString = void 0;
var toString = function (value) { return "" + value; };
exports.toString = toString;
/**
* Chain multiple filter functions together.
*/
function chain() {
var fns = [];
for (var _i = 0; _i < arguments.length; _i++) {
fns[_i] = arguments[_i];
}
return function () {
for (var _i = 0, fns_1 = fns; _i < fns_1.length; _i++) {
var fn = fns_1[_i];
if (!fn.apply(this, arguments)) {
return false;
}
}
return true;
};
}
exports.chain = chain;
/**
* Create a filter that allows specific extensions.
*/
var allowExtensions = function (strFn) {
if (strFn === void 0) { strFn = exports.toString; }
return function (allowedExtensions, logger) {
return function (_a) {
var location = _a.location;
var converted = strFn(location);
var lastSlashIndex = Math.max(0, converted.lastIndexOf('/'));
var lastSlashPart = converted.substr(lastSlashIndex);
var lastDotIndex = lastSlashPart.lastIndexOf('.');
if (lastDotIndex !== -1) {
var extension = lastSlashPart.substr(lastDotIndex + 1);
if (allowedExtensions.indexOf(extension) === -1) {
logger === null || logger === void 0 ? void 0 : logger.info("Extension not allowed " + converted);
return false;
}
}
return true;
};
};
};
exports.allowExtensions = allowExtensions;
/**
* Create a filter that allows values matching the given regexes.
*/
var allowRegex = function (strFn) {
if (strFn === void 0) { strFn = exports.toString; }
return function (allowUrls, logger) {
return function (_a) {
var location = _a.location;
for (var _i = 0, allowUrls_1 = allowUrls; _i < allowUrls_1.length; _i++) {
var allowUrl = allowUrls_1[_i];
var converted = strFn(location);
if (allowUrl.test(converted)) {
logger === null || logger === void 0 ? void 0 : logger.info("Allowing " + converted);
return true;
}
}
return false;
};
};
};
exports.allowRegex = allowRegex;
/**
* Create a filter that ignores values matching the given regexes.
*/
var ignoreRegex = function (strFn) {
if (strFn === void 0) { strFn = exports.toString; }
return function (ignoredUrls, logger) {
return function (_a) {
var location = _a.location;
for (var _i = 0, ignoredUrls_1 = ignoredUrls; _i < ignoredUrls_1.length; _i++) {
var ignoredUrl = ignoredUrls_1[_i];
var converted = strFn(location);
if (ignoredUrl.test(converted)) {
logger === null || logger === void 0 ? void 0 : logger.info("Ignoring " + converted);
return false;
}
}
return true;
};
};
};
exports.ignoreRegex = ignoreRegex;
/**
* Create a filter that ignores doubles.
*/
var ignoreDoubles = function (strFn) {
if (strFn === void 0) { strFn = exports.toString; }
return function (logger) {
var seen = [];
return function (_a) {
var location = _a.location;
var key = strFn(location);
if (!key || seen.includes(key)) {
logger === null || logger === void 0 ? void 0 : logger.info("Skipping visited \"" + location + "\"");
return false;
}
seen.push(key);
return true;
};
};
};
exports.ignoreDoubles = ignoreDoubles;
var cache = function (strFn) {
if (strFn === void 0) { strFn = exports.toString; }
return function (fn) {
var shouldFollowCache = {};
return function cachedShouldFollow(_a) {
var location = _a.location;
var string = strFn(location);
if (shouldFollowCache.hasOwnProperty(string)) {
return shouldFollowCache[string];
}
var shouldFollow = fn({ location: location });
shouldFollowCache[string] = shouldFollow;
return shouldFollow;
};
};
};
exports.cache = cache;
;