bs-broken-links-checker
Version:
Broken links checker tool fow web sites
454 lines (376 loc) • 15.7 kB
JavaScript
'use strict';
var _get = require('babel-runtime/helpers/get')['default'];
var _inherits = require('babel-runtime/helpers/inherits')['default'];
var _createClass = require('babel-runtime/helpers/create-class')['default'];
var _classCallCheck = require('babel-runtime/helpers/class-call-check')['default'];
var _Promise = require('babel-runtime/core-js/promise')['default'];
var _Array$from = require('babel-runtime/core-js/array/from')['default'];
var _interopRequireDefault = require('babel-runtime/helpers/interop-require-default')['default'];
Object.defineProperty(exports, '__esModule', {
value: true
});
var _lodash = require('lodash');
var _lodash2 = _interopRequireDefault(_lodash);
var _got = require('got');
var _got2 = _interopRequireDefault(_got);
var _base = require('./base');
var _base2 = _interopRequireDefault(_base);
var _basedOption = require('./based-option');
var _basedOption2 = _interopRequireDefault(_basedOption);
var _modelModel = require('./model/model');
var _modelModel2 = _interopRequireDefault(_modelModel);
var _modelDocument = require('./model/document');
var _modelDocument2 = _interopRequireDefault(_modelDocument);
var _modelStatistic = require('./model/statistic');
var _modelStatistic2 = _interopRequireDefault(_modelStatistic);
var _linkAnalyzer = require('./link-analyzer');
var _linkAnalyzer2 = _interopRequireDefault(_linkAnalyzer);
require('http').globalAgent.maxSockets = Infinity;
require('https').globalAgent.maxSockets = Infinity;
process.env.NODE_TLS_REJECT_UNAUTHORIZED = "0";
var Checker = (function (_Base) {
_inherits(Checker, _Base);
/**
* Constructor
* @param {Object} [options] — configuration object
* @param {String} [options.mode] - mode of checking ("website", "section" or "page")
* @param {Number} [options.concurrent] — number of concurrent requests
* @param {Object} [options.requestHeaders] — set custom request headers for crawler requests
* @param {Number} [options.requestRetriesAmount] - number of attempts for request if it fails at first
* @param {Number} [options.requestTimeout] - request timeout (in milliseconds)
* @param {Function} [options.onDone] - set custom done handler function
* @param {String[]} [options.acceptedSchemes] — set array of accepted request acceptedSchemes
* @param {Boolean} [options.checkExternalUrls] — set `true` for check outer links
* @param {RegExp[]} [options.excludeLinkPatterns - array of regular expressions. Urls that matches
* for this regular expressions would be excluded from verification
* @constructor
*/
function Checker() {
var options = arguments.length <= 0 || arguments[0] === undefined ? {} : arguments[0];
_classCallCheck(this, Checker);
_get(Object.getPrototypeOf(Checker.prototype), 'constructor', this).call(this, options, module);
this.logger.info('Initialize crawler instance');
/**
* Checker options
* @type {BasedOptions}
*/
this._options = new _basedOption2['default']();
var def = this.constructor.DEFAULT;
this.options.setOption(options, 'mode', def.mode).setOption(options, 'concurrent', def.concurrent).setOption(options, 'requestHeaders', def.requestHeaders).setOption(options, 'requestTimeout', def.requestTimeout).setOption(options, 'acceptedSchemes', def.acceptedSchemes).setOption(options, 'checkExternalUrls', def.checkExternalUrls).setOption(options, 'excludeLinkPatterns', def.excludeLinkPatterns).setOption(options, 'requestRetriesAmount', def.requestRetriesAmount).setOption(options, 'onDone', this.onDone.bind(this));
}
/**
* Getter function for options
* @returns {BasedOptions}
*/
_createClass(Checker, [{
key: 'initModel',
/**
* Sets model instance
* @param {Model} model instance
* @return {Checker}
*/
value: function initModel(model) {
this._model = model;
return this;
}
/**
* Sets linkAnalyzer instance
* @param {LinkAnalyzer} linkAnalyzer
* @return {Checker}
*/
}, {
key: 'initLinkAnalyzer',
value: function initLinkAnalyzer(linkAnalyzer) {
this._linkAnalyzer = linkAnalyzer;
return this;
}
/**
* Sets Statistic instance
* @param {Statistic} statistic
* @return {Checker}
*/
}, {
key: 'initStatistic',
value: function initStatistic(statistic) {
this._statistic = statistic;
return this;
}
/**
* Returns application default options
* @returns {Object}
* @constructor
*/
}, {
key: 'processLoadedDocument',
/**
* Processes loaded document
* @param {Document} document - document model
* @param {String} document.url - request url
* @param {HttpResponse|HttpsResponse} document.res - response object
* @protected
*/
value: function processLoadedDocument(document) {
var _this = this,
documentUrl = document.url,
$ = document.$;
$('a').each(function () {
var href = $(this).attr('href');
if (href) {
var url = document.resolve(href.split('#')[0]);
if (_this.linkAnalyzer.isNeedToSkipUrl(url, documentUrl)) {
return;
}
if (_this.linkAnalyzer.isExternal(url)) {
_this.model.addToExternal(url, documentUrl, href);
} else {
_this._addToQueue(url, { page: documentUrl, href: href });
}
}
});
this._onFinishLoad(documentUrl);
}
/**
* Start to crawl pages for given url
* @param {String} url - initial site url for start
* @throws Error
* @public
*/
}, {
key: 'start',
value: function start(url) {
if (!url) {
throw new Error('Url was not set');
}
if (!url.match(this.constructor.CONSTANTS.URL_REGEXP)) {
throw new Error('Urls is not valid');
}
this.initStatistic(new _modelStatistic2['default']()).initModel(new _modelModel2['default']()).initLinkAnalyzer(new _linkAnalyzer2['default'](url, this.options)).logger.info('Start to analyze pages for: => %s', url).info('It can be take a long time. Please wait ...');
this._addToQueue(url, { page: url });
}
/**
* onDone callback function
* @param {Statistic} statistic model instance
* @protected
*/
}, {
key: 'onDone',
value: function onDone(statistic) {
return statistic;
}
/**
* Makes request to given external url
* @param {String} url - external url (url that should be requested)
* @param {Object} advanced - object with advanced data
* @param {Number} attempt - number of request attempt
* @private
*/
}, {
key: '_checkInternalLink',
value: function _checkInternalLink(url, advanced) {
var _this2 = this;
var attempt = arguments.length <= 2 || arguments[2] === undefined ? 0 : arguments[2];
if (attempt === 0) {
this.model.addToActive(url);
}
_got2['default'].get(url, this._getRequestOptions(), function (error, data, res) {
if (error) {
if (!error.statusCode && attempt < _this2.options.getOption('requestRetriesAmount')) {
return _this2._checkInternalLink(url, advanced, ++attempt);
} else {
_this2.statistic.increaseInternalCount();
_this2.statistic.getBroken().add(url, advanced, error.statusCode);
_this2.logger.warn('Broken [%s] link: => %s on page: => %s', error.statusCode, advanced.href, advanced.page);
}
return _this2._onFinishLoad(url);
}
_this2.logger.debug('[%s] [%s] Receive [%s] for url: => %s', _this2.model.getPendingLength(), _this2.model.getActiveLength(), res ? res.statusCode : -1, url);
_this2.statistic.increaseInternalCount();
_this2.processLoadedDocument(new _modelDocument2['default'](url, data));
});
}
/**
* Checks given external link item
* @param {Object} item - external link item object
* @param {String} item.url - external link url
* @param {Object} item.advanced - external link advanced meta data object
* @param {Number} attempt - number of request attempt
* @returns {Promise}
* @private
*/
}, {
key: '_checkExternalLink',
value: function _checkExternalLink(item) {
var _this4 = this;
var attempt = arguments.length <= 1 || arguments[1] === undefined ? 0 : arguments[1];
var url = item.url,
advanced = item.advanced;
function ping() {
var _this3 = this;
return new _Promise(function (resolve) {
_got2['default'].head(url, _this3._getRequestOptions(), function (error, data, res) {
if (error) {
if (!error.statusCode && attempt < _this3.options.getOption('requestRetriesAmount') - 1) {
return resolve(false);
} else if (error.statusCode) {
_this3.statistic.getBroken().add(url, advanced, error.statusCode);
_this3.logger.warn('Broken [%s] link: => %s on page: => %s', error.statusCode, advanced.href, advanced.page);
}
}
_this3.logger.debug('[%s] [%s] Receive [%s] for url: => %s', _this3.model.getPendingLength(), _this3.model.getActiveLength(), res ? res.statusCode : -1, url);
_this3.statistic.increaseExternalCount();
resolve(true);
});
});
}
return ping.apply(this).then(function (result) {
return result || _this4._checkExternalLink(item, ++attempt);
});
}
/**
* Check all collected external links
* @returns {Promise}
* @private
*/
}, {
key: '_checkExternalLinks',
value: function _checkExternalLinks() {
var _this5 = this;
if (!this.model.areExternal()) {
return _Promise.resolve();
}
this.logger.info('Start to verify external links ...');
return (0, _lodash2['default'])(_Array$from(this.model.external)).map(function (item) {
return { url: item[0], advanced: item[1] };
}).chunk(100).value().reduce(function (prev, portion) {
return prev.then(function () {
return _Promise.all(portion.map(_this5._checkExternalLink.bind(_this5)));
});
}, _Promise.resolve());
}
/**
* Adds item to check queue
* @param {String} url - link url
* @param {Object} advanced - object with advanced data
* @private
*/
}, {
key: '_addToQueue',
value: function _addToQueue(url, advanced) {
url = url.replace(/\/$/, '');
if (this.model.addToProcessed(url)) {
this.model.isQueueFull(this.options.getOption('concurrent')) ? this.model.addToPending(url, advanced) : this._checkInternalLink(url, advanced);
}
}
/**
* Function which called after request to given url will be finished
* @param {String} url which was requested
* @return {*}
* @private
*/
}, {
key: '_onFinishLoad',
value: function _onFinishLoad(url) {
var _this6 = this;
this.model.removeFromActive(url);
if (!this.model.isQueueFull(this.options.getOption('concurrent'))) {
var next = this.model.removeFromPending();
if (next) {
this._checkInternalLink(next.url, next.advanced);
} else if (!this.model.areActive()) {
return this._checkExternalLinks().then(function () {
_this6.options.getOption('onDone')(_this6.statistic);
});
}
}
}
/**
* Returns request options hash
* @returns {{encoding: string, headers: *, timeout: *}}
* @private
*/
}, {
key: '_getRequestOptions',
value: function _getRequestOptions() {
return {
encoding: 'utf-8',
headers: this.options.getOption('requestHeaders'),
timeout: this.options.getOption('requestTimeout')
};
}
}, {
key: 'options',
get: function get() {
return this._options;
}
/**
* Returns logger instance
* @return {Logger} logger
*/
}, {
key: 'logger',
get: function get() {
return this._logger;
}
/**
* Returns model instance
* @return {Model} model
*/
}, {
key: 'model',
get: function get() {
return this._model;
}
/**
* Returns instance of LinkAnalyzer class
* @return {LinkAnalyzer} linkAnalyzer
*/
}, {
key: 'linkAnalyzer',
get: function get() {
return this._linkAnalyzer;
}
/**
* Returns instance of Statistic class
* @return {Statistic} statistic
*/
}, {
key: 'statistic',
get: function get() {
return this._statistic;
}
}], [{
key: 'DEFAULT',
get: function get() {
return {
mode: 'website',
concurrent: 100,
requestHeaders: { 'user-agent': 'node-spider' },
requestRetriesAmount: 5,
requestTimeout: 5000,
acceptedSchemes: ['http:', 'https:'],
checkExternalUrls: false,
excludeLinkPatterns: []
};
}
/**
* Returns application constants model
* @returns {Object}
* @static
*/
}, {
key: 'CONSTANTS',
get: function get() {
return {
URL_REGEXP: /https?\:\/\/\w+((\:\d+)?\/\S*)?/,
MODE: {
WEBSITE: 'website',
SECTION: 'section',
PAGE: 'page'
}
};
}
}]);
return Checker;
})(_base2['default']);
exports['default'] = Checker;
module.exports = exports['default'];