UNPKG

bs-broken-links-checker

Version:
454 lines (376 loc) 15.7 kB
'use strict'; var _get = require('babel-runtime/helpers/get')['default']; var _inherits = require('babel-runtime/helpers/inherits')['default']; var _createClass = require('babel-runtime/helpers/create-class')['default']; var _classCallCheck = require('babel-runtime/helpers/class-call-check')['default']; var _Promise = require('babel-runtime/core-js/promise')['default']; var _Array$from = require('babel-runtime/core-js/array/from')['default']; var _interopRequireDefault = require('babel-runtime/helpers/interop-require-default')['default']; Object.defineProperty(exports, '__esModule', { value: true }); var _lodash = require('lodash'); var _lodash2 = _interopRequireDefault(_lodash); var _got = require('got'); var _got2 = _interopRequireDefault(_got); var _base = require('./base'); var _base2 = _interopRequireDefault(_base); var _basedOption = require('./based-option'); var _basedOption2 = _interopRequireDefault(_basedOption); var _modelModel = require('./model/model'); var _modelModel2 = _interopRequireDefault(_modelModel); var _modelDocument = require('./model/document'); var _modelDocument2 = _interopRequireDefault(_modelDocument); var _modelStatistic = require('./model/statistic'); var _modelStatistic2 = _interopRequireDefault(_modelStatistic); var _linkAnalyzer = require('./link-analyzer'); var _linkAnalyzer2 = _interopRequireDefault(_linkAnalyzer); require('http').globalAgent.maxSockets = Infinity; require('https').globalAgent.maxSockets = Infinity; process.env.NODE_TLS_REJECT_UNAUTHORIZED = "0"; var Checker = (function (_Base) { _inherits(Checker, _Base); /** * Constructor * @param {Object} [options] — configuration object * @param {String} [options.mode] - mode of checking ("website", "section" or "page") * @param {Number} [options.concurrent] — number of concurrent requests * @param {Object} [options.requestHeaders] — set custom request headers for crawler requests * @param {Number} [options.requestRetriesAmount] - number of attempts for request if it fails at first * @param {Number} [options.requestTimeout] - request timeout (in milliseconds) * @param {Function} [options.onDone] - set custom done handler function * @param {String[]} [options.acceptedSchemes] — set array of accepted request acceptedSchemes * @param {Boolean} [options.checkExternalUrls] — set `true` for check outer links * @param {RegExp[]} [options.excludeLinkPatterns - array of regular expressions. Urls that matches * for this regular expressions would be excluded from verification * @constructor */ function Checker() { var options = arguments.length <= 0 || arguments[0] === undefined ? {} : arguments[0]; _classCallCheck(this, Checker); _get(Object.getPrototypeOf(Checker.prototype), 'constructor', this).call(this, options, module); this.logger.info('Initialize crawler instance'); /** * Checker options * @type {BasedOptions} */ this._options = new _basedOption2['default'](); var def = this.constructor.DEFAULT; this.options.setOption(options, 'mode', def.mode).setOption(options, 'concurrent', def.concurrent).setOption(options, 'requestHeaders', def.requestHeaders).setOption(options, 'requestTimeout', def.requestTimeout).setOption(options, 'acceptedSchemes', def.acceptedSchemes).setOption(options, 'checkExternalUrls', def.checkExternalUrls).setOption(options, 'excludeLinkPatterns', def.excludeLinkPatterns).setOption(options, 'requestRetriesAmount', def.requestRetriesAmount).setOption(options, 'onDone', this.onDone.bind(this)); } /** * Getter function for options * @returns {BasedOptions} */ _createClass(Checker, [{ key: 'initModel', /** * Sets model instance * @param {Model} model instance * @return {Checker} */ value: function initModel(model) { this._model = model; return this; } /** * Sets linkAnalyzer instance * @param {LinkAnalyzer} linkAnalyzer * @return {Checker} */ }, { key: 'initLinkAnalyzer', value: function initLinkAnalyzer(linkAnalyzer) { this._linkAnalyzer = linkAnalyzer; return this; } /** * Sets Statistic instance * @param {Statistic} statistic * @return {Checker} */ }, { key: 'initStatistic', value: function initStatistic(statistic) { this._statistic = statistic; return this; } /** * Returns application default options * @returns {Object} * @constructor */ }, { key: 'processLoadedDocument', /** * Processes loaded document * @param {Document} document - document model * @param {String} document.url - request url * @param {HttpResponse|HttpsResponse} document.res - response object * @protected */ value: function processLoadedDocument(document) { var _this = this, documentUrl = document.url, $ = document.$; $('a').each(function () { var href = $(this).attr('href'); if (href) { var url = document.resolve(href.split('#')[0]); if (_this.linkAnalyzer.isNeedToSkipUrl(url, documentUrl)) { return; } if (_this.linkAnalyzer.isExternal(url)) { _this.model.addToExternal(url, documentUrl, href); } else { _this._addToQueue(url, { page: documentUrl, href: href }); } } }); this._onFinishLoad(documentUrl); } /** * Start to crawl pages for given url * @param {String} url - initial site url for start * @throws Error * @public */ }, { key: 'start', value: function start(url) { if (!url) { throw new Error('Url was not set'); } if (!url.match(this.constructor.CONSTANTS.URL_REGEXP)) { throw new Error('Urls is not valid'); } this.initStatistic(new _modelStatistic2['default']()).initModel(new _modelModel2['default']()).initLinkAnalyzer(new _linkAnalyzer2['default'](url, this.options)).logger.info('Start to analyze pages for: => %s', url).info('It can be take a long time. Please wait ...'); this._addToQueue(url, { page: url }); } /** * onDone callback function * @param {Statistic} statistic model instance * @protected */ }, { key: 'onDone', value: function onDone(statistic) { return statistic; } /** * Makes request to given external url * @param {String} url - external url (url that should be requested) * @param {Object} advanced - object with advanced data * @param {Number} attempt - number of request attempt * @private */ }, { key: '_checkInternalLink', value: function _checkInternalLink(url, advanced) { var _this2 = this; var attempt = arguments.length <= 2 || arguments[2] === undefined ? 0 : arguments[2]; if (attempt === 0) { this.model.addToActive(url); } _got2['default'].get(url, this._getRequestOptions(), function (error, data, res) { if (error) { if (!error.statusCode && attempt < _this2.options.getOption('requestRetriesAmount')) { return _this2._checkInternalLink(url, advanced, ++attempt); } else { _this2.statistic.increaseInternalCount(); _this2.statistic.getBroken().add(url, advanced, error.statusCode); _this2.logger.warn('Broken [%s] link: => %s on page: => %s', error.statusCode, advanced.href, advanced.page); } return _this2._onFinishLoad(url); } _this2.logger.debug('[%s] [%s] Receive [%s] for url: => %s', _this2.model.getPendingLength(), _this2.model.getActiveLength(), res ? res.statusCode : -1, url); _this2.statistic.increaseInternalCount(); _this2.processLoadedDocument(new _modelDocument2['default'](url, data)); }); } /** * Checks given external link item * @param {Object} item - external link item object * @param {String} item.url - external link url * @param {Object} item.advanced - external link advanced meta data object * @param {Number} attempt - number of request attempt * @returns {Promise} * @private */ }, { key: '_checkExternalLink', value: function _checkExternalLink(item) { var _this4 = this; var attempt = arguments.length <= 1 || arguments[1] === undefined ? 0 : arguments[1]; var url = item.url, advanced = item.advanced; function ping() { var _this3 = this; return new _Promise(function (resolve) { _got2['default'].head(url, _this3._getRequestOptions(), function (error, data, res) { if (error) { if (!error.statusCode && attempt < _this3.options.getOption('requestRetriesAmount') - 1) { return resolve(false); } else if (error.statusCode) { _this3.statistic.getBroken().add(url, advanced, error.statusCode); _this3.logger.warn('Broken [%s] link: => %s on page: => %s', error.statusCode, advanced.href, advanced.page); } } _this3.logger.debug('[%s] [%s] Receive [%s] for url: => %s', _this3.model.getPendingLength(), _this3.model.getActiveLength(), res ? res.statusCode : -1, url); _this3.statistic.increaseExternalCount(); resolve(true); }); }); } return ping.apply(this).then(function (result) { return result || _this4._checkExternalLink(item, ++attempt); }); } /** * Check all collected external links * @returns {Promise} * @private */ }, { key: '_checkExternalLinks', value: function _checkExternalLinks() { var _this5 = this; if (!this.model.areExternal()) { return _Promise.resolve(); } this.logger.info('Start to verify external links ...'); return (0, _lodash2['default'])(_Array$from(this.model.external)).map(function (item) { return { url: item[0], advanced: item[1] }; }).chunk(100).value().reduce(function (prev, portion) { return prev.then(function () { return _Promise.all(portion.map(_this5._checkExternalLink.bind(_this5))); }); }, _Promise.resolve()); } /** * Adds item to check queue * @param {String} url - link url * @param {Object} advanced - object with advanced data * @private */ }, { key: '_addToQueue', value: function _addToQueue(url, advanced) { url = url.replace(/\/$/, ''); if (this.model.addToProcessed(url)) { this.model.isQueueFull(this.options.getOption('concurrent')) ? this.model.addToPending(url, advanced) : this._checkInternalLink(url, advanced); } } /** * Function which called after request to given url will be finished * @param {String} url which was requested * @return {*} * @private */ }, { key: '_onFinishLoad', value: function _onFinishLoad(url) { var _this6 = this; this.model.removeFromActive(url); if (!this.model.isQueueFull(this.options.getOption('concurrent'))) { var next = this.model.removeFromPending(); if (next) { this._checkInternalLink(next.url, next.advanced); } else if (!this.model.areActive()) { return this._checkExternalLinks().then(function () { _this6.options.getOption('onDone')(_this6.statistic); }); } } } /** * Returns request options hash * @returns {{encoding: string, headers: *, timeout: *}} * @private */ }, { key: '_getRequestOptions', value: function _getRequestOptions() { return { encoding: 'utf-8', headers: this.options.getOption('requestHeaders'), timeout: this.options.getOption('requestTimeout') }; } }, { key: 'options', get: function get() { return this._options; } /** * Returns logger instance * @return {Logger} logger */ }, { key: 'logger', get: function get() { return this._logger; } /** * Returns model instance * @return {Model} model */ }, { key: 'model', get: function get() { return this._model; } /** * Returns instance of LinkAnalyzer class * @return {LinkAnalyzer} linkAnalyzer */ }, { key: 'linkAnalyzer', get: function get() { return this._linkAnalyzer; } /** * Returns instance of Statistic class * @return {Statistic} statistic */ }, { key: 'statistic', get: function get() { return this._statistic; } }], [{ key: 'DEFAULT', get: function get() { return { mode: 'website', concurrent: 100, requestHeaders: { 'user-agent': 'node-spider' }, requestRetriesAmount: 5, requestTimeout: 5000, acceptedSchemes: ['http:', 'https:'], checkExternalUrls: false, excludeLinkPatterns: [] }; } /** * Returns application constants model * @returns {Object} * @static */ }, { key: 'CONSTANTS', get: function get() { return { URL_REGEXP: /https?\:\/\/\w+((\:\d+)?\/\S*)?/, MODE: { WEBSITE: 'website', SECTION: 'section', PAGE: 'page' } }; } }]); return Checker; })(_base2['default']); exports['default'] = Checker; module.exports = exports['default'];