links-extractor
Version:
A link extractor library
195 lines (175 loc) • 5.74 kB
JavaScript
var phantom = require('phantom');
var jsdom = require("jsdom");
var _ = require("lodash");
var Q = require("q");
/**
* Sanitize html
*/
var sanitizeHtml = function(html){
html = html.replace(/\\n|\\t/g,""); //remove weird pseudo new lines and tabs
html = html.replace(/\\"/g,'"'); //replace werid escaped quotes with real quotes
html = html.substr(1); //remove first quote character
html = html.substr(0, html.length - 1); //remove last quote character
return html;
};
/**
* Export module as a node module
* @example
* var linkExtractor = require('PATH/TO/linkextractor');
* var myLinkExtractor = linkExtractor.create({
* siteRoot: 'http://my-domain.com',
* debug: true
* }).getLinks()...
*/
exports.create = function (options) {
return new LinkExtractor(options);
};
/**
* Initialize LinkExtractor
* @constructor
*/
var LinkExtractor = function (options) {
if(_.isBoolean(options.debug)) {
this.debug_level = options.debug;
}
if (_.isFunction(options.filter)) {
this.filter = options.filter;
}
this.siteRoot = options.siteRoot || false;
this.level = _.isNumber(options.level) ? options.level : 3;
this.avoidUrl = _.isArray(options.avoidUrl) ? options.avoidUrl : [];
return this;
};
/** @type {String} The site root url */
LinkExtractor.prototype.siteRoot = "";
/** @type {boolean} define current state of debug displayed */
LinkExtractor.prototype.debug_level = false;
/** @type {Array} Links array */
LinkExtractor.prototype.links = [];
/** @type {Array} Links array */
LinkExtractor.prototype.filter = false;
/** @type {Array} url to avoid */
LinkExtractor.prototype.avoidUrl = [];
/** @type {Promise} links promise to know when links array was builded */
LinkExtractor.prototype.linksPromise = Q.defer();
/** @type {Phantom} phantom instance */
LinkExtractor.prototype._ph = null;
/**
* Wrapper to enable/disable debuf
*/
LinkExtractor.prototype.debug = function () {
if (this.debug_level) {
console.log.apply(console, arguments);
}
};
/**
* Wrapper to manage phantom instance
*/
LinkExtractor.prototype.createPhantomInstance = function () {
if (!_.isNull(this._ph)) {
return Q.when(this._ph);
}
var deferred = Q.defer();
var self = this;
phantom.create(function(ph) {
self._ph = ph;
deferred.resolve(self);
});
return deferred.promise;
};
/**
* Start point to extract urls
*/
LinkExtractor.prototype.getLinks = function () {
this.linksPromise = Q.defer();
var self = this;
this
.createPhantomInstance()
.then(function (ph) {
self.parseHtmlPage(self.siteRoot);
});
return this.linksPromise.promise;
};
/**
* Open the url and retrieve HTML through phantomjs
*/
LinkExtractor.prototype.parseHtmlPage = function (url) {
var deferred = Q.defer();
var self = this;
this
.createPhantomInstance()
.then(function (ph) {
ph.createPage(function(page) { //does things in parallel?
page.open(url, function(status) {
page.evaluate(function () {
return JSON.stringify(document.all[0].outerHTML);
}, function (html) {
self.buildLinksFromHTML(sanitizeHtml(html), url);
page.close();
});
});
});
});
return deferred.promise;
};
/**
* Parse all internal links from given HTML and check if links are new to push them into links array
*/
LinkExtractor.prototype.buildLinksFromHTML = function (html, url) {
var self = this;
var new_links = false;
jsdom.env(
html,
["http://code.jquery.com/jquery.js"],
function (errors, window) {
var links = window.$("a[href^='"+self.siteRoot+"'], a[href^='/'], a[href^='./'], a[href^='../'], a[href^='#']");
_.forEach(links, function (v) {
var stringLink = window.$(v).attr('href');
if (_.isString(stringLink) && stringLink !== "" && // Valid URL
_.indexOf(self.avoidUrl, stringLink) === -1 && // Check if not in avoid url
( (_.isFunction(self.filter) && self.filter(stringLink)) || self.filter === false) && // Check if filter exist and pass!
_.isUndefined(_.find(self.links, {url: stringLink})) // Check if already exist in links array
) {
self.links.push({
url: stringLink,
visited: false
});
new_links = true;
}
});
self.debug('VISIT link: ' + url + ' - ' + new_links);
self.parseArray();
}
);
};
/**
* Parse current links array and stop parse when discover a unvisited link.
*/
LinkExtractor.prototype.parseArray = function () {
this.debug('external_links size (' + this.links.length + ')');
var self = this;
var parsedLink = 0;
_.forEach(this.links, function (link) {
parsedLink++;
if (!link.visited) {
link.visited = true;
var curUrl = self.siteRoot + link.url;
self.debug('Start open ' + curUrl);
self.parseHtmlPage(curUrl);
return false;
}
});
// If we reach this part, we discover all links
if (parsedLink === this.links.length) {
this.debug('Finish!');
this.linksPromise.resolve(_.map(this.links, 'url'));
this.destroy();
}
};
/**
* Destroy LinkExtractor instances
*/
LinkExtractor.prototype.destroy = function () {
this._ph.exit();
this._ph = null;
};