elsewhere
Version:
A node project that aims to replicate the functionality of the Google Social Graph API
555 lines (465 loc) • 15.5 kB
JavaScript
var urlParser = require('url'),
_ = require('underscore')._,
wf = require('webfinger'),
Page = require('./page.js').Page,
fn = require('./functions.js'),
internalCache = require('./cache.js'),
internalLogger = require('./logger.js'),
internalOptions = require('./options.js');
_.mixin(require('underscore.Deferred'));
// The Graphing object uses page objects to scrape URLs for
// rel=me links.
// Each child page contains a reference to the grapher that
// created it.
function Grapher (url, options) {
this.rootUrl = url;
this.pages = {};
this.crawlCount = -1;
this.domainCounts = [];
this.options = options || {};
mergeOptions(this.options);
}
Grapher.prototype = {
constructor: Grapher,
// Primary method of the grapher. Fetches the page at the
// root URL and all subsequent pages. Calls the callback
// parameter when complete.
build: function (callback) {
var logger = this.options.logger,
startedParse = new Date();
// log the start of the graph build
logger.info('elsewhere started - with url: ' + this.rootUrl);
var rootPage = new Page(this.rootUrl, this, this.options, undefined, 0);
rootPage.verified = true;
this.pages[this.rootUrl] = rootPage;
this.fetchPages(function (err, graph) {
// log the end of the graph build
var requestTimes = _.pluck(graph.pages, 'requestTime'),
total = 0;
_.each(requestTimes, function (x) {
total += x;
});
logger.info('total html request time: ' + total + 'ms');
var ms = new Date().getTime() - startedParse.getTime();
logger.info('total time taken: ' + ms + 'ms');
callback(err, graph);
});
},
// Fetches each unfetched page in the `this.pages` array.
// When every page has been fetched, executes callback().
fetchPages: function (callback) {
var self = this,
whenFetched;
whenPageIsFetched = function (page) {
// if the root url errors, stop the whole parse with by throwing an error
if (page && page.status === "errored" && page.url === self.rootUrl) {
callback(page.errorMsg, self);
} else {
if (self.allFetched()) {
// finished fetching all pages, execute callback.
self.verifyPages();
self.logFetched();
callback(null, self);
} else {
// some pages haven't been fetched yet, execute self again.
findUnfetchedPages();
}
}
};
findUnfetchedPages = function(){
_.each(self.pages, function (page) {
if (page.status === "unfetched") {
if (page.level <= self.options.crawlLimit) {
page.fetch(whenPageIsFetched);
} else {
self.options.logger.log('over the crawl limit: ' + page.url + ' - page level: ' + page.level);
self.pages[page.url].status = "dontfetch";
whenPageIsFetched();
}
}
});
};
findUnfetchedPages();
},
replaceAliases: function () {
var didSomeReplacing = false;
_.each(this.pages, function (page, key, pages) {
page.aliases.forEach(function (alias) {
_.each(pages, function (pg, k, pgs) {
pg.links.forEach(function (link, i) {
if (link === alias) {
pg.links[i] = page.url;
didSomeReplacing = true;
}
});
});
});
});
if (didSomeReplacing) {
var newPages = {};
_.each(this.pages, function (page) {
newPages[page.url] = page;
});
this.pages = newPages;
}
},
// A recursive function that checks if all pages are verified
// and attempts to verify the ones that aren't by checking
// them for links to ones that are. If at least one link is
// verified by the function, it returns true. Otherwise it
// returns false.
verifyPages: function (verified) {
var self = this,
verifiedStuff = false;
_.each(this.pages, function (page) {
if (!page.verified) {
page.verified = page.links.some(function (link) {
var linkedPage = self.getPage(link);
if (linkedPage !== undefined) {
if (linkedPage.verified) {
return true;
} else {
return linkedPage.links.some(function (sublink) {
var subLinkedPage = self.pages[sublink];
return subLinkedPage !== undefined && subLinkedPage.verified;
});
}
}
});
if (page.verified) {
verifiedStuff = true
page.level = 0;
}
}
});
if (verifiedStuff) {
return self.verifyPages(true);
} else {
return verified === undefined ? false : verified;
}
},
// gets a page from the pages array. Ignores protocol & trailing slashes
getPage: function (url) {
if (url.indexOf('http') === 0) {
var url = url.substring(url.match(/^https?:\/\//)[0].length,url.length),
url = fn.url.removeTrailingSlash(url),
nowww = fn.url.removeWWW(url),
http = "http://" + url,
https = "https://" + url,
httpwww = "http://www." + nowww;
httpnowww = "http://" + nowww;
httpswww = "https://www." + nowww;
httpsnowww = "https://" + nowww;
return this.pages[http] || this.pages[http + "/"] ||
this.pages[https] || this.pages[https + "/"] ||
this.pages[httpwww] || this.pages[httpwww + "/"] ||
this.pages[httpswww] || this.pages[httpswww + "/"] ||
this.pages[httpnowww] || this.pages[httpnowww + "/"] ||
this.pages[httpsnowww] || this.pages[httpsnowww + "/"];
} else {
return undefined;
}
},
// returns an object representing the Graph, its pages and stats
// pertaining to its construction.
toLiteral: function (err) {
var self = this,
results = [],
warnings = [],
rtnObj = {},
verifiedCount = 0,
doneTime = new Date(),
truthTest, propList, pages;
if (self.options.strict) {
propList = ['url', 'title', 'favicon', 'links', 'inboundCount', 'aliases'];
truthTest = function (page) {
return page.verified && page.status === "fetched";
}
} else {
propList = ['url', 'title', 'favicon', 'links', 'verified', 'inboundCount', 'aliases'];
truthTest = function (page) {
return page.status === "fetched";
}
}
pages = _.filter(self.pages, truthTest);
if (self.options.stripDeeperLinks){
pages = self.stripDuplicates(pages)
};
results = _.map(pages, function (page) {
return page.toLiteral(propList)
});
for (var key in self.pages) {
var page = self.pages[key];
if(page.getWarning() !== null){
warnings.push(page.getWarning());
}
}
verifiedCount = _.reduce(_.pluck(self.pages, 'verified')
, function(m,v){return m + (v ? 1 : 0)});
if (err) {
rtnObj = {
query : self.rootUrl,
created : doneTime.toJSON(),
crawled : _.size(self.pages),
verified : 0
};
} else {
rtnObj = {
results : results,
query : self.rootUrl,
created : doneTime.toJSON(),
crawled : _.size(self.pages),
verified : verifiedCount
};
}
if (!err && warnings.length > 0) {
rtnObj.warnings = warnings;
}
return rtnObj;
},
toJSON: function () {
return JSON.stringify(this.toLiteral());
},
// used to remove duplicate pages and
// also removes by items by pageDepth
stripDuplicates: function (pages) {
var rtnObj = {};
_.each(pages, function (currPage) {
var currPageDomain = urlParser.parse(currPage.url).hostname,
sameDomainPages = [],
hasShorterPaths = false,
currPagePath, currPageDepth;
sameDomainPages = _.filter(pages, function (page) {
return currPageDomain === urlParser.parse(page.url).hostname && page.url !== currPage.url;
});
if (_.isEmpty(sameDomainPages)) {
rtnObj[currPage.url] = currPage;
} else {
currPagePath = fn.url.removeTrailingSlash(urlParser.parse(currPage.url).path);
currPageDepth = currPagePath.split('/').length;
hasShorterPaths = _.any(sameDomainPages, function (page) {
var pagePath = fn.url.removeTrailingSlash(urlParser.parse(page.url).path),
pageDepth = pagePath.split('/').length;
return pageDepth < currPageDepth;
});
if (!hasShorterPaths) {
rtnObj[currPage.url] = currPage;
}
}
});
return rtnObj;
},
// returns the grapher as JSON within a JSONP callback
// if one is provided, otherwise unencapsulated JSON is
// returned.
toJSONP: function (callback) {
if (callback) {
return callback + '(' + this.toJSON() + ');';
} else {
return this.toJSON();
}
},
// Used to by `Page.addPages` to work out if a page or related
// page had already been fetched.
alreadyUsed: function (url) {
if (this.pages[url]) {
return true;
} else {
var oldUrls = _.pluck(this.pages,'url'),
newObj = require('url').parse(url);
return _.any(oldUrls, function (oldUrl) {
return fn.sameUrl(url, oldUrl);
});
}
},
// Used to by `Page.addPages` to work out if a page request
// is from a domain which has already hit its limit
aboveDomainLimit: function (url) {
var hostname = urlParser.parse(url).hostname;
hostname = fn.url.removeWWW(hostname);
var i = this.domainCounts.length;
while (i--) {
if (this.domainCounts[i].hostname === hostname) {
if (this.domainCounts[i].count > this.options.domainLimit) {
return true;
} else {
return false;
}
}
}
return false;
},
// Appends a count against a domain in domainCounts
appendDomainCount: function (url) {
var i = this.domainCounts.length,
hostname = urlParser.parse(url).hostname,
found = false;
if(hostname){
hostname = fn.url.removeWWW(hostname);
while (i--) {
if(this.domainCounts[i].hostname === hostname){
this.domainCounts[i].count ++;
found = true;
}
}
if(!found){
this.domainCounts.push({
'hostname': hostname,
'count': 1
})
}
}
},
verifiedLink: function (url) {
return this.pages[url]
? this.pages[url].verified
: _.any(this.pages, function (page) {
return fn.sameUrl(page.url, url) && page.verified;
});
},
// Checks if the grapher has reached or exceeded its
// crawl limit. Returns true if it has.
atCrawlLimit: function () {
var deepestLevel = 0;
_.each(this.pages, function (page) {
if (page.level > deepestLevel) deepestLevel = page.level;
});
return deepestLevel > this.options.crawlLimit;
},
// Checks the status of every Page in `this.pages`.
// Returns true if all are either "fetched" or "error".
allFetched: function () {
var statuses = _.pluck(this.pages, 'status');
return _.all(statuses, function (status) {
return status === "fetched" || status === "error" || status === "dontfetch" || status === "errored";
});
},
logFetched: function () {
var statuses = _.pluck(this.pages, 'status'),
fetchedCount = 0;
errorCount = 0;
dontFetchedCount = 0,
logger = this.options.logger;
_.each(statuses, function (s) {
fetchedCount += (s === "fetched" ? 1 : 0);
errorCount += (s === "error" ? 1 : 0);
dontFetchedCount += (s === "dontfetch" ? 1 : 0);
});
logger.info('total pages ' + statuses.length);
logger.info('total fetched ' + fetchedCount);
logger.info('total errors ' + errorCount);
logger.info('total pages outside limits ' + dontFetchedCount);
}
}
// Handles creating the object and running build.
// Executes callback() parameter when graph has
// been built, also returns a promise that resolves
// at that same moment.
function graph (url, options, callback) {
var deferred = _.Deferred(),
promise = deferred.promise(),
err = null,
grapher;
if (arguments.length == 2 && _.isFunction(options)) {
callback = options;
options = {};
}
mergeOptions(options);
// test for email address or url
if (fn.trim(url) !== '') {
if (url.indexOf('@') > -1) {
options.logger.info('parsing webfinger address: ' + url);
getWebfinger (url, function(err, wfUrl) {
if (err === null) {
graphIt(wfUrl);
} else {
returnError('Could not find any pages with webfinger address: ' + url);
}
});
} else {
try {
graphIt(url);
}
catch (err) {
returnError(err);
}
}
} else {
returnError('Sorry no url given');
}
function graphIt (url) {
grapher = new Grapher(url, options);
grapher.build(function (err, graph) {
// if we have an error reformat object
if (callback) {
callback( err, graph.toLiteral(err) );
}
deferred.resolve(err, graph.toLiteral(err) );
});
}
function returnError(msg, url){
var responseObj = {
"query": url,
"created": new Date().toJSON(),
"crawled": 0,
"verified": 0,
}
options.logger.error(msg);
if (callback) {callback(msg, responseObj);}
deferred.resolve(msg, responseObj);
}
return promise;
}
// use webfinger to get a url with xfn
function getWebfinger(address, callback){
try {
wf.webfinger (address, function(msg, data) {
if (data && data.links) {
var objs = data.links,
i = objs.length,
found = false;
x = 0;
while (x < i) {
// if we find a page that contains xfn ie a possible rel=me
if (objs[x].rel === 'http://gmpg.org/xfn/11') {
callback(null, objs[x].href);
found = true;
break;
}
x++;
}
if (!found) callback('No XFN data webfinger request', null);
} else {
callback('No data from webfinger request', null);
}
});
}
catch (err) {
callback(err, null);
}
}
// merges passed and default options
function mergeOptions (options) {
// add interface for cache and logger
options.cache = (options.cache)? options.cache : internalCache;
options.logger = (options.logger)? options.logger : internalLogger;
// single level clone of missing properties
for (var key in internalOptions) {
if (internalOptions.hasOwnProperty(key)) {
if (!options.hasOwnProperty(key)) {
options[key] = internalOptions[key];
}
}
}
// set options within cache and logger objects
if(options.logger.setLogLevel){
options.logger.setLogLevel( options.logLevel );
}
if(options.cache.setCacheLimits){
options.cache.setCacheLimits( options.cacheTimeLimit, options.cacheItemLimit, options.logger );
}
}
exports.Grapher = Grapher;
exports.graph = graph;
exports.cache = Grapher.cache;
exports.options = internalOptions;