ninjs-html
Version:
web crawling engine using cheerio for html/dom manipulation and templating
350 lines (271 loc) • 8.08 kB
JavaScript
/**
* NINJS-CORE Dom - Document Object Model
* cheerio wrapper / html dom _
*/
var _ = require("lodash"),
async = require("async"),
request = require("request"),
cheerio = require("cheerio");
exports = module.exports = _.assign({}, cheerio, {
isElem: isElem,
xmlFile: xmlFile,
file: file,
get: get,
page: page,
metas: metas,
meta: meta,
links: links,
jslinks: jslinks,
images: images,
frames: frames,
cleanLinks: cleanLinks,
cleanJsLinks: cleanJsLinks,
cleanImages: cleanImages,
cleanFrames: cleanFrames,
elem:elem,
elemAttr: elemAttr,
elemHtml: elemHtml,
mapTexts: mapTexts
});
// is cheerio elem?
function isElem(val) {
return val ? val instanceof cheerio : false;
}
// loads a cheerio dom element with xml from src
function xmlFile(src, callback) {
//console.log("xmlFile: %s", src);
_.readFile(src, function (err, result) {
if (err) return _.fail(err, callback);
try {
var $ = load(result, { xmlMode: true });
_.done($, callback);
} catch (e) {
return _.fail(e, callback);
}
});
}
// load a cheerio dom element with html
function file(src, callback) {
//console.log("file: %s", src);
_.readFile(src, function (err, result) {
if (err) return _.fail(err, callback);
try {
var $ = load(result);
_.done($, callback);
} catch (e) {
return _.fail(e, callback);
}
});
}
// load a cheerio dom element with html from requested url
function url(ourl, callback) {
request.get(ourl, function (err, res, body) {
if (err) return _.fail(err, callback);
var $ = [];
try {
var $ = load(body);
_.done($, callback);
} catch (e) {
return callback(e);
}
});
}
// load a cheerio dom element with html from url OR src file
function get(options, callback) {
var src = _.get(options, "src"),
ourl = _.get(options, "url");
if (src && _.isString(src)) {
return src.indexOf("http") === 0 ? url(src, callback) : file(src, callback);
}
if (ourl && _.isString(ourl)) {
return ourl.indexOf("http") === 0 ? url(ourl, callback) : file(ourl, callback);
}
return _.fail("No src or url provided", callback);
}
// page
function page(options, callback) {
callback = _.cb(arguments);
options = _.mcopy({ src: "", url: "", dest: "" }, _.notcb(options, {}));
var src = _.get(options, "src"),
ourl = _.get(options, "url"),
dest = ourl ? _.get(options, "dest") : "";
get(options, function (err, $) {
if (err) return callback(err);
var ret = {
title: "",
url: "",
meta: {},
links: [],
jslinks: [],
images: [],
frames: []
},
title = $("head title").first().text();
async.parallel({
meta: async.apply(meta, options),
links: async.apply(links, options),
jslinks: async.apply(jslinks, options),
images: async.apply(images, options),
frames: async.apply(frames, options)
},
function (err, result) {
if (err) return callback(err);
ret = _.extend(ret, result, { title: title, url: ourl });
callback(null, ret);
});
});
}
function metas(options, callback) {
callback = _.cb(arguments);
options = _.mcopy({ src: "", url: "", sel: "meta" }, _.notcb(options, {}));
return elemAttr(options, callback);
}
function meta(options, callback) {
callback = _.cb(arguments);
options = _.mcopy({ src: "", url: "" }, _.notcb(options, {}));
metas(options, function (err, results) {
if (err) return callback(err);
var ret = {};
_.each(results, function (item) {
var k = _.get(item, "name") || _.get(item, "property");
if (!k) return;
ret[k] = _.get(item, "content", "");
});
callback(null, ret);
});
}
function links(options, callback) {
callback = _.cb(arguments);
options = _.mcopy({ src: "", url: "", sel: "a" }, _.notcb(options, {}));
var ourl = _.get(options, "url");
elemAttr(options, function (err, results) {
if (err) return callback(err);
callback(null, cleanLinks(_.url.home(ourl), results));
});
}
function jslinks(options, callback) {
callback = _.cb(arguments);
options = _.mcopy({ src: "", url: "", sel: "a" }, _.notcb(options, {}));
elemAttr(options, function (err, results) {
if (err) return callback(err);
callback(null, cleanJsLinks(results));
});
}
function images(options, callback) {
callback = _.cb(arguments);
options = _.mcopy({ src: "", url: "", sel: "img" }, _.notcb(options, {}));
var ourl = _.get(options, "url");
elemAttr(options, function (err, results) {
if (err) return callback(err);
callback(null, cleanImages(_.url.home(ourl), results));
});
}
function frames(options, callback) {
callback = _.cb(arguments);
options = _.mcopy({ src: "", url: "", sel: "frame" }, _.notcb(options, {}));
var ourl = _.get(options, "url");
elemAttr(options, function (err, results) {
if (err) return callback(err);
callback(null, cleanFrames(_.url.home(ourl), results));
});
}
// helpers
function cleanLinks(base, arr) {
var ret = [],
hrefs = [];
_.each(arr, function (item) {
var href = _.get(item, "href", ""),
isJs = href && href.indexOf("javascript:") > -1 ? true : false;
if (isJs || !href || !_.isString(href) || (hrefs.indexOf(href) > -1)) return;
hrefs.push(href);
item.href = href.indexOf("http") > -1 ? href : _.url.resolve(base, href);
item.href = _.url.hostpath(item.href);
ret.push(item);
});
return ret;
}
function cleanJsLinks(arr) {
var ret = [];
_.each(arr, function (item) {
var href = _.get(item, "href", ""),
isJs = href && href.indexOf("javascript:") > -1 ? true : false;
if (!isJs) return;
ret.push(item);
});
return ret;
}
function cleanImages(base, arr) {
var ret = [],
srcs = [];
_.each(arr, function (item) {
var src = _.get(item, "src", "");
if (!src || (srcs.indexOf(src) > -1)) return;
srcs.push(src);
item.src = src.indexOf("http") > -1 ? src : _.url.resolve(base, src);
item.src = _.url.hostpath(item.src);
ret.push(item);
});
return ret;
}
function cleanFrames(base, arr) {
return arr;
//var ret = [],
// srcs = [];
//_.each(arr, function (item) {
// var src = _.get(item, "src", "");
// if (!src) return;
// srcs.push(src);
// item.src = src.indexOf("http") > -1 ? src : _.url.resolve(url, src);
// item.src = _.url.hostpath(item.src);
// ret.push(item);
//});
//return ret;
}
function elem(options, callback) {
callback = _.cb(arguments);
options = _.mcopy({ src: "", url: "", sel: "" }, _.notcb(options, {}));
get(options, function (err, $) {
if (err) return callback(err);
var sel = _.get(options, "sel"),
result = sel ? $(sel) : $;
callback(null, result);
});
}
function elemAttr(options, callback) {
callback = _.cb(arguments);
options = _.mcopy({ src: "", url: "", sel: "", attr: "" }, _.notcb(options, {}));
get(options, function (err, $) {
if (err) return callback(err);
var sel = _.get(options, "sel"),
attr = _.get(options, "attr"),
$elems = sel ? $(sel) : [],
result = null;
if ($elems.length === 1) {
result = attr ? $elems.attr(attr) : $elems.attr();
} else if ($elems.length > 1) {
result = $elems.map(function (i, elem) {
var $elem = $(this);
return attr ? $elem.attr(attr) : $elem.attr();
}).get();
}
callback(null, result);
});
}
function elemHtml(options, callback) {
callback = _.cb(arguments);
options = _.mcopy({ src: "", url: "", sel: "" }, _.notcb(options, {}));
get(options, function (err, $) {
if (err) return callback(err);
var sel = _.get(options, "sel"),
result = sel ? $.html(sel) : "";
callback(null, result);
});
}
// get array of selected, filtered node text values
function mapTexts($, sel, filter) {
var $elems = isElem(sel) ? sel : $(sel),
ware = function (i, elem) {
return filter ? $(this).find(filter).text() : $(this).text();
};
return $elems.map(ware).get();
}