UNPKG

ninjs-html

Version:

web crawling engine using cheerio for html/dom manipulation and templating

350 lines (271 loc) 8.08 kB
/** * NINJS-CORE Dom - Document Object Model * cheerio wrapper / html dom _ */ var _ = require("lodash"), async = require("async"), request = require("request"), cheerio = require("cheerio"); exports = module.exports = _.assign({}, cheerio, { isElem: isElem, xmlFile: xmlFile, file: file, get: get, page: page, metas: metas, meta: meta, links: links, jslinks: jslinks, images: images, frames: frames, cleanLinks: cleanLinks, cleanJsLinks: cleanJsLinks, cleanImages: cleanImages, cleanFrames: cleanFrames, elem:elem, elemAttr: elemAttr, elemHtml: elemHtml, mapTexts: mapTexts }); // is cheerio elem? function isElem(val) { return val ? val instanceof cheerio : false; } // loads a cheerio dom element with xml from src function xmlFile(src, callback) { //console.log("xmlFile: %s", src); _.readFile(src, function (err, result) { if (err) return _.fail(err, callback); try { var $ = load(result, { xmlMode: true }); _.done($, callback); } catch (e) { return _.fail(e, callback); } }); } // load a cheerio dom element with html function file(src, callback) { //console.log("file: %s", src); _.readFile(src, function (err, result) { if (err) return _.fail(err, callback); try { var $ = load(result); _.done($, callback); } catch (e) { return _.fail(e, callback); } }); } // load a cheerio dom element with html from requested url function url(ourl, callback) { request.get(ourl, function (err, res, body) { if (err) return _.fail(err, callback); var $ = []; try { var $ = load(body); _.done($, callback); } catch (e) { return callback(e); } }); } // load a cheerio dom element with html from url OR src file function get(options, callback) { var src = _.get(options, "src"), ourl = _.get(options, "url"); if (src && _.isString(src)) { return src.indexOf("http") === 0 ? url(src, callback) : file(src, callback); } if (ourl && _.isString(ourl)) { return ourl.indexOf("http") === 0 ? url(ourl, callback) : file(ourl, callback); } return _.fail("No src or url provided", callback); } // page function page(options, callback) { callback = _.cb(arguments); options = _.mcopy({ src: "", url: "", dest: "" }, _.notcb(options, {})); var src = _.get(options, "src"), ourl = _.get(options, "url"), dest = ourl ? _.get(options, "dest") : ""; get(options, function (err, $) { if (err) return callback(err); var ret = { title: "", url: "", meta: {}, links: [], jslinks: [], images: [], frames: [] }, title = $("head title").first().text(); async.parallel({ meta: async.apply(meta, options), links: async.apply(links, options), jslinks: async.apply(jslinks, options), images: async.apply(images, options), frames: async.apply(frames, options) }, function (err, result) { if (err) return callback(err); ret = _.extend(ret, result, { title: title, url: ourl }); callback(null, ret); }); }); } function metas(options, callback) { callback = _.cb(arguments); options = _.mcopy({ src: "", url: "", sel: "meta" }, _.notcb(options, {})); return elemAttr(options, callback); } function meta(options, callback) { callback = _.cb(arguments); options = _.mcopy({ src: "", url: "" }, _.notcb(options, {})); metas(options, function (err, results) { if (err) return callback(err); var ret = {}; _.each(results, function (item) { var k = _.get(item, "name") || _.get(item, "property"); if (!k) return; ret[k] = _.get(item, "content", ""); }); callback(null, ret); }); } function links(options, callback) { callback = _.cb(arguments); options = _.mcopy({ src: "", url: "", sel: "a" }, _.notcb(options, {})); var ourl = _.get(options, "url"); elemAttr(options, function (err, results) { if (err) return callback(err); callback(null, cleanLinks(_.url.home(ourl), results)); }); } function jslinks(options, callback) { callback = _.cb(arguments); options = _.mcopy({ src: "", url: "", sel: "a" }, _.notcb(options, {})); elemAttr(options, function (err, results) { if (err) return callback(err); callback(null, cleanJsLinks(results)); }); } function images(options, callback) { callback = _.cb(arguments); options = _.mcopy({ src: "", url: "", sel: "img" }, _.notcb(options, {})); var ourl = _.get(options, "url"); elemAttr(options, function (err, results) { if (err) return callback(err); callback(null, cleanImages(_.url.home(ourl), results)); }); } function frames(options, callback) { callback = _.cb(arguments); options = _.mcopy({ src: "", url: "", sel: "frame" }, _.notcb(options, {})); var ourl = _.get(options, "url"); elemAttr(options, function (err, results) { if (err) return callback(err); callback(null, cleanFrames(_.url.home(ourl), results)); }); } // helpers function cleanLinks(base, arr) { var ret = [], hrefs = []; _.each(arr, function (item) { var href = _.get(item, "href", ""), isJs = href && href.indexOf("javascript:") > -1 ? true : false; if (isJs || !href || !_.isString(href) || (hrefs.indexOf(href) > -1)) return; hrefs.push(href); item.href = href.indexOf("http") > -1 ? href : _.url.resolve(base, href); item.href = _.url.hostpath(item.href); ret.push(item); }); return ret; } function cleanJsLinks(arr) { var ret = []; _.each(arr, function (item) { var href = _.get(item, "href", ""), isJs = href && href.indexOf("javascript:") > -1 ? true : false; if (!isJs) return; ret.push(item); }); return ret; } function cleanImages(base, arr) { var ret = [], srcs = []; _.each(arr, function (item) { var src = _.get(item, "src", ""); if (!src || (srcs.indexOf(src) > -1)) return; srcs.push(src); item.src = src.indexOf("http") > -1 ? src : _.url.resolve(base, src); item.src = _.url.hostpath(item.src); ret.push(item); }); return ret; } function cleanFrames(base, arr) { return arr; //var ret = [], // srcs = []; //_.each(arr, function (item) { // var src = _.get(item, "src", ""); // if (!src) return; // srcs.push(src); // item.src = src.indexOf("http") > -1 ? src : _.url.resolve(url, src); // item.src = _.url.hostpath(item.src); // ret.push(item); //}); //return ret; } function elem(options, callback) { callback = _.cb(arguments); options = _.mcopy({ src: "", url: "", sel: "" }, _.notcb(options, {})); get(options, function (err, $) { if (err) return callback(err); var sel = _.get(options, "sel"), result = sel ? $(sel) : $; callback(null, result); }); } function elemAttr(options, callback) { callback = _.cb(arguments); options = _.mcopy({ src: "", url: "", sel: "", attr: "" }, _.notcb(options, {})); get(options, function (err, $) { if (err) return callback(err); var sel = _.get(options, "sel"), attr = _.get(options, "attr"), $elems = sel ? $(sel) : [], result = null; if ($elems.length === 1) { result = attr ? $elems.attr(attr) : $elems.attr(); } else if ($elems.length > 1) { result = $elems.map(function (i, elem) { var $elem = $(this); return attr ? $elem.attr(attr) : $elem.attr(); }).get(); } callback(null, result); }); } function elemHtml(options, callback) { callback = _.cb(arguments); options = _.mcopy({ src: "", url: "", sel: "" }, _.notcb(options, {})); get(options, function (err, $) { if (err) return callback(err); var sel = _.get(options, "sel"), result = sel ? $.html(sel) : ""; callback(null, result); }); } // get array of selected, filtered node text values function mapTexts($, sel, filter) { var $elems = isElem(sel) ? sel : $(sel), ware = function (i, elem) { return filter ? $(this).find(filter).text() : $(this).text(); }; return $elems.map(ware).get(); }