UNPKG

ninjs-html

Version:

web crawling engine using cheerio for html/dom manipulation and templating

281 lines (231 loc) 6.89 kB
/** * Ninjs Html (cheerio wrapper) */ 'use strict' const _ = require('ninjs-lodash') const request = require('ninjs-request') const cheerio = require('cheerio') const _load = cheerio.load exports = module.exports = cheerio _.assign(exports, { isElem: isElem, load: load, loadFile: loadFile, loadUrl: loadUrl, loadString: loadString, page: page, metas: metas, meta: meta, links: links, jslinks: jslinks, images: images, frames: frames, cleanLinks: cleanLinks, cleanJsLinks: cleanJsLinks, cleanImages: cleanImages, cleanFrames: cleanFrames, elem:elem, elemAttr: elemAttr, elemHtml: elemHtml, mapTexts: mapTexts }) // is cheerio elem? function isElem(val) { return val ? val instanceof cheerio : false } // load a cheerio dom element with html from url OR src file function load(src, callback) { if(!src || !_.isString(src)) return _.fail('Invalid cheerio src', callback) return src.indexOf('http') === 0 ? loadUrl(src, callback) : loadFile(src, callback) } // load a cheerio dom element with html function loadFile(src, callback) { _.async.waterfall([ _.async.constant(src), _.readFile, loadString ], callback) } // load a cheerio dom element with html from requested url function loadUrl(src, callback) { _.async.waterfall([ _.async.constant(src), request.get, async.asyncify(function(res, body) { return body; }), loadString ], callback) } // returns $ wrapped root node of html string function loadString(str, callback) { let $ = _.attempt(_load, str) if (!$) return _.fail(`\nError: Html string could not be parsed\n`, callback) if (_.isError($)) return _.fail($, callback) return _.done($, callback) } // page function page({ src='', dest='' }, callback) { load({ src: src, dest: dest }, function (err, $) { if (err) return _.fail(err, callback) let ret = { title: '', url: '', meta: {}, links: [], jslinks: [], images: [], frames: [] } let title = $('head title').first().text() return async.parallel({ meta: async.apply(meta, options), links: async.apply(links, options), jslinks: async.apply(jslinks, options), images: async.apply(images, options), frames: async.apply(frames, options) }, function (err, result) { if (err) return _.fail(err, callback) result = _.mcopy(result, { title: title, url: ourl }) _.done(result, callback) }) }) } function metas(options, callback) { // callback = _.cb(arguments) // options = _.mcopy({ src: '', url: '', sel: 'meta' }, _.notcb(options, {})) return elemAttr(options, callback) } function meta(options, callback) { options = _.mcopy({ src: '', url: '' }, _.notcb(options, {})) metas(options, function (err, results) { if (err) return _.fail(err, callback) let ret = {} _.each(results, function (item) { let k = _.get(item, 'name') || _.get(item, 'property') if (!k) return ret[k] = _.get(item, 'content', '') }) _.done(ret, callback) }) } function links(options, callback) { let src = _.get(options, 'url') elemAttr(options, function (err, results) { if (err) return _.fail(err, callback) _.done(cleanLinks(_.url.home(ourl), results), callback) }) } function jslinks(options, callback) { elemAttr(options, function (err, results) { if (err) return _.fail(err, callback) _.done(cleanJsLinks(results), callback) }) } function images(options, callback) { callback = _.cb(arguments) options = _.mcopy({ src: '', url: '', sel: 'img' }, _.notcb(options, {})) let ourl = _.get(options, 'url') elemAttr(options, function (err, results) { if (err) return _.fail(err, callback) _.done(cleanImages(_.url.home(ourl), results), callback) }) } function frames(options, callback) { let url = _.get(options, 'url') elemAttr(options, function (err, results) { if (err) return _.fail(err, callback) _.done(cleanFrames(_.url.home(url), results), callback) }) } // helpers function cleanLinks(base, arr) { let ret = [], hrefs = [] _.each(arr, function (item) { let href = _.get(item, 'href', '') let isJs = href && href.indexOf('javascript:') > -1 ? true : false if (isJs || !href || !_.isString(href) || (hrefs.indexOf(href) > -1)) return hrefs.push(href) item.href = href.indexOf('http') > -1 ? href : _.url.resolve(base, href) item.href = _.url.hostpath(item.href) ret.push(item) }) return ret } function cleanJsLinks(arr) { let ret = [] _.each(arr, function (item) { let href = _.get(item, 'href', '') let isJs = href && href.indexOf('javascript:') > -1 ? true : false if (!isJs) return ret.push(item) }) return ret } function cleanImages(base, arr) { let ret = [], srcs = [] _.each(arr, function (item) { let src = _.get(item, 'src', '') if (!src || _.includes(srcs, src)) return srcs.push(src) item.src = src.indexOf('http') > -1 ? src : _.url.resolve(base, src) item.src = _.url.hostpath(item.src) ret.push(item) }) return ret } function cleanFrames(base, arr) { return arr //let ret = [], // srcs = [] //_.each(arr, function (item) { // let src = _.get(item, 'src', '') // if (!src) return // srcs.push(src) // item.src = src.indexOf('http') > -1 ? src : _.url.resolve(url, src) // item.src = _.url.hostpath(item.src) // ret.push(item) //}) //return ret } function elem(options, callback) { get(options, function (err, $) { if (err) return _.fail(err, callback) let sel = _.get(options, 'sel') let result = sel ? $(sel) : $ _.done(result, callback) }) } function elemAttr(options, callback) { callback = _.cb(arguments) options = _.mcopy({ src: '', url: '', sel: '', attr: '' }, _.notcb(options, {})) get(options, function (err, $) { if (err) return _.fail(err, callback) let sel = _.get(options, 'sel'), attr = _.get(options, 'attr'), $elems = sel ? $(sel) : [], result = null if ($elems.length === 1) { result = attr ? $elems.attr(attr) : $elems.attr() } else if ($elems.length > 1) { result = $elems.map(function (i, elem) { let $elem = $(this) return attr ? $elem.attr(attr) : $elem.attr() }).get() } _.done(result, callback) }) } function elemHtml(options, callback) { get(options, function (err, $) { if (err) return _.fail(err, callback) let sel = _.get(options, 'sel') let result = sel ? $.html(sel) : '' _.done(result, callback) }) } // get array of selected, filtered node text values function mapTexts($, sel, filter) { let $elems = isElem(sel) ? sel : $(sel) let ware = function (i, elem) { return filter ? $(this).find(filter).text() : $(this).text() } return $elems.map(ware).get() }