ninjs-html
Version:
web crawling engine using cheerio for html/dom manipulation and templating
281 lines (231 loc) • 6.89 kB
JavaScript
/**
* Ninjs Html (cheerio wrapper)
*/
const _ = require('ninjs-lodash')
const request = require('ninjs-request')
const cheerio = require('cheerio')
const _load = cheerio.load
exports = module.exports = cheerio
_.assign(exports, {
isElem: isElem,
load: load,
loadFile: loadFile,
loadUrl: loadUrl,
loadString: loadString,
page: page,
metas: metas,
meta: meta,
links: links,
jslinks: jslinks,
images: images,
frames: frames,
cleanLinks: cleanLinks,
cleanJsLinks: cleanJsLinks,
cleanImages: cleanImages,
cleanFrames: cleanFrames,
elem:elem,
elemAttr: elemAttr,
elemHtml: elemHtml,
mapTexts: mapTexts
})
// is cheerio elem?
function isElem(val) { return val ? val instanceof cheerio : false }
// load a cheerio dom element with html from url OR src file
function load(src, callback) {
if(!src || !_.isString(src)) return _.fail('Invalid cheerio src', callback)
return src.indexOf('http') === 0 ? loadUrl(src, callback) : loadFile(src, callback)
}
// load a cheerio dom element with html
function loadFile(src, callback) {
_.async.waterfall([
_.async.constant(src),
_.readFile,
loadString
], callback)
}
// load a cheerio dom element with html from requested url
function loadUrl(src, callback) {
_.async.waterfall([
_.async.constant(src),
request.get,
async.asyncify(function(res, body) { return body; }),
loadString
], callback)
}
// returns $ wrapped root node of html string
function loadString(str, callback) {
let $ = _.attempt(_load, str)
if (!$) return _.fail(`\nError: Html string could not be parsed\n`, callback)
if (_.isError($)) return _.fail($, callback)
return _.done($, callback)
}
// page
function page({ src='', dest='' }, callback) {
load({ src: src, dest: dest }, function (err, $) {
if (err) return _.fail(err, callback)
let ret = {
title: '',
url: '',
meta: {},
links: [],
jslinks: [],
images: [],
frames: []
}
let title = $('head title').first().text()
return async.parallel({
meta: async.apply(meta, options),
links: async.apply(links, options),
jslinks: async.apply(jslinks, options),
images: async.apply(images, options),
frames: async.apply(frames, options)
},
function (err, result) {
if (err) return _.fail(err, callback)
result = _.mcopy(result, { title: title, url: ourl })
_.done(result, callback)
})
})
}
function metas(options, callback) {
// callback = _.cb(arguments)
// options = _.mcopy({ src: '', url: '', sel: 'meta' }, _.notcb(options, {}))
return elemAttr(options, callback)
}
function meta(options, callback) {
options = _.mcopy({ src: '', url: '' }, _.notcb(options, {}))
metas(options, function (err, results) {
if (err) return _.fail(err, callback)
let ret = {}
_.each(results, function (item) {
let k = _.get(item, 'name') || _.get(item, 'property')
if (!k) return
ret[k] = _.get(item, 'content', '')
})
_.done(ret, callback)
})
}
function links(options, callback) {
let src = _.get(options, 'url')
elemAttr(options, function (err, results) {
if (err) return _.fail(err, callback)
_.done(cleanLinks(_.url.home(ourl), results), callback)
})
}
function jslinks(options, callback) {
elemAttr(options, function (err, results) {
if (err) return _.fail(err, callback)
_.done(cleanJsLinks(results), callback)
})
}
function images(options, callback) {
callback = _.cb(arguments)
options = _.mcopy({ src: '', url: '', sel: 'img' }, _.notcb(options, {}))
let ourl = _.get(options, 'url')
elemAttr(options, function (err, results) {
if (err) return _.fail(err, callback)
_.done(cleanImages(_.url.home(ourl), results), callback)
})
}
function frames(options, callback) {
let url = _.get(options, 'url')
elemAttr(options, function (err, results) {
if (err) return _.fail(err, callback)
_.done(cleanFrames(_.url.home(url), results), callback)
})
}
// helpers
function cleanLinks(base, arr) {
let ret = [],
hrefs = []
_.each(arr, function (item) {
let href = _.get(item, 'href', '')
let isJs = href && href.indexOf('javascript:') > -1 ? true : false
if (isJs || !href || !_.isString(href) || (hrefs.indexOf(href) > -1)) return
hrefs.push(href)
item.href = href.indexOf('http') > -1 ? href : _.url.resolve(base, href)
item.href = _.url.hostpath(item.href)
ret.push(item)
})
return ret
}
function cleanJsLinks(arr) {
let ret = []
_.each(arr, function (item) {
let href = _.get(item, 'href', '')
let isJs = href && href.indexOf('javascript:') > -1 ? true : false
if (!isJs) return
ret.push(item)
})
return ret
}
function cleanImages(base, arr) {
let ret = [],
srcs = []
_.each(arr, function (item) {
let src = _.get(item, 'src', '')
if (!src || _.includes(srcs, src)) return
srcs.push(src)
item.src = src.indexOf('http') > -1 ? src : _.url.resolve(base, src)
item.src = _.url.hostpath(item.src)
ret.push(item)
})
return ret
}
function cleanFrames(base, arr) {
return arr
//let ret = [],
// srcs = []
//_.each(arr, function (item) {
// let src = _.get(item, 'src', '')
// if (!src) return
// srcs.push(src)
// item.src = src.indexOf('http') > -1 ? src : _.url.resolve(url, src)
// item.src = _.url.hostpath(item.src)
// ret.push(item)
//})
//return ret
}
function elem(options, callback) {
get(options, function (err, $) {
if (err) return _.fail(err, callback)
let sel = _.get(options, 'sel')
let result = sel ? $(sel) : $
_.done(result, callback)
})
}
function elemAttr(options, callback) {
callback = _.cb(arguments)
options = _.mcopy({ src: '', url: '', sel: '', attr: '' }, _.notcb(options, {}))
get(options, function (err, $) {
if (err) return _.fail(err, callback)
let sel = _.get(options, 'sel'),
attr = _.get(options, 'attr'),
$elems = sel ? $(sel) : [],
result = null
if ($elems.length === 1) {
result = attr ? $elems.attr(attr) : $elems.attr()
} else if ($elems.length > 1) {
result = $elems.map(function (i, elem) {
let $elem = $(this)
return attr ? $elem.attr(attr) : $elem.attr()
}).get()
}
_.done(result, callback)
})
}
function elemHtml(options, callback) {
get(options, function (err, $) {
if (err) return _.fail(err, callback)
let sel = _.get(options, 'sel')
let result = sel ? $.html(sel) : ''
_.done(result, callback)
})
}
// get array of selected, filtered node text values
function mapTexts($, sel, filter) {
let $elems = isElem(sel) ? sel : $(sel)
let ware = function (i, elem) { return filter ? $(this).find(filter).text() : $(this).text() }
return $elems.map(ware).get()
}