UNPKG

bquery

Version:

bquery is a useful node module to fetch web page, which use css selector to fetch and structure this html page content.

257 lines (227 loc) 8.02 kB
var _ = require("underscore"), q = require('q'), util = require('util'), cheerio = require('cheerio'), bquery, RETRY_COUNT = 3, retryCounter = {}, htmlToText = require('html-to-text'); var URI = require("urijs"); exports._init = function(n) { bquery = n; }; exports.fetch = fetch; exports.select = select; function fetch(url, query) { var deferred = q.defer(); return bquery.fetch(url, query).then(function(page, meta) { return select(page, query); }); } isObject = function(a) { return (!!a) && (a.constructor === Object); }; function select(body, query) { var page = cheerio.load(body); if (!query.holdScripts) { page('script').remove(); page('style').remove(); } if (query.hasOwnProperty('preSelect') && typeof query.preSelect == "function") { query.preSelect(page); } var deferred = q.defer(), extract = query.extract || 'text', selector = query.selector, selected = page(selector), results = []; if (!selector) { deferred.resolve(bquery._wrapResults(body.trim(), query)) return deferred.promise; } results = extractData(selected, extract, page, query.url) // Pass back the extracted results from the DOM if (results.length === 0) { deferred.reject(new Error('Could not match with that selector or extract value')); } else { deferred.resolve(bquery._wrapResults(results, query)); } var result = deferred.promise, value = result.valueOf(); if (query.setCookie && value.exception && query.meta.responseHeaders && query.meta.responseHeaders['set-cookie']) { query.cookies = query.meta.responseHeaders['set-cookie']; delete query.meta; retryCounter[query.url] = !!retryCounter[query.url] ? (retryCounter[query.url] + 1) : 1; if (retryCounter[query.url] > RETRY_COUNT) { retryCounter[query.url] = 1; return result; } return fetch(query.url, query); } else { retryCounter[query.url] = 1; return result; } } function extractData(selected, extract, page, url) { var results = []; extract = extract || 'text'; if (isObject(extract)) { selected.each(function(i, achor) { var item = {}, notEmpty; for (prop in extract) { var attr, elem = achor, mapping = extract[prop]; mapping = _.extend({killBreaks: true, stripSpaces: true, prop: prop}, mapping) if (isObject(mapping)) { attr = mapping.extract; if (mapping.value) { item[prop] = mapping.value; continue; } if (!mapping.selector || mapping.selector.length === 0 || mapping.selector === '.') { set = new cheerio(elem); } else { set = page(mapping.selector, achor); } } else { attr = mapping; } if (mapping.hasOwnProperty('callback') && typeof mapping.callback == "function") { item[prop] = trimBlank(mapping.callback.call(this, extractData(set, attr, page, url)), mapping); } else { item[prop] = trimBlank(extractData(set, attr, page, url), mapping) } notEmpty = notEmpty || item[prop]; } if (notEmpty) { results.push(item); } }); } else if (util.isArray(extract)) { selected.each(function(i, elem) { var item = {}, notEmpty; extract.forEach(function(property) { var name, prop; if (isObject(property)) { name = property.name prop = property.prop property = _.extend({stripSpaces: true, killBreaks: true, prop: name}, property) } else { name = prop = property } item[prop] = trimBlank(extractProperty(page, elem, prop, url), property) notEmpty = notEmpty || item[name] }); if (notEmpty) { results.push(item); } }); } else { selected.each(function(i, elem) { results.push(extractProperty(page, elem, extract, url)); }); } if (results.length === 1) return results[0]; return results; } function absoluteUrl(surl, base){ var uri = URI(surl); if (uri.is("relative")){ var params = URI.parseQuery(URI(base).hash("").query()); if(params.url && base.indexOf("192.168") != -1){ return uri.absoluteTo(params.url).href(); } else{ return uri.absoluteTo(base).href(); } } return surl; } function trimBlank (source, opts) { if(typeof source !== "string"){ return source } if(opts && opts.killBreaks){ if(opts.prop !== "body"){ source = source.replace(/[\r\n]/g, "") } else{ source = source.replace(/(\r\n|\r|\n)+/g, "\n") } } if(opts && !opts.stripSpaces){ return source } return source .replace(/(^\s+)|(\s+$)/g, "") .replace(/ ?/g, ' ') .replace(/(\u00A0| )+/g, ' ') .replace(/^(\u00A0| )+/, '') .replace(/(\u00A0| )+$/, '') } function extractProperty(page, elem, property, url) { if (property === 'selftext') { return page(elem).contents().filter(function() { return this[0].type === "text"; }).text(); } else if (property === 'text') { return page(elem).text(); } else if (property === 'html' || property === 'innerHTML') { return page(elem).html(); } else if (property === "json") { var props = [], text = htmlToText.fromString(page(elem).html(), {ignoreHref: true, wordwrap: 100}); text.split(/\n/).forEach(function(str){ var str = str.replace(/\[.*?gif.*?\]/ig, ""), mt; if(mt = str.match(/^\[(.*?[jpg|png|jpeg].*?)\]$/i)){ try{ mt[1].replace(/\]\[/g, " ").split(" ").forEach(function(t){ if(t){ props.push({type: "img", value: absoluteUrl(t, url)}); } }) } catch(err){ } } else if(mt = str.match(/\[(.*?[jpg|png|jpeg].*?)\]/gi)){ mt.forEach(function(t, i){ var idx = str.indexOf(t), l = t.length; props.push({type: "text", value: str.substr(0, idx)}); try{ t.substr(1, l - 2 ).replace(/\]\[/g, " ").split(" ").forEach(function(tobj){ if(tobj){ props.push({type: "img", value: absoluteUrl(tobj, url)}); } }); } catch(err){} str = str.substr(idx + l); if(i == mt.length-1 && str.length > 0){ props.push({type: "text", value: str}); } }) } else{ str.trim() && props.push({type: "text", value: str}); } }) return props; } else { var attr = page(elem).attr(property); if ((property === "href" || property === "src") && typeof attr === "string" && attr) { attr = absoluteUrl(attr, url); } return attr; } }