bquery
Version:
bquery is a useful node module to fetch web page, which use css selector to fetch and structure this html page content.
257 lines (227 loc) • 8.02 kB
JavaScript
var _ = require("underscore"),
q = require('q'),
util = require('util'),
cheerio = require('cheerio'),
bquery,
RETRY_COUNT = 3,
retryCounter = {},
htmlToText = require('html-to-text');
var URI = require("urijs");
exports._init = function(n) {
bquery = n;
};
exports.fetch = fetch;
exports.select = select;
function fetch(url, query) {
var deferred = q.defer();
return bquery.fetch(url, query).then(function(page, meta) {
return select(page, query);
});
}
isObject = function(a) {
return (!!a) && (a.constructor === Object);
};
function select(body, query) {
var page = cheerio.load(body);
if (!query.holdScripts) {
page('script').remove();
page('style').remove();
}
if (query.hasOwnProperty('preSelect') && typeof query.preSelect == "function") {
query.preSelect(page);
}
var deferred = q.defer(),
extract = query.extract || 'text',
selector = query.selector,
selected = page(selector),
results = [];
if (!selector) {
deferred.resolve(bquery._wrapResults(body.trim(), query))
return deferred.promise;
}
results = extractData(selected, extract, page, query.url)
// Pass back the extracted results from the DOM
if (results.length === 0) {
deferred.reject(new Error('Could not match with that selector or extract value'));
} else {
deferred.resolve(bquery._wrapResults(results, query));
}
var result = deferred.promise,
value = result.valueOf();
if (query.setCookie && value.exception && query.meta.responseHeaders && query.meta.responseHeaders['set-cookie']) {
query.cookies = query.meta.responseHeaders['set-cookie'];
delete query.meta;
retryCounter[query.url] = !!retryCounter[query.url] ? (retryCounter[query.url] + 1) : 1;
if (retryCounter[query.url] > RETRY_COUNT) {
retryCounter[query.url] = 1;
return result;
}
return fetch(query.url, query);
} else {
retryCounter[query.url] = 1;
return result;
}
}
function extractData(selected, extract, page, url) {
var results = [];
extract = extract || 'text';
if (isObject(extract)) {
selected.each(function(i, achor) {
var item = {},
notEmpty;
for (prop in extract) {
var attr,
elem = achor,
mapping = extract[prop];
mapping = _.extend({killBreaks: true, stripSpaces: true, prop: prop}, mapping)
if (isObject(mapping)) {
attr = mapping.extract;
if (mapping.value) {
item[prop] = mapping.value;
continue;
}
if (!mapping.selector || mapping.selector.length === 0 || mapping.selector === '.') {
set = new cheerio(elem);
} else {
set = page(mapping.selector, achor);
}
}
else {
attr = mapping;
}
if (mapping.hasOwnProperty('callback') && typeof mapping.callback == "function") {
item[prop] = trimBlank(mapping.callback.call(this, extractData(set, attr, page, url)), mapping);
}
else {
item[prop] = trimBlank(extractData(set, attr, page, url), mapping)
}
notEmpty = notEmpty || item[prop];
}
if (notEmpty) {
results.push(item);
}
});
} else if (util.isArray(extract)) {
selected.each(function(i, elem) {
var item = {},
notEmpty;
extract.forEach(function(property) {
var name, prop;
if (isObject(property)) {
name = property.name
prop = property.prop
property = _.extend({stripSpaces: true, killBreaks: true, prop: name}, property)
} else {
name = prop = property
}
item[prop] = trimBlank(extractProperty(page, elem, prop, url), property)
notEmpty = notEmpty || item[name]
});
if (notEmpty) {
results.push(item);
}
});
} else {
selected.each(function(i, elem) {
results.push(extractProperty(page, elem, extract, url));
});
}
if (results.length === 1) return results[0];
return results;
}
function absoluteUrl(surl, base){
var uri = URI(surl);
if (uri.is("relative")){
var params = URI.parseQuery(URI(base).hash("").query());
if(params.url && base.indexOf("192.168") != -1){
return uri.absoluteTo(params.url).href();
}
else{
return uri.absoluteTo(base).href();
}
}
return surl;
}
function trimBlank (source, opts) {
if(typeof source !== "string"){
return source
}
if(opts && opts.killBreaks){
if(opts.prop !== "body"){
source = source.replace(/[\r\n]/g, "")
}
else{
source = source.replace(/(\r\n|\r|\n)+/g, "\n")
}
}
if(opts && !opts.stripSpaces){
return source
}
return source
.replace(/(^\s+)|(\s+$)/g, "")
.replace(/ ?/g, ' ')
.replace(/(\u00A0| )+/g, ' ')
.replace(/^(\u00A0| )+/, '')
.replace(/(\u00A0| )+$/, '')
}
function extractProperty(page, elem, property, url) {
if (property === 'selftext') {
return page(elem).contents().filter(function() {
return this[0].type === "text";
}).text();
}
else if (property === 'text') {
return page(elem).text();
}
else if (property === 'html' || property === 'innerHTML') {
return page(elem).html();
}
else if (property === "json") {
var props = [],
text = htmlToText.fromString(page(elem).html(), {ignoreHref: true, wordwrap: 100});
text.split(/\n/).forEach(function(str){
var str = str.replace(/\[.*?gif.*?\]/ig, ""),
mt;
if(mt = str.match(/^\[(.*?[jpg|png|jpeg].*?)\]$/i)){
try{
mt[1].replace(/\]\[/g, " ").split(" ").forEach(function(t){
if(t){
props.push({type: "img", value: absoluteUrl(t, url)});
}
})
}
catch(err){
}
}
else if(mt = str.match(/\[(.*?[jpg|png|jpeg].*?)\]/gi)){
mt.forEach(function(t, i){
var idx = str.indexOf(t),
l = t.length;
props.push({type: "text", value: str.substr(0, idx)});
try{
t.substr(1, l - 2 ).replace(/\]\[/g, " ").split(" ").forEach(function(tobj){
if(tobj){
props.push({type: "img", value: absoluteUrl(tobj, url)});
}
});
}
catch(err){}
str = str.substr(idx + l);
if(i == mt.length-1 && str.length > 0){
props.push({type: "text", value: str});
}
})
}
else{
str.trim() && props.push({type: "text", value: str});
}
})
return props;
} else {
var attr = page(elem).attr(property);
if ((property === "href" || property === "src") && typeof attr === "string" && attr) {
attr = absoluteUrl(attr, url);
}
return attr;
}
}