bquery
Version:
bquery is a useful node module to fetch web page, which use css selector to fetch and structure this html page content.
139 lines (128 loc) • 3.74 kB
JavaScript
var _ = require("underscore"),
q = require('q'),
jsonSelect = require('JSONSelect'),
bquery;
exports._init = function (n) {
bquery = n;
};
exports.fetch = fetch;
exports.select = select;
function fetch (url, query) {
var deferred = q.defer();
// if (bquery.cache.check(query)) {
// deferred.resolve(bquery.cache.get(query).value);
// return deferred.promise;
// } else {
return bquery.fetch(url, query).then(function (data) {
try {
var parsed = JSON.parse(data);
return select(parsed, query);
}
catch (e) {
throw new Error('Could not parse JSON document');
}
});
// }
}
function select (parsed, query) {
var deferred = q.defer(),
results;
try {
if (!query.selector) {
deferred.resolve(bquery._wrapResults([parsed], query));
}
else {
results = pick_value(parsed, query);
if (results.length === 0) {
deferred.reject(new Error('Could not match with that selector'));
}
else {
deferred.resolve(bquery._wrapResults(results, query));
}
}
} catch (e) {
deferred.reject(new Error('Could not match with that selector'));
}
return deferred.promise;
}
/**
* 将指定的对象按照query选择器的格式进行构造
* @param {[Object]} parsed 数据源对象
* @param {[Object]} query JSON选择器形如:{
* selector: ".media_types>.buckets",
* extract: {
* media_type: ":root>.key"
* }
* }
* @return {[Object]} 按照query结构构造的JSON对象
*
*/
function pick_value(parsed, query){
var result = {},
root = null;
if(query.selector){
root = jsonSelect.match(query.selector, [], parsed);
root = root[0]
}
else if(typeof query === "string"){
result = jsonSelect.match(query, parsed);
if(_.isArray(result) && result.length == 1){
result = result[0];
}
}
if(query.extract){
if(_.isArray(root)){
root = _.flatten(root);
var rs = [];
root.forEach(function(rc){
var t = {};
for(var key in query.extract){
t[key] = pick_value(rc, query.extract[key]);
}
rs.push(t);
});
result = rs;
}
else{
for(var key in query.extract){
result[key] = pick_value(root, query.extract[key]);
}
}
}
else{
result = root;
}
return result;
}
/**
* 将JSON对象展开为对象数组
* @param {[Object]} source 待展开的对象
* @param {[Object]} res 递归参数,初始值为空对象{}
* @param {[Array]} arrs 引用结果,初始值为空数组[]
*/
function flatten(source, res, arrs){
var tmp = res;
for(var key in source){
if(_.isArray(source[key])){
source[key].forEach(function(obj){
if(_.isObject(obj)){
var t = _.extend({}, res);
flatten(obj, res, arrs);
res = t;
}
else{
if(!res[key]){
res[key] = [];
}
res[key].push(obj);
}
});
}
else{
res[key] = source[key];
}
}
if(tmp == res){
arrs.push(res);
}
}