bquery
Version:
bquery is a useful node module to fetch web page, which use css selector to fetch and structure this html page content.
346 lines (288 loc) • 9.39 kB
JavaScript
var q = require('q'),
fs = require('fs'),
events = require('events'),
req = require('req-fast'),
_ = require('underscore'),
Cache = require('./cache'),
pageCache,
resultsCache;
// ------------------------------------------------------------
// Main noodle entry point for usage.
//
// Accepts one or an array of noodle queries. Based on the
// query type it will make use of the appropriate type module
// to do the processing.
//
// See docs/ for information on what and noodle queries can
// be written.
// ------------------------------------------------------------
exports.query = function (queries) {
var deferred = q.defer(),
promises = [];
// Normalise one query to an array
queries = _.isArray(queries) ? queries : [queries],
// For each query route resolve it as either a normal query
// or a map query
queries.forEach(function (query, i) {
var deferred = q.defer();
query.type = query.type || exports.config.defaultDocumentType;
query.cache = (query.cache === false) ? false : true;
exports.events.emit('noodle/query', query);
if (exports[query.type]) {
if (query.map) {
handleQueryMap(query, deferred, i);
} else {
handleQuery(query, deferred, i);
}
} else {
deferred.resolve({results: [], error: 'Document type not supported'});
}
promises.push(deferred.promise);
});
// Return master promise when all queries have resolved
// and ensure that the order they were evaluated is
// maintained
q.all(promises)
.then(function (results) {
results = results.sort(function (a, b) {
return a.orderNo - b.orderNo;
});
results.forEach(function (result) {
delete result.orderNo;
});
deferred.resolve({results: results});
});
return deferred.promise;
};
function handleQuery (query, deferred, i) {
exports[query.type].fetch(query.url, query)
.then(function (result) {
result.orderNo = i;
deferred.resolve(result);
})
.fail(function (error) {
deferred.resolve({results: [], error: error.message, orderNo: i});
});
}
function handleQueryMap (query, deferred, i) {
map(query, function (error, result) {
if (!error) {
result.orderNo = i;
deferred.resolve(result);
} else {
deferred.resolve({results: [], error: error.message, orderNo: i});
}
});
}
// ------------------------------------------------------------
// Fetch a web document (possibly from cache) with a url.
//
// The query should also be passed in as it contains
// details if it should bypass the cache or if it is a POST
// request.
//
// This fetch method is used by the different type modules to
// get the document before they do they interpret the query
// process the document.
// ------------------------------------------------------------
function random_ip(ip_pattern){
ip_pattern = ip_pattern || "223.*.*.*";
return ip_pattern.replace(/\*/g, function(pattern, key){
return Math.round(Math.random() * 255);
})
}
exports.fetch = function (url, query) {
var opts = _.extend({ timeout: 10000, url: url, trackCookie: true }, _.pick(query, "charset", "cookies", "proxy", "timeout")),
deferred = q.defer();
if(query.post) {
opts.method = 'POST';
opts.body = serialize(query.post);
}
query.cache = false;
var r_ip = random_ip();
req(opts, function(err, resp){
if(err){
deferred.reject(err || new Error('Document not found'));
}
else if(resp.statusCode == 302){
req({url: url, cookies: resp.cookies}, function(err, resp){
if(err || resp.statusCode != 200){
return deferred.reject(err || new Error('Document not found'));
}
deferred.resolve(resp.body);
});
}
else{
deferred.resolve(resp.body);
}
});
return deferred.promise;
};
function jump(url, next){
req(opts, function(err, resp){
})
}
// ------------------------------------------------------------
// Returns an object representing a result set which comprises
// of an array of 1 or more results and the associate page
// header information.
//
// (!!) This is where a result set is cached in resultsCache.
//
// Exposed as it is also called from some type modules.
// ------------------------------------------------------------
exports._wrapResults = function (results, query) {
var resultSet = {};
if (results.length || Object.keys(results).length) {
resultSet.result = results;
if (query.headers) {
resultSet.headers = getHeadersForResultSet(query);
}
if (query.linkHeader) {
resultSet.headers = resultSet.headers || {};
resultSet.headers.link = getLinkHeaders(query) || null;
}
return resultSet;
}
return [];
};
// ------------------------------------------------------------
// The namespace for noodles events.
//
// Events are emitted from both this file and cache.js.
//
// One can subscribe to the following events:
// - cache/page
// - cache/result
// - cache/purge
// - cache/expire
//
// ------------------------------------------------------------
exports.events = new events.EventEmitter();
// ------------------------------------------------------------
// An exposed noodle config initialized by an editable
// json representation at lib/config.json
// ------------------------------------------------------------
exports.config = JSON.parse(fs.readFileSync(__dirname +'/config.json'));
// ------------------------------------------------------------
// Accepts a full or part config object an extends it over
// the existing noodle config.
//
// This is a way to programmatically configure the config
// without touching lib/config.json
// ------------------------------------------------------------
exports.configure = function (obj) {
exports.config = _.extend(exports.config, obj);
};
// ------------------------------------------------------------
// Stops the cache intervals from running in the event loop.
// Allows for the node process to exit.
// ------------------------------------------------------------
exports.stopCache = function () {
resultsCache.stop();
pageCache.stop();
};
// Function called from exports.query()
//
// Takes in a query in the map notation
//
// For each map property, a call to the appropriate type module
// is done and the result is grabbed for that map property's
// value.
//
// When all properties are mapped with values this function calls
// back to exports.query().
function map (query, callback) {
var promises = [],
mappedContainer = {},
getResultSet,
toPush,
mapTo;
getResultSet = function (mapTo, query) {
query.map[mapTo].url = query.url;
query.map[mapTo].cache = query.cache;
return exports[query.type].fetch(query.url, query.map[mapTo])
.then(function (result) {
mappedContainer[mapTo] = result.results;
})
.fail(function (error) {
mappedContainer[mapTo] = {results: [], error: error.message};
});
};
for (mapTo in query.map) {
promises.push(getResultSet(mapTo, query));
}
q.all(promises)
.then(function () {
callback(null, exports._wrapResults(mappedContainer, query));
})
.fail(function (error) {
callback(error);
});
}
// Function called from exports._wrapResults()
//
// Passed in a query and returns the full page headers
// or specific page headers as specified by the query.
function getHeadersForResultSet (query) {
var bucket = {},
pageHeaders,
prop;
if (query.headers !== 'all' && _.isArray(query.headers)) {
for (prop in pageHeaders) {
query.headers.forEach(function (name) {
if (prop.toLowerCase() === name.toLowerCase()) {
bucket[name] = pageHeaders[prop];
}
});
}
return bucket;
} else {
return pageHeaders;
}
}
// Function called from exports._wrapResults()
//
// Passed in a query this function returns a parsed representation
// of the Link header values (intended to aid people with navigation).
function getLinkHeaders (query) {
var header ,
links = {},
parts;
if (header) {
parts = header.split(',');
} else {
return false;
}
// Parse each part into a named link
parts.forEach(function(p) {
var section = p.split(';'),
url = section[0].replace(/<(.*)>/, '$1').trim(),
name = section[1].replace(/rel="(.*)"/, '$1').trim();
links[name] = url;
});
return links;
}
// Function called from exports.query
//
// Will return a query parameter string from an object.
function serialize (obj) {
var str = [], p;
for (p in obj) {
str.push(encodeURIComponent(p) + "=" + encodeURIComponent(obj[p]));
}
return str.join("&");
}
// .---------------------------.
// |noodle initialization stuff|
// '---------------------------'
// Initialize supported document types
fs.readdirSync(__dirname + '/types/').forEach(function (file) {
file = file.substr(0, file.lastIndexOf('.'));
exports[file] = require('./types/' + file);
exports[file]._init(exports);
});
// Start the logger.
// The logger will output to terminal if config.debug is set
// to true.
require('./logger')(exports);