zombie-globbies
Version:
A very quick fix for [**Zombie**](https://github.com/assaf/zombie) to permit to crawl correctly webpages with attributes on the html tag (eg: html lang="en").
574 lines (536 loc) • 18.6 kB
JavaScript
var File, HTML, HTTP, MATCH_CHARSET, Path, Promise, QS, Request, Resources, URL, Zlib, assert, iconv,
__hasProp = {}.hasOwnProperty,
__extends = function(child, parent) { for (var key in parent) { if (__hasProp.call(parent, key)) child[key] = parent[key]; } function ctor() { this.constructor = child; } ctor.prototype = parent.prototype; child.prototype = new ctor(); child.__super__ = parent.prototype; return child; },
__slice = [].slice;
iconv = require("iconv-lite");
File = require("fs");
HTML = require("jsdom").defaultLevel;
Path = require("path");
QS = require("querystring");
Request = require("request");
URL = require("url");
HTTP = require('http');
Zlib = require("zlib");
assert = require("assert");
Promise = require("bluebird").Promise;
Resources = (function(_super) {
__extends(Resources, _super);
function Resources(browser) {
this.browser = browser;
this.pipeline = Resources.pipeline.slice();
this.urlMatchers = [];
}
Resources.prototype.request = function(method, url, options, callback) {
var promise, request, resource, _ref;
if (options == null) {
options = {};
}
if (!callback && typeof options === 'function') {
_ref = [{}, options], options = _ref[0], callback = _ref[1];
}
request = {
method: method.toUpperCase(),
url: url,
headers: options.headers || {},
params: options.params,
body: options.body,
time: Date.now(),
timeout: options.timeout || 0,
strictSSL: this.browser.strictSSL,
localAddress: this.browser.localAddress || 0
};
resource = {
request: request,
target: options.target
};
this.push(resource);
this.browser.emit("request", request);
promise = new Promise((function(_this) {
return function(resolve, reject) {
return _this.runPipeline(request, function(error, response) {
if (error) {
resource.error = error;
return reject(error);
} else {
response.url || (response.url = request.url);
response.statusCode || (response.statusCode = 200);
response.statusText = HTTP.STATUS_CODES[response.statusCode] || "Unknown";
response.headers || (response.headers = {});
response.redirects || (response.redirects = 0);
response.time = Date.now();
resource.response = response;
_this.browser.emit("response", request, response);
return resolve(resource.response);
}
});
};
})(this));
if (callback) {
return promise.done(function(response) {
return callback(null, response);
}, callback);
} else {
return promise;
}
};
Resources.prototype.get = function(url, options, callback) {
return this.request("get", url, options, callback);
};
Resources.prototype.post = function(url, options, callback) {
return this.request("post", url, options, callback);
};
Resources.prototype.fail = function(url, message) {
var failTheRequest;
failTheRequest = function(request, next) {
return next(new Error(message || "This request was intended to fail"));
};
this.urlMatchers.push([url, failTheRequest]);
};
Resources.prototype.delay = function(url, delay) {
var delayTheResponse;
if (delay == null) {
delay = 10;
}
delayTheResponse = function(request, next) {
return setTimeout(next, delay);
};
this.urlMatchers.push([url, delayTheResponse]);
};
Resources.prototype.mock = function(url, result) {
var mockTheResponse;
if (result == null) {
result = {};
}
mockTheResponse = function(request, next) {
return next(null, result);
};
this.urlMatchers.push([url, mockTheResponse]);
};
Resources.prototype.restore = function(url) {
this.urlMatchers = this.urlMatchers.filter(function(_arg) {
var match, _;
match = _arg[0], _ = _arg[1];
return match !== url;
});
};
Resources.prototype.dump = function(output) {
var error, name, request, resource, response, sample, target, value, _i, _len, _ref, _results;
if (output == null) {
output = process.stdout;
}
_results = [];
for (_i = 0, _len = this.length; _i < _len; _i++) {
resource = this[_i];
request = resource.request, response = resource.response, error = resource.error, target = resource.target;
if (response) {
output.write("" + request.method + " " + response.url + " - " + response.statusCode + " " + response.statusText + " - " + (response.time - request.time) + "ms\n");
} else {
output.write("" + resource.request.method + " " + resource.request.url + "\n");
}
if (target instanceof HTML.Document) {
output.write(" Loaded as HTML document\n");
} else if (target) {
if (target.id) {
output.write(" Loading by element #" + target.id + "\n");
} else {
output.write(" Loading as " + target.tagName + " element\n");
}
}
if (response) {
if (response.redirects) {
output.write(" Followed " + response.redirects + " redirects\n");
}
_ref = response.headers;
for (name in _ref) {
value = _ref[name];
output.write(" " + name + ": " + value + "\n");
}
output.write("\n");
sample = response.body.slice(0, 250).toString("utf8").split("\n").map(function(line) {
return " " + line;
}).join("\n");
output.write(sample);
} else if (error) {
output.write(" Error: " + error.message + "\n");
} else {
output.write(" Pending since " + (new Date(request.time)) + "\n");
}
_results.push(output.write("\n\n"));
}
return _results;
};
Resources.prototype.addHandler = function(handler) {
assert(handler.call, "Handler must be a function");
assert(handler.length === 2 || handler.length === 3, "Handler function takes 2 (request handler) or 3 (reponse handler) arguments");
return this.pipeline.push(handler);
};
Resources.prototype.runPipeline = function(request, callback) {
var nextRequestHandler, nextResponseHandler, requestHandlers, response, responseHandlers;
requestHandlers = this.pipeline.filter(function(fn) {
return fn.length === 2;
});
requestHandlers.push(Resources.makeHTTPRequest);
responseHandlers = this.pipeline.filter(function(fn) {
return fn.length === 3;
});
response = null;
nextRequestHandler = (function(_this) {
return function(error, responseFromHandler) {
var handler;
if (error) {
return callback(error);
} else if (responseFromHandler) {
response = responseFromHandler;
response.url || (response.url = request.url);
return nextResponseHandler();
} else {
handler = requestHandlers.shift();
try {
return handler.call(_this.browser, request, nextRequestHandler);
} catch (_error) {
error = _error;
return callback(error);
}
}
};
})(this);
nextResponseHandler = (function(_this) {
return function(error, responseFromHandler) {
var handler;
if (error) {
return callback(error);
} else {
if (responseFromHandler) {
response = responseFromHandler;
}
handler = responseHandlers.shift();
if (handler) {
try {
return handler.call(_this.browser, request, response, nextResponseHandler);
} catch (_error) {
error = _error;
return callback(error);
}
} else {
return callback(null, response);
}
}
};
})(this);
nextRequestHandler();
};
return Resources;
})(Array);
Resources.addHandler = function(handler) {
assert(handler.call, "Handler must be a function");
assert(handler.length === 2 || handler.length === 3, "Handler function takes 2 (request handler) or 3 (response handler) arguments");
return this.pipeline.push(handler);
};
Resources.normalizeURL = function(request, next) {
var method, name, uri, value, _ref;
if (/^file:/.test(request.url)) {
request.url = request.url.replace(/^file:\/{1,3}/, "file:///");
} else {
if (this.document) {
request.url = HTML.resourceLoader.resolve(this.document, request.url);
} else {
request.url = URL.resolve(this.site || "http://localhost", request.url);
}
}
if (request.params) {
method = request.method;
if (method === "GET" || method === "HEAD" || method === "DELETE") {
uri = URL.parse(request.url, true);
_ref = request.params;
for (name in _ref) {
value = _ref[name];
uri.query[name] = value;
}
request.url = URL.format(uri);
}
}
next();
};
Resources.mergeHeaders = function(request, next) {
var credentials, headers, host, name, value, _ref, _ref1;
headers = {
"user-agent": this.userAgent
};
_ref = this.headers;
for (name in _ref) {
value = _ref[name];
headers[name.toLowerCase()] = value;
}
if (request.headers) {
_ref1 = request.headers;
for (name in _ref1) {
value = _ref1[name];
headers[name.toLowerCase()] = value;
}
}
host = URL.parse(request.url).host;
headers.host = host;
if (credentials = this.authenticate(host, false)) {
credentials.apply(headers);
}
request.headers = headers;
next();
};
Resources.createBody = function(request, next) {
var binary, boundary, disp, headers, method, mimeType, multipart, name, params, value, values, _i, _len;
method = request.method;
if (method === "POST" || method === "PUT") {
headers = request.headers;
headers["content-type"] || (headers["content-type"] = "application/x-www-form-urlencoded");
mimeType = headers["content-type"].split(";")[0];
if (!request.body) {
switch (mimeType) {
case "application/x-www-form-urlencoded":
request.body = QS.stringify(request.params || {});
headers["content-length"] = request.body.length;
break;
case "multipart/form-data":
params = request.params || {};
if (Object.keys(params).length === 0) {
headers["content-type"] = "text/plain";
request.body = "";
} else {
boundary = "" + (new Date().getTime()) + "." + (Math.random());
headers["content-type"] += "; boundary=" + boundary;
multipart = [];
for (name in params) {
values = params[name];
for (_i = 0, _len = values.length; _i < _len; _i++) {
value = values[_i];
disp = "form-data; name=\"" + name + "\"";
if (value.read) {
binary = value.read();
multipart.push({
"Content-Disposition": "" + disp + "; filename=\"" + value + "\"",
"Content-Type": value.mime || "application/octet-stream",
"Content-Length": binary.length,
body: binary
});
} else {
multipart.push({
"Content-Disposition": disp,
"Content-Type": "text/plain; charset=utf8",
"Content-Length": value.length,
body: value
});
}
}
}
request.multipart = multipart;
}
break;
case "text/plain":
break;
default:
next(new Error("Unsupported content type " + mimeType));
return;
}
}
}
next();
};
Resources.specialURLHandlers = function(request, next) {
var handler, url, _i, _len, _ref, _ref1;
_ref = this.resources.urlMatchers;
for (_i = 0, _len = _ref.length; _i < _len; _i++) {
_ref1 = _ref[_i], url = _ref1[0], handler = _ref1[1];
if (url instanceof RegExp) {
if (url.test(request.url)) {
handler(request, next);
return;
}
} else if (URL.resolve(request.url, url) === request.url) {
handler(request, next);
return;
}
}
return next();
};
Resources.handleHTTPResponse = function(request, response, callback) {
var hostname, name, pathname, protocol, redirectHeaders, redirectRequest, redirectUrl, redirects, setCookie, value, _ref, _ref1;
_ref = URL.parse(request.url), protocol = _ref.protocol, hostname = _ref.hostname, pathname = _ref.pathname;
if (!(protocol === "http:" || protocol === "https:")) {
callback();
return;
}
setCookie = response.headers && response.headers["set-cookie"];
if (setCookie) {
this.cookies.update(setCookie, hostname, pathname);
}
redirects = request.redirects || 0;
redirectUrl = null;
switch (response.statusCode) {
case 301:
case 307:
if (request.method === "GET" || request.method === "HEAD") {
redirectUrl = URL.resolve(request.url, response.headers.location);
}
break;
case 302:
case 303:
redirectUrl = URL.resolve(request.url, response.headers.location);
}
if (redirectUrl) {
response.url = redirectUrl;
++redirects;
if (redirects > this.maxRedirects) {
callback(new Error("More than " + this.maxRedirects + " redirects, giving up"));
return;
}
redirectHeaders = {};
_ref1 = request.headers;
for (name in _ref1) {
value = _ref1[name];
redirectHeaders[name] = value;
}
redirectHeaders.referer = request.url;
delete redirectHeaders["content-type"];
delete redirectHeaders["content-length"];
delete redirectHeaders["content-transfer-encoding"];
redirectRequest = {
method: "GET",
url: response.url,
headers: redirectHeaders,
redirects: redirects,
strictSSL: request.strictSSL,
time: request.time,
timeout: request.timeout
};
this.emit("redirect", request, response, redirectRequest);
this.resources.runPipeline(redirectRequest, callback);
} else {
response.redirects = redirects;
callback();
}
};
Resources.decompressBody = function(request, response, next) {
var contentEncoding, transferEncoding;
if (response.body && response.headers) {
transferEncoding = response.headers["transfer-encoding"];
contentEncoding = response.headers["content-encoding"];
}
if ((contentEncoding === "deflate") || (transferEncoding === "deflate")) {
Zlib.inflate(response.body, function(error, buffer) {
if (!error) {
response.body = buffer;
}
return next(error);
});
} else if ((contentEncoding === "gzip") || (transferEncoding === "gzip")) {
Zlib.gunzip(response.body, function(error, buffer) {
if (!error) {
response.body = buffer;
}
return next(error);
});
} else {
next();
}
};
MATCH_CHARSET = /<meta(?!\s*(?:name|value)\s*=)[^>]*?charset\s*=[\s"']*([^\s"'\/>]*)/i;
Resources.decodeBody = function(request, response, next) {
var charset, contentType, isHTML, match, mimeType, subtype, type, typeOption, typeOptions, _i, _len, _ref, _ref1;
contentType = response.headers && response.headers["content-type"];
if (contentType && Buffer.isBuffer(response.body)) {
_ref = contentType.split(/;\s*/), mimeType = _ref[0], typeOptions = 2 <= _ref.length ? __slice.call(_ref, 1) : [];
_ref1 = contentType.split(/\//, 2), type = _ref1[0], subtype = _ref1[1];
}
if (type && type !== "text") {
next();
return;
}
if (Buffer.isBuffer(response.body)) {
if (mimeType) {
for (_i = 0, _len = typeOptions.length; _i < _len; _i++) {
typeOption = typeOptions[_i];
if (/^charset=/i.test(typeOption)) {
charset = typeOption.split("=")[1];
break;
}
}
}
isHTML = /html/.test(subtype) || /\bhtml\b/.test(request.headers.accept);
if (!charset && isHTML) {
match = response.body.toString().match(MATCH_CHARSET);
charset = match && match[1];
}
if (!charset && isHTML) {
charset = charset || "windows-1252";
}
if (charset) {
response.body = iconv.decode(response.body, charset);
}
}
return next();
};
Resources.pipeline = [Resources.normalizeURL, Resources.mergeHeaders, Resources.createBody, Resources.specialURLHandlers, Resources.handleHTTPResponse, Resources.decompressBody, Resources.decodeBody];
Resources.makeHTTPRequest = function(request, callback) {
var cookies, filename, hostname, httpRequest, pathname, protocol, _ref;
_ref = URL.parse(request.url), protocol = _ref.protocol, hostname = _ref.hostname, pathname = _ref.pathname;
if (protocol === "file:") {
if (request.method === "GET") {
filename = Path.normalize(decodeURI(pathname));
File.exists(filename, (function(_this) {
return function(exists) {
if (exists) {
return File.readFile(filename, function(error, buffer) {
if (error) {
request.error = error;
return callback(error);
} else {
return callback(null, {
body: buffer
});
}
});
} else {
return callback(null, {
statusCode: 404
});
}
};
})(this));
} else {
callback(resource.error);
}
} else {
cookies = this.cookies;
request.headers.cookie = cookies.serialize(hostname, pathname);
httpRequest = {
method: request.method,
url: request.url,
headers: request.headers,
body: request.body,
multipart: request.multipart,
proxy: this.proxy,
jar: false,
followRedirect: false,
encoding: null,
strictSSL: request.strictSSL,
localAddress: request.localAddress || 0,
timeout: request.timeout || 0
};
Request(httpRequest, (function(_this) {
return function(error, response) {
if (error) {
callback(error);
return;
}
response = {
url: request.url,
statusCode: response.statusCode,
headers: response.headers,
body: response.body,
redirects: request.redirects || 0
};
return callback(null, response);
};
})(this));
}
};
module.exports = Resources;