UNPKG

simplecrawler

Version:

Very straightforward, event driven web crawler. Features a flexible queue interface and a basic cache mechanism with extensible backend.

414 lines (345 loc) 13.2 kB
/** * @file simplecrawler's cookie jar module */ var EventEmitter = require("events").EventEmitter, util = require("util"); /** * Creates a new cookie jar * @class */ var CookieJar = function() { EventEmitter.call(this); /** * The actual jar that holds the cookies * @private * @type {Array} */ this.cookies = []; }; util.inherits(CookieJar, EventEmitter); /** * Called when {@link CookieJar#add} returns a result * @callback CookieJar~addCallback * @param {Error|null} error If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. * @param {Cookie|null} cookie The cookie that was added to the jar */ /** * Adds a new cookie to the jar, either by creating a new {@link Cookie} object * from specific details such as name, value, etc., accepting a string from a * Set-Cookie header, or by passing in an existing {@link Cookie} object. * @fires CookieJar#addcookie * @param {String} name Name of the new cookie * @param {String} value Value of the new cookie * @param {String|Number} expiry Expiry timestamp of the new cookie in milliseconds * @param {String} [path="/"] Limits cookie to a path * @param {String} [domain="*"] Limits cookie to a domain * @param {Boolean} [httponly=false] Specifies whether to include the HttpOnly flag * @param {CookieJar~addCallback} [callback] * @return {CookieJar} Returns the cookie jar instance to enable chained API calls */ CookieJar.prototype.add = function(name, value, expiry, path, domain, httponly, callback) { var existingIndex = -1, newCookie; if (arguments.length > 1) { newCookie = new Cookie(name, value, expiry, path, domain, httponly); } else if (name instanceof Cookie) { newCookie = name; } else { newCookie = Cookie.fromString(name); } // Are we updating an existing cookie or adding a new one? this.cookies.forEach(function(cookie, index) { if (cookie.name === newCookie.name && cookie.matchDomain(newCookie.domain)) { existingIndex = index; } }); if (existingIndex === -1) { this.cookies.push(newCookie); } else { this.cookies[existingIndex] = newCookie; } /** * Fired when a cookie has been added to the jar * @event CookieJar#addcookie * @param {Cookie} cookie The cookie that has been added */ this.emit("addcookie", newCookie); if (callback instanceof Function) { callback(null, newCookie); } return this; }; /** * Called when {@link CookieJar#remove} returns a result * @callback CookieJar~removeCallback * @param {Error|null} error If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. * @param {Cookie[]|null} cookiesRemoved An array of the cookies that were removed from the cookie jar */ /** * Removes cookies from the cookie jar. If no domain and name are specified, all * cookies in the jar are removed. * @fires CookieJar#removecookie * @param {String} [name] Name of the cookie to be removed * @param {String} [domain] The domain that the cookie applies to * @param {CookieJar~removeCallback} [callback] * @return {Cookie[]} Returns an array of the cookies that were removed from the cookie jar */ CookieJar.prototype.remove = function(name, domain, callback) { var cookiesRemoved = [], jar = this; jar.cookies.forEach(function(cookie, index) { // If the names don't match, we're not removing this cookie if (Boolean(name) && cookie.name !== name) { return false; } // If the domains don't match, we're not removing this cookie if (Boolean(domain) && !cookie.matchDomain(domain)) { return false; } // Matched. Remove! cookiesRemoved.push(jar.cookies.splice(index, 1)); }); /** * Fired when one or multiple cookie have been removed from the jar * @event CookieJar#removecookie * @param {Cookie[]} cookie The cookies that have been removed */ jar.emit("removecookie", cookiesRemoved); if (callback instanceof Function) { callback(null, cookiesRemoved); } return cookiesRemoved; }; /** * Called when {@link CookieJar#get} returns a result * @callback CookieJar~getCallback * @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. * @param {Cookie[]} [cookies] An array of cookies that matched the name and/or domain. */ /** * Gets an array of cookies based on name and domain * @param {String} [name] Name of the cookie to retrieve * @param {String} [domain] Domain to retrieve the cookies from * @param {CookieJar~getCallback} [callback] * @return {Cookie[]} Returns an array of cookies that matched the name and/or domain */ CookieJar.prototype.get = function(name, domain, callback) { var cookies = this.cookies.filter(function(cookie) { // If the names don't match, we're not returning this cookie if (Boolean(name) && cookie.name !== name) { return false; } // If the domains don't match, we're not returning this cookie if (Boolean(domain) && !cookie.matchDomain(domain)) { return false; } return true; }); if (callback instanceof Function) { callback(null, cookies); } return cookies; }; /** * Called when {@link CookieJar#getAsHeader} returns a result * @callback CookieJar~getAsHeaderCallback * @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. * @param {String[]} [cookies] An array of HTTP header formatted cookies. */ /** * Generates an array of headers based on the value of the cookie jar * @param {String} [domain] The domain from which to generate cookies * @param {String} [path] Filter headers to cookies applicable to this path * @param {CookieJar~getAsHeaderCallback} [callback] * @return {String[]} Returns an array of HTTP header formatted cookies */ CookieJar.prototype.getAsHeader = function(domain, path, callback) { var headers = this.cookies.filter(function(cookie) { if (cookie.isExpired()) { return false; } if (!domain && !path) { return true; } if (domain) { return cookie.matchDomain(domain); } if (path) { return cookie.matchPath(path); } }).map(function(cookie) { return cookie.toOutboundString(); }); if (callback instanceof Function) { callback(null, headers); } return headers; }; /** * Called when {@link CookieJar#addFromHeaders} returns a result * @callback CookieJar~addFromHeadersCallback * @param {Error} [error] If the operation was successful, this will be `null`. Otherwise it will be the error that was encountered. */ /** * Adds cookies to the cookie jar based on an array of 'Set-Cookie' headers * provided by a web server. Duplicate cookies are overwritten. * @fires CookieJar#addcookie * @param {String|String[]} headers One or multiple Set-Cookie headers to be added to the cookie jar * @param {CookieJar~addFromHeadersCallback} [callback] * @return {CookieJar} Returns the cookie jar instance to enable chained API calls */ CookieJar.prototype.addFromHeaders = function(headers, callback) { var jar = this; if (!Array.isArray(headers)) { headers = [headers]; } headers.forEach(function(header) { jar.add(header); }); if (callback instanceof Function) { callback(null); } return jar; }; /** * Generates a newline-separated list of all cookies in the jar * @return {String} Returns stringified versions of all cookies in the jar in a newline separated string */ CookieJar.prototype.toString = function() { return this.getAsHeader().join("\n"); }; /** * Creates a new cookies * @class * @param {String} name Name of the new cookie * @param {String} value Value of the new cookie * @param {String|Number} expires Expiry timestamp of the new cookie in milliseconds * @param {String} [path="/"] Limits cookie to a path * @param {String} [domain="*"] Limits cookie to a domain * @param {Boolean} [httponly=false] Specifies whether to include the HttpOnly flag */ var Cookie = function(name, value, expires, path, domain, httponly) { if (!name) { throw new Error("A name is required to create a cookie."); } // Parse date to timestamp - consider it never expiring if timestamp is not // passed to the function if (expires) { if (typeof expires !== "number") { expires = new Date(expires).getTime(); } } else { expires = -1; } this.name = name; this.value = value || ""; this.expires = expires; this.path = path || "/"; this.domain = domain || "*"; this.httponly = Boolean(httponly); }; /** * Creates a new {@link Cookie} based on a header string * @param {String} string A Set-Cookie header string * @return {Cookie} Returns a newly created Cookie object */ Cookie.fromString = function(string) { if (!string || typeof string !== "string") { throw new Error("String must be supplied to generate a cookie."); } function parseKeyVal(input) { var key = input.split(/=/).shift(), val = input.split(/=/).slice(1).join("="); return [key, val]; } string = string.replace(/^\s*set-cookie\s*:\s*/i, ""); var parts = string.split(/\s*;\s*/i), name = parseKeyVal(parts.shift()), keyValParts = {}; keyValParts.name = name[0]; keyValParts.value = name[1]; parts .filter(function(input) { return Boolean(input.replace(/\s+/ig, "").length); }) .map(parseKeyVal) .forEach(function(keyval) { var key = String(keyval[0]).toLowerCase().replace(/[^a-z0-9]/ig, ""); keyValParts[key] = keyval[1]; }); return new Cookie( keyValParts.name, keyValParts.value, keyValParts.expires || keyValParts.expiry, keyValParts.path, keyValParts.domain, keyValParts.hasOwnProperty("httponly") ); }; /** * Outputs the cookie as a string, in the form of an outbound Cookie header * @return {String} Stringified version of the cookie */ Cookie.prototype.toOutboundString = function() { return this.name + "=" + this.value; }; /** * Outputs the cookie as a string, in the form of a Set-Cookie header * @param {Boolean} [includeHeader] Controls whether to include the 'Set-Cookie: ' header name at the beginning of the string. * @return {String} Stringified version of the cookie */ Cookie.prototype.toString = function(includeHeader) { var string = ""; if (includeHeader) { string = "Set-Cookie: "; } string += this.name + "=" + this.value + "; "; if (this.expires > 0) { string += "Expires=" + new Date(this.expires).toGMTString() + "; "; } if (this.path) { string += "Path=" + this.path + "; "; } if (this.domain) { string += "Domain=" + this.domain + "; "; } if (this.httponly) { string += "Httponly; "; } return string; }; /** * Determines whether a cookie has expired or not * @return {Boolean} Returns true if the cookie has expired. Otherwise, it returns false. */ Cookie.prototype.isExpired = function() { if (this.expires < 0) { return false; } return this.expires < Date.now(); }; /** * Determines whether a cookie matches a given domain * @param {String} domain The domain to match against * @return {Boolean} Returns true if the provided domain matches the cookie's domain. Otherwise, it returns false. */ Cookie.prototype.matchDomain = function(domain) { if (this.domain === "*") { return true; } var reverseDomain = this.domain.split("").reverse().join(""), reverseDomainComp = domain.split("").reverse().join(""); return reverseDomain.indexOf(reverseDomainComp) === 0; }; /** * Determines whether a cookie matches a given path * @param {String} path The path to match against * @return {Boolean} Returns true if the provided path matches the cookie's path. Otherwise, it returns false. */ Cookie.prototype.matchPath = function(path) { if (!this.path) { return true; } return path.indexOf(this.path) === 0; }; module.exports = CookieJar; module.exports.Cookie = Cookie;