UNPKG

simplecrawler

Version:

Very straightforward, event driven web crawler. Features a flexible queue interface and a basic cache mechanism with extensible backend.

251 lines (209 loc) 8.45 kB
/* * Simplecrawler - FS cache backend * https://github.com/simplecrawler/simplecrawler * * Copyright (c) 2011-2015, Christopher Giffard * */ // Tries to ensure a local 'cache' of a website is as close as possible to a mirror of the website itself. // The idea is that it is then possible to re-serve the website just using the cache. var fs = require("fs"), crypto = require("crypto"); // Factory for FSBackend var backend = function backend(loadParameter) { return new FSBackend(loadParameter); }; module.exports = backend; // Constructor for filesystem cache backend var FSBackend = function FSBackend(loadParameter) { this.loaded = false; this.index = []; this.location = typeof loadParameter === "string" && loadParameter.length > 0 ? loadParameter : process.cwd() + "/cache/"; this.location = this.location.substr(this.location.length - 1) === "/" ? this.location : this.location + "/"; }; // Function for sanitising paths // We try to get the most understandable, file-system friendly paths we can. // An extension is added if not present or inappropriate - if a better one can be determined. // Querystrings are hashed to truncate without (hopefully) collision. function sanitisePath(path, queueObject) { // Remove first slash (as we set one later.) path = path.replace(/^\//, ""); var pathStack = []; // Trim whitespace. If no path is present - assume index.html. var sanitisedPath = path.length ? path.replace(/\s*$/ig, "") : "index.html"; var headers = queueObject.stateData.headers, sanitisedPathParts; if (sanitisedPath.match(/\?/)) { sanitisedPathParts = sanitisedPath.split(/\?/g); var resource = sanitisedPathParts.shift(); var hashedQS = crypto.createHash("sha1").update(sanitisedPathParts.join("?")).digest("hex"); sanitisedPath = resource + "?" + hashedQS; } pathStack = sanitisedPath.split(/\//g); pathStack = pathStack.map(function(pathChunk) { if (pathChunk.length >= 250) { return crypto.createHash("sha1").update(pathChunk).digest("hex"); } return pathChunk; }); sanitisedPath = pathStack.join("/"); // Try to get a file extension for the file - for ease of identification // We run through this if we either: // 1) haven't got a file extension at all, or: // 2) have an HTML file without an HTML file extension (might be .php, .aspx, .do, or some other server-processed type) if (!sanitisedPath.match(/\.[a-z0-9]{1,6}$/i) || headers["content-type"] && headers["content-type"].match(/text\/html/i) && !sanitisedPath.match(/\.htm[l]?$/i)) { var subMimeType = ""; var mimeParts = []; if (headers["content-type"] && headers["content-type"].match(/text\/html/i)) { if (sanitisedPath.match(/\/$/)) { sanitisedPath += "index.html"; } else { sanitisedPath += ".html"; } } else if (headers["content-type"] && (mimeParts = headers["content-type"].match(/(image|video|audio|application)\/([a-z0-9]+)/i))) { subMimeType = mimeParts[2]; sanitisedPath += "." + subMimeType; } } return sanitisedPath; } FSBackend.prototype.fileExists = function(location) { try { fs.statSync(location); return true; } catch (er) { return false; } }; FSBackend.prototype.isDirectory = function(location) { try { if (fs.statSync(location).isDirectory()) { return true; } return false; } catch (er) { return false; } }; FSBackend.prototype.load = function() { var backend = this; if (!backend.fileExists(backend.location) && backend.isDirectory(backend.location)) { throw new Error("Unable to verify cache location exists."); } try { var fileData; if ((fileData = fs.readFileSync(backend.location + "cacheindex.json")) && fileData.length) { backend.index = JSON.parse(fileData.toString("utf8")); backend.loaded = true; } } catch (error) { if (error.code === "ENOENT") { // Cache index doesn't exist. Assume this is a new cache. // Just leave the memory index empty for now. backend.loaded = true; } else { throw error; } } // Flush store to disk when closing. process.on("exit", function() { backend.saveCache.apply(backend); }); }; FSBackend.prototype.saveCache = function(callback) { if (callback) { fs.writeFile(this.location + "cacheindex.json", JSON.stringify(this.index), callback); } else { fs.writeFileSync(this.location + "cacheindex.json", JSON.stringify(this.index)); } }; FSBackend.prototype.setItem = function(queueObject, data, callback) { callback = callback instanceof Function ? callback : function() {}; var backend = this; var pathStack = [queueObject.protocol, queueObject.host, queueObject.port]; pathStack = pathStack.concat(sanitisePath(queueObject.path, queueObject).split(/\/+/g)); var cacheItemExists = false; var firstInstanceIndex = NaN; if (backend.index.reduce(function(prev, current, index) { firstInstanceIndex = !isNaN(firstInstanceIndex) ? firstInstanceIndex : index; return prev || current.url === queueObject.url; }, false)) { cacheItemExists = true; } var writeFileData = function(currentPath, data) { fs.writeFile(currentPath, data, function(error) { if (error) { throw error; } fs.writeFile(currentPath + ".cacheData.json", JSON.stringify(queueObject), function(error) { if (error) { throw error; } var cacheObject = { url: queueObject.url, etag: queueObject.stateData.headers.etag, lastModified: queueObject.stateData.headers["last-modified"], dataFile: currentPath, metaFile: currentPath + ".cacheData.json" }; if (cacheItemExists) { backend.index[firstInstanceIndex] = cacheObject; } else { backend.index.push(cacheObject); } callback(cacheObject); }); }); }; pathStack.forEach(function(pathChunk, count) { var currentPath = backend.location + pathStack.slice(0, count + 1).join("/"); if (backend.fileExists(backend.location + pathStack.slice(0, count + 1).join("/"))) { if (!backend.isDirectory(currentPath)) { if (count === pathStack.length - 1) { // Just overwrite the file... writeFileData(currentPath, data); } else { throw new Error(`Cache storage of resource (${queueObject.url}) blocked by file: ${currentPath}`); } } } else if (count === pathStack.length - 1) { // Write the file data in writeFileData(currentPath, data); } else { fs.mkdirSync(currentPath); } }); }; FSBackend.prototype.getItem = function(queueObject, callback) { var cacheItemResult = this.index.filter(function(item) { return item.url === queueObject.url; }); if (cacheItemResult.length) { var cacheItem = cacheItemResult.shift(); callback({ url: cacheItem.url, etag: cacheItem.etag, lastModified: cacheItem.lastModified, getData: function(callback) { fs.readFile(cacheItem.dataFile, function(error, data) { if (error) { callback(error); return false; } callback(null, data); }); }, getMetadata: function(callback) { fs.readFile(cacheItem.metaFile, function(error, data) { if (error) { callback(error); return false; } callback(null, JSON.parse(data.toString("utf8"))); }); } }); } else { callback(null); } return false; };