UNPKG

file-duplicates

Version:

Given a target file, recursively search for duplicated files.

341 lines (279 loc) 10.8 kB
/** * --------------------------------------------------------------------------------------- * index.js * --------------------------------------------------------------------------------------- */ var fs = require("fs"), path = require("path"), crypto = require("crypto"), minimatch = require("minimatch"); //region private function _generateSHA1digest(data) { return crypto.createHash("sha1").update(data).digest("hex"); } function _filter(files, dirPath, ignorePatterns) { files = files.map(function(f) { return path.join(dirPath, f) } ); files = files.filter(function(f) { return !ignorePatterns.some(function(p) { // pattern is a path if (p.dir !== "") { return minimatch(f, p.source); } // else { // file if (p.ext !== "") { return minimatch(path.basename(f), p.source); } // folder else { return minimatch(path.basename(f), p.source); } } }); }); return files; } function _find(size, digest, dirPath, ignorePatterns, cb) { var result = []; fs.readdir(dirPath, function(err, files) { if (err) { return cb(err); } var filePaths = _filter(files, dirPath, ignorePatterns); var pending = filePaths.length; // empty folder, return cb if (pending === 0) { return cb(null, result); } filePaths.forEach(function(filePath) { fs.stat(filePath, function(err, stats) { if (err) { return cb(err); } // if directory if (stats.isDirectory()) { _find(size, digest, filePath, ignorePatterns, function(err, res) { result = result.concat(res); pending--; if (pending === 0) { return cb(null, result); } }); } // if file else { // if equal size, compute checksum if (stats.size === size) { var hash = crypto.createHash("sha1"); var stream = fs.createReadStream(filePath); stream.on("data", function(chunk) { hash.update(chunk); }); stream.on("error", function(err) { return cb(err); }); stream.on("end", function() { if (hash.digest("hex") === digest) { // equal file found result.push(filePath); } pending--; if (pending === 0) { return cb(null, result); } }); } // different size, go ahead else { pending--; if (pending === 0) { return cb(null, result); } } } }); }); }); } function _find_sync(size, digest, dirPath, ignorePatterns) { var result = []; var files = fs.readdirSync(dirPath); var queue = _filter(files, dirPath, ignorePatterns); while(queue.length > 0) { var filePath = queue.pop(); var stats = fs.statSync(filePath); if (stats.isDirectory()) { files = fs.readdirSync(filePath); files = _filter(files, filePath, ignorePatterns); queue = queue.concat(files); } else { var fileBuffer = fs.readFileSync(filePath); if (stats.size === size && _generateSHA1digest(fileBuffer) === digest) { result.push(filePath); } } } return result; } //endregion //region public /** * Recursively search for duplicates of the target file or buffer in the specified directory, returning the corresponding absolute paths (ASYNC). * @param {string or Buffer} pathOrBuffer - Path or buffer of the file to search. * @param {string} [dirPath] - Directory which represents the starting point of the search. Default is the working directory. * @param {Array} [ignorePatterns] - An array of patterns that will be excluded from the search (e.g. ["*.", "node_modules", "*.txt", "path/to/file", "path/to/directory"]). * @param {function} [cb] - Callback of type function(err, result). If not provided a Promise will be returned instead. * @return {} - Callback or Promise fulfilled with an array of absolute paths to duplicated files. */ function find(pathOrBuffer, dirPath, ignorePatterns, cb) { if (!(typeof pathOrBuffer === "string") && !(pathOrBuffer instanceof Buffer)) { throw Error("First argument must be a file path or a buffer"); } if (!dirPath) { dirPath = process.cwd(); } if (dirPath instanceof Array) { if (typeof ignorePatterns === "function") { cb = ignorePatterns; } ignorePatterns = dirPath; dirPath = process.cwd(); } if (typeof dirPath === "function") { cb = dirPath; dirPath = process.cwd(); ignorePatterns = [] } if (dirPath && !path.isAbsolute(dirPath)) { dirPath = path.join(process.cwd(), dirPath); } if (typeof ignorePatterns === "function") { cb = ignorePatterns; ignorePatterns = []; } if (!ignorePatterns || !ignorePatterns instanceof Array) { ignorePatterns = []; } if (typeof pathOrBuffer === "string") { if (!path.isAbsolute(pathOrBuffer)) { ignorePatterns.push(path.join(process.cwd(), pathOrBuffer)); } else { ignorePatterns.push(pathOrBuffer); } } ignorePatterns = ignorePatterns.map(function (p){ var parsed = path.parse(p); parsed.source = p; return parsed; }); // return promise if (!cb || typeof cb !== "function") { return new Promise(function(resolve, reject){ // file path if (typeof pathOrBuffer === "string") { var hash = crypto.createHash("sha1"); var stream = fs.createReadStream(pathOrBuffer); var size = 0; stream.on("data", function(chunk) { hash.update(chunk); size += chunk.length; }); stream.on("error", function(err) { reject(err); }); stream.on("end", function() { _find(size, hash.digest("hex"), dirPath, ignorePatterns, function(err, res){ if (err) { reject(err); } else { resolve(res); } }); }); } // file buffer else { _find(pathOrBuffer.length, _generateSHA1digest(pathOrBuffer), dirPath, ignorePatterns, function(err, res){ if (err) { reject(err); } else { resolve(res); } }); } }); } // use callback else { // file path if (typeof pathOrBuffer === "string") { var hash = crypto.createHash("sha1"); var stream = fs.createReadStream(pathOrBuffer); var size = 0; stream.on("data", function(chunk) { hash.update(chunk); size += chunk.length; }); stream.on("error", function(err) { return cb(err); }); stream.on("end", function() { return _find(size, hash.digest("hex"), dirPath, ignorePatterns, cb); }); } // file buffer else { return _find(pathOrBuffer.length, _generateSHA1digest(pathOrBuffer), dirPath, ignorePatterns, cb); } } } /** * Recursively search for duplicates of the target file or buffer in the specified directory, returning the corresponding absolute paths (SYNC). * @param {string or Buffer} pathOrBuffer - Path or buffer of the file to search. * @param {string} [dirPath] - Directory which represents the starting point of the search. Default is the working directory. * @param {Array} [ignorePatterns] - An array of patterns that will be excluded from the search (e.g. ["*.", "node_modules", "*.txt", "path/to/file", "path/to/directory"]). * @return {Array} - An array of absolute paths to duplicated files. */ function findSync(pathOrBuffer, dirPath, ignorePatterns) { if (!(typeof pathOrBuffer === "string") && !(pathOrBuffer instanceof Buffer)) { throw Error("First argument must be a file path or a buffer"); } if (!dirPath) { dirPath = process.cwd(); } if (dirPath instanceof Array) { ignorePatterns = dirPath; dirPath = process.cwd(); } if (dirPath && !path.isAbsolute(dirPath)) { dirPath = path.join(process.cwd(), dirPath); } if (!ignorePatterns || !ignorePatterns instanceof Array) { ignorePatterns = []; } if (typeof pathOrBuffer === "string") { if (!path.isAbsolute(pathOrBuffer)) { ignorePatterns.push(path.join(process.cwd(), pathOrBuffer)); } else { ignorePatterns.push(pathOrBuffer); } pathOrBuffer = fs.readFileSync(pathOrBuffer); } ignorePatterns = ignorePatterns.map(function (p){ var parsed = path.parse(p); parsed.source = p; return parsed; }); return _find_sync(pathOrBuffer.length, _generateSHA1digest(pathOrBuffer), dirPath, ignorePatterns); } //endregion module.exports = { find: find, findSync: findSync };