UNPKG

dep-copy-file

Version:

Blazing-fast file and directory copy for Node.js

328 lines (284 loc) 11.1 kB
'use strict'; const { readdir, mkdir, copyFile, symlink, readlink, stat, lutimes, utimes } = require('fs/promises'); const { constants } = require('fs'); const path = require('path'); const os = require('os'); // --------------------------------------------------------------------------- // Platform-specific flags for maximum throughput // --------------------------------------------------------------------------- /** Copy-on-write clone flag (macOS APFS) — instantaneous copies for any file size */ const COPYFILE_FICLONE = constants.COPYFILE_FICLONE || 0; // --------------------------------------------------------------------------- // Concurrency helpers // --------------------------------------------------------------------------- /** * Resolve concurrency based on CPU core count. * * Default: `cpu * 2` (min 4) — conservative, safe for HDD and SSD alike. * 'auto': `cpu * 4` (max 128) — higher throughput for fast NVMe storage. * * A concurrency pool larger than the I/O subsystem can service only adds * memory pressure and file-descriptor contention without throughput gains. */ function resolveConcurrency(value) { const cpu = os.cpus().length; if (value === 'auto') { return Math.min(cpu * 4, 128); } if (value === undefined || value === null) { return Math.max(cpu * 2, 4); } return value; } // --------------------------------------------------------------------------- // Pipelined walk + mkdir + copy (stream architecture) // --------------------------------------------------------------------------- /** * Copy a directory tree using a **sequential DFS walk + parallel file I/O** * architecture. * * ## Memory design * * Directory traversal is depth-first and sequential — only ONE `readdir` * result is held in memory at any time. This bounds heap usage regardless * of tree size (millions of files will not OOM). * * File and symlink copies share a single concurrency pool so only * `concurrency` copy operations are in-flight at once. Copies start * immediately as entries are discovered (no deferred "collect then copy" * phase). * * ## Why not parallel directory walking? * * Walking directories in parallel creates unbounded memory growth: each * active `readdir` holds its entire Dirent array. For trees with 50 000+ * directories that piles up to gigabytes before the first copy completes. * `readdir` is microsecond-fast on modern file systems — the overhead of * sequential walking is negligible compared to copy I/O. */ async function pipelinedCopy(srcPath, destPath, { concurrency, overwrite, filter, onProgress, excludeSymlinks, preserveTimestamps }) { let totalFiles = 0; let totalDirs = 1; // root let totalSymlinks = 0; let completed = 0; // finished file-copy operations (for progress) let succeeded = 0; let failed = 0; const errors = []; const flags = overwrite ? COPYFILE_FICLONE : (COPYFILE_FICLONE | constants.COPYFILE_EXCL); /** @type {Set<Promise<void>>} */ const pool = new Set(); // ------------------------------------------------------------------ // Helpers // ------------------------------------------------------------------ async function enqueueFile(src, dst) { const task = copyFile(src, dst, flags) .then(async () => { succeeded++; if (preserveTimestamps) { const { atime, mtime } = await stat(src); await utimes(dst, atime, mtime); } }) .catch((err) => { failed++; errors.push({ src, dest: dst, error: err }); }) .finally(() => { completed++; pool.delete(task); if (onProgress) { onProgress({ completed, total: totalFiles, succeeded, failed }); } }); pool.add(task); if (pool.size >= concurrency) { await Promise.race(pool); } } async function enqueueSymlink(src, dst) { const task = readlink(src) .then((target) => symlink(target, dst)) .then(async () => { succeeded++; if (preserveTimestamps) { const { atime, mtime } = await stat(src); await lutimes(dst, atime, mtime); } }) .catch((err) => { if (err.code === 'EEXIST' && !overwrite) { succeeded++; return; } failed++; errors.push({ src, dest: dst, error: err }); }) .finally(() => { pool.delete(task); }); pool.add(task); if (pool.size >= concurrency) { await Promise.race(pool); } } // ------------------------------------------------------------------ // DFS walk — one directory at a time, depth-first // ------------------------------------------------------------------ /** @type {Array<[string, string]>} */ const stack = [[srcPath, destPath]]; while (stack.length > 0) { const [srcDir, destDir] = stack.pop(); // Create destination directory (recursive in case ancestors don't exist yet) await mkdir(destDir, { recursive: true }).catch((err) => { if (err.code !== 'EEXIST') throw err; }); let entries; try { entries = await readdir(srcDir, { withFileTypes: true }); } catch (err) { errors.push({ src: srcDir, dest: destDir, error: err }); continue; } // Push subdirectories onto the stack first so they're processed // depth-first after the current directory's files are done. for (let i = entries.length - 1; i >= 0; i--) { const entry = entries[i]; const src = path.join(srcDir, entry.name); const dst = path.join(destDir, entry.name); if (entry.isDirectory()) { if (filter && !filter(src, entry)) continue; totalDirs++; stack.push([src, dst]); } } // Process files and symlinks. Each `await` throttles via the pool // so we never have more than `concurrency` copies in-flight. for (let i = 0; i < entries.length; i++) { const entry = entries[i]; const src = path.join(srcDir, entry.name); const dst = path.join(destDir, entry.name); if (entry.isSymbolicLink()) { if (excludeSymlinks) continue; if (filter && !filter(src, entry)) continue; totalSymlinks++; await enqueueSymlink(src, dst); } else if (entry.isFile()) { if (filter && !filter(src, entry)) continue; totalFiles++; await enqueueFile(src, dst); } } } // Drain remaining in-flight copies if (pool.size > 0) { await Promise.all(pool); } return { files: totalFiles, dirs: totalDirs, symlinks: totalSymlinks, succeeded, failed, errors, }; } // --------------------------------------------------------------------------- // Validation helpers // --------------------------------------------------------------------------- function validateDestNotInsideSource(source, dest) { const rel = path.relative(source, dest); if (rel !== '' && !rel.startsWith('..') && !path.isAbsolute(rel)) { throw new Error( `Destination is inside the source directory: "${dest}" is inside "${source}". This would cause an infinite loop.` ); } } function validateNotSame(source, dest) { if (source === dest) { throw new Error('Source and destination are the same path.'); } } // --------------------------------------------------------------------------- // Main entry point // --------------------------------------------------------------------------- /** * Copy a file or directory from `source` to `dest`. * * ## Speed design (v2 pipeline architecture) * - Tree traversal uses `readdir` with `withFileTypes: true` (zero extra stat). * - Sibling directories are walked **in parallel**. * - File copies start **immediately** as entries are discovered — no waiting * for the full tree scan to finish (pipelined walk→mkdir→copy). * - `mkdir` calls are **deduplicated** via an in-flight cache — each unique * dest path hits the kernel at most once, eliminating redundant syscalls. * - Concurrency defaults to `cpu * 2` (min 4); pass `'auto'` for `cpu * 4` (max 128). * - On macOS APFS the `COPYFILE_FICLONE` flag enables instantaneous * copy-on-write clones regardless of file size. * * @param {string} source - Source file or directory path. * @param {string} dest - Destination path. * @param {object} [options] * @param {number|'auto'} [options.concurrency] - Max parallel copies. Defaults to `cpu * 2` (min 4). Pass `'auto'` for `cpu * 4` (max 128). * @param {boolean} [options.overwrite=true] - Overwrite existing files. * @param {(src: string, entry: import('fs').Dirent) => boolean} [options.filter] * @param {(stats: { completed: number, total: number, succeeded: number, failed: number }) => void} [options.onProgress] * @param {boolean} [options.excludeSymlinks=false] * @returns {Promise<{ * files: number, dirs: number, symlinks: number, * succeeded: number, failed: number, errors: { src: string, dest: string, error: Error }[] * }>} */ async function copyDep(source, dest, options = {}) { if (!source || !dest) { throw new Error('Source and destination paths are required.'); } const { concurrency: rawConcurrency, // default resolved by CPU count overwrite = true, filter = null, onProgress = null, excludeSymlinks = false, preserveTimestamps = false, } = options; const concurrency = resolveConcurrency(rawConcurrency); const srcPath = path.resolve(source); const destPath = path.resolve(dest); validateNotSame(srcPath, destPath); // --- single file fast-path ---------------------------------------------- const srcStat = await stat(srcPath).catch(() => null); if (!srcStat) { throw new Error(`Source does not exist: ${srcPath}`); } if (srcStat.isFile()) { const destDir = path.dirname(destPath); await mkdir(destDir, { recursive: true }).catch((err) => { if (err.code !== 'EEXIST') throw err; }); const flags = overwrite ? COPYFILE_FICLONE : COPYFILE_FICLONE | constants.COPYFILE_EXCL; try { await copyFile(srcPath, destPath, flags); } catch (err) { if (err.code === 'EEXIST' && !overwrite) { return { files: 1, dirs: 1, symlinks: 0, succeeded: 0, failed: 1, errors: [{ src: srcPath, dest: destPath, error: err }], }; } throw err; } return { files: 1, dirs: 1, symlinks: 0, succeeded: 1, failed: 0, errors: [], }; } if (!srcStat.isDirectory()) { throw new Error(`Source is neither a file nor a directory: ${srcPath}`); } validateDestNotInsideSource(srcPath, destPath); // --- directory copy (pipelined) ----------------------------------------- return pipelinedCopy(srcPath, destPath, { concurrency, overwrite, filter, onProgress, excludeSymlinks, preserveTimestamps, }); } // --------------------------------------------------------------------------- // Exports // --------------------------------------------------------------------------- module.exports = copyDep; copyDep.copy = copyDep; copyDep.copyDep = copyDep;