dep-copy-file
Version:
Blazing-fast file and directory copy for Node.js
328 lines (284 loc) • 11.1 kB
JavaScript
;
const { readdir, mkdir, copyFile, symlink, readlink, stat, lutimes, utimes } = require('fs/promises');
const { constants } = require('fs');
const path = require('path');
const os = require('os');
// ---------------------------------------------------------------------------
// Platform-specific flags for maximum throughput
// ---------------------------------------------------------------------------
/** Copy-on-write clone flag (macOS APFS) — instantaneous copies for any file size */
const COPYFILE_FICLONE = constants.COPYFILE_FICLONE || 0;
// ---------------------------------------------------------------------------
// Concurrency helpers
// ---------------------------------------------------------------------------
/**
* Resolve concurrency based on CPU core count.
*
* Default: `cpu * 2` (min 4) — conservative, safe for HDD and SSD alike.
* 'auto': `cpu * 4` (max 128) — higher throughput for fast NVMe storage.
*
* A concurrency pool larger than the I/O subsystem can service only adds
* memory pressure and file-descriptor contention without throughput gains.
*/
function resolveConcurrency(value) {
const cpu = os.cpus().length;
if (value === 'auto') {
return Math.min(cpu * 4, 128);
}
if (value === undefined || value === null) {
return Math.max(cpu * 2, 4);
}
return value;
}
// ---------------------------------------------------------------------------
// Pipelined walk + mkdir + copy (stream architecture)
// ---------------------------------------------------------------------------
/**
* Copy a directory tree using a **sequential DFS walk + parallel file I/O**
* architecture.
*
* ## Memory design
*
* Directory traversal is depth-first and sequential — only ONE `readdir`
* result is held in memory at any time. This bounds heap usage regardless
* of tree size (millions of files will not OOM).
*
* File and symlink copies share a single concurrency pool so only
* `concurrency` copy operations are in-flight at once. Copies start
* immediately as entries are discovered (no deferred "collect then copy"
* phase).
*
* ## Why not parallel directory walking?
*
* Walking directories in parallel creates unbounded memory growth: each
* active `readdir` holds its entire Dirent array. For trees with 50 000+
* directories that piles up to gigabytes before the first copy completes.
* `readdir` is microsecond-fast on modern file systems — the overhead of
* sequential walking is negligible compared to copy I/O.
*/
async function pipelinedCopy(srcPath, destPath, { concurrency, overwrite, filter, onProgress, excludeSymlinks, preserveTimestamps }) {
let totalFiles = 0;
let totalDirs = 1; // root
let totalSymlinks = 0;
let completed = 0; // finished file-copy operations (for progress)
let succeeded = 0;
let failed = 0;
const errors = [];
const flags = overwrite ? COPYFILE_FICLONE : (COPYFILE_FICLONE | constants.COPYFILE_EXCL);
/** @type {Set<Promise<void>>} */
const pool = new Set();
// ------------------------------------------------------------------
// Helpers
// ------------------------------------------------------------------
async function enqueueFile(src, dst) {
const task = copyFile(src, dst, flags)
.then(async () => {
succeeded++;
if (preserveTimestamps) {
const { atime, mtime } = await stat(src);
await utimes(dst, atime, mtime);
}
})
.catch((err) => {
failed++;
errors.push({ src, dest: dst, error: err });
})
.finally(() => {
completed++;
pool.delete(task);
if (onProgress) {
onProgress({ completed, total: totalFiles, succeeded, failed });
}
});
pool.add(task);
if (pool.size >= concurrency) {
await Promise.race(pool);
}
}
async function enqueueSymlink(src, dst) {
const task = readlink(src)
.then((target) => symlink(target, dst))
.then(async () => {
succeeded++;
if (preserveTimestamps) {
const { atime, mtime } = await stat(src);
await lutimes(dst, atime, mtime);
}
})
.catch((err) => {
if (err.code === 'EEXIST' && !overwrite) { succeeded++; return; }
failed++;
errors.push({ src, dest: dst, error: err });
})
.finally(() => { pool.delete(task); });
pool.add(task);
if (pool.size >= concurrency) {
await Promise.race(pool);
}
}
// ------------------------------------------------------------------
// DFS walk — one directory at a time, depth-first
// ------------------------------------------------------------------
/** @type {Array<[string, string]>} */
const stack = [[srcPath, destPath]];
while (stack.length > 0) {
const [srcDir, destDir] = stack.pop();
// Create destination directory (recursive in case ancestors don't exist yet)
await mkdir(destDir, { recursive: true }).catch((err) => {
if (err.code !== 'EEXIST') throw err;
});
let entries;
try {
entries = await readdir(srcDir, { withFileTypes: true });
} catch (err) {
errors.push({ src: srcDir, dest: destDir, error: err });
continue;
}
// Push subdirectories onto the stack first so they're processed
// depth-first after the current directory's files are done.
for (let i = entries.length - 1; i >= 0; i--) {
const entry = entries[i];
const src = path.join(srcDir, entry.name);
const dst = path.join(destDir, entry.name);
if (entry.isDirectory()) {
if (filter && !filter(src, entry)) continue;
totalDirs++;
stack.push([src, dst]);
}
}
// Process files and symlinks. Each `await` throttles via the pool
// so we never have more than `concurrency` copies in-flight.
for (let i = 0; i < entries.length; i++) {
const entry = entries[i];
const src = path.join(srcDir, entry.name);
const dst = path.join(destDir, entry.name);
if (entry.isSymbolicLink()) {
if (excludeSymlinks) continue;
if (filter && !filter(src, entry)) continue;
totalSymlinks++;
await enqueueSymlink(src, dst);
} else if (entry.isFile()) {
if (filter && !filter(src, entry)) continue;
totalFiles++;
await enqueueFile(src, dst);
}
}
}
// Drain remaining in-flight copies
if (pool.size > 0) {
await Promise.all(pool);
}
return {
files: totalFiles,
dirs: totalDirs,
symlinks: totalSymlinks,
succeeded,
failed,
errors,
};
}
// ---------------------------------------------------------------------------
// Validation helpers
// ---------------------------------------------------------------------------
function validateDestNotInsideSource(source, dest) {
const rel = path.relative(source, dest);
if (rel !== '' && !rel.startsWith('..') && !path.isAbsolute(rel)) {
throw new Error(
`Destination is inside the source directory: "${dest}" is inside "${source}". This would cause an infinite loop.`
);
}
}
function validateNotSame(source, dest) {
if (source === dest) {
throw new Error('Source and destination are the same path.');
}
}
// ---------------------------------------------------------------------------
// Main entry point
// ---------------------------------------------------------------------------
/**
* Copy a file or directory from `source` to `dest`.
*
* ## Speed design (v2 pipeline architecture)
* - Tree traversal uses `readdir` with `withFileTypes: true` (zero extra stat).
* - Sibling directories are walked **in parallel**.
* - File copies start **immediately** as entries are discovered — no waiting
* for the full tree scan to finish (pipelined walk→mkdir→copy).
* - `mkdir` calls are **deduplicated** via an in-flight cache — each unique
* dest path hits the kernel at most once, eliminating redundant syscalls.
* - Concurrency defaults to `cpu * 2` (min 4); pass `'auto'` for `cpu * 4` (max 128).
* - On macOS APFS the `COPYFILE_FICLONE` flag enables instantaneous
* copy-on-write clones regardless of file size.
*
* @param {string} source - Source file or directory path.
* @param {string} dest - Destination path.
* @param {object} [options]
* @param {number|'auto'} [options.concurrency] - Max parallel copies. Defaults to `cpu * 2` (min 4). Pass `'auto'` for `cpu * 4` (max 128).
* @param {boolean} [options.overwrite=true] - Overwrite existing files.
* @param {(src: string, entry: import('fs').Dirent) => boolean} [options.filter]
* @param {(stats: { completed: number, total: number, succeeded: number, failed: number }) => void} [options.onProgress]
* @param {boolean} [options.excludeSymlinks=false]
* @returns {Promise<{
* files: number, dirs: number, symlinks: number,
* succeeded: number, failed: number, errors: { src: string, dest: string, error: Error }[]
* }>}
*/
async function copyDep(source, dest, options = {}) {
if (!source || !dest) {
throw new Error('Source and destination paths are required.');
}
const {
concurrency: rawConcurrency, // default resolved by CPU count
overwrite = true,
filter = null,
onProgress = null,
excludeSymlinks = false,
preserveTimestamps = false,
} = options;
const concurrency = resolveConcurrency(rawConcurrency);
const srcPath = path.resolve(source);
const destPath = path.resolve(dest);
validateNotSame(srcPath, destPath);
// --- single file fast-path ----------------------------------------------
const srcStat = await stat(srcPath).catch(() => null);
if (!srcStat) {
throw new Error(`Source does not exist: ${srcPath}`);
}
if (srcStat.isFile()) {
const destDir = path.dirname(destPath);
await mkdir(destDir, { recursive: true }).catch((err) => {
if (err.code !== 'EEXIST') throw err;
});
const flags = overwrite ? COPYFILE_FICLONE : COPYFILE_FICLONE | constants.COPYFILE_EXCL;
try {
await copyFile(srcPath, destPath, flags);
} catch (err) {
if (err.code === 'EEXIST' && !overwrite) {
return {
files: 1, dirs: 1, symlinks: 0,
succeeded: 0, failed: 1,
errors: [{ src: srcPath, dest: destPath, error: err }],
};
}
throw err;
}
return {
files: 1, dirs: 1, symlinks: 0,
succeeded: 1, failed: 0, errors: [],
};
}
if (!srcStat.isDirectory()) {
throw new Error(`Source is neither a file nor a directory: ${srcPath}`);
}
validateDestNotInsideSource(srcPath, destPath);
// --- directory copy (pipelined) -----------------------------------------
return pipelinedCopy(srcPath, destPath, {
concurrency, overwrite, filter, onProgress, excludeSymlinks, preserveTimestamps,
});
}
// ---------------------------------------------------------------------------
// Exports
// ---------------------------------------------------------------------------
module.exports = copyDep;
copyDep.copy = copyDep;
copyDep.copyDep = copyDep;