dbgate-api
Version:
Allows run DbGate data-manipulation scripts.
367 lines (334 loc) • 12.5 kB
JavaScript
const crypto = require('crypto');
const fs = require('fs');
const { pipeline } = require('stream');
const os = require('os');
const readline = require('readline');
const path = require('path');
const { getLogger, extractErrorLogData } = require('dbgate-tools');
const logger = getLogger('externalSort');
// Number of rows accumulated per sorted temp-chunk during external sort.
// Capped so that a single chunk never exceeds ~50 MB for typical row sizes.
const SORT_CHUNK_SIZE = 50_000;
// Maximum number of chunk files merged simultaneously in one pass.
// Limits the number of concurrently open file descriptors during merge.
const MAX_MERGE_FAN_IN = 16;
// Async generator that yields parsed JSON objects from an internally-generated
// sorted chunk file. Parse errors are thrown immediately — chunk files are
// always written by this module, so a parse error indicates corruption.
async function* readChunkLines(file) {
const stream = fs.createReadStream(file);
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
try {
for await (const line of rl) {
if (!line.trim()) continue;
yield JSON.parse(line); // intentionally throws on bad JSON
}
} finally {
rl.close();
stream.destroy();
}
}
// Cross-device-safe rename: tries renameSync first; on EXDEV (cross-filesystem)
// falls back to a streaming copy followed by unlinking the source.
// Uses stream.pipeline() so both streams are destroyed and the partial
// destination file is removed if an error occurs mid-copy.
async function safeRename(src, dest) {
try {
fs.renameSync(src, dest);
} catch (e) {
if (e.code !== 'EXDEV') throw e;
await new Promise((resolve, reject) => {
const rs = fs.createReadStream(src);
const ws = fs.createWriteStream(dest);
pipeline(rs, ws, err => {
if (err) {
try { fs.unlinkSync(dest); } catch { /* best-effort */ }
reject(err);
} else {
resolve();
}
});
});
fs.unlinkSync(src);
}
}
// Write an array of rows to a JSON-lines file, respecting stream backpressure.
function writeChunkFile(filePath, rows) {
return new Promise((resolve, reject) => {
const ws = fs.createWriteStream(filePath);
ws.on('error', reject);
ws.on('finish', resolve);
const writeNext = i => {
for (; i < rows.length; i++) {
const ok = ws.write(JSON.stringify(rows[i]) + '\n');
if (!ok) {
ws.once('drain', () => writeNext(i + 1));
return;
}
}
ws.end();
};
writeNext(0);
});
}
// Min-heap used for k-way merge. Each item stored in the heap is
// { row: object, iter: AsyncGenerator }.
class _SortMinHeap {
constructor(comparator) {
this._data = [];
this._cmp = comparator;
}
get size() { return this._data.length; }
push(item) {
this._data.push(item);
let i = this._data.length - 1;
while (i > 0) {
const p = (i - 1) >> 1;
if (this._cmp(this._data[i].row, this._data[p].row) < 0) {
[this._data[i], this._data[p]] = [this._data[p], this._data[i]];
i = p;
} else break;
}
}
pop() {
const top = this._data[0];
const last = this._data.pop();
if (this._data.length > 0) {
this._data[0] = last;
let i = 0;
for (;;) {
const l = 2 * i + 1, r = 2 * i + 2, n = this._data.length;
let min = i;
if (l < n && this._cmp(this._data[l].row, this._data[min].row) < 0) min = l;
if (r < n && this._cmp(this._data[r].row, this._data[min].row) < 0) min = r;
if (min === i) break;
[this._data[i], this._data[min]] = [this._data[min], this._data[i]];
i = min;
}
}
return top;
}
}
// Merge exactly inputFiles.length (≤ MAX_MERGE_FAN_IN) sorted chunk files into
// one output file. Opens exactly inputFiles.length file descriptors at once.
async function mergeBatch(inputFiles, outfile, comparator) {
const ws = fs.createWriteStream(outfile);
const iters = inputFiles.map(f => readChunkLines(f));
const cleanup = () => {
ws.destroy();
for (const it of iters) it.return();
};
try {
await new Promise((resolve, reject) => {
ws.on('error', err => { cleanup(); reject(err); });
const heap = new _SortMinHeap(comparator);
let settled = false;
const fail = err => {
if (settled) return;
settled = true;
cleanup();
reject(err);
};
const advance = async iter => {
const { value, done } = await iter.next();
if (!done) heap.push({ row: value, iter });
};
const drain = async () => {
try {
await Promise.all(iters.map(advance));
while (heap.size > 0) {
if (settled) return;
const { row, iter } = heap.pop();
const line = JSON.stringify(row) + '\n';
const ok = ws.write(line);
if (!ok) await new Promise(r => ws.once('drain', r));
await advance(iter);
}
if (settled) return;
ws.end();
ws.once('finish', () => { settled = true; resolve(); });
} catch (e) {
fail(e);
}
};
drain();
});
} catch (e) {
// cleanup() was already called inside the Promise; re-throw so callers see the error.
throw e;
}
}
// Multi-pass k-way merge. Each pass merges groups of ≤ MAX_MERGE_FAN_IN files,
// writing intermediates via nextTmpFile() so they are tracked for cleanup.
// The very last pass writes directly to outfile (no cross-fs rename needed).
async function multiPassMerge(inputFiles, outfile, comparator, nextTmpFile) {
let current = inputFiles;
while (current.length > MAX_MERGE_FAN_IN) {
const next = [];
for (let i = 0; i < current.length; i += MAX_MERGE_FAN_IN) {
const batch = current.slice(i, i + MAX_MERGE_FAN_IN);
const merged = nextTmpFile();
next.push(merged);
await mergeBatch(batch, merged, comparator);
}
current = next;
}
await mergeBatch(current, outfile, comparator);
}
async function sortFile(infile, outfile, sort) {
const comparator = (a, b) => {
for (const { uniqueName, order } of sort) {
const av = a[uniqueName], bv = b[uniqueName];
if (av < bv) return order === 'ASC' ? -1 : 1;
if (av > bv) return order === 'ASC' ? 1 : -1;
}
return 0;
};
const tmpDir = path.join(os.tmpdir(), `dbgate-sort-${crypto.randomUUID()}`);
fs.mkdirSync(tmpDir, { recursive: true });
// All tmp paths are registered here BEFORE any write attempt so that the
// finally block can unlink partial files even when a write fails mid-way.
const createdTmpFiles = new Set();
let tmpCounter = 0;
const nextTmpFile = () => {
const f = path.join(tmpDir, `es-${tmpCounter++}.jsonl`);
createdTmpFiles.add(f);
return f;
};
try {
// ── Phase 1: generate sorted runs ──────────────────────────────────────
// Read the input line by line; accumulate SORT_CHUNK_SIZE rows, sort
// them, write to a temp file, then discard the chunk from memory.
// Peak memory ≈ one chunk (SORT_CHUNK_SIZE rows).
//
// The first non-empty line is inspected for __isStreamHeader. If found,
// it is saved and excluded from sorting so it can be written back as the
// very first line of the output file.
let chunk = [];
const runFiles = [];
let headerRow = null;
let isFirstNonEmptyLine = true;
const flushChunk = async () => {
if (chunk.length === 0) return;
chunk.sort(comparator);
// Register the path BEFORE writing so the finally block can always
// clean it up even if writeChunkFile throws partway through.
const tmpFile = nextTmpFile();
runFiles.push(tmpFile);
await writeChunkFile(tmpFile, chunk);
chunk = [];
};
await new Promise((resolve, reject) => {
const inputStream = fs.createReadStream(infile);
const rl = readline.createInterface({ input: inputStream, crlfDelay: Infinity });
let pendingFlush = Promise.resolve();
let settled = false;
const fail = err => {
if (settled) return;
settled = true;
// Destroy both the readline interface and the underlying stream so
// no file descriptors are leaked when we reject while paused.
rl.close();
inputStream.destroy();
reject(err);
};
// Attach directly to the stream — readline.Interface does not reliably
// forward the underlying stream's 'error' event.
inputStream.on('error', fail);
rl.on('error', fail);
rl.on('line', line => {
if (!line.trim()) return;
let parsed;
try {
parsed = JSON.parse(line);
} catch (e) {
logger.warn(extractErrorLogData(e), 'DBGM-00000 Skipping invalid JSON line during sort');
return;
}
// Detect and capture the stream header; do not include it in the sort.
if (isFirstNonEmptyLine) {
isFirstNonEmptyLine = false;
if (parsed.__isStreamHeader) {
headerRow = parsed;
return;
}
}
chunk.push(parsed);
if (chunk.length >= SORT_CHUNK_SIZE) {
rl.pause();
pendingFlush = pendingFlush
.then(() => flushChunk())
.then(() => rl.resume())
.catch(fail);
}
});
rl.on('close', () => {
if (settled) return;
pendingFlush
.then(() => flushChunk())
.then(() => { settled = true; resolve(); })
.catch(fail);
});
});
// ── Phase 2: k-way streaming merge ─────────────────────────────────────
if (headerRow !== null) {
// There is a stream header that must appear as the first line of outfile.
// Merge all data runs into a single intermediate file (or use the single
// run directly), then write outfile = header + merged data.
let mergedDataFile = null;
if (runFiles.length === 1) {
mergedDataFile = runFiles[0]; // cleaned up by the finally block
} else if (runFiles.length > 1) {
mergedDataFile = nextTmpFile();
await multiPassMerge(runFiles, mergedDataFile, comparator, nextTmpFile);
}
await new Promise((resolve, reject) => {
const ws = fs.createWriteStream(outfile);
let settled = false;
const fail = err => {
if (settled) return;
settled = true;
ws.destroy();
try { fs.unlinkSync(outfile); } catch { /* best-effort */ }
reject(err);
};
ws.on('error', fail);
const headerLine = JSON.stringify(headerRow) + '\n';
if (!mergedDataFile) {
// Header only — no data rows.
ws.end(headerLine);
ws.once('finish', () => { settled = true; resolve(); });
} else {
ws.write(headerLine, writeErr => {
if (writeErr) return fail(writeErr);
const rs = fs.createReadStream(mergedDataFile);
rs.on('error', fail);
rs.pipe(ws);
ws.once('finish', () => { settled = true; resolve(); });
});
}
});
} else if (runFiles.length === 0) {
fs.writeFileSync(outfile, '');
} else if (runFiles.length === 1) {
// safeRename handles EXDEV (cross-filesystem) by falling back to
// stream-copy + unlink, so outfile is always populated correctly.
await safeRename(runFiles[0], outfile);
// The file is now at outfile (or already unlinked on EXDEV path),
// so remove it from the cleanup set to avoid a spurious unlink error.
createdTmpFiles.delete(runFiles[0]);
} else {
// multiPassMerge batches the fan-in to MAX_MERGE_FAN_IN files per pass,
// bounding the number of concurrently open file descriptors.
// Intermediate files are allocated via nextTmpFile() so they are always
// tracked in createdTmpFiles for cleanup.
await multiPassMerge(runFiles, outfile, comparator, nextTmpFile);
}
} finally {
for (const f of createdTmpFiles) {
try { fs.unlinkSync(f); } catch { /* best-effort cleanup */ }
}
try { fs.rmdirSync(tmpDir); } catch { /* best-effort cleanup */ }
}
}
module.exports = { sortFile };