@naturalcycles/db-lib
Version:
Lowest Common Denominator API to supported Databases
99 lines (98 loc) • 4.72 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.dbPipelineRestore = dbPipelineRestore;
const js_lib_1 = require("@naturalcycles/js-lib");
const nodejs_lib_1 = require("@naturalcycles/nodejs-lib");
/**
* Pipeline from NDJSON files in a folder (optionally gzipped) to CommonDB.
* Allows to define a mapper and a predicate to map/filter objects between input and output.
* Handles backpressure.
*
* Optionally you can provide mapperPerTable and @param transformMapOptions (one for all mappers) - it will run for each table.
*/
async function dbPipelineRestore(opt) {
const { db, concurrency = 16, chunkSize = 100, limit, sinceUpdated, inputDirPath, mapperPerTable = {}, saveOptionsPerTable = {}, transformMapOptions, errorMode = js_lib_1.ErrorMode.SUPPRESS, recreateTables = false, } = opt;
const onlyTables = opt.tables && new Set(opt.tables);
const sinceUpdatedStr = sinceUpdated ? ' since ' + (0, nodejs_lib_1.grey)((0, js_lib_1.localTime)(sinceUpdated).toPretty()) : '';
console.log(`>> ${(0, nodejs_lib_1.dimWhite)('dbPipelineRestore')} started in ${(0, nodejs_lib_1.grey)(inputDirPath)}...${sinceUpdatedStr}`);
nodejs_lib_1.fs2.ensureDir(inputDirPath);
const tablesToGzip = new Set();
const sizeByTable = {};
const statsPerTable = {};
const tables = [];
nodejs_lib_1.fs2.readdir(inputDirPath).forEach(f => {
let table;
let gzip = false;
if (f.endsWith('.ndjson')) {
table = f.slice(0, f.length - '.ndjson'.length);
}
else if (f.endsWith('.ndjson.gz')) {
table = f.slice(0, f.length - '.ndjson.gz'.length);
gzip = true;
}
else {
return;
}
if (onlyTables && !onlyTables.has(table))
return; // skip table
tables.push(table);
if (gzip)
tablesToGzip.add(table);
sizeByTable[table] = nodejs_lib_1.fs2.stat(`${inputDirPath}/${f}`).size;
});
const sizeStrByTable = (0, js_lib_1._mapValues)(sizeByTable, (_k, b) => (0, js_lib_1._hb)(b));
console.log(`${(0, nodejs_lib_1.yellow)(tables.length)} ${(0, nodejs_lib_1.boldWhite)('table(s)')}:\n`, sizeStrByTable);
// const schemaByTable: Record<string, CommonSchema> = {}
if (recreateTables) {
await (0, js_lib_1.pMap)(tables, async (table) => {
const schemaFilePath = `${inputDirPath}/${table}.schema.json`;
if (!nodejs_lib_1.fs2.pathExists(schemaFilePath)) {
console.warn(`${schemaFilePath} does not exist!`);
return;
}
const schema = await nodejs_lib_1.fs2.readJsonAsync(schemaFilePath);
await db.createTable(table, schema, { dropIfExists: true });
});
}
await (0, js_lib_1.pMap)(tables, async (table) => {
const gzip = tablesToGzip.has(table);
const filePath = `${inputDirPath}/${table}.ndjson` + (gzip ? '.gz' : '');
const saveOptions = saveOptionsPerTable[table] || {};
const started = Date.now();
let rows = 0;
const sizeBytes = sizeByTable[table];
console.log(`<< ${(0, nodejs_lib_1.grey)(filePath)} ${(0, nodejs_lib_1.dimWhite)((0, js_lib_1._hb)(sizeBytes))} started...`);
await (0, nodejs_lib_1._pipeline)([
nodejs_lib_1.fs2.createReadStreamAsNDJSON(filePath).take(limit || Number.POSITIVE_INFINITY),
(0, nodejs_lib_1.transformTap)(() => rows++),
(0, nodejs_lib_1.transformLogProgress)({
logEvery: 1000,
...opt,
metric: table,
}),
...(sinceUpdated
? [(0, nodejs_lib_1.transformFilterSync)(r => r.updated >= sinceUpdated)]
: []),
(0, nodejs_lib_1.transformMap)(mapperPerTable[table] || js_lib_1._passthroughMapper, {
errorMode,
flattenArrayOutput: true,
...transformMapOptions,
metric: table,
}),
(0, nodejs_lib_1.transformChunk)({ chunkSize }),
(0, nodejs_lib_1.writableForEach)(async (dbms) => {
await db.saveBatch(table, dbms, saveOptions);
}),
]);
const stats = nodejs_lib_1.NDJsonStats.create({
tookMillis: Date.now() - started,
rows,
sizeBytes,
});
console.log(`<< ${(0, nodejs_lib_1.grey)(filePath)}\n` + stats.toPretty());
statsPerTable[table] = stats;
}, { concurrency, errorMode });
const statsTotal = nodejs_lib_1.NDJsonStats.createCombined(Object.values(statsPerTable));
console.log(statsTotal.toPretty('total'));
return statsTotal;
}