@naturalcycles/db-lib
Version:
Lowest Common Denominator API to supported Databases
100 lines (99 loc) • 4.42 kB
JavaScript
import { _hb } from '@naturalcycles/js-lib';
import { localTime } from '@naturalcycles/js-lib/datetime/localTime.js';
import { ErrorMode } from '@naturalcycles/js-lib/error/errorMode.js';
import { _mapValues } from '@naturalcycles/js-lib/object';
import { pMap } from '@naturalcycles/js-lib/promise/pMap.js';
import { _passthroughMapper } from '@naturalcycles/js-lib/types';
import { boldWhite, dimWhite, grey, yellow } from '@naturalcycles/nodejs-lib/colors';
import { fs2 } from '@naturalcycles/nodejs-lib/fs2';
import { NDJsonStats, Pipeline } from '@naturalcycles/nodejs-lib/stream';
/**
* Pipeline from NDJSON files in a folder (optionally gzipped) to CommonDB.
* Allows to define a mapper and a predicate to map/filter objects between input and output.
* Handles backpressure.
*
* Optionally you can provide mapperPerTable and @param transformMapOptions (one for all mappers) - it will run for each table.
*/
export async function dbPipelineRestore(opt) {
const { db, concurrency = 16, chunkSize = 100, limit, sinceUpdated, inputDirPath, mapperPerTable = {}, saveOptionsPerTable = {}, transformMapOptions, errorMode = ErrorMode.SUPPRESS, recreateTables = false, } = opt;
const onlyTables = opt.tables && new Set(opt.tables);
const sinceUpdatedStr = sinceUpdated ? ' since ' + grey(localTime(sinceUpdated).toPretty()) : '';
console.log(`>> ${dimWhite('dbPipelineRestore')} started in ${grey(inputDirPath)}...${sinceUpdatedStr}`);
fs2.ensureDir(inputDirPath);
const tablesToCompress = new Set();
const sizeByTable = {};
const statsPerTable = {};
const tables = [];
fs2.readdir(inputDirPath).forEach(f => {
let table;
let zst = false;
if (f.endsWith('.ndjson')) {
table = f.slice(0, f.length - '.ndjson'.length);
}
else if (f.endsWith('.ndjson.zst')) {
table = f.slice(0, f.length - '.ndjson.zst'.length);
zst = true;
}
else {
return;
}
if (onlyTables && !onlyTables.has(table))
return; // skip table
tables.push(table);
if (zst)
tablesToCompress.add(table);
sizeByTable[table] = fs2.stat(`${inputDirPath}/${f}`).size;
});
const sizeStrByTable = _mapValues(sizeByTable, (_k, b) => _hb(b));
console.log(`${yellow(tables.length)} ${boldWhite('table(s)')}:\n`, sizeStrByTable);
// const schemaByTable: Record<string, CommonSchema> = {}
if (recreateTables) {
await pMap(tables, async (table) => {
const schemaFilePath = `${inputDirPath}/${table}.schema.json`;
if (!fs2.pathExists(schemaFilePath)) {
console.warn(`${schemaFilePath} does not exist!`);
return;
}
const schema = await fs2.readJsonAsync(schemaFilePath);
await db.createTable(table, schema, { dropIfExists: true });
});
}
await pMap(tables, async (table) => {
const zst = tablesToCompress.has(table);
const filePath = `${inputDirPath}/${table}.ndjson` + (zst ? '.zst' : '');
const saveOptions = saveOptionsPerTable[table] || {};
const started = Date.now();
let rows = 0;
const sizeBytes = sizeByTable[table];
console.log(`<< ${grey(filePath)} ${dimWhite(_hb(sizeBytes))} started...`);
await Pipeline.fromNDJsonFile(filePath)
.limitSource(limit)
.tapSync(() => rows++)
.logProgress({
logEvery: 1000,
...opt,
metric: table,
})
.filterSync(r => !sinceUpdated || r.updated >= sinceUpdated)
.map(mapperPerTable[table] || _passthroughMapper, {
errorMode,
...transformMapOptions,
metric: table,
})
.flattenIfNeeded()
.chunk(chunkSize)
.forEach(async (dbms) => {
await db.saveBatch(table, dbms, saveOptions);
});
const stats = NDJsonStats.create({
tookMillis: Date.now() - started,
rows,
sizeBytes,
});
console.log(`<< ${grey(filePath)}\n` + stats.toPretty());
statsPerTable[table] = stats;
}, { concurrency, errorMode });
const statsTotal = NDJsonStats.createCombined(Object.values(statsPerTable));
console.log(statsTotal.toPretty('total'));
return statsTotal;
}