UNPKG

@naturalcycles/db-lib

Version:

Lowest Common Denominator API to supported Databases

100 lines (99 loc) 4.42 kB
import { _hb } from '@naturalcycles/js-lib'; import { localTime } from '@naturalcycles/js-lib/datetime/localTime.js'; import { ErrorMode } from '@naturalcycles/js-lib/error/errorMode.js'; import { _mapValues } from '@naturalcycles/js-lib/object'; import { pMap } from '@naturalcycles/js-lib/promise/pMap.js'; import { _passthroughMapper } from '@naturalcycles/js-lib/types'; import { boldWhite, dimWhite, grey, yellow } from '@naturalcycles/nodejs-lib/colors'; import { fs2 } from '@naturalcycles/nodejs-lib/fs2'; import { NDJsonStats, Pipeline } from '@naturalcycles/nodejs-lib/stream'; /** * Pipeline from NDJSON files in a folder (optionally gzipped) to CommonDB. * Allows to define a mapper and a predicate to map/filter objects between input and output. * Handles backpressure. * * Optionally you can provide mapperPerTable and @param transformMapOptions (one for all mappers) - it will run for each table. */ export async function dbPipelineRestore(opt) { const { db, concurrency = 16, chunkSize = 100, limit, sinceUpdated, inputDirPath, mapperPerTable = {}, saveOptionsPerTable = {}, transformMapOptions, errorMode = ErrorMode.SUPPRESS, recreateTables = false, } = opt; const onlyTables = opt.tables && new Set(opt.tables); const sinceUpdatedStr = sinceUpdated ? ' since ' + grey(localTime(sinceUpdated).toPretty()) : ''; console.log(`>> ${dimWhite('dbPipelineRestore')} started in ${grey(inputDirPath)}...${sinceUpdatedStr}`); fs2.ensureDir(inputDirPath); const tablesToCompress = new Set(); const sizeByTable = {}; const statsPerTable = {}; const tables = []; fs2.readdir(inputDirPath).forEach(f => { let table; let zst = false; if (f.endsWith('.ndjson')) { table = f.slice(0, f.length - '.ndjson'.length); } else if (f.endsWith('.ndjson.zst')) { table = f.slice(0, f.length - '.ndjson.zst'.length); zst = true; } else { return; } if (onlyTables && !onlyTables.has(table)) return; // skip table tables.push(table); if (zst) tablesToCompress.add(table); sizeByTable[table] = fs2.stat(`${inputDirPath}/${f}`).size; }); const sizeStrByTable = _mapValues(sizeByTable, (_k, b) => _hb(b)); console.log(`${yellow(tables.length)} ${boldWhite('table(s)')}:\n`, sizeStrByTable); // const schemaByTable: Record<string, CommonSchema> = {} if (recreateTables) { await pMap(tables, async (table) => { const schemaFilePath = `${inputDirPath}/${table}.schema.json`; if (!fs2.pathExists(schemaFilePath)) { console.warn(`${schemaFilePath} does not exist!`); return; } const schema = await fs2.readJsonAsync(schemaFilePath); await db.createTable(table, schema, { dropIfExists: true }); }); } await pMap(tables, async (table) => { const zst = tablesToCompress.has(table); const filePath = `${inputDirPath}/${table}.ndjson` + (zst ? '.zst' : ''); const saveOptions = saveOptionsPerTable[table] || {}; const started = Date.now(); let rows = 0; const sizeBytes = sizeByTable[table]; console.log(`<< ${grey(filePath)} ${dimWhite(_hb(sizeBytes))} started...`); await Pipeline.fromNDJsonFile(filePath) .limitSource(limit) .tapSync(() => rows++) .logProgress({ logEvery: 1000, ...opt, metric: table, }) .filterSync(r => !sinceUpdated || r.updated >= sinceUpdated) .map(mapperPerTable[table] || _passthroughMapper, { errorMode, ...transformMapOptions, metric: table, }) .flattenIfNeeded() .chunk(chunkSize) .forEach(async (dbms) => { await db.saveBatch(table, dbms, saveOptions); }); const stats = NDJsonStats.create({ tookMillis: Date.now() - started, rows, sizeBytes, }); console.log(`<< ${grey(filePath)}\n` + stats.toPretty()); statsPerTable[table] = stats; }, { concurrency, errorMode }); const statsTotal = NDJsonStats.createCombined(Object.values(statsPerTable)); console.log(statsTotal.toPretty('total')); return statsTotal; }