UNPKG

@naturalcycles/db-lib

Version:

Lowest Common Denominator API to supported Databases

227 lines (190 loc) 6.72 kB
import { _hb } from '@naturalcycles/js-lib' import { localTime } from '@naturalcycles/js-lib/datetime/localTime.js' import { ErrorMode } from '@naturalcycles/js-lib/error/errorMode.js' import { _mapValues } from '@naturalcycles/js-lib/object' import { pMap } from '@naturalcycles/js-lib/promise/pMap.js' import type { AsyncMapper, BaseDBEntity, UnixTimestamp } from '@naturalcycles/js-lib/types' import { _passthroughMapper } from '@naturalcycles/js-lib/types' import type { JsonSchema } from '@naturalcycles/nodejs-lib/ajv' import { boldWhite, dimWhite, grey, yellow } from '@naturalcycles/nodejs-lib/colors' import { fs2 } from '@naturalcycles/nodejs-lib/fs2' import { NDJsonStats, Pipeline, type TransformLogProgressOptions, type TransformMapOptions, } from '@naturalcycles/nodejs-lib/stream' import type { CommonDB } from '../commondb/common.db.js' import type { CommonDBSaveOptions } from '../db.model.js' export interface DBPipelineRestoreOptions extends TransformLogProgressOptions { /** * DB to save data to. */ db: CommonDB /** * Directory path to store dumped files. Will create `${tableName}.ndjson` (or .ndjson.zst if zst=true) files. * All parent directories will be created. */ inputDirPath: string /** * List of tables to dump. If undefined - will dump all files that end with .ndjson (or .ndjson.gz) extension. */ tables?: string[] /** * How many tables to dump in parallel. * * @default 16 * Set to `1` for serial (1 at a time) processing or debugging. */ concurrency?: number /** * @default 100 * * Determines the size of .saveBatch() */ batchSize?: number /** * @default ErrorMode.SUPPRESS * * Used in high-level pMap(tables, ...) * Also used as default option for TransformMapOptions */ errorMode?: ErrorMode /** * @default undefined * If set - will dump maximum that number of rows per table */ limit?: number /** * If set - will do "incremental backup" (not full), only for entities that updated >= `sinceUpdated` * * @default undefined */ sinceUpdated?: UnixTimestamp /** * @default false * If true - will read ${table}.schema.json files and recreate tables before importing. * Caution! Will do `drop table if exists`!!! */ recreateTables?: boolean /** * Optionally you can provide mapper that is going to run for each table. * * @default `{}` * Default mappers will be "passthroughMapper" (pass all data as-is). */ mapperPerTable?: Record<string, AsyncMapper> /** * You can alter default `transformMapOptions` here. * * @default (see the code) * The goal to have default values that are reasonable for such a job to provide resilient output (forgiving individual errors). * `metric` will be set to table name */ transformMapOptions?: TransformMapOptions saveOptionsPerTable?: Record<string, CommonDBSaveOptions<any>> } /** * Pipeline from NDJSON files in a folder (optionally gzipped) to CommonDB. * Allows to define a mapper and a predicate to map/filter objects between input and output. * Handles backpressure. * * Optionally you can provide mapperPerTable and @param transformMapOptions (one for all mappers) - it will run for each table. */ export async function dbPipelineRestore(opt: DBPipelineRestoreOptions): Promise<NDJsonStats> { const { db, concurrency = 16, chunkSize = 100, limit, sinceUpdated, inputDirPath, mapperPerTable = {}, saveOptionsPerTable = {}, transformMapOptions, errorMode = ErrorMode.SUPPRESS, recreateTables = false, } = opt const onlyTables = opt.tables && new Set(opt.tables) const sinceUpdatedStr = sinceUpdated ? ' since ' + grey(localTime(sinceUpdated).toPretty()) : '' console.log( `>> ${dimWhite('dbPipelineRestore')} started in ${grey(inputDirPath)}...${sinceUpdatedStr}`, ) fs2.ensureDir(inputDirPath) const tablesToCompress = new Set<string>() const sizeByTable: Record<string, number> = {} const statsPerTable: Record<string, NDJsonStats> = {} const tables: string[] = [] fs2.readdir(inputDirPath).forEach(f => { let table: string let zst = false if (f.endsWith('.ndjson')) { table = f.slice(0, f.length - '.ndjson'.length) } else if (f.endsWith('.ndjson.zst')) { table = f.slice(0, f.length - '.ndjson.zst'.length) zst = true } else { return } if (onlyTables && !onlyTables.has(table)) return // skip table tables.push(table) if (zst) tablesToCompress.add(table) sizeByTable[table] = fs2.stat(`${inputDirPath}/${f}`).size }) const sizeStrByTable = _mapValues(sizeByTable, (_k, b) => _hb(b)) console.log(`${yellow(tables.length)} ${boldWhite('table(s)')}:\n`, sizeStrByTable) // const schemaByTable: Record<string, CommonSchema> = {} if (recreateTables) { await pMap(tables, async table => { const schemaFilePath = `${inputDirPath}/${table}.schema.json` if (!fs2.pathExists(schemaFilePath)) { console.warn(`${schemaFilePath} does not exist!`) return } const schema = await fs2.readJsonAsync<JsonSchema<any>>(schemaFilePath) await db.createTable(table, schema, { dropIfExists: true }) }) } await pMap( tables, async table => { const zst = tablesToCompress.has(table) const filePath = `${inputDirPath}/${table}.ndjson` + (zst ? '.zst' : '') const saveOptions: CommonDBSaveOptions<any> = saveOptionsPerTable[table] || {} const started = Date.now() let rows = 0 const sizeBytes = sizeByTable[table] console.log(`<< ${grey(filePath)} ${dimWhite(_hb(sizeBytes))} started...`) await Pipeline.fromNDJsonFile<BaseDBEntity>(filePath) .limitSource(limit) .tapSync(() => rows++) .logProgress({ logEvery: 1000, ...opt, metric: table, }) .filterSync(r => !sinceUpdated || r.updated >= sinceUpdated) .map(mapperPerTable[table] || _passthroughMapper, { errorMode, ...transformMapOptions, metric: table, }) .flattenIfNeeded() .chunk(chunkSize) .forEach(async dbms => { await db.saveBatch(table, dbms, saveOptions) }) const stats = NDJsonStats.create({ tookMillis: Date.now() - started, rows, sizeBytes, }) console.log(`<< ${grey(filePath)}\n` + stats.toPretty()) statsPerTable[table] = stats }, { concurrency, errorMode }, ) const statsTotal = NDJsonStats.createCombined(Object.values(statsPerTable)) console.log(statsTotal.toPretty('total')) return statsTotal }