UNPKG

@naturalcycles/db-lib

Version:

Lowest Common Denominator API to supported Databases

245 lines (202 loc) 7.19 kB
import { localTime } from '@naturalcycles/js-lib/datetime/localTime.js' import { AppError, ErrorMode } from '@naturalcycles/js-lib/error' import { pMap } from '@naturalcycles/js-lib/promise/pMap.js' import type { AsyncMapper, StringMap, UnixTimestamp } from '@naturalcycles/js-lib/types' import { _passthroughMapper } from '@naturalcycles/js-lib/types' import { boldWhite, dimWhite, grey, yellow } from '@naturalcycles/nodejs-lib/colors' import { fs2 } from '@naturalcycles/nodejs-lib/fs2' import type { TransformLogProgressOptions, TransformMapOptions, } from '@naturalcycles/nodejs-lib/stream' import { NDJsonStats } from '@naturalcycles/nodejs-lib/stream' import type { CommonDB } from '../commondb/common.db.js' import { DBQuery } from '../query/dbQuery.js' export interface DBPipelineBackupOptions extends TransformLogProgressOptions { /** * DB to dump data from. */ db: CommonDB /** * List of tables to dump. If undefined - will call CommonDB.getTables() and dump ALL tables returned. */ tables?: string[] /** * How many tables to dump in parallel. * * @default 16 * Set to `1` for serial (1 at a time) processing or debugging. */ concurrency?: number /** * @default ErrorMode.SUPPRESS * * Used in high-level pMap(tables, ...) * Also used as default option for TransformMapOptions */ errorMode?: ErrorMode /** * @default undefined * If set - will dump maximum that number of rows per table */ limit?: number /** * If set - will do "incremental backup" (not full), only for entities that updated >= `sinceUpdated` */ sinceUpdated?: UnixTimestamp /** * Map for each table a `sinceUpdated` timestamp, or `undefined`. * If set - will do "incremental backup" (not full), only for entities that updated >= `sinceUpdated` (on a per table basis) */ sinceUpdatedPerTable?: StringMap<UnixTimestamp> /** * By default, dbPipelineBackup creates a Query based on sinceUpdated. * But if queryPerTable is set for a table - it will override the Query that is ran for that table * (and ignore sinceUpdated, sinceUpdatedPerTable, limit, and any other properties that modify the query). */ queryPerTable?: StringMap<DBQuery<any>> /** * Directory path to store dumped files. Will create `${tableName}.ndjson` (or .ndjson.gz if gzip=true) files. * All parent directories will be created. * * @default to process.cwd() */ outputDirPath: string /** * @default false * If true - will fail if output file already exists. */ protectFromOverwrite?: boolean /** * Compress as .zst * @default true */ zst?: boolean /** * Only applicable if `gzip` is enabled * Currently not available. */ // zlibOptions?: ZlibOptions /** * Optionally you can provide mapper that is going to run for each table. * * @default `{}` * Default mappers will be "passthroughMapper" (pass all data as-is). */ mapperPerTable?: StringMap<AsyncMapper> /** * If defined - it'll use that `logEvery` for that table. * Default logEvery is 1000. */ logEveryPerTable?: StringMap<number> /** * You can alter default `transformMapOptions` here. * * @default (see the code) * The goal to have default values that are reasonable for such a job to provide resilient output (forgiving individual errors). * `metric` will be set to table name */ transformMapOptions?: TransformMapOptions /** * @default false * If true - will use CommonSchemaGenerator to detect schema from input data. */ // emitSchemaFromData?: boolean /** * @default false * If true - will use CommonDB.getTableSchema() and emit schema. */ emitSchemaFromDB?: boolean } /** * Pipeline from input stream(s) to a NDJSON file (optionally gzipped). * File is overwritten (by default). * Input stream can be a stream from CommonDB.streamQuery() * Allows to define a mapper and a predicate to map/filter objects between input and output. * Handles backpressure. * * Optionally you can provide mapperPerTable and @param transformMapOptions (one for all mappers) - it will run for each table. */ export async function dbPipelineBackup(opt: DBPipelineBackupOptions): Promise<NDJsonStats> { const { db, concurrency = 16, limit = 0, outputDirPath, protectFromOverwrite = false, mapperPerTable = {}, queryPerTable = {}, logEveryPerTable = {}, transformMapOptions, errorMode = ErrorMode.SUPPRESS, emitSchemaFromDB = false, zst = true, } = opt let { tables } = opt console.log(`>> ${dimWhite('dbPipelineBackup')} started in ${grey(outputDirPath)}...`) fs2.ensureDir(outputDirPath) tables ||= await db.getTables() console.log(`${yellow(tables.length)} ${boldWhite('table(s)')}:\n` + tables.join('\n')) const statsPerTable: Record<string, NDJsonStats> = {} await pMap( tables, async table => { let q = DBQuery.create<any>(table).limit(limit) const sinceUpdated = opt.sinceUpdatedPerTable?.[table] ?? opt.sinceUpdated if (sinceUpdated) { q = q.filter('updated', '>=', sinceUpdated) } if (queryPerTable[table]) { // Override the Query with this Query, completely ingoring any of the other query-related options q = queryPerTable[table]! console.log(`>> ${grey(table)} ${q.pretty()}`) } else { const sinceUpdatedStr = sinceUpdated ? ' since ' + grey(localTime(sinceUpdated).toPretty()) : '' console.log(`>> ${grey(table)}${sinceUpdatedStr}`) } const filePath = `${outputDirPath}/${table}.ndjson` + (zst ? '.zst' : '') const schemaFilePath = `${outputDirPath}/${table}.schema.json` if (protectFromOverwrite && fs2.pathExists(filePath)) { throw new AppError(`dbPipelineBackup: output file exists: ${filePath}`) } const started = Date.now() let rows = 0 fs2.ensureFile(filePath) // console.log(`>> ${grey(filePath)} started...`) if (emitSchemaFromDB) { const schema = await db.getTableSchema(table) await fs2.writeJsonAsync(schemaFilePath, schema, { spaces: 2 }) console.log(`>> ${grey(schemaFilePath)} saved (generated from DB)`) } await db .streamQuery(q) .logProgress({ ...opt, logEvery: logEveryPerTable[table] ?? opt.logEvery ?? 1000, metric: table, }) .map(mapperPerTable[table] || _passthroughMapper, { errorMode, ...transformMapOptions, metric: table, }) .flattenIfNeeded() .tapSync(() => rows++) .toNDJsonFile(filePath) const { size: sizeBytes } = await fs2.statAsync(filePath) const stats = NDJsonStats.create({ tookMillis: Date.now() - started, rows, sizeBytes, }) console.log(`>> ${grey(filePath)}\n` + stats.toPretty()) statsPerTable[table] = stats }, { concurrency, errorMode }, ) const statsTotal = NDJsonStats.createCombined(Object.values(statsPerTable)) console.log(statsTotal.toPretty('total')) return statsTotal }