UNPKG

mongo2crate

Version:

Sync MongoDB to CrateDB and Convert JSON schema to SQL DDL

174 lines (173 loc) 6.94 kB
import _debug from 'debug'; import _ from 'lodash/fp.js'; import * as mongoChangeStream from 'mongochangestream'; import { renameKeys } from 'mongochangestream'; import { mapLeaves } from 'obj-walker'; import { convertSchema } from './convertSchema.js'; import { getFailedRecords, partitionEvents, setDefaults, sumByRowcount, } from './util.js'; const debug = _debug('mongo2crate:sync'); const maybeThrow = (error) => { debug('Error caught %O', error); // This exception can be thrown if a document is updated and handled // by processChangeStream before runInitialScan has processed the record. // DuplicateKeyException[A document with the same primary key exists already] if (!error?.message?.includes('DuplicateKeyException')) { throw error; } }; export const initSync = (redis, collection, crate, options = {}) => { const mapper = (doc) => { if (options.mapper) { mapLeaves(doc, options.mapper, { modifyInPlace: true }); } renameKeys(doc, { ...options.rename, _id: 'id' }); debug('Mapped doc %o', doc); return doc; }; const schemaName = options.schemaName || 'doc'; const tableName = options.tableName || collection.collectionName.toLowerCase(); const qualifiedName = `"${schemaName}"."${tableName}"`; // Initialize sync const sync = mongoChangeStream.initSync(redis, collection, options); // Use emitter from mongochangestream const emitter = sync.emitter; const emit = (event, data) => { emitter.emit(event, { type: event, ...data }); }; const createTableFromSchema = async (jsonSchema, options = {}) => { const createTableStmt = convertSchema(jsonSchema, qualifiedName, { omit: options.omit, ...options, }); return crate.query(createTableStmt); }; const handleChangeStreamResult = (result, operationType) => { debug('Change stream result %O', result); if ('rowcount' in result) { const event = { success: result.rowcount, fail: 0, changeStream: true, operationCounts: { [operationType]: 1 }, }; emit('process', event); } else { maybeThrow(result.error); } }; /** * Process change stream events. */ const processChangeStreamRecords = async (docs) => { // Assume batchSize is always 1 const doc = docs[0]; try { if (doc.operationType === 'insert') { const document = mapper(doc.fullDocument); const result = await crate.insert(qualifiedName, document); handleChangeStreamResult(result, doc.operationType); } else if (doc.operationType === 'update') { const document = doc.fullDocument ? mapper(doc.fullDocument) : {}; const { updatedFields, removedFields } = doc.updateDescription; const removed = removedFields && setDefaults(removedFields, null); const update = mapper({ ...updatedFields, ...removed }); if (_.size(update)) { const result = await crate.upsert(qualifiedName, document, update); handleChangeStreamResult(result, doc.operationType); } } else if (doc.operationType === 'replace') { const _id = doc.documentKey._id; // Delete await crate.deleteById(qualifiedName, _id.toString()); // Insert const document = mapper(doc.fullDocument); const result = await crate.insert(qualifiedName, document); handleChangeStreamResult(result, doc.operationType); } else if (doc.operationType === 'delete') { const _id = doc.documentKey._id; const result = await crate.deleteById(qualifiedName, _id.toString()); handleChangeStreamResult(result, doc.operationType); } } catch (e) { maybeThrow(e); } }; /** * Process insert documents in bulk. */ const processInsertRecords = async (docs, type = 'initialScan') => { try { const documents = docs.map(({ fullDocument }) => mapper(fullDocument)); const result = await crate.bulkInsert(qualifiedName, documents); debug('Bulk insert result %O', result); if ('results' in result) { // 1 indicates success const numInserted = sumByRowcount(1)(result.results); // -2 indicates failure const numFailed = sumByRowcount(-2)(result.results); const failedDocs = getFailedRecords(result.results, docs); const event = { success: numInserted, fail: numFailed, ...(failedDocs.length && { failedDocs }), [type]: true, operationCounts: { insert: docs.length }, }; emit('process', event); } if ('error' in result) { maybeThrow(result.error); } } catch (e) { maybeThrow(e); } }; const processChangeStream = (options) => options?.autoOptimizeInserts ? sync.processChangeStream(async (docs) => { const partitions = partitionEvents(docs); for (const partition of partitions) { // We have more than one event so this is a grouped set of inserts if (partition.length > 1) { debug('Change stream insert batch of length %d', partition.length); await processInsertRecords( // We know these are going to be insert events partition, 'changeStream'); } else { await processChangeStreamRecords(partition); } } }, options) : sync.processChangeStream(processChangeStreamRecords, { ...options, // We can only handle one record at a time batchSize: 1, }); const runInitialScan = (options) => sync.runInitialScan(processInsertRecords, options); return { ...sync, /** * Process MongoDB change stream for the given collection. */ processChangeStream, /** * Run initial collection scan. `options.batchSize` defaults to 500. * Sorting defaults to `_id`. */ runInitialScan, /** * Convert the given JSON schema to CrateDB table DDL. */ createTableFromSchema, schemaName, tableName, qualifiedName, emitter, }; };