mongo2crate
Version:
Sync MongoDB to CrateDB and Convert JSON schema to SQL DDL
174 lines (173 loc) • 6.94 kB
JavaScript
import _debug from 'debug';
import _ from 'lodash/fp.js';
import * as mongoChangeStream from 'mongochangestream';
import { renameKeys } from 'mongochangestream';
import { mapLeaves } from 'obj-walker';
import { convertSchema } from './convertSchema.js';
import { getFailedRecords, partitionEvents, setDefaults, sumByRowcount, } from './util.js';
const debug = _debug('mongo2crate:sync');
const maybeThrow = (error) => {
debug('Error caught %O', error);
// This exception can be thrown if a document is updated and handled
// by processChangeStream before runInitialScan has processed the record.
// DuplicateKeyException[A document with the same primary key exists already]
if (!error?.message?.includes('DuplicateKeyException')) {
throw error;
}
};
export const initSync = (redis, collection, crate, options = {}) => {
const mapper = (doc) => {
if (options.mapper) {
mapLeaves(doc, options.mapper, { modifyInPlace: true });
}
renameKeys(doc, { ...options.rename, _id: 'id' });
debug('Mapped doc %o', doc);
return doc;
};
const schemaName = options.schemaName || 'doc';
const tableName = options.tableName || collection.collectionName.toLowerCase();
const qualifiedName = `"${schemaName}"."${tableName}"`;
// Initialize sync
const sync = mongoChangeStream.initSync(redis, collection, options);
// Use emitter from mongochangestream
const emitter = sync.emitter;
const emit = (event, data) => {
emitter.emit(event, { type: event, ...data });
};
const createTableFromSchema = async (jsonSchema, options = {}) => {
const createTableStmt = convertSchema(jsonSchema, qualifiedName, {
omit: options.omit,
...options,
});
return crate.query(createTableStmt);
};
const handleChangeStreamResult = (result, operationType) => {
debug('Change stream result %O', result);
if ('rowcount' in result) {
const event = {
success: result.rowcount,
fail: 0,
changeStream: true,
operationCounts: { [operationType]: 1 },
};
emit('process', event);
}
else {
maybeThrow(result.error);
}
};
/**
* Process change stream events.
*/
const processChangeStreamRecords = async (docs) => {
// Assume batchSize is always 1
const doc = docs[0];
try {
if (doc.operationType === 'insert') {
const document = mapper(doc.fullDocument);
const result = await crate.insert(qualifiedName, document);
handleChangeStreamResult(result, doc.operationType);
}
else if (doc.operationType === 'update') {
const document = doc.fullDocument ? mapper(doc.fullDocument) : {};
const { updatedFields, removedFields } = doc.updateDescription;
const removed = removedFields && setDefaults(removedFields, null);
const update = mapper({ ...updatedFields, ...removed });
if (_.size(update)) {
const result = await crate.upsert(qualifiedName, document, update);
handleChangeStreamResult(result, doc.operationType);
}
}
else if (doc.operationType === 'replace') {
const _id = doc.documentKey._id;
// Delete
await crate.deleteById(qualifiedName, _id.toString());
// Insert
const document = mapper(doc.fullDocument);
const result = await crate.insert(qualifiedName, document);
handleChangeStreamResult(result, doc.operationType);
}
else if (doc.operationType === 'delete') {
const _id = doc.documentKey._id;
const result = await crate.deleteById(qualifiedName, _id.toString());
handleChangeStreamResult(result, doc.operationType);
}
}
catch (e) {
maybeThrow(e);
}
};
/**
* Process insert documents in bulk.
*/
const processInsertRecords = async (docs, type = 'initialScan') => {
try {
const documents = docs.map(({ fullDocument }) => mapper(fullDocument));
const result = await crate.bulkInsert(qualifiedName, documents);
debug('Bulk insert result %O', result);
if ('results' in result) {
// 1 indicates success
const numInserted = sumByRowcount(1)(result.results);
// -2 indicates failure
const numFailed = sumByRowcount(-2)(result.results);
const failedDocs = getFailedRecords(result.results, docs);
const event = {
success: numInserted,
fail: numFailed,
...(failedDocs.length && { failedDocs }),
[type]: true,
operationCounts: { insert: docs.length },
};
emit('process', event);
}
if ('error' in result) {
maybeThrow(result.error);
}
}
catch (e) {
maybeThrow(e);
}
};
const processChangeStream = (options) => options?.autoOptimizeInserts
? sync.processChangeStream(async (docs) => {
const partitions = partitionEvents(docs);
for (const partition of partitions) {
// We have more than one event so this is a grouped set of inserts
if (partition.length > 1) {
debug('Change stream insert batch of length %d', partition.length);
await processInsertRecords(
// We know these are going to be insert events
partition, 'changeStream');
}
else {
await processChangeStreamRecords(partition);
}
}
}, options)
: sync.processChangeStream(processChangeStreamRecords, {
...options,
// We can only handle one record at a time
batchSize: 1,
});
const runInitialScan = (options) => sync.runInitialScan(processInsertRecords, options);
return {
...sync,
/**
* Process MongoDB change stream for the given collection.
*/
processChangeStream,
/**
* Run initial collection scan. `options.batchSize` defaults to 500.
* Sorting defaults to `_id`.
*/
runInitialScan,
/**
* Convert the given JSON schema to CrateDB table DDL.
*/
createTableFromSchema,
schemaName,
tableName,
qualifiedName,
emitter,
};
};