@flatfile/implementation-utils-dedupe-worker
Version:
Provides functionality for easily implementing record deduplication logic.
149 lines (144 loc) • 5.91 kB
JavaScript
//#region rolldown:runtime
var __create = Object.create;
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __getProtoOf = Object.getPrototypeOf;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) {
key = keys[i];
if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, {
get: ((k) => from[k]).bind(null, key),
enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable
});
}
return to;
};
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", {
value: mod,
enumerable: true
}) : target, mod));
//#endregion
const __flatfile_implementation_utils_job_worker = __toESM(require("@flatfile/implementation-utils-job-worker"));
//#region src/dedupe.ts
/**
* Ranks records by the number of conflicts they have with other records in the set.
* The record with the fewest conflicts is ranked highest.
*
* @param records - The records to rank.
* @returns A sorted array of records with their conflict counts.
*/
const rankRecordsByConflicts = (records) => {
return records.map((candidateRecord) => {
let conflicts = 0;
for (const otherRecord of records) if (candidateRecord.id !== otherRecord.id && candidateRecord.hasConflict(otherRecord)) conflicts = conflicts + 1;
return {
record: candidateRecord,
conflicts
};
}).sort((a, b) => a.conflicts - b.conflicts);
};
/**
* Deduplicates a collection of records based on specified dedupe keys.
*
* @param records - Collection of records to deduplicate.
* @param dedupeKeys - Array of field keys to use for deduplication.
* @param isMergeable - Optional function to determine if a source record can be merged into a target record.
* @returns Result object containing categorized records.
*/
const deduplicateRecords = (records, dedupeKeys, isMergeable) => {
const result = {
pristineRecords: [],
conflictingRecords: [],
duplicateRecords: [],
mergedRecords: []
};
const recordIndex = records.groupBy((record) => record.hash(...dedupeKeys));
recordIndex.each((recordsByHash) => {
let recordsToProcess = [...recordsByHash.all()];
if (recordsToProcess.length === 1) {
const record = recordsToProcess[0];
if (record) result.pristineRecords.push(record);
return;
}
while (recordsToProcess.length > 0) {
const rankedRecords = rankRecordsByConflicts(recordsToProcess);
const baseRecord = rankedRecords[0]?.record;
if (!baseRecord || rankedRecords[0] && rankedRecords[0].conflicts === recordsToProcess.length - 1) {
result.conflictingRecords.push(...recordsToProcess);
break;
}
const remainingRecords = [];
const blockedByMergeable = [];
recordsToProcess = recordsToProcess.filter((r) => r.id !== baseRecord.id);
let isMerged = false;
for (const otherRecord of recordsToProcess) if (baseRecord.hasConflict(otherRecord)) remainingRecords.push(otherRecord);
else if (isMergeable && !isMergeable(otherRecord, baseRecord)) blockedByMergeable.push(otherRecord);
else {
isMerged = true;
baseRecord.merge(otherRecord);
otherRecord.delete();
result.duplicateRecords.push(otherRecord);
}
if (isMerged) result.mergedRecords.push(baseRecord);
else result.pristineRecords.push(baseRecord);
if (blockedByMergeable.length > 0) {
const allBlockedHaveNoConflicts = blockedByMergeable.every((blocked) => !remainingRecords.some((remaining) => blocked.hasConflict(remaining)));
if (allBlockedHaveNoConflicts && remainingRecords.length === 0) result.pristineRecords.push(...blockedByMergeable);
else remainingRecords.push(...blockedByMergeable);
}
recordsToProcess = remainingRecords;
}
});
return result;
};
//#endregion
//#region src/constants.ts
const ACTION_DEDUPE_INPUT_FORM_FIELD_KEY_DEDUPE_ON = "dedupeOn";
//#endregion
//#region src/dedupe.worker.ts
/**
* This abstract job is responsible for deduplicating records in a sheet that have the exact same values
* based on the dedupeOn field.
*
* It must be extended to provide record fetching and write functionality.
*/
var DedupeJobWorker = class extends __flatfile_implementation_utils_job_worker.SheetJobWorker {
async execute() {
const sheet = await this.sheet();
const sheetFieldKeys = sheet.fields.map((field) => field.key);
const dedupeOnInput = this.job.input?.[ACTION_DEDUPE_INPUT_FORM_FIELD_KEY_DEDUPE_ON];
let dedupeOn;
if (dedupeOnInput) dedupeOn = [dedupeOnInput];
else dedupeOn = sheetFieldKeys;
const records = await this.fetchRecords();
const dedupeResult = deduplicateRecords(records, dedupeOn);
const changesCount = records.changes().count();
if (changesCount > 0) await this.writeRecords(records);
return { message: `Successfully deduped records.\nUnchanged: ${dedupeResult.pristineRecords.length.toString()}\nMerged into existing records: ${dedupeResult.duplicateRecords.length.toString()}\nConflicts: ${dedupeResult.conflictingRecords.length.toString()}` };
}
};
/**
* Returns the dedupe action input form for a sheet with an option to select which field to dedupe on.
*/
const getDedupeActionInputForm = (fields) => {
return {
type: "simple",
fields: [{
key: ACTION_DEDUPE_INPUT_FORM_FIELD_KEY_DEDUPE_ON,
label: "Dedupe On (Defaults to Exact Match Across All Fields)",
type: "enum",
config: { options: fields.map((field) => {
return {
label: field.label,
value: field.key
};
}) }
}]
};
};
//#endregion
exports.DedupeJobWorker = DedupeJobWorker;
exports.deduplicateRecords = deduplicateRecords;
exports.getDedupeActionInputForm = getDedupeActionInputForm;