UNPKG

@flatfile/implementation-utils-dedupe-worker

Version:

Provides functionality for easily implementing record deduplication logic.

149 lines (144 loc) 5.91 kB
//#region rolldown:runtime var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) { key = keys[i]; if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: ((k) => from[k]).bind(null, key), enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod)); //#endregion const __flatfile_implementation_utils_job_worker = __toESM(require("@flatfile/implementation-utils-job-worker")); //#region src/dedupe.ts /** * Ranks records by the number of conflicts they have with other records in the set. * The record with the fewest conflicts is ranked highest. * * @param records - The records to rank. * @returns A sorted array of records with their conflict counts. */ const rankRecordsByConflicts = (records) => { return records.map((candidateRecord) => { let conflicts = 0; for (const otherRecord of records) if (candidateRecord.id !== otherRecord.id && candidateRecord.hasConflict(otherRecord)) conflicts = conflicts + 1; return { record: candidateRecord, conflicts }; }).sort((a, b) => a.conflicts - b.conflicts); }; /** * Deduplicates a collection of records based on specified dedupe keys. * * @param records - Collection of records to deduplicate. * @param dedupeKeys - Array of field keys to use for deduplication. * @param isMergeable - Optional function to determine if a source record can be merged into a target record. * @returns Result object containing categorized records. */ const deduplicateRecords = (records, dedupeKeys, isMergeable) => { const result = { pristineRecords: [], conflictingRecords: [], duplicateRecords: [], mergedRecords: [] }; const recordIndex = records.groupBy((record) => record.hash(...dedupeKeys)); recordIndex.each((recordsByHash) => { let recordsToProcess = [...recordsByHash.all()]; if (recordsToProcess.length === 1) { const record = recordsToProcess[0]; if (record) result.pristineRecords.push(record); return; } while (recordsToProcess.length > 0) { const rankedRecords = rankRecordsByConflicts(recordsToProcess); const baseRecord = rankedRecords[0]?.record; if (!baseRecord || rankedRecords[0] && rankedRecords[0].conflicts === recordsToProcess.length - 1) { result.conflictingRecords.push(...recordsToProcess); break; } const remainingRecords = []; const blockedByMergeable = []; recordsToProcess = recordsToProcess.filter((r) => r.id !== baseRecord.id); let isMerged = false; for (const otherRecord of recordsToProcess) if (baseRecord.hasConflict(otherRecord)) remainingRecords.push(otherRecord); else if (isMergeable && !isMergeable(otherRecord, baseRecord)) blockedByMergeable.push(otherRecord); else { isMerged = true; baseRecord.merge(otherRecord); otherRecord.delete(); result.duplicateRecords.push(otherRecord); } if (isMerged) result.mergedRecords.push(baseRecord); else result.pristineRecords.push(baseRecord); if (blockedByMergeable.length > 0) { const allBlockedHaveNoConflicts = blockedByMergeable.every((blocked) => !remainingRecords.some((remaining) => blocked.hasConflict(remaining))); if (allBlockedHaveNoConflicts && remainingRecords.length === 0) result.pristineRecords.push(...blockedByMergeable); else remainingRecords.push(...blockedByMergeable); } recordsToProcess = remainingRecords; } }); return result; }; //#endregion //#region src/constants.ts const ACTION_DEDUPE_INPUT_FORM_FIELD_KEY_DEDUPE_ON = "dedupeOn"; //#endregion //#region src/dedupe.worker.ts /** * This abstract job is responsible for deduplicating records in a sheet that have the exact same values * based on the dedupeOn field. * * It must be extended to provide record fetching and write functionality. */ var DedupeJobWorker = class extends __flatfile_implementation_utils_job_worker.SheetJobWorker { async execute() { const sheet = await this.sheet(); const sheetFieldKeys = sheet.fields.map((field) => field.key); const dedupeOnInput = this.job.input?.[ACTION_DEDUPE_INPUT_FORM_FIELD_KEY_DEDUPE_ON]; let dedupeOn; if (dedupeOnInput) dedupeOn = [dedupeOnInput]; else dedupeOn = sheetFieldKeys; const records = await this.fetchRecords(); const dedupeResult = deduplicateRecords(records, dedupeOn); const changesCount = records.changes().count(); if (changesCount > 0) await this.writeRecords(records); return { message: `Successfully deduped records.\nUnchanged: ${dedupeResult.pristineRecords.length.toString()}\nMerged into existing records: ${dedupeResult.duplicateRecords.length.toString()}\nConflicts: ${dedupeResult.conflictingRecords.length.toString()}` }; } }; /** * Returns the dedupe action input form for a sheet with an option to select which field to dedupe on. */ const getDedupeActionInputForm = (fields) => { return { type: "simple", fields: [{ key: ACTION_DEDUPE_INPUT_FORM_FIELD_KEY_DEDUPE_ON, label: "Dedupe On (Defaults to Exact Match Across All Fields)", type: "enum", config: { options: fields.map((field) => { return { label: field.label, value: field.key }; }) } }] }; }; //#endregion exports.DedupeJobWorker = DedupeJobWorker; exports.deduplicateRecords = deduplicateRecords; exports.getDedupeActionInputForm = getDedupeActionInputForm;