UNPKG

@flatfile/implementation-utils-dedupe-worker

Version:

Provides functionality for easily implementing record deduplication logic.

124 lines (120 loc) 4.77 kB
import { SheetJobWorker } from "@flatfile/implementation-utils-job-worker"; //#region src/dedupe.ts /** * Ranks records by the number of conflicts they have with other records in the set. * The record with the fewest conflicts is ranked highest. * * @param records - The records to rank. * @returns A sorted array of records with their conflict counts. */ const rankRecordsByConflicts = (records) => { return records.map((candidateRecord) => { let conflicts = 0; for (const otherRecord of records) if (candidateRecord.id !== otherRecord.id && candidateRecord.hasConflict(otherRecord)) conflicts = conflicts + 1; return { record: candidateRecord, conflicts }; }).sort((a, b) => a.conflicts - b.conflicts); }; /** * Deduplicates a collection of records based on specified dedupe keys. * * @param records - Collection of records to deduplicate. * @param dedupeKeys - Array of field keys to use for deduplication. * @param isMergeable - Optional function to determine if a source record can be merged into a target record. * @returns Result object containing categorized records. */ const deduplicateRecords = (records, dedupeKeys, isMergeable) => { const result = { pristineRecords: [], conflictingRecords: [], duplicateRecords: [], mergedRecords: [] }; const recordIndex = records.groupBy((record) => record.hash(...dedupeKeys)); recordIndex.each((recordsByHash) => { let recordsToProcess = [...recordsByHash.all()]; if (recordsToProcess.length === 1) { const record = recordsToProcess[0]; if (record) result.pristineRecords.push(record); return; } while (recordsToProcess.length > 0) { const rankedRecords = rankRecordsByConflicts(recordsToProcess); const baseRecord = rankedRecords[0]?.record; if (!baseRecord || rankedRecords[0] && rankedRecords[0].conflicts === recordsToProcess.length - 1) { result.conflictingRecords.push(...recordsToProcess); break; } const remainingRecords = []; const blockedByMergeable = []; recordsToProcess = recordsToProcess.filter((r) => r.id !== baseRecord.id); let isMerged = false; for (const otherRecord of recordsToProcess) if (baseRecord.hasConflict(otherRecord)) remainingRecords.push(otherRecord); else if (isMergeable && !isMergeable(otherRecord, baseRecord)) blockedByMergeable.push(otherRecord); else { isMerged = true; baseRecord.merge(otherRecord); otherRecord.delete(); result.duplicateRecords.push(otherRecord); } if (isMerged) result.mergedRecords.push(baseRecord); else result.pristineRecords.push(baseRecord); if (blockedByMergeable.length > 0) { const allBlockedHaveNoConflicts = blockedByMergeable.every((blocked) => !remainingRecords.some((remaining) => blocked.hasConflict(remaining))); if (allBlockedHaveNoConflicts && remainingRecords.length === 0) result.pristineRecords.push(...blockedByMergeable); else remainingRecords.push(...blockedByMergeable); } recordsToProcess = remainingRecords; } }); return result; }; //#endregion //#region src/constants.ts const ACTION_DEDUPE_INPUT_FORM_FIELD_KEY_DEDUPE_ON = "dedupeOn"; //#endregion //#region src/dedupe.worker.ts /** * This abstract job is responsible for deduplicating records in a sheet that have the exact same values * based on the dedupeOn field. * * It must be extended to provide record fetching and write functionality. */ var DedupeJobWorker = class extends SheetJobWorker { async execute() { const sheet = await this.sheet(); const sheetFieldKeys = sheet.fields.map((field) => field.key); const dedupeOnInput = this.job.input?.[ACTION_DEDUPE_INPUT_FORM_FIELD_KEY_DEDUPE_ON]; let dedupeOn; if (dedupeOnInput) dedupeOn = [dedupeOnInput]; else dedupeOn = sheetFieldKeys; const records = await this.fetchRecords(); const dedupeResult = deduplicateRecords(records, dedupeOn); const changesCount = records.changes().count(); if (changesCount > 0) await this.writeRecords(records); return { message: `Successfully deduped records.\nUnchanged: ${dedupeResult.pristineRecords.length.toString()}\nMerged into existing records: ${dedupeResult.duplicateRecords.length.toString()}\nConflicts: ${dedupeResult.conflictingRecords.length.toString()}` }; } }; /** * Returns the dedupe action input form for a sheet with an option to select which field to dedupe on. */ const getDedupeActionInputForm = (fields) => { return { type: "simple", fields: [{ key: ACTION_DEDUPE_INPUT_FORM_FIELD_KEY_DEDUPE_ON, label: "Dedupe On (Defaults to Exact Match Across All Fields)", type: "enum", config: { options: fields.map((field) => { return { label: field.label, value: field.key }; }) } }] }; }; //#endregion export { DedupeJobWorker, deduplicateRecords, getDedupeActionInputForm };