UNPKG

tabular-data-differ

Version:

A very efficient library for diffing two sorted streams of tabular data, such as CSV files.

570 lines 21.3 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.DifferContext = exports.Differ = exports.SourceStats = exports.UniqueKeyViolationError = exports.UnorderedStreamsError = void 0; exports.diff = diff; exports.sameArrays = sameArrays; const streams_1 = require("./streams"); const formats_1 = require("./formats"); class UnorderedStreamsError extends Error { } exports.UnorderedStreamsError = UnorderedStreamsError; class UniqueKeyViolationError extends Error { } exports.UniqueKeyViolationError = UniqueKeyViolationError; class SourceStats { constructor() { this.rows = 0; this.duplicateRows = 0; this.uniqueRows = 0; this.uniqueRowsWithDuplicates = 0; this.duplicationPercent = 0; this.uniqueRowDuplicationPercent = 0; this.maxDuplicatesPerUniqueKey = 0; this.minDuplicatesPerUniqueKey = 0; this.averageDuplicatesPerUniqueKey = 0; } incRows() { this.rows += 1; } incDuplicateRows() { this.duplicateRows += 1; } incUniqueRows() { this.uniqueRows += 1; } incUniqueRowsWithDuplicates() { this.uniqueRowsWithDuplicates += 1; } incDuplicates(value) { this.maxDuplicatesPerUniqueKey = Math.max(this.maxDuplicatesPerUniqueKey, value); if (this.minDuplicatesPerUniqueKey === 0) { this.minDuplicatesPerUniqueKey = value; } else { this.minDuplicatesPerUniqueKey = Math.min(this.minDuplicatesPerUniqueKey, value); } } calcStats() { if (this.uniqueRowsWithDuplicates) { this.averageDuplicatesPerUniqueKey = (0, formats_1.roundDecimals)(this.duplicateRows / this.uniqueRowsWithDuplicates, 4); } if (this.rows) { this.duplicationPercent = (0, formats_1.roundDecimals)((this.duplicateRows / this.rows) * 100, 4); } if (this.uniqueRows) { this.uniqueRowDuplicationPercent = (0, formats_1.roundDecimals)((this.uniqueRowsWithDuplicates / this.uniqueRows) * 100, 4); } } } exports.SourceStats = SourceStats; /** * Creates a new differ object allowing you to compare two input streams and eventually send the changes to a specific output. * @param options the options required to compare two streams * @returns a Differ instance * @example * import { diff } from 'tabular-data-differ'; * const stats = diff({ * oldSource: './tests/a.csv', * newSource: './tests/b.csv', * keyFields: ['id'], * }).to('console'); * console.log(stats); */ function diff(options) { return new Differ(options); } function createFormatReader(options) { const unknownFormat = options.format; if (options.format === 'csv') { return new formats_1.CsvFormatReader(options); } if (options.format === 'tsv') { return new formats_1.CsvFormatReader({ ...options, delimiter: '\t', }); } if (options.format === 'json') { return new formats_1.JsonFormatReader(options); } if (options.format === 'iterable') { return new formats_1.IterableFormatReader(options); } if (options.format === 'custom') { return options.reader; } throw new Error(`Unknown source format '${unknownFormat}'`); } function createSource(value) { if (typeof value === 'string' || value instanceof URL) { return createFormatReader({ format: 'csv', stream: value }); } return createFormatReader(value); } function createFormatWriter(options) { if (options === 'console') { return new formats_1.CsvFormatWriter({ stream: 'console' }); } if (options === 'null') { return new formats_1.NullFormatWriter(); } if (typeof options === 'string' || options instanceof URL) { return new formats_1.CsvFormatWriter({ stream: options }); } const unknownFormat = options.format; if (options.format === 'csv') { return new formats_1.CsvFormatWriter(options); } if (options.format === 'tsv') { return new formats_1.CsvFormatWriter({ ...options, delimiter: '\t', }); } if (options.format === 'json') { return new formats_1.JsonFormatWriter(options); } if (options.format === 'custom') { return options.writer; } throw new Error(`Unknown destination format '${unknownFormat}'`); } function createOutput(value) { if (value === 'console') { return { format: new formats_1.CsvFormatWriter({ stream: new streams_1.ConsoleOutputStream() }) }; } if (value === 'null') { return { format: new formats_1.NullFormatWriter() }; } if (typeof value === 'string' || value instanceof URL) { return { format: new formats_1.CsvFormatWriter({ stream: new streams_1.FileOutputStream(value) }) }; } return { format: createFormatWriter(value.destination), filter: value.filter, keepSameRows: value.keepSameRows, changeLimit: value.changeLimit, labels: value.labels, }; } class Differ { constructor(options) { this.options = options; } async start() { const ctx = new DifferContext(this.options); await ctx[OpenSymbol](); return ctx; } /** * Iterates over the changes and sends them to the submitted output. * @param options a standard output such as console or null, a string filename, a URL or a custom OutputOptions. * @returns the change stats once all the changes have been processed. * Note that the stats might be different from getStats() when there is a filter in the output options, * as the differ stats are updated by the iterator which doesn't have any filter. * @throws {UnorderedStreamsError} * @example * import { diff } from 'tabular-data-differ'; * const stats = diff({ * oldSource: './tests/a.csv', * newSource: './tests/b.csv', * keyFields: ['id'], * }).to('console'); * console.log(stats); */ async to(options) { const ctx = await this.start(); return ctx.to(options); } } exports.Differ = Differ; const OpenSymbol = Symbol('open'); class DifferContext { constructor(options) { this.options = options; this._stats = new formats_1.DiffStats(); this._columnNames = []; this._isOpen = false; this._isClosed = false; this.comparer = formats_1.defaultRowComparer; this.keys = []; this._columns = []; this.columnsWithoutKeys = []; this.normalizeOldRow = row => row; this.normalizeNewRow = row => row; this._oldSourceStats = new SourceStats(); this._newSourceStats = new SourceStats(); this.oldSource = new formats_1.BufferedFormatReader(createSource(options.oldSource)); this.newSource = new formats_1.BufferedFormatReader(createSource(options.newSource)); this.comparer = options.rowComparer ?? formats_1.defaultRowComparer; this.duplicateKeyHandling = options.duplicateKeyHandling ?? 'fail'; this.duplicateRowBufferSize = Math.max(5, options.duplicateRowBufferSize ?? 1000); } /** * Opens the input streams (old and new) and reads the headers. * This is an internal method that will be automatically called by "Differ.start" method. */ async [OpenSymbol]() { if (!this._isOpen) { this._isOpen = true; this._oldSourceStats = new SourceStats(); this._newSourceStats = new SourceStats(); await this.oldSource.open(); await this.newSource.open(); await this.extractHeaders(); } } /** * Closes the input streams. * This will be automatically called by the "diffs" or "to" methods. * This does nothing if the streams are not open. */ close() { if (this._isOpen) { this.newSource.close(); this.oldSource.close(); this._isOpen = false; } this._isClosed = true; } /** * tells if the input streams are open or not */ get isOpen() { return this._isOpen; } /** * gets the normalized column names from the old and new streams, according to the includedFields/excludedFields constraints. * @returns a list of column names */ get columns() { return this._columnNames; } /** * gets the diff stats * @returns the diff stats */ get stats() { return this._stats; } /** * gets the stats accumulated while parsing the old source * @returns the source stats */ get oldSourceStats() { return this._oldSourceStats; } /** * gets the stats accumulated while parsing the new source * @returns the source stats */ get newSourceStats() { return this._newSourceStats; } /** * Iterates over the changes and sends them to the submitted output. * @param options a standard output such as console or null, a string filename, A URL or a custom OutputOptions. * @returns the change stats once all the changes have been processed. * Note that the stats might be different from "DiffContext.stats" when there is a filter in the output options, * as the context stats are updated by the iterator which doesn't have any filter. * @throws {UnorderedStreamsError} * @throws {UniqueKeyViolationError} * @example * import { diff } from 'tabular-data-differ'; * const stats = diff({ * oldSource: './tests/a.csv', * newSource: './tests/b.csv', * keyFields: ['id'], * }).to('console'); * console.log(stats); */ async to(options) { const stats = new formats_1.DiffStats(); const output = createOutput(options); await output.format.open(); try { await output.format.writeHeader({ columns: this.columns, labels: output.labels, }); for await (const rowDiff of this.diffs()) { let isValidDiff = output.filter?.(rowDiff) ?? true; if (isValidDiff) { stats.add(rowDiff); } let canWriteDiff = output.keepSameRows === true || rowDiff.status !== 'same'; if (isValidDiff && canWriteDiff) { await output.format.writeDiff(rowDiff); } if (typeof output.changeLimit === 'number' && stats.totalChanges >= output.changeLimit) { break; } } await output.format.writeFooter({ stats: stats }); } finally { await output.format.close(); } return stats; } /** * Enumerates the differences between two input streams (old and new). * @yields {RowDiff} * @throws {UnorderedStreamsError} * @throws {UniqueKeyViolationError} * @example * import { diff, ArrayInputStream } from 'tabular-data-differ'; * const ctx = diff({ * oldSource: { * stream: new ArrayInputStream([ * 'id,name', * '1,john', * '2,mary', * ]), * }, * newSource: { * stream: new ArrayInputStream([ * 'id,name', * '1,john', * '3,sarah', * ]), * }, * keyFields: ['id'], * }).start(); * console.log('columns:', ctx.getColumns()); * for (const rowDiff of ctx.diffs()) { * console.log(rowDiff); * } * console.log('stats:', ctx.getStats()); */ async *diffs() { if (this._isClosed) { throw new Error('Cannot get diffs on closed streams. You should call "Differ.start()" again.'); } try { let pairProvider = () => this.getNextPair(); let previousPair = {}; while (true) { const pair = await pairProvider(); if (pair.oldRow === undefined && pair.newRow === undefined) { break; } const rowDiff = this.evalPair(pair); this.ensurePairsAreInAscendingOrder(previousPair, pair); this.stats.add(rowDiff); yield rowDiff; if (rowDiff.delta === 0) { pairProvider = () => this.getNextPair(); } else if (rowDiff.delta > 0) { pairProvider = async () => ({ oldRow: pair.oldRow, newRow: await this.getNextNewRow() }); } else { pairProvider = async () => ({ oldRow: await this.getNextOldRow(), newRow: pair.newRow }); } previousPair = pair; } } finally { this.oldSourceStats.calcStats(); this.newSourceStats.calcStats(); this.close(); } } async extractHeaders() { const oldHeader = await this.oldSource.readHeader(); const newHeader = await this.newSource.readHeader(); if (oldHeader.columns.length === 0) { throw new Error('Expected to find columns in old source'); } if (newHeader.columns.length === 0) { throw new Error('Expected to find columns in new source'); } this._columns = this.normalizeColumns(oldHeader.columns, newHeader.columns); this.keys = this.extractKeys(this._columns, this.options.keys.map(asColumnDefinition)); this.columnsWithoutKeys = this._columns.filter(col => !this.keys.some(key => key.name === col.name)); this._columnNames = this._columns.map(col => col.name); if (!sameArrays(oldHeader.columns, this._columns.map(col => col.name))) { this.normalizeOldRow = row => row ? this._columns.map(col => row[col.oldIndex] ?? '') : undefined; } if (!sameArrays(newHeader.columns, this._columns.map(col => col.name))) { this.normalizeNewRow = row => row ? this._columns.map(col => row[col.newIndex]) : undefined; } } normalizeColumns(oldColumns, newColumns) { const includedColumns = new Set(this.options.includedColumns); const excludedColumns = new Set(this.options.excludedColumns); const columns = []; for (let newIndex = 0; newIndex < newColumns.length; newIndex++) { const name = newColumns[newIndex]; const isIncluded = includedColumns.size === 0 || includedColumns.has(name); if (isIncluded) { const isExcluded = excludedColumns.has(name); if (!isExcluded) { const oldIndex = oldColumns.indexOf(name); columns.push({ name, newIndex, oldIndex, }); } } } return columns; } extractKeys(columns, keys) { const result = []; for (const key of keys) { const column = columns.find(col => col.name === key.name); if (column) { if (column.oldIndex < 0) { throw new Error(`Could not find key '${key.name}' in old stream`); } result.push({ ...column, comparer: asColumnComparer(key.comparer), sortDirection: key.order, }); } else { throw new Error(`Could not find key '${key.name}' in new stream`); } } return result; } async readDuplicatesOf(source, stats, row) { const duplicateRows = []; duplicateRows.push(row); stats.incUniqueRowsWithDuplicates(); let duplicateCount = 0; let isDuplicate = true; while (isDuplicate) { const duplicateRow = await source.readRow(); if (duplicateRow) { duplicateCount += 1; stats.incRows(); stats.incDuplicateRows(); if (this.duplicateKeyHandling !== 'keepFirstRow') { // we don't need to accumulate duplicate rows when we just have to return the first row! duplicateRows.push(duplicateRow); } if (this.duplicateKeyHandling === 'keepLastRow') { // we don't need to accumulate the previous rows when we just have to return the last row! duplicateRows.shift(); } if (duplicateRows.length > this.duplicateRowBufferSize) { if (this.options.duplicateRowBufferOverflow) { // remove the first entry when we can overflow duplicateRows.shift(); } else { throw new Error('Too many duplicate rows'); } } } const nextRow = await source.peekRow(); isDuplicate = !!nextRow && this.comparer(this.keys, nextRow, row) === 0; } stats.incDuplicates(duplicateCount); stats.calcStats(); return duplicateRows; } async getNextRow(source, stats) { const row = await source.readRow(); if (!row) { return row; } stats.incRows(); stats.incUniqueRows(); if (this.duplicateKeyHandling === 'fail') { // Note that it will be further processed in ensureRowsAreInAscendingOrder and throw a UniqueKeyViolationError exception return row; } const nextRow = await source.peekRow(); if (!nextRow) { return row; } let isDuplicate = this.comparer(this.keys, nextRow, row) === 0; if (isDuplicate) { const duplicateRows = await this.readDuplicatesOf(source, stats, row); if (this.duplicateKeyHandling === 'keepFirstRow') { return duplicateRows[0]; } if (this.duplicateKeyHandling === 'keepLastRow') { return duplicateRows[duplicateRows.length - 1]; } return this.duplicateKeyHandling(duplicateRows); } return row; } getNextOldRow() { return this.getNextRow(this.oldSource, this._oldSourceStats); } getNextNewRow() { return this.getNextRow(this.newSource, this._newSourceStats); } async getNextPair() { const oldRow = await this.getNextOldRow(); const newRow = await this.getNextNewRow(); return { oldRow, newRow }; } evalPair(pair) { const delta = this.comparer(this.keys, pair.oldRow, pair.newRow); const newRow = this.normalizeNewRow(pair.newRow); const oldRow = this.normalizeOldRow(pair.oldRow); if (delta === 0) { const areSame = this.columnsWithoutKeys.length === 0 || this.comparer(this.columnsWithoutKeys, pair.oldRow, pair.newRow) === 0; return { delta, status: areSame ? 'same' : 'modified', oldRow, newRow }; } else if (delta < 0) { return { delta, status: 'deleted', oldRow }; } return { delta, status: 'added', newRow }; } ensureRowsAreInAscendingOrder(source, previous, current) { if (previous && current && previous !== current) { const oldDelta = this.comparer(this.keys, previous, current); if (oldDelta === 0) { const cols = this.keys.map(key => key.name); throw new UniqueKeyViolationError(`Expected rows to be unique by "${cols}" in ${source} source but received:\n previous=${previous}\n current=${current}\nNote that you can resolve this conflict automatically using the duplicateKeyHandling option.`); } if (oldDelta > 0) { const colOrder = this.keys.map(key => `${key.name} ${key.sortDirection ?? 'ASC'}`); throw new UnorderedStreamsError(`Expected rows to be ordered by "${colOrder}" in ${source} source but received:\n previous=${previous}\n current=${current}`); } } } ensurePairsAreInAscendingOrder(previous, current) { this.ensureRowsAreInAscendingOrder('old', previous.oldRow, current.oldRow); this.ensureRowsAreInAscendingOrder('new', previous.newRow, current.newRow); } } exports.DifferContext = DifferContext; function asColumnDefinition(value) { if (typeof value === 'string') { return { name: value }; } return value; } function asColumnComparer(comparer) { if (comparer === 'string') { return formats_1.stringComparer; } if (comparer === 'number') { return formats_1.numberComparer; } return comparer; } function sameArrays(a, b) { if (a.length !== b.length) { return false; } for (let i = 0; i < a.length; i++) { if (a[i] !== b[i]) { return false; } } return true; } //# sourceMappingURL=differ.js.map