tabular-data-differ
Version:
A very efficient library for diffing two sorted streams of tabular data, such as CSV files.
570 lines • 21.3 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.DifferContext = exports.Differ = exports.SourceStats = exports.UniqueKeyViolationError = exports.UnorderedStreamsError = void 0;
exports.diff = diff;
exports.sameArrays = sameArrays;
const streams_1 = require("./streams");
const formats_1 = require("./formats");
class UnorderedStreamsError extends Error {
}
exports.UnorderedStreamsError = UnorderedStreamsError;
class UniqueKeyViolationError extends Error {
}
exports.UniqueKeyViolationError = UniqueKeyViolationError;
class SourceStats {
constructor() {
this.rows = 0;
this.duplicateRows = 0;
this.uniqueRows = 0;
this.uniqueRowsWithDuplicates = 0;
this.duplicationPercent = 0;
this.uniqueRowDuplicationPercent = 0;
this.maxDuplicatesPerUniqueKey = 0;
this.minDuplicatesPerUniqueKey = 0;
this.averageDuplicatesPerUniqueKey = 0;
}
incRows() {
this.rows += 1;
}
incDuplicateRows() {
this.duplicateRows += 1;
}
incUniqueRows() {
this.uniqueRows += 1;
}
incUniqueRowsWithDuplicates() {
this.uniqueRowsWithDuplicates += 1;
}
incDuplicates(value) {
this.maxDuplicatesPerUniqueKey = Math.max(this.maxDuplicatesPerUniqueKey, value);
if (this.minDuplicatesPerUniqueKey === 0) {
this.minDuplicatesPerUniqueKey = value;
}
else {
this.minDuplicatesPerUniqueKey = Math.min(this.minDuplicatesPerUniqueKey, value);
}
}
calcStats() {
if (this.uniqueRowsWithDuplicates) {
this.averageDuplicatesPerUniqueKey = (0, formats_1.roundDecimals)(this.duplicateRows / this.uniqueRowsWithDuplicates, 4);
}
if (this.rows) {
this.duplicationPercent = (0, formats_1.roundDecimals)((this.duplicateRows / this.rows) * 100, 4);
}
if (this.uniqueRows) {
this.uniqueRowDuplicationPercent = (0, formats_1.roundDecimals)((this.uniqueRowsWithDuplicates / this.uniqueRows) * 100, 4);
}
}
}
exports.SourceStats = SourceStats;
/**
* Creates a new differ object allowing you to compare two input streams and eventually send the changes to a specific output.
* @param options the options required to compare two streams
* @returns a Differ instance
* @example
* import { diff } from 'tabular-data-differ';
* const stats = diff({
* oldSource: './tests/a.csv',
* newSource: './tests/b.csv',
* keyFields: ['id'],
* }).to('console');
* console.log(stats);
*/
function diff(options) {
return new Differ(options);
}
function createFormatReader(options) {
const unknownFormat = options.format;
if (options.format === 'csv') {
return new formats_1.CsvFormatReader(options);
}
if (options.format === 'tsv') {
return new formats_1.CsvFormatReader({
...options,
delimiter: '\t',
});
}
if (options.format === 'json') {
return new formats_1.JsonFormatReader(options);
}
if (options.format === 'iterable') {
return new formats_1.IterableFormatReader(options);
}
if (options.format === 'custom') {
return options.reader;
}
throw new Error(`Unknown source format '${unknownFormat}'`);
}
function createSource(value) {
if (typeof value === 'string' || value instanceof URL) {
return createFormatReader({ format: 'csv', stream: value });
}
return createFormatReader(value);
}
function createFormatWriter(options) {
if (options === 'console') {
return new formats_1.CsvFormatWriter({ stream: 'console' });
}
if (options === 'null') {
return new formats_1.NullFormatWriter();
}
if (typeof options === 'string' || options instanceof URL) {
return new formats_1.CsvFormatWriter({ stream: options });
}
const unknownFormat = options.format;
if (options.format === 'csv') {
return new formats_1.CsvFormatWriter(options);
}
if (options.format === 'tsv') {
return new formats_1.CsvFormatWriter({
...options,
delimiter: '\t',
});
}
if (options.format === 'json') {
return new formats_1.JsonFormatWriter(options);
}
if (options.format === 'custom') {
return options.writer;
}
throw new Error(`Unknown destination format '${unknownFormat}'`);
}
function createOutput(value) {
if (value === 'console') {
return { format: new formats_1.CsvFormatWriter({ stream: new streams_1.ConsoleOutputStream() }) };
}
if (value === 'null') {
return { format: new formats_1.NullFormatWriter() };
}
if (typeof value === 'string' || value instanceof URL) {
return { format: new formats_1.CsvFormatWriter({ stream: new streams_1.FileOutputStream(value) }) };
}
return {
format: createFormatWriter(value.destination),
filter: value.filter,
keepSameRows: value.keepSameRows,
changeLimit: value.changeLimit,
labels: value.labels,
};
}
class Differ {
constructor(options) {
this.options = options;
}
async start() {
const ctx = new DifferContext(this.options);
await ctx[OpenSymbol]();
return ctx;
}
/**
* Iterates over the changes and sends them to the submitted output.
* @param options a standard output such as console or null, a string filename, a URL or a custom OutputOptions.
* @returns the change stats once all the changes have been processed.
* Note that the stats might be different from getStats() when there is a filter in the output options,
* as the differ stats are updated by the iterator which doesn't have any filter.
* @throws {UnorderedStreamsError}
* @example
* import { diff } from 'tabular-data-differ';
* const stats = diff({
* oldSource: './tests/a.csv',
* newSource: './tests/b.csv',
* keyFields: ['id'],
* }).to('console');
* console.log(stats);
*/
async to(options) {
const ctx = await this.start();
return ctx.to(options);
}
}
exports.Differ = Differ;
const OpenSymbol = Symbol('open');
class DifferContext {
constructor(options) {
this.options = options;
this._stats = new formats_1.DiffStats();
this._columnNames = [];
this._isOpen = false;
this._isClosed = false;
this.comparer = formats_1.defaultRowComparer;
this.keys = [];
this._columns = [];
this.columnsWithoutKeys = [];
this.normalizeOldRow = row => row;
this.normalizeNewRow = row => row;
this._oldSourceStats = new SourceStats();
this._newSourceStats = new SourceStats();
this.oldSource = new formats_1.BufferedFormatReader(createSource(options.oldSource));
this.newSource = new formats_1.BufferedFormatReader(createSource(options.newSource));
this.comparer = options.rowComparer ?? formats_1.defaultRowComparer;
this.duplicateKeyHandling = options.duplicateKeyHandling ?? 'fail';
this.duplicateRowBufferSize = Math.max(5, options.duplicateRowBufferSize ?? 1000);
}
/**
* Opens the input streams (old and new) and reads the headers.
* This is an internal method that will be automatically called by "Differ.start" method.
*/
async [OpenSymbol]() {
if (!this._isOpen) {
this._isOpen = true;
this._oldSourceStats = new SourceStats();
this._newSourceStats = new SourceStats();
await this.oldSource.open();
await this.newSource.open();
await this.extractHeaders();
}
}
/**
* Closes the input streams.
* This will be automatically called by the "diffs" or "to" methods.
* This does nothing if the streams are not open.
*/
close() {
if (this._isOpen) {
this.newSource.close();
this.oldSource.close();
this._isOpen = false;
}
this._isClosed = true;
}
/**
* tells if the input streams are open or not
*/
get isOpen() {
return this._isOpen;
}
/**
* gets the normalized column names from the old and new streams, according to the includedFields/excludedFields constraints.
* @returns a list of column names
*/
get columns() {
return this._columnNames;
}
/**
* gets the diff stats
* @returns the diff stats
*/
get stats() {
return this._stats;
}
/**
* gets the stats accumulated while parsing the old source
* @returns the source stats
*/
get oldSourceStats() {
return this._oldSourceStats;
}
/**
* gets the stats accumulated while parsing the new source
* @returns the source stats
*/
get newSourceStats() {
return this._newSourceStats;
}
/**
* Iterates over the changes and sends them to the submitted output.
* @param options a standard output such as console or null, a string filename, A URL or a custom OutputOptions.
* @returns the change stats once all the changes have been processed.
* Note that the stats might be different from "DiffContext.stats" when there is a filter in the output options,
* as the context stats are updated by the iterator which doesn't have any filter.
* @throws {UnorderedStreamsError}
* @throws {UniqueKeyViolationError}
* @example
* import { diff } from 'tabular-data-differ';
* const stats = diff({
* oldSource: './tests/a.csv',
* newSource: './tests/b.csv',
* keyFields: ['id'],
* }).to('console');
* console.log(stats);
*/
async to(options) {
const stats = new formats_1.DiffStats();
const output = createOutput(options);
await output.format.open();
try {
await output.format.writeHeader({
columns: this.columns,
labels: output.labels,
});
for await (const rowDiff of this.diffs()) {
let isValidDiff = output.filter?.(rowDiff) ?? true;
if (isValidDiff) {
stats.add(rowDiff);
}
let canWriteDiff = output.keepSameRows === true || rowDiff.status !== 'same';
if (isValidDiff && canWriteDiff) {
await output.format.writeDiff(rowDiff);
}
if (typeof output.changeLimit === 'number' && stats.totalChanges >= output.changeLimit) {
break;
}
}
await output.format.writeFooter({ stats: stats });
}
finally {
await output.format.close();
}
return stats;
}
/**
* Enumerates the differences between two input streams (old and new).
* @yields {RowDiff}
* @throws {UnorderedStreamsError}
* @throws {UniqueKeyViolationError}
* @example
* import { diff, ArrayInputStream } from 'tabular-data-differ';
* const ctx = diff({
* oldSource: {
* stream: new ArrayInputStream([
* 'id,name',
* '1,john',
* '2,mary',
* ]),
* },
* newSource: {
* stream: new ArrayInputStream([
* 'id,name',
* '1,john',
* '3,sarah',
* ]),
* },
* keyFields: ['id'],
* }).start();
* console.log('columns:', ctx.getColumns());
* for (const rowDiff of ctx.diffs()) {
* console.log(rowDiff);
* }
* console.log('stats:', ctx.getStats());
*/
async *diffs() {
if (this._isClosed) {
throw new Error('Cannot get diffs on closed streams. You should call "Differ.start()" again.');
}
try {
let pairProvider = () => this.getNextPair();
let previousPair = {};
while (true) {
const pair = await pairProvider();
if (pair.oldRow === undefined && pair.newRow === undefined) {
break;
}
const rowDiff = this.evalPair(pair);
this.ensurePairsAreInAscendingOrder(previousPair, pair);
this.stats.add(rowDiff);
yield rowDiff;
if (rowDiff.delta === 0) {
pairProvider = () => this.getNextPair();
}
else if (rowDiff.delta > 0) {
pairProvider = async () => ({ oldRow: pair.oldRow, newRow: await this.getNextNewRow() });
}
else {
pairProvider = async () => ({ oldRow: await this.getNextOldRow(), newRow: pair.newRow });
}
previousPair = pair;
}
}
finally {
this.oldSourceStats.calcStats();
this.newSourceStats.calcStats();
this.close();
}
}
async extractHeaders() {
const oldHeader = await this.oldSource.readHeader();
const newHeader = await this.newSource.readHeader();
if (oldHeader.columns.length === 0) {
throw new Error('Expected to find columns in old source');
}
if (newHeader.columns.length === 0) {
throw new Error('Expected to find columns in new source');
}
this._columns = this.normalizeColumns(oldHeader.columns, newHeader.columns);
this.keys = this.extractKeys(this._columns, this.options.keys.map(asColumnDefinition));
this.columnsWithoutKeys = this._columns.filter(col => !this.keys.some(key => key.name === col.name));
this._columnNames = this._columns.map(col => col.name);
if (!sameArrays(oldHeader.columns, this._columns.map(col => col.name))) {
this.normalizeOldRow = row => row ? this._columns.map(col => row[col.oldIndex] ?? '') : undefined;
}
if (!sameArrays(newHeader.columns, this._columns.map(col => col.name))) {
this.normalizeNewRow = row => row ? this._columns.map(col => row[col.newIndex]) : undefined;
}
}
normalizeColumns(oldColumns, newColumns) {
const includedColumns = new Set(this.options.includedColumns);
const excludedColumns = new Set(this.options.excludedColumns);
const columns = [];
for (let newIndex = 0; newIndex < newColumns.length; newIndex++) {
const name = newColumns[newIndex];
const isIncluded = includedColumns.size === 0 || includedColumns.has(name);
if (isIncluded) {
const isExcluded = excludedColumns.has(name);
if (!isExcluded) {
const oldIndex = oldColumns.indexOf(name);
columns.push({
name,
newIndex,
oldIndex,
});
}
}
}
return columns;
}
extractKeys(columns, keys) {
const result = [];
for (const key of keys) {
const column = columns.find(col => col.name === key.name);
if (column) {
if (column.oldIndex < 0) {
throw new Error(`Could not find key '${key.name}' in old stream`);
}
result.push({
...column,
comparer: asColumnComparer(key.comparer),
sortDirection: key.order,
});
}
else {
throw new Error(`Could not find key '${key.name}' in new stream`);
}
}
return result;
}
async readDuplicatesOf(source, stats, row) {
const duplicateRows = [];
duplicateRows.push(row);
stats.incUniqueRowsWithDuplicates();
let duplicateCount = 0;
let isDuplicate = true;
while (isDuplicate) {
const duplicateRow = await source.readRow();
if (duplicateRow) {
duplicateCount += 1;
stats.incRows();
stats.incDuplicateRows();
if (this.duplicateKeyHandling !== 'keepFirstRow') {
// we don't need to accumulate duplicate rows when we just have to return the first row!
duplicateRows.push(duplicateRow);
}
if (this.duplicateKeyHandling === 'keepLastRow') {
// we don't need to accumulate the previous rows when we just have to return the last row!
duplicateRows.shift();
}
if (duplicateRows.length > this.duplicateRowBufferSize) {
if (this.options.duplicateRowBufferOverflow) {
// remove the first entry when we can overflow
duplicateRows.shift();
}
else {
throw new Error('Too many duplicate rows');
}
}
}
const nextRow = await source.peekRow();
isDuplicate = !!nextRow && this.comparer(this.keys, nextRow, row) === 0;
}
stats.incDuplicates(duplicateCount);
stats.calcStats();
return duplicateRows;
}
async getNextRow(source, stats) {
const row = await source.readRow();
if (!row) {
return row;
}
stats.incRows();
stats.incUniqueRows();
if (this.duplicateKeyHandling === 'fail') {
// Note that it will be further processed in ensureRowsAreInAscendingOrder and throw a UniqueKeyViolationError exception
return row;
}
const nextRow = await source.peekRow();
if (!nextRow) {
return row;
}
let isDuplicate = this.comparer(this.keys, nextRow, row) === 0;
if (isDuplicate) {
const duplicateRows = await this.readDuplicatesOf(source, stats, row);
if (this.duplicateKeyHandling === 'keepFirstRow') {
return duplicateRows[0];
}
if (this.duplicateKeyHandling === 'keepLastRow') {
return duplicateRows[duplicateRows.length - 1];
}
return this.duplicateKeyHandling(duplicateRows);
}
return row;
}
getNextOldRow() {
return this.getNextRow(this.oldSource, this._oldSourceStats);
}
getNextNewRow() {
return this.getNextRow(this.newSource, this._newSourceStats);
}
async getNextPair() {
const oldRow = await this.getNextOldRow();
const newRow = await this.getNextNewRow();
return { oldRow, newRow };
}
evalPair(pair) {
const delta = this.comparer(this.keys, pair.oldRow, pair.newRow);
const newRow = this.normalizeNewRow(pair.newRow);
const oldRow = this.normalizeOldRow(pair.oldRow);
if (delta === 0) {
const areSame = this.columnsWithoutKeys.length === 0 ||
this.comparer(this.columnsWithoutKeys, pair.oldRow, pair.newRow) === 0;
return { delta, status: areSame ? 'same' : 'modified', oldRow, newRow };
}
else if (delta < 0) {
return { delta, status: 'deleted', oldRow };
}
return { delta, status: 'added', newRow };
}
ensureRowsAreInAscendingOrder(source, previous, current) {
if (previous && current && previous !== current) {
const oldDelta = this.comparer(this.keys, previous, current);
if (oldDelta === 0) {
const cols = this.keys.map(key => key.name);
throw new UniqueKeyViolationError(`Expected rows to be unique by "${cols}" in ${source} source but received:\n previous=${previous}\n current=${current}\nNote that you can resolve this conflict automatically using the duplicateKeyHandling option.`);
}
if (oldDelta > 0) {
const colOrder = this.keys.map(key => `${key.name} ${key.sortDirection ?? 'ASC'}`);
throw new UnorderedStreamsError(`Expected rows to be ordered by "${colOrder}" in ${source} source but received:\n previous=${previous}\n current=${current}`);
}
}
}
ensurePairsAreInAscendingOrder(previous, current) {
this.ensureRowsAreInAscendingOrder('old', previous.oldRow, current.oldRow);
this.ensureRowsAreInAscendingOrder('new', previous.newRow, current.newRow);
}
}
exports.DifferContext = DifferContext;
function asColumnDefinition(value) {
if (typeof value === 'string') {
return { name: value };
}
return value;
}
function asColumnComparer(comparer) {
if (comparer === 'string') {
return formats_1.stringComparer;
}
if (comparer === 'number') {
return formats_1.numberComparer;
}
return comparer;
}
function sameArrays(a, b) {
if (a.length !== b.length) {
return false;
}
for (let i = 0; i < a.length; i++) {
if (a[i] !== b[i]) {
return false;
}
}
return true;
}
//# sourceMappingURL=differ.js.map