tabular-data-differ
Version:
A very efficient library for diffing two sorted streams of tabular data, such as CSV files.
340 lines (339 loc) • 11.7 kB
TypeScript
import { Filename } from "./streams";
import { Row, RowDiffFilter, ColumnComparer, SortDirection, RowComparer, FormatReader, FormatWriter, DiffStats, RowDiff, CsvFormatReaderOptions, CsvFormatWriterOptions, JsonFormatReaderOptions, JsonFormatWriterOptions, IterableFormatReaderOptions, BufferedFormatReader } from "./formats";
export declare class UnorderedStreamsError extends Error {
}
export declare class UniqueKeyViolationError extends Error {
}
export interface RowPair {
oldRow?: Row;
newRow?: Row;
}
export type RowPairProvider = () => Promise<RowPair>;
/**
* Options for configuring a source stream as a CSV stream
*/
export type CsvSource = {
format: 'csv';
} & CsvFormatReaderOptions;
/**
* Options for configuring a destination stream as a CSV stream
*/
export type CsvDestination = {
format: 'csv';
} & CsvFormatWriterOptions;
/**
* Options for configuring a source stream as a TSV stream
*/
export type TsvSource = {
format: 'tsv';
} & CsvFormatReaderOptions;
/**
* Options for configuring a destination stream as a TSV stream
*/
export type TsvDestination = {
format: 'tsv';
} & CsvFormatWriterOptions;
/**
* Options for configuring a source stream as a JSON stream
*/
export type JsonSource = {
format: 'json';
} & JsonFormatReaderOptions;
/**
* Options for configuring a destination stream as a JSON stream
*/
export type JsonDestination = {
format: 'json';
} & JsonFormatWriterOptions;
/**
* Options for configuring a source as an iterable generator
*/
export type IterableSource = {
format: 'iterable';
} & IterableFormatReaderOptions;
/**
* Options for configuring a source as a custom format
*/
export type CustomSource = {
format: 'custom';
reader: FormatReader;
};
/**
* Options for configuring a destination as a custom format
*/
export type CustomDestination = {
format: 'custom';
writer: FormatWriter;
};
/**
* Options for configuring a source of data
*/
export type SourceOptions = CsvSource | TsvSource | JsonSource | IterableSource | CustomSource;
/**
* Options for configuring a destination of data
*/
export type DestinationOptions = CsvDestination | TsvDestination | JsonDestination | CustomDestination;
/**
* Options for configuring the output destination of the changes emitted by the Differ object
*/
export interface OutputOptions {
destination: 'console' | 'null' | Filename | DestinationOptions;
/**
* Specifies if the output should also contain the rows that haven't changed.
*/
keepSameRows?: boolean;
/**
* Specifies a maximum number of differences that should be outputted.
*/
changeLimit?: number;
/**
* Specifies a filter to select which changes should be sent to the output stream.
*/
filter?: RowDiffFilter;
/**
* Specifies a dictionary of key/value pairs that can provide custom metadata to the generated file.
*/
labels?: Record<string, string>;
}
export interface ColumnDefinition {
/**
* the name of the column.
*/
name: string;
/**
* either a standard comparer ('string' or 'number') or a custom comparer.
*/
comparer?: 'string' | 'number' | ColumnComparer;
/**
* specifies if the column is in ascending (ASC) or descending (DESC) order.
*/
order?: SortDirection;
}
export type DuplicateKeyHandler = (rows: Row[]) => Row;
export type DuplicateKeyHandling = 'fail' | 'keepFirstRow' | 'keepLastRow' | DuplicateKeyHandler;
export declare class SourceStats {
rows: number;
duplicateRows: number;
uniqueRows: number;
uniqueRowsWithDuplicates: number;
duplicationPercent: number;
uniqueRowDuplicationPercent: number;
maxDuplicatesPerUniqueKey: number;
minDuplicatesPerUniqueKey: number;
averageDuplicatesPerUniqueKey: number;
incRows(): void;
incDuplicateRows(): void;
incUniqueRows(): void;
incUniqueRowsWithDuplicates(): void;
incDuplicates(value: number): void;
calcStats(): void;
}
/**
* Options for configuring the Differ object that will traverse two input streams in parallel in order to compare their rows
* and produce a change set.
*/
export interface DifferOptions {
/**
* Configures the old source
*/
oldSource: Filename | SourceOptions;
/**
* Configures the new source
*/
newSource: Filename | SourceOptions;
/**
* Configures the primary keys used to compare the rows between the old and new sources
*/
keys: (string | ColumnDefinition)[];
/**
* the list of columns to keep from the input sources. If not specified, all columns are selected.
*/
includedColumns?: string[];
/**
* the list of columns to exclude from the input sources.
*/
excludedColumns?: string[];
/**
* Specifies a custom row comparer
*/
rowComparer?: RowComparer;
/**
* specifies how to handle duplicate rows in a source.
* It will fail by default and throw a UniqueKeyViolationError exception.
* But you can keep the first or last row, or even provide your own function that will receive the duplicates and select the best candidate.
* @default fail
* @see duplicateRowBufferSize
*/
duplicateKeyHandling?: DuplicateKeyHandling;
/**
* specifies the maximum size of the buffer used to accumulate duplicate rows.
* Note that the buffer size matters only when you provide a custom function to the duplicateKeyHandling, since it will receive the accumulated duplicates
* as an input parameter.
* @default 1000
* @see duplicateKeyHandling
*/
duplicateRowBufferSize?: number;
/**
* specifies if we can remove the first entries of the buffer to continue adding new duplicate entries when reaching maximum capacity,
* to avoir throwing an error and halting the process.
* Note that the buffer size matters only when you provide a custom function to the duplicateKeyHandling, since it will receive the accumulated duplicates
* as an input parameter.
* @default false
* @see duplicateRowBufferSize
*/
duplicateRowBufferOverflow?: boolean;
}
/**
* Creates a new differ object allowing you to compare two input streams and eventually send the changes to a specific output.
* @param options the options required to compare two streams
* @returns a Differ instance
* @example
* import { diff } from 'tabular-data-differ';
* const stats = diff({
* oldSource: './tests/a.csv',
* newSource: './tests/b.csv',
* keyFields: ['id'],
* }).to('console');
* console.log(stats);
*/
export declare function diff(options: DifferOptions): Differ;
export declare class Differ {
private options;
constructor(options: DifferOptions);
start(): Promise<DifferContext>;
/**
* Iterates over the changes and sends them to the submitted output.
* @param options a standard output such as console or null, a string filename, a URL or a custom OutputOptions.
* @returns the change stats once all the changes have been processed.
* Note that the stats might be different from getStats() when there is a filter in the output options,
* as the differ stats are updated by the iterator which doesn't have any filter.
* @throws {UnorderedStreamsError}
* @example
* import { diff } from 'tabular-data-differ';
* const stats = diff({
* oldSource: './tests/a.csv',
* newSource: './tests/b.csv',
* keyFields: ['id'],
* }).to('console');
* console.log(stats);
*/
to(options: 'console' | 'null' | Filename | OutputOptions): Promise<DiffStats>;
}
declare const OpenSymbol: unique symbol;
export declare class DifferContext {
private options;
private _stats;
private _columnNames;
private _isOpen;
private _isClosed;
private oldSource;
private newSource;
private comparer;
private keys;
private _columns;
private columnsWithoutKeys;
private normalizeOldRow;
private normalizeNewRow;
private duplicateKeyHandling;
private duplicateRowBufferSize;
private _oldSourceStats;
private _newSourceStats;
constructor(options: DifferOptions);
/**
* Opens the input streams (old and new) and reads the headers.
* This is an internal method that will be automatically called by "Differ.start" method.
*/
[OpenSymbol](): Promise<void>;
/**
* Closes the input streams.
* This will be automatically called by the "diffs" or "to" methods.
* This does nothing if the streams are not open.
*/
close(): void;
/**
* tells if the input streams are open or not
*/
get isOpen(): boolean;
/**
* gets the normalized column names from the old and new streams, according to the includedFields/excludedFields constraints.
* @returns a list of column names
*/
get columns(): string[];
/**
* gets the diff stats
* @returns the diff stats
*/
get stats(): DiffStats;
/**
* gets the stats accumulated while parsing the old source
* @returns the source stats
*/
get oldSourceStats(): SourceStats;
/**
* gets the stats accumulated while parsing the new source
* @returns the source stats
*/
get newSourceStats(): SourceStats;
/**
* Iterates over the changes and sends them to the submitted output.
* @param options a standard output such as console or null, a string filename, A URL or a custom OutputOptions.
* @returns the change stats once all the changes have been processed.
* Note that the stats might be different from "DiffContext.stats" when there is a filter in the output options,
* as the context stats are updated by the iterator which doesn't have any filter.
* @throws {UnorderedStreamsError}
* @throws {UniqueKeyViolationError}
* @example
* import { diff } from 'tabular-data-differ';
* const stats = diff({
* oldSource: './tests/a.csv',
* newSource: './tests/b.csv',
* keyFields: ['id'],
* }).to('console');
* console.log(stats);
*/
to(options: 'console' | 'null' | Filename | OutputOptions): Promise<DiffStats>;
/**
* Enumerates the differences between two input streams (old and new).
* @yields {RowDiff}
* @throws {UnorderedStreamsError}
* @throws {UniqueKeyViolationError}
* @example
* import { diff, ArrayInputStream } from 'tabular-data-differ';
* const ctx = diff({
* oldSource: {
* stream: new ArrayInputStream([
* 'id,name',
* '1,john',
* '2,mary',
* ]),
* },
* newSource: {
* stream: new ArrayInputStream([
* 'id,name',
* '1,john',
* '3,sarah',
* ]),
* },
* keyFields: ['id'],
* }).start();
* console.log('columns:', ctx.getColumns());
* for (const rowDiff of ctx.diffs()) {
* console.log(rowDiff);
* }
* console.log('stats:', ctx.getStats());
*/
diffs(): AsyncGenerator<RowDiff, void, unknown>;
private extractHeaders;
private normalizeColumns;
private extractKeys;
readDuplicatesOf(source: BufferedFormatReader, stats: SourceStats, row: Row): Promise<Row[]>;
getNextRow(source: BufferedFormatReader, stats: SourceStats): Promise<Row | undefined>;
private getNextOldRow;
private getNextNewRow;
private getNextPair;
private evalPair;
private ensureRowsAreInAscendingOrder;
private ensurePairsAreInAscendingOrder;
}
export declare function sameArrays(a: string[], b: string[]): boolean;
export {};