UNPKG

tabular-data-differ

Version:

A very efficient library for diffing two sorted streams of tabular data, such as CSV files.

340 lines (339 loc) 11.7 kB
import { Filename } from "./streams"; import { Row, RowDiffFilter, ColumnComparer, SortDirection, RowComparer, FormatReader, FormatWriter, DiffStats, RowDiff, CsvFormatReaderOptions, CsvFormatWriterOptions, JsonFormatReaderOptions, JsonFormatWriterOptions, IterableFormatReaderOptions, BufferedFormatReader } from "./formats"; export declare class UnorderedStreamsError extends Error { } export declare class UniqueKeyViolationError extends Error { } export interface RowPair { oldRow?: Row; newRow?: Row; } export type RowPairProvider = () => Promise<RowPair>; /** * Options for configuring a source stream as a CSV stream */ export type CsvSource = { format: 'csv'; } & CsvFormatReaderOptions; /** * Options for configuring a destination stream as a CSV stream */ export type CsvDestination = { format: 'csv'; } & CsvFormatWriterOptions; /** * Options for configuring a source stream as a TSV stream */ export type TsvSource = { format: 'tsv'; } & CsvFormatReaderOptions; /** * Options for configuring a destination stream as a TSV stream */ export type TsvDestination = { format: 'tsv'; } & CsvFormatWriterOptions; /** * Options for configuring a source stream as a JSON stream */ export type JsonSource = { format: 'json'; } & JsonFormatReaderOptions; /** * Options for configuring a destination stream as a JSON stream */ export type JsonDestination = { format: 'json'; } & JsonFormatWriterOptions; /** * Options for configuring a source as an iterable generator */ export type IterableSource = { format: 'iterable'; } & IterableFormatReaderOptions; /** * Options for configuring a source as a custom format */ export type CustomSource = { format: 'custom'; reader: FormatReader; }; /** * Options for configuring a destination as a custom format */ export type CustomDestination = { format: 'custom'; writer: FormatWriter; }; /** * Options for configuring a source of data */ export type SourceOptions = CsvSource | TsvSource | JsonSource | IterableSource | CustomSource; /** * Options for configuring a destination of data */ export type DestinationOptions = CsvDestination | TsvDestination | JsonDestination | CustomDestination; /** * Options for configuring the output destination of the changes emitted by the Differ object */ export interface OutputOptions { destination: 'console' | 'null' | Filename | DestinationOptions; /** * Specifies if the output should also contain the rows that haven't changed. */ keepSameRows?: boolean; /** * Specifies a maximum number of differences that should be outputted. */ changeLimit?: number; /** * Specifies a filter to select which changes should be sent to the output stream. */ filter?: RowDiffFilter; /** * Specifies a dictionary of key/value pairs that can provide custom metadata to the generated file. */ labels?: Record<string, string>; } export interface ColumnDefinition { /** * the name of the column. */ name: string; /** * either a standard comparer ('string' or 'number') or a custom comparer. */ comparer?: 'string' | 'number' | ColumnComparer; /** * specifies if the column is in ascending (ASC) or descending (DESC) order. */ order?: SortDirection; } export type DuplicateKeyHandler = (rows: Row[]) => Row; export type DuplicateKeyHandling = 'fail' | 'keepFirstRow' | 'keepLastRow' | DuplicateKeyHandler; export declare class SourceStats { rows: number; duplicateRows: number; uniqueRows: number; uniqueRowsWithDuplicates: number; duplicationPercent: number; uniqueRowDuplicationPercent: number; maxDuplicatesPerUniqueKey: number; minDuplicatesPerUniqueKey: number; averageDuplicatesPerUniqueKey: number; incRows(): void; incDuplicateRows(): void; incUniqueRows(): void; incUniqueRowsWithDuplicates(): void; incDuplicates(value: number): void; calcStats(): void; } /** * Options for configuring the Differ object that will traverse two input streams in parallel in order to compare their rows * and produce a change set. */ export interface DifferOptions { /** * Configures the old source */ oldSource: Filename | SourceOptions; /** * Configures the new source */ newSource: Filename | SourceOptions; /** * Configures the primary keys used to compare the rows between the old and new sources */ keys: (string | ColumnDefinition)[]; /** * the list of columns to keep from the input sources. If not specified, all columns are selected. */ includedColumns?: string[]; /** * the list of columns to exclude from the input sources. */ excludedColumns?: string[]; /** * Specifies a custom row comparer */ rowComparer?: RowComparer; /** * specifies how to handle duplicate rows in a source. * It will fail by default and throw a UniqueKeyViolationError exception. * But you can keep the first or last row, or even provide your own function that will receive the duplicates and select the best candidate. * @default fail * @see duplicateRowBufferSize */ duplicateKeyHandling?: DuplicateKeyHandling; /** * specifies the maximum size of the buffer used to accumulate duplicate rows. * Note that the buffer size matters only when you provide a custom function to the duplicateKeyHandling, since it will receive the accumulated duplicates * as an input parameter. * @default 1000 * @see duplicateKeyHandling */ duplicateRowBufferSize?: number; /** * specifies if we can remove the first entries of the buffer to continue adding new duplicate entries when reaching maximum capacity, * to avoir throwing an error and halting the process. * Note that the buffer size matters only when you provide a custom function to the duplicateKeyHandling, since it will receive the accumulated duplicates * as an input parameter. * @default false * @see duplicateRowBufferSize */ duplicateRowBufferOverflow?: boolean; } /** * Creates a new differ object allowing you to compare two input streams and eventually send the changes to a specific output. * @param options the options required to compare two streams * @returns a Differ instance * @example * import { diff } from 'tabular-data-differ'; * const stats = diff({ * oldSource: './tests/a.csv', * newSource: './tests/b.csv', * keyFields: ['id'], * }).to('console'); * console.log(stats); */ export declare function diff(options: DifferOptions): Differ; export declare class Differ { private options; constructor(options: DifferOptions); start(): Promise<DifferContext>; /** * Iterates over the changes and sends them to the submitted output. * @param options a standard output such as console or null, a string filename, a URL or a custom OutputOptions. * @returns the change stats once all the changes have been processed. * Note that the stats might be different from getStats() when there is a filter in the output options, * as the differ stats are updated by the iterator which doesn't have any filter. * @throws {UnorderedStreamsError} * @example * import { diff } from 'tabular-data-differ'; * const stats = diff({ * oldSource: './tests/a.csv', * newSource: './tests/b.csv', * keyFields: ['id'], * }).to('console'); * console.log(stats); */ to(options: 'console' | 'null' | Filename | OutputOptions): Promise<DiffStats>; } declare const OpenSymbol: unique symbol; export declare class DifferContext { private options; private _stats; private _columnNames; private _isOpen; private _isClosed; private oldSource; private newSource; private comparer; private keys; private _columns; private columnsWithoutKeys; private normalizeOldRow; private normalizeNewRow; private duplicateKeyHandling; private duplicateRowBufferSize; private _oldSourceStats; private _newSourceStats; constructor(options: DifferOptions); /** * Opens the input streams (old and new) and reads the headers. * This is an internal method that will be automatically called by "Differ.start" method. */ [OpenSymbol](): Promise<void>; /** * Closes the input streams. * This will be automatically called by the "diffs" or "to" methods. * This does nothing if the streams are not open. */ close(): void; /** * tells if the input streams are open or not */ get isOpen(): boolean; /** * gets the normalized column names from the old and new streams, according to the includedFields/excludedFields constraints. * @returns a list of column names */ get columns(): string[]; /** * gets the diff stats * @returns the diff stats */ get stats(): DiffStats; /** * gets the stats accumulated while parsing the old source * @returns the source stats */ get oldSourceStats(): SourceStats; /** * gets the stats accumulated while parsing the new source * @returns the source stats */ get newSourceStats(): SourceStats; /** * Iterates over the changes and sends them to the submitted output. * @param options a standard output such as console or null, a string filename, A URL or a custom OutputOptions. * @returns the change stats once all the changes have been processed. * Note that the stats might be different from "DiffContext.stats" when there is a filter in the output options, * as the context stats are updated by the iterator which doesn't have any filter. * @throws {UnorderedStreamsError} * @throws {UniqueKeyViolationError} * @example * import { diff } from 'tabular-data-differ'; * const stats = diff({ * oldSource: './tests/a.csv', * newSource: './tests/b.csv', * keyFields: ['id'], * }).to('console'); * console.log(stats); */ to(options: 'console' | 'null' | Filename | OutputOptions): Promise<DiffStats>; /** * Enumerates the differences between two input streams (old and new). * @yields {RowDiff} * @throws {UnorderedStreamsError} * @throws {UniqueKeyViolationError} * @example * import { diff, ArrayInputStream } from 'tabular-data-differ'; * const ctx = diff({ * oldSource: { * stream: new ArrayInputStream([ * 'id,name', * '1,john', * '2,mary', * ]), * }, * newSource: { * stream: new ArrayInputStream([ * 'id,name', * '1,john', * '3,sarah', * ]), * }, * keyFields: ['id'], * }).start(); * console.log('columns:', ctx.getColumns()); * for (const rowDiff of ctx.diffs()) { * console.log(rowDiff); * } * console.log('stats:', ctx.getStats()); */ diffs(): AsyncGenerator<RowDiff, void, unknown>; private extractHeaders; private normalizeColumns; private extractKeys; readDuplicatesOf(source: BufferedFormatReader, stats: SourceStats, row: Row): Promise<Row[]>; getNextRow(source: BufferedFormatReader, stats: SourceStats): Promise<Row | undefined>; private getNextOldRow; private getNextNewRow; private getNextPair; private evalPair; private ensureRowsAreInAscendingOrder; private ensurePairsAreInAscendingOrder; } export declare function sameArrays(a: string[], b: string[]): boolean; export {};