@asmartbear/diff-merge
Version:
Text and arbitrary-array diff and merge, fast, multiple algorithms
510 lines (509 loc) • 24.6 kB
TypeScript
/**
* Returns the number of characters that both strings have in common, or 0 if none or at least one is null or undefined.
*
* Contains a few optimizations (that benchmarks indicate are, in fact, useful), and is Unicode-safe.
*/
export declare function getCommonPrefixLength(a: string | null | undefined, b?: string | null | undefined): number;
/**
* Holds an array with offset and length, so we can create a pseudo-array "view" into an array without
* making large-array copies or allocations. The "end" is inclusive!
*/
export declare class ArrayView<T> {
readonly a: T[];
readonly start: number;
readonly end: number;
constructor(a: T[], start?: number, end?: number);
static createEmpty<T>(): ArrayView<T>;
/**
* The number of elements in this view.
*/
get length(): number;
/**
* True if this view is empty, i.e. represents the empty array, and has length 0.
*/
get empty(): boolean;
/**
* Like Array.indexOf(), but includes start (relative to the actual start of the underlying array) and end (inclusive
* and relative to the actual underlying array).
*/
indexOf(target: T, start?: number, end?: number): number;
/**
* Same behavior as String#indexOf: Find another ArrayView as a consecutive substring on this array, returning the
* (absolute) index of the first such position, or -1 if not found.
*/
indexOfSubstring(target: ArrayView<T>, start?: number, end?: number): number;
/**
* Creates a copy of this subsection of the array, which is therefore modifable, and allocates memory.
*/
getCopy(): T[];
/**
* Like Array.forEach(), but on this view of the array, and more restrictions on the callback function.
*/
forEach(f: (el: T) => void): void;
/**
* Like Array.map(), but on this view of the array, and more restrictions on the callback function.
*/
map(f: (el: T) => T): T[];
/**
* Like Array.map(), but on this view of the array, and the mapping function converts to a string.
*/
mapToString(f: (el: T) => string): string[];
/**
* Pushes all of the elements from this view onto the end of the given array.
*/
pushAll(arry: T[]): void;
/**
* Retrieves an element from the array, relative to the start of this view.
*/
getElement(relative_index: number): T;
/**
* Return an ArrayView that is a subsequence, starting at an offset relative to `this,` and with a given length.
* If the result would be identical to `this`, then `this` itself is returned without allocating a new view.
* If length is missing, the subsequence goes to the end.
*/
getSubsequence(relative_start: number, relative_length?: number): ArrayView<T>;
/**
* Gets an array in reverse order, just the elements inside this view, as a new view.
* This allocates memory.
*/
getReverse(): ArrayView<T>;
toString(): string;
/**
* True if this subsequence is equal to the other subsequence, false otherwise.
*/
equals(that: ArrayView<T>): boolean;
/**
* Returns -1, 0, or 1, indicating whether `this` is less, equal, or greater than `that`, comparing like strings, e.g. one element at a time,
* returning the answer if they're unequal, and if the shorter is equal to the prefix of the latter, the shorter is deemed smaller.
*/
compare(that: ArrayView<T>): -1 | 0 | 1;
/**
* Concatenates a sequence onto this one, NOT changing the current one, but rather returning a new result.
* This is akin to Array#concat() with just one parameter.
* Various optimizations prevent allocating memory whenever possible.
*/
concat(that: ArrayView<T>): ArrayView<T>;
/**
* Returns the number of elements that are common to the first elements in both arrays.
* Each array can be any size, including empty, but cannot be null or undefined.
*/
getLengthOfCommonPrefix(that: ArrayView<T>): number;
/**
* Returns the number of elements that are common to the last elements in both arrays.
* Each array can be any size, including empty, but cannot be null or undefined.
*/
getLengthOfCommonSuffix(that: ArrayView<T>): number;
/**
* Find the longest length L of a suffix of `this` which overlaps with a prefix of `that` also of length L.
* Each array can be any size, including empty, but not null or undefined. Returns 0 if there is no commonality.
*/
getLengthOfOverlapAtMyEnd(that: ArrayView<T>): number;
/**
* Returns a pair of ArrayViews containing the longest common substring in `this` and `that` respectively.
* Both views in the pair will be empty if there's no common substring.
*
* This particular algorithm is slow -- O(N*M) -- but always finds the right answer, is simple, it approaches
* O(N) if the longest substring is long, and it uses only O(1) additional memory.
* It uses a few techniques to speed up, for finding the first element of a substring,
* and by anticipating that a found-substring might be extended in-place to speed up the best-so-far length.
* In short, the N-loop is slow but the M-loop is fast, and further, it orders strings such that N <= M.
*/
static getLongestCommonSubstringGrowingLinearScan<T>(ths: ArrayView<T>, tht: ArrayView<T>): [ArrayView<T>, ArrayView<T>];
/**
* Returns a pair of ArrayViews containing the longest common substring in `this` and `that` respectively.
* Both views in the pair will be empty if there's no common substring.
*
* This algorithm uses an "optimistic" method, in which if there are long common substrings (relative to the
* shortest string), it will be found quickly, in O(M) comparisons. It is still O(NM) in the worst case,
* which is the lack of a common substring, or a common of 1 character.
*/
static getLongestCommonSubstringOptimisticBisect<T>(ths: ArrayView<T>, tht: ArrayView<T>): [ArrayView<T>, ArrayView<T>];
/**
* Finds the longest common substring between two strings, returning the offset in the two input strings,
* and the length.
*
* This algorithm uses an "optimistic" method, in which if there are long common substrings (relative to the
* shortest string), it will be found quickly, in O(M) comparisons. It is still O(NM) in the worst case,
* which is the lack of a common substring, or a common of 1 character.
*/
static getLongestCommonSubstringOptimisticBisectString(a: string, b: string): {
a_offset: number;
b_offset: number;
len: number;
};
/**
* Returns the index coordinates of the middle of the longest common subsequence between the two arrays.
* The longest subsequence is the longest set of ordered (but not necessarily consecutive) equal elements
* from both arrays. The middle is often but not always along a common substring. Even when it is a common
* substring, it is often not the globally-longest common substring.
*
* The index is relative to the start of each ArrayView, not relative to the absolute position inside the
* underlying array.
*
* If there is no common subsequence whatsoever, `null` is returned.
*/
getLongestCommonSubsequenceMiddleMyers(that: ArrayView<T>): [number, number] | null;
/**
* Breaks a string into individual characters, and returns an array view of those characters.
*/
static fromCharacters(str: string): ArrayView<string>;
/**
* Breaks a string into tokens based on any regular expression, and returns an array view of those tokens.
*/
static fromTokens(str: string, re: RegExp): ArrayView<string>;
/**
* Tokenizes a string assuming the use-case is plain-text prose.
*/
static fromPlainProse(str: string): ArrayView<string>;
/**
* Tokenizes a string by lines. Includes the line-ending character as well.
*/
static fromLines(str: string): ArrayView<string>;
}
declare type SymbolInformation = {
id: string | number;
count: number;
first_offset: number;
};
/**
* Generates a histogram from a set of tokens, counting the number of times each appears, and the location
* that the first instance of that token appears.
*/
export declare class Histogram<T> {
/**
* Map of each unique symbol's ID to its information record.
*/
readonly histogram: Record<string | number, SymbolInformation>;
/**
* Map of the relative-position in the original input, to the symbol ID, and a field that can be used in algorithms to
*/
readonly ordered: {
id: string | number;
other_offset?: number;
}[];
constructor(input: ArrayView<T>);
/**
* Given an element from the original input, returns its histogram record, or `undefined` if it's not present in the original input
*/
lookupElement(el: T): SymbolInformation | undefined;
/**
* Converts any type of element into a unique ID that is used as a key in the histogram array.
*/
private getIdForElement;
}
export declare class Edit<T> {
readonly prev: ArrayView<T>;
readonly next: ArrayView<T>;
constructor(prev: ArrayView<T>, next: ArrayView<T>);
static createEquality<T>(keep: ArrayView<T>): Edit<T>;
static createPureInsertion<T>(ins: ArrayView<T>): Edit<T>;
static createPureDeletion<T>(del: ArrayView<T>): Edit<T>;
/**
* True if this edit represents equal subsequences in the previous and next.
*
* @readonly
*/
isEquality(): boolean;
/**
* True if this edit is an insertion, with no deletion.
*
* @readonly
*/
isPureInsertion(): boolean;
/**
* True if this edit is a deletion, with no insertion.
*
* @readonly
*/
isPureDeletion(): boolean;
/**
* True if this edit is a modification, i.e. both a non-trivial delete and a non-trivial insert at the same location.
*
* @readonly
*/
isModification(): boolean;
/**
* Creates and returns a new Edit object, that is the same as this one, but the opposite.
* Inserts become deletes. Equalities are returned without creating a new Edit object.
*
* @readonly
*/
getConverse(): Edit<T>;
/**
* Returns a human-readable, but not machine-usable, representation of this edit
*
* @param matched {boolean} if true, "equality" operations are surrounded by parenthesis to be explicit, otherwise they are plain for easier readability
*/
toString(matched?: boolean): string;
}
export declare class EditScript<T> {
private readonly edits;
constructor();
get length(): number;
get empty(): boolean;
append(e: Edit<T>): EditScript<T>;
prepend(e: Edit<T>): EditScript<T>;
/**
* Returns a human-readable, but not machine-usable, representation of the entire edit script.
*
* @param matched {boolean} if true, "equality" operations are surrounded by parenthesis to be explicit, otherwise they are plain for easier readability
*/
toString(matched?: boolean): string;
/**
* Given the output of `EditScript<T>.toString()`, parses and returns the result as a string-typed EditScript.
*/
static fromString(s: string): EditScript<string>;
private static re_delete;
private static re_insert;
private static re_equal;
/**
* Recreate the "previous" sequence using only edits, concatenating back to an array.
*/
getPrev(): T[];
/**
* Recreate the "next" sequence using only edits, concatenating back to an array.
*/
getNext(): T[];
/**
* Creates and returns a new EditScript, that is the same as this one, but goes the opposite direction.
* Inserts become deletes.
*/
getConverse(): EditScript<T>;
/**
* Creates a string, assuming the script represents strings of text, emitting lines with a gutter
* of '+' for insertion, '-' for deletion, or ' ' for equality.
*/
getScriptAsFormattedLines(): string;
/**
* Visits all Edits, in forward order (which disallows changing the edit list while iterating).
*/
visitEditsForward(f_visit: (edit: Edit<T>) => void): void;
/**
* Visits all Edits, in reverse order (which allows for changes to the underlying edit script).
*
* The callback function can return null to indicate that no change should be made in the edit script,
* or it can return an array which replaces the two edits completely. If some of the edits should be
* preserved, just include them in the array. It is legal for the array to be any length, including empty.
*/
visitEdits(f_visit: (edit: Edit<T>) => Edit<T>[] | null): void;
/**
* Visits all pairs of Edits, in reverse order (which allows for changes to the underlying edit script).
* Won't visit anything if there's just one Edit.
*
* The callback function can return null to indicate that no change should be made in the edit script,
* or it can return an array which replaces the two edits completely. If some of the edits should be
* preserved, just include them in the array. It is legal for the array to be any length, including empty.
*/
visitEditPairs(f_visit: (left: Edit<T>, right: Edit<T>) => Edit<T>[] | null): void;
/**
* Visits all trios of Edits, in reverse order (which allows for changes to the underlying edit script).
* Won't visit anything if there are fewer than three Edits.
*
* The callback function can return null to indicate that no change should be made in the edit script,
* or it can return an array which replaces the three edits completely. If some of the edits should be
* preserved, just include them in the array. It is legal for the array to be any length, including empty.
*/
visitEditTrios(f_visit: (left: Edit<T>, middle: Edit<T>, right: Edit<T>) => Edit<T>[] | null): void;
/**
* Scans for an Equality edit (the "middle"), surrounded on both sides by either two insertions, two deletions,
* or one modification and any change on the other side. These are the conditions in which it is
* legal to join the left, middle, and right edits into a single edit. While the resulting script is
* identical, it reduces the total number of edits in a way that might be preferable, usually for semantic
* reasons. For example, a single space separating changes to words on either side, probably should be
* folded into a single semantically-meaningful change.
*
* A callback function is consulted to determine whether this transformation should actually be executed.
* The callback is provided the Middle content, and is called only if the surrounding Edits are valid for
* this transformation.
*/
collapseMiddleEquality(f_should_collapse: (middle: ArrayView<T>) => boolean): void;
/**
* Attempts to further simplify all non-pure modifications, changing the script in-place.
* Returns true if any were transformed, otherwise returns false.
*
* Supply a preconfigured engine to execute simplifications.
*/
reeditModifications(simplification_engine: Engine<T>): boolean;
/**
* Reduces edits until they alternate between equality and modification.
* Returns true if there was at least one coalesing of a modification-style edit.
*/
coalesce(): boolean;
/**
* Rewrites Edit records to shift rightward when it is possible to do so without changing the result.
* This is a way to normalize output, as well as often semantically better, as it makes insertions closer
* to an append. For example, with "Hello." vs "Hello...", you could report "Hello{..}." but "Hello.{..}"
* is more likely to be what is intended.
*
* It is possible that an equality is completely eliminated as a result of this. If that happens, this will
* automatically run a coalese() to restore the system to its normal state.
*
* This will also shift edits leftward in the special case that doing so would cause one "equality" Edit to
* completely disappear, thus collecting into fewer total edits. In all other cases, shifting is rightward-only.
*
* Returns `true` if any changes were made, `false` otherwise.
*/
shiftEdits(): boolean;
}
export declare enum MergeConflictingInsertsAlgorithm {
KEEP_STATUS_QUO = 0,
KEEP_BOTH = 1,
MERGE = 2
}
/**
* An engine that can perform two-way or three-way merge, with configuration settings.
*/
export declare class Merge<T> {
/**
* If true, examine inserts that happen at exactly the same place, combining common prefix, suffix, or when a prefix
* of one matches a suffix of the other.
*
* When diffs might have already been applied, or in the case where the alphabet contains unique items, this will merge
* better. Otherwise it can incorrectly merge things that are truly separate, for example if both sides append a new
* bullet point to a section, this setting will incorrectly "merge" the two bullet points, when in fact they are logically
* completely separate the minds of the authors. On the other hand, if one person pasted part of a change that another
* person made, this option will resolve that properly, instead of actually having two copies of the pasted part.
*
* Default: False.
*/
opt_combine_overlapping_inserts: MergeConflictingInsertsAlgorithm;
/**
* If we are combining overlapping inserts (see other options), this is the algorithm used to merge.
*/
opt_algorithm_combined_overlapping_inserts: DiffAlgorithm;
/**
* When one side makes an insertion, but the other side deletes the region surrounding the insertion point,
* the default behavior is to silently ignore the insertion. Reason: Imagine that Alice makes a typo correction
* within a sentence, while Bob deletes the entire sentence. It makes sense for Alice's change to be ignored.
*
* However, it can be useful to preserve the insertion, either in a separate "exceptions" list, or in-line in the
* content, perhaps formatting in a special way to indicate that something exceptional happened.
*
* This function will be called at the moment that an insertion is being rejected. The output array will have been
* fully populated to this point, and it can be altered; not just append, but any alteration. Of course the caller
* can also update some arbitrary external state.
*/
opt_f_handle_deleted_insert: (output: T[], insertion: ArrayView<T>) => void;
private take_both_diff_engine;
constructor();
/**
* Given two arrays, which are theoretically "matched up" (e.g. two insertions at the same location), create a
* new array that "merges" both, meaning taking content from both, but including duplicated content only once.
* There is no such thing as a "deletion." All change are assumed to be content that needs to be kept.
*
* For example, ("a","b") => "ab", but ("hi there","hi you") => "hi thereyou".
*/
takeBoth(a: ArrayView<T>, b: ArrayView<T>, algorithm: DiffAlgorithm): T[];
/**
* Three-way merge between a common state and the "status quo" (which is the default "winner," when we need a tie-breaker), and
* a set of differences between the same common state and a state to "apply."
*
* Most situations are symmetrical, but not all, hence the difference in semantics.
*
* If the common state is available, it can be passed in. Otherwise, it is computed from the differences.
*/
merge3(status_quo: EditScript<T>, apply: EditScript<T>, common?: T[] | null): T[];
}
export declare enum DiffAlgorithm {
PREPROCESSOR_ONLY = 0,
SIMPLE = 1,
LCS = 2,
MYERS = 3,
HECKEL = 4,
PATIENCE = 5
}
export declare class Engine<T> {
opt_algorithm: DiffAlgorithm;
opt_secondary_algorithm: DiffAlgorithm | null;
opt_shift_rightward: boolean;
opt_f_collapse_equalities: ((middle: ArrayView<T>) => boolean) | null;
opt_f_lcs: (a: ArrayView<T>, b: ArrayView<T>) => [ArrayView<T>, ArrayView<T>];
constructor();
/**
* Computes and returns the edit sequence that transforms `prev` into `next`, according to the configured options.
*/
getEdits(prev: ArrayView<T>, next: ArrayView<T>): EditScript<T>;
/**
* Computes the edit sequence that transforms `prev` into `next`, using a few fast operations, taking O(1), O(min(n,M)), or O(N) speed
* and O(1) space. In simple cases, the entire edit sequence is determined; in complex cases, the simpler aspects are removed, leaving only
* a challenge.
*
* The processor is called only if there is a remaining non-trivial difference. The arguments will be non-empty, and they will mis-match
* at both their start and end, and they will be at least 2 elements in length; it can be useful for the subsequence processor to make use
* of these facts to simply its algorithm.
*/
private executePreprocessor;
/**
* A trivial execution processor, that just emits a single "modification" edit, sending `prev` to `next`.
*/
private executeSingleModification;
/**
* Attempts to completely evaluate the differences using only simple and fast algorithms, i.e. when worse-case is
* O(1) or O(N). If those cases are handled, the subprocessor will not be invoked; otherwise it will be invoked
* as the simpler cases were unsuccessful.
*/
private executeEasyCases;
/**
* Identifies a few straightforward cases, such as when there's an overlap at the start or end, or when the smaller is a
* subset of the longer. If no straightforward case is successful, the given process is called to continue working on the
* problem, otherwise it is not invoked.
*/
private executeSimple;
/**
* Finds the longest common substring between two sides, keeping that as an "equality" edit, and recursively
* computing the edit script for the two surrounding pieces.
*/
private executeLongestCommonSubstring;
/**
* Retains the longest common subsequence as equalities, producing minimal Edits for insert/delete.
* This can miss the longest common substring, or "interesting" common elements, but does maximize
* how much total material is kept.
*/
private executeMyers;
/**
* Runs a version of the algorithm of Heckel 1978, in which a histogram is used to locate the longest substring
* that are not only common to both, but appear only once in both, then taking an equality there.
*
* This can create suboptimal modifications as it deals with "move" operations. If you want to preserve the concept
* of a move, then those aren't suboptimal! If you don't, they can be coalesed into something simpler.
*/
private executeHeckel;
}
export interface StringInsert {
op: "insert";
idx: number;
txt: string;
}
export interface StringDelete {
op: "delete";
idx: number;
len: number;
}
export declare type StringOp = StringInsert | StringDelete;
export declare class StringEngine extends Engine<string> {
/**
* Returns edits, with differences on a per-character basis. This is most-detailed, but often not what a human might expect to see.
*/
getEditsByCharacter(prev: string, next: string): EditScript<string>;
/**
* Returns edits, first splitting the string into tokens, treating anything not matching a token as an individual character.
*/
getEditsByToken(prev: string, next: string, re: RegExp): EditScript<string>;
/**
* Returns edits, using the line as the unit of comparison.
*/
getEditsByLine(prev: string, next: string): EditScript<string>;
/**
* Returns edits, assuming configuration that is better for prose.
*/
getEditsByProse(prev: string, next: string): EditScript<string>;
/**
* Converts an `EditScript<string>` that was generated by a difference engine into a list
* of `insert` and `delete` instructions relative to the "previous" string used in the
* original diff.
*
* @param rollingUpdates if true, indexes are listed as if the underlying string is being transformed as we go, otherwise they are relative to the original string
*/
static getStringOffsets(script: EditScript<string>, rollingUpdates: boolean): StringOp[];
}
export {};