@fishan/myers-core-diff
Version:
A high-performance core diff engine based on Myers' algorithm, with plugin support for custom strategies (e.g., Patience, Preserve Structure).
514 lines (513 loc) • 25.3 kB
TypeScript
/**
* Enumerates the types of operations in a diff result.
*/
export declare enum DiffOperation {
/** Represents a part of the sequence that is unchanged. */
EQUAL = 0,
/** Represents a part of the sequence that was added. */
ADD = 1,
/** Represents a part of the sequence that was removed. */
REMOVE = 2
}
/**
* Represents a single operation in the diff result.
* It's a tuple where the first element is the operation type
* and the second is the string content (token).
* @example [DiffOperation.EQUAL, 'some text']
*/
export type DiffResult = [DiffOperation, string];
/**
* Data structure for the result of the middle snake search.
* Represents the overlapping region found by the forward and backward searches.
* @internal
*/
interface MiddleSnake {
/** Start X coordinate (position in oldTokens) of the snake. */
x: number;
/** Start Y coordinate (position in newTokens) of the snake. */
y: number;
/** End U coordinate (position in oldTokens) of the snake. */
u: number;
/** End V coordinate (position in newTokens) of the snake. */
v: number;
}
/**
* Configuration options for the diff algorithm.
*/
export interface DiffOptions {
/** The name of the diffing strategy plugin to use. */
diffStrategyName?: string;
/** The minimum length of a match to be considered a valid anchor. */
minMatchLength?: number;
/** The threshold (N+M) for switching to a faster, less precise diff algorithm for small changes. */
quickDiffThreshold?: number;
/** The threshold (N+M) for using optimizations (like _guidedCalculateDiff) for very large differences. */
hugeDiffThreshold?: number;
/** How far ahead to look for potential matches when guiding the diff algorithm (_guidedCalculateDiff). */
lookahead?: number;
/** The width of the "corridor" to search within around the main diagonal (_guidedCalculateDiff). */
corridorWidth?: number;
/** If true, skips the initial trimming of common prefixes and suffixes. */
skipTrimming?: boolean;
/** (For _findAnchors) Scan step when searching for anchors. */
jumpStep?: number;
/** (For _findAnchors) Chunk size for hashing. */
huntChunkSize?: number;
/** (For _findAnchors) Minimum anchor confidence (0.0–1.0). */
minAnchorConfidence?: number;
/** Whether to use L1 anchors (global search). */
useAnchors?: boolean;
/** If true, the diff algorithm will prioritize preserving the positions of equal tokens. (Used by strategies) */
preservePositions?: boolean;
/** (For stable diff) Threshold for using full diff on small gaps vs. simple add/remove. */
localgap?: number;
/** (For stable diff) How far to search for L2 (positional) anchors. */
localLookahead?: number;
/** (For _findAnchors) L1 anchor search mode. */
anchorSearchMode?: 'floating' | 'positional' | 'combo';
/** (For 'positional' mode) Max drift for an L1 positional anchor. */
positionalAnchorMaxDrift?: number;
}
/**
* Defines the interface (contract) for a diff strategy plugin.
* A plugin receives the diff engine instance to access its "Toolbox" of algorithms.
*
* @param engine The engine instance for accessing the Toolbox.
* @param oldTokens The tokenized 'old' sequence.
* @param oldStart The start index for diffing in oldTokens.
* @param oldEnd The end index (exclusive) for diffing in oldTokens.
* @param newTokens The tokenized 'new' sequence.
* @param newStart The start index for diffing in newTokens.
* @param newEnd The end index (exclusive) for diffing in newTokens.
* @param idToString A map to convert token IDs back to strings.
* @param config The fully resolved diff configuration.
* @param debug A flag to enable verbose logging.
* @returns An array of DiffResult tuples.
*/
export type DiffStrategyPlugin = (engine: MyersCoreDiff, // The engine instance for accessing the Toolbox
oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], config: Required<DiffOptions>, debug: boolean) => DiffResult[];
/**
* Represents an anchor, which is a significant, identical block of tokens
* between the old and new sequences. Anchors guide the diffing process.
* @internal
*/
export interface Anchor {
/** The starting position in the 'old' sequence. */
oldPos: number;
/** The starting position in the 'new' sequence. */
newPos: number;
/** The length of the matching block. */
length: number;
/** The absolute positional difference (Math.abs(newPos - oldPos)). */
driftDistance: number;
/** The drift distance relative to the anchor length. */
driftRatio: number;
/** A confidence score (0.0 - 1.0) for this anchor. */
confidence: number;
}
/**
* Represents a "gap" between two anchors, which needs to be diffed.
* @internal
*/
interface GapInfo {
/** The start index of the gap in the 'old' sequence. */
oldStart: number;
/** The end index (exclusive) of the gap in the 'old' sequence. */
oldEnd: number;
/** The start index of the gap in the 'new' sequence. */
newStart: number;
/** The end index (exclusive) of the gap in the 'new' sequence. */
newEnd: number;
}
/**
* An advanced, high-performance implementation of the Myers diff algorithm.
*
* [v6.0] This class is implemented as an "Engine" (Toolbox) and a "Dispatcher".
* It provides a "Toolbox" of core diffing algorithms (e.g., _findAnchors,
* _recursiveDiff) and a "Registry" for "Strategy Plugins".
*
* The `diff()` method is a "Dispatcher" that performs tokenization and trimming,
* then delegates the core diffing logic to the selected "Strategy Plugin"
* (e.g., 'commonSES' or an external 'preserveStructure' plugin).
*
* ### Key Features & Techniques
*
* - **Token-Based Approach**: (Core) Converts string tokens to integer IDs
* for blazing-fast comparisons.
*
* - **Prefix/Suffix Trimming**: (Core) Strips common prefixes and suffixes
* before diffing.
*
* - **Strategy Registry (Plugins)**: Allows external code to register new
* diffing strategies (e.G., `registerStrategy('preserveStructure', ...)`).
* This makes the engine highly extensible for specialized tasks (like
* genetic analysis) without modifying the core.
*
* - **Toolbox of Algorithms**: Provides all core algorithms as public methods
* (e.g., `_findAnchors`, `_recursiveDiff`, `_guidedCalculateDiff`) for use
* by external strategy plugins.
*
* ### Default Strategy: 'commonSES'
*
* The default built-in strategy, 'commonSES', implements the logic
* optimized for finding the Shortest Edit Script (SES):
*
* - **Anchor-Based Guided Diff**: Uses `_findAnchors` (L1) to find
* global floating anchors.
* - **Recursive Myers**: Uses `_recursiveDiff` (with "middle snake")
* to process the "gaps" between anchors, falling back to
* `_guidedCalculateDiff` for very large gaps.
*
* @example
* ```typescript
* // 1. Using the default 'commonSES' strategy
* const differ = new MyersCoreDiff();
* const result = differ.diff(oldCode, newCode);
*
* // 2. Using a custom (externally registered) strategy
* // (Assuming 'preserveStructure' was registered)
* const options = { diffStrategyName: 'preserveStructure' };
* const result = differ.diff(oldCode, newCode, false, options);
* ```
*/
export declare class MyersCoreDiff {
static __DEV__: boolean;
private static strategyRegistry;
private static isDefaultRegistered;
static readonly defaultOptions: Required<DiffOptions>;
/**
* Ensures that the default 'commonSES' strategy is registered.
* This method is idempotent and will only register the strategy once,
* using the provided instance to correctly bind 'this' for the method.
*
* @param instance - The MyersCoreDiff instance to which the strategy function will be bound.
* @private
* @static
*/
private static ensureDefaultStrategyRegistered;
/**
* Registers a new diffing strategy plugin with the Core Engine.
* @param name The name of the strategy (e.g., 'preserveStructure').
* @param strategyFn The function implementing the DiffStrategyPlugin interface.
* @public
* @static
*/
static registerStrategy(name: string, strategyFn: DiffStrategyPlugin): void;
/**
* Initializes the Core Engine and registers built-in strategies.
* @public
*/
constructor();
/**
* Computes the difference using the "Dispatcher" logic.
*
* This method performs setup (tokenization, trimming) and then delegates
* the core diffing logic to the selected "Strategy Plugin" from the
* registry (based on `options.diffStrategyName`).
*
* @param oldTokens - The original array of strings.
* @param newTokens - The new array of strings.
* @param debug - (Internal) Enables verbose logging for debugging purposes.
* @param options - Optional configuration, including `diffStrategyName`.
* @returns An array of DiffResult tuples representing the edit script.
* @public
*/
diff(oldTokens: string[], newTokens: string[], debug?: boolean, options?: DiffOptions): DiffResult[];
/**
* Built-in plugin strategy "commonSES".
* Implements the classic cdiff logic optimized for SES,
* but *retains* the ability to use _calculateStableDiff if
* config.preservePositions is true.
* @param engine - The engine instance (unused, `this` is used).
* @param oldTokens - The tokenized 'old' sequence.
* @param oldStart - The start index for diffing in oldTokens.
* @param oldEnd - The end index (exclusive) for diffing in oldTokens.
* @param newTokens - The tokenized 'new' sequence.
* @param newStart - The start index for diffing in newTokens.
* @param newEnd - The end index (exclusive) for diffing in newTokens.
* @param idToString - A map to convert token IDs back to strings.
* @param config - The fully resolved diff configuration.
* @param debug - A flag to enable verbose logging.
* @returns An array of DiffResult tuples.
* @private
*/
private _strategycommonSES;
/**
* [TOOLBOX] Finds anchors (significant matching blocks) between old and new token sequences.
* These anchors help guide the diffing process by identifying stable regions.
*
* @param oldTokens - The original array of token IDs.
* @param oldStart - The starting index in the oldTokens array.
* @param oldEnd - The ending index (exclusive) in the oldTokens array.
* @param newTokens - The new array of token IDs.
* @param newStart - The starting index in the newTokens array.
* @param newEnd - The ending index (exclusive) in the newTokens array.
* @param config - The diff options configuration.
* @param debug - Enables verbose logging for debugging purposes.
* @returns An array of Anchor objects representing the found anchors.
* @public
*/
_findAnchors(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, config: Required<DiffOptions>, debug: boolean): Anchor[];
/**
* [TOOLBOX] Merges anchors, filters conflicts, and sorts them
* to produce a final, monotonic chain (Longest Common Subsequence of anchors).
*
* @param anchors - The raw array of anchors found by `_findAnchors`.
* @param config - The diff options configuration.
* @param debug - Enables verbose logging for debugging purposes.
* @returns A sorted and filtered array of Anchors forming a valid chain.
* @public
*/
_mergeAndFilterAnchors(anchors: Anchor[], config: Required<DiffOptions>, debug: boolean): Anchor[];
/**
* [TOOLBOX] Processes the diff by iterating through the anchor chain
* and calling `_processGap` for regions between them.
*
* @param oldTokens - The tokenized 'old' sequence.
* @param oldStart - The start index for diffing in oldTokens.
* @param oldEnd - The end index (exclusive) for diffing in oldTokens.
* @param newTokens - The tokenized 'new' sequence.
* @param newStart - The start index for diffing in newTokens.
* @param newEnd - The end index (exclusive) for diffing in newTokens.
* @param anchors - The sorted and filtered chain of anchors.
* @param idToString - A map to convert token IDs back to strings.
* @param config - The fully resolved diff configuration.
* @param debug - A flag to enable verbose logging.
* @param depth - Recursion depth, for debugging.
* @returns An array of DiffResult tuples.
* @public
*/
_processWithAnchors(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, anchors: Anchor[], idToString: string[], config: Required<DiffOptions>, debug: boolean, depth?: number): DiffResult[];
/**
* [TOOLBOX] A dispatcher that chooses the appropriate diffing strategy
* for a gap, optimized for 'commonSES' (SES).
*
* @param gap - The GapInfo object defining the region to diff.
* @param oldTokens - The tokenized 'old' sequence.
* @param newTokens - The tokenized 'new' sequence.
* @param idToString - A map to convert token IDs back to strings.
* @param config - The fully resolved diff configuration.
* @param debug - A flag to enable verbose logging.
* @returns An array of DiffResult tuples.
* @public
*/
_processGap(gap: GapInfo, oldTokens: Uint32Array, newTokens: Uint32Array, idToString: string[], config: Required<DiffOptions>, debug: boolean): DiffResult[];
/**
* [TOOLBOX] The core recursive implementation of the Myers diff algorithm
* with the "middle snake" optimization (SES).
*
* @param oldTokens - The tokenized 'old' sequence.
* @param oldStart - The start index for diffing in oldTokens.
* @param oldEnd - The end index (exclusive) for diffing in oldTokens.
* @param newTokens - The tokenized 'new' sequence.
* @param newStart - The start index for diffing in newTokens.
* @param newEnd - The end index (exclusive) for diffing in newTokens.
* @param idToString - A map to convert token IDs back to strings.
* @param config - The fully resolved diff configuration.
* @param debug - A flag to enable verbose logging.
* @returns An array of DiffResult tuples.
* @public
*/
_recursiveDiff(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], config: Required<DiffOptions>, debug: boolean): DiffResult[];
/**
* [TOOLBOX] Finds the "middle snake" for linear-memory Myers.
*/
private forwardBuffer;
private backwardBuffer;
/**
* Validates that the input ranges (start/end indices) are sane
* and within the bounds of the token arrays.
*
* @param oldTokens - The 'old' token array.
* @param oldStart - The start index for the 'old' range.
* @param oldEnd - The end index (exclusive) for the 'old' range.
* @param newTokens - The 'new' token array.
* @param newStart - The start index for the 'new' range.
* @param newEnd - The end index (exclusive) for the 'new' range.
* @returns `true` if the ranges are valid, `false` otherwise.
* @private
*/
private _validateInputs;
/**
* [TOOLBOX] Finds the "middle snake" for linear-memory Myers.
*
* @param oldTokens - The tokenized 'old' sequence.
* @param oldStart - The start index for diffing in oldTokens.
* @param oldEnd - The end index (exclusive) for diffing in oldTokens.
* @param newTokens - The tokenized 'new' sequence.
* @param newStart - The start index for diffing in newTokens.
* @param newEnd - The end index (exclusive) for diffing in newTokens.
* @param debug - A flag to enable verbose logging.
* @returns A MiddleSnake object, or undefined if no overlap is found.
* @public
*/
_findMiddleSnake(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, debug: boolean): MiddleSnake | undefined;
/**
* [TOOLBOX] A fast, heuristic-based diff algorithm ("corridor diff").
* Does not guarantee SES, but stays close to the diagonal.
* Used as a fallback for very large or complex gaps.
*
* @param oldTokens - The tokenized 'old' sequence.
* @param oldStart - The start index for diffing in oldTokens.
* @param oldEnd - The end index (exclusive) for diffing in oldTokens.
* @param newTokens - The tokenized 'new' sequence.
* @param newStart - The start index for diffing in newTokens.
* @param newEnd - The end index (exclusive) for diffing in newTokens.
* @param idToString - A map to convert token IDs back to strings.
* @param config - The fully resolved diff configuration.
* @param debug - A flag to enable verbose logging.
* @returns An array of DiffResult tuples.
* @public
*/
_guidedCalculateDiff(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], config: Required<DiffOptions>, debug: boolean): DiffResult[];
/**
* [TOOLBOX] The basic (O(ND)) Myers diff algorithm.
* Finds the SES. Used for small gaps where recursion is overhead.
*
* @param oldTokens - The tokenized 'old' sequence.
* @param oldStart - The start index for diffing in oldTokens.
* @param oldEnd - The end index (exclusive) for diffing in oldTokens.
* @param newTokens - The tokenized 'new' sequence.
* @param newStart - The start index for diffing in newTokens.
* @param newEnd - The end index (exclusive) for diffing in newTokens.
* @param idToString - A map to convert token IDs back to strings.
* @param config - The fully resolved diff configuration.
* @param debug - A flag to enable verbose logging.
* @returns An array of DiffResult tuples.
* @public
*/
calculateDiff(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], config?: Required<DiffOptions>, debug?: boolean): DiffResult[];
/**
* [TOOLBOX] (Legacy) A stable diff algorithm that prioritizes
* finding positional anchors (L2 anchors).
*
* @param oldTokens - The tokenized 'old' sequence.
* @param oldStart - The start index for diffing in oldTokens.
* @param oldEnd - The end index (exclusive) for diffing in oldTokens.
* @param newTokens - The tokenized 'new' sequence.
* @param newStart - The start index for diffing in newTokens.
* @param newEnd - The end index (exclusive) for diffing in newTokens.
* @param idToString - A map to convert token IDs back to strings.
* @param config - The fully resolved diff configuration.
* @param debug - A flag to enable verbose logging.
* @returns An array of DiffResult tuples.
* @public
*/
_calculateStableDiff(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], config: Required<DiffOptions>, debug: boolean): DiffResult[];
/**
* [TOOLBOX] Finds the next nearby positional anchor (L2 anchor).
*
* @param oldTokens - The tokenized 'old' sequence.
* @param oldStart - The start index for diffing in oldTokens.
* @param oldEnd - The end index (exclusive) for diffing in oldTokens.
* @param newTokens - The tokenized 'new' sequence.
* @param newStart - The start index for diffing in newTokens.
* @param newEnd - The end index (exclusive) for diffing in newTokens.
* @param lookahead - How far to search for a positional match.
* @param debug - A flag to enable verbose logging.
* @returns A simple object { oldPos, newPos } or null if no anchor is found.
* @public
*/
_findNextLocalAnchor(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, lookahead: number, debug: boolean): {
oldPos: number;
newPos: number;
} | null;
/**
* [TOOLBOX] (Legacy) Processes a gap for `_calculateStableDiff`.
*
* @param oldTokens - The tokenized 'old' sequence.
* @param oldStart - The start index for diffing in oldTokens.
* @param oldEnd - The end index (exclusive) for diffing in oldTokens.
* @param newTokens - The tokenized 'new' sequence.
* @param newStart - The start index for diffing in newTokens.
* @param newEnd - The end index (exclusive) for diffing in newTokens.
* @param idToString - A map to convert token IDs back to strings.
* @param config - The fully resolved diff configuration.
* @param debug - A flag to enable verbose logging.
* @returns An array of DiffResult tuples.
* @public
*/
_processLocalGap(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], config: Required<DiffOptions>, debug: boolean): DiffResult[];
/**
* Efficiently finds and separates common prefixes and suffixes from two token arrays.
* This preprocessing step reduces the problem size for the main diff algorithm.
*
* @param oldTokens - The tokenized 'old' sequence.
* @param oldStart - The start index for diffing in oldTokens.
* @param oldEnd - The end index (exclusive) for diffing in oldTokens.
* @param newTokens - The tokenized 'new' sequence.
* @param newStart - The start index for diffing in newTokens.
* @param newEnd - The end index (exclusive) for diffing in newTokens.
* @param idToString - A map to convert token IDs back to strings.
* @param debug - A flag to enable verbose logging.
* @returns An object containing the prefix/suffix arrays and new trimmed indices.
* @private
*/
private _trimCommonPrefixSuffix;
/**
* Converts arrays of string tokens into numerical IDs to speed up comparisons.
* This is a critical performance optimization, as integer comparisons are much
* faster than string comparisons.
*
* @param oldTokens - Array of 'old' string tokens.
* @param newTokens - Array of 'new' string tokens.
*V @param debug - A flag to enable verbose logging.
* @returns An object containing hashed arrays and the ID-to-string map.
* @private
*/
private _tokenize;
/**
* Helper method to determine if a token is rare within a given range.
* This is used as a heuristic in the guided diff algorithm.
*
* @param token - The token ID to check.
* @param tokens - The array to search within.
* @param startPos - The start index of the range.
* @param endPos - The end index (exclusive) of the range.
* @param maxOccurrences - The threshold to be considered "rare".
* @param debug - A flag to enable verbose logging.
* @returns True if the token count is <= maxOccurrences, false otherwise.
* @private
*/
private _isTokenRare;
/**
* [TOOLBOX] Helper function to create an array of ADD operations.
*
* @param tokens - The token array to read from.
* @param start - The start index.
* @param end - The end index (exclusive).
* @param idToString - A map to convert token IDs back to strings.
* @param debug - A flag to enable verbose logging.
* @returns An array of ADD DiffResult tuples.
* @public
*/
_createAdditions(tokens: Uint32Array, start: number, end: number, idToString: string[], debug?: boolean): DiffResult[];
/**
* [TOOLBOX] Helper function to create an array of REMOVE operations.
*
* @param tokens - The token array to read from.
* @param start - The start index.
* @param end - The end index (exclusive).
* @param idToString - A map to convert token IDs back to strings.
* @param debug - A flag to enable verbose logging.
* @returns An array of REMOVE DiffResult tuples.
* @public
*/
_createDeletions(tokens: Uint32Array, start: number, end: number, idToString: string[], debug?: boolean): DiffResult[];
/**
* [TOOLBOX] Reconstructs the diff from the trace generated by `calculateDiff`.
*
* @param trace - The array of O(ND) trace buffers.
* @param oldTokens - The tokenized 'old' sequence.
* @param oldStart - The start index for diffing in oldTokens.
* @param oldEnd - The end index (exclusive) for diffing in oldTokens.
* @param newTokens - The tokenized 'new' sequence.
* @param newStart - The start index for diffing in newTokens.
* @param newEnd - The end index (exclusive) for diffing in newTokens.
* @param idToString - A map to convert token IDs back to strings.
* @returns An array of DiffResult tuples.
* @public
*/
buildValues(trace: Int32Array[], oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], debug?: boolean): DiffResult[];
}
export {};