UNPKG

@fishan/myers-core-diff

Version:

A high-performance core diff engine based on Myers' algorithm, with plugin support for custom strategies (e.g., Patience, Preserve Structure).

514 lines (513 loc) 25.3 kB
/** * Enumerates the types of operations in a diff result. */ export declare enum DiffOperation { /** Represents a part of the sequence that is unchanged. */ EQUAL = 0, /** Represents a part of the sequence that was added. */ ADD = 1, /** Represents a part of the sequence that was removed. */ REMOVE = 2 } /** * Represents a single operation in the diff result. * It's a tuple where the first element is the operation type * and the second is the string content (token). * @example [DiffOperation.EQUAL, 'some text'] */ export type DiffResult = [DiffOperation, string]; /** * Data structure for the result of the middle snake search. * Represents the overlapping region found by the forward and backward searches. * @internal */ interface MiddleSnake { /** Start X coordinate (position in oldTokens) of the snake. */ x: number; /** Start Y coordinate (position in newTokens) of the snake. */ y: number; /** End U coordinate (position in oldTokens) of the snake. */ u: number; /** End V coordinate (position in newTokens) of the snake. */ v: number; } /** * Configuration options for the diff algorithm. */ export interface DiffOptions { /** The name of the diffing strategy plugin to use. */ diffStrategyName?: string; /** The minimum length of a match to be considered a valid anchor. */ minMatchLength?: number; /** The threshold (N+M) for switching to a faster, less precise diff algorithm for small changes. */ quickDiffThreshold?: number; /** The threshold (N+M) for using optimizations (like _guidedCalculateDiff) for very large differences. */ hugeDiffThreshold?: number; /** How far ahead to look for potential matches when guiding the diff algorithm (_guidedCalculateDiff). */ lookahead?: number; /** The width of the "corridor" to search within around the main diagonal (_guidedCalculateDiff). */ corridorWidth?: number; /** If true, skips the initial trimming of common prefixes and suffixes. */ skipTrimming?: boolean; /** (For _findAnchors) Scan step when searching for anchors. */ jumpStep?: number; /** (For _findAnchors) Chunk size for hashing. */ huntChunkSize?: number; /** (For _findAnchors) Minimum anchor confidence (0.0–1.0). */ minAnchorConfidence?: number; /** Whether to use L1 anchors (global search). */ useAnchors?: boolean; /** If true, the diff algorithm will prioritize preserving the positions of equal tokens. (Used by strategies) */ preservePositions?: boolean; /** (For stable diff) Threshold for using full diff on small gaps vs. simple add/remove. */ localgap?: number; /** (For stable diff) How far to search for L2 (positional) anchors. */ localLookahead?: number; /** (For _findAnchors) L1 anchor search mode. */ anchorSearchMode?: 'floating' | 'positional' | 'combo'; /** (For 'positional' mode) Max drift for an L1 positional anchor. */ positionalAnchorMaxDrift?: number; } /** * Defines the interface (contract) for a diff strategy plugin. * A plugin receives the diff engine instance to access its "Toolbox" of algorithms. * * @param engine The engine instance for accessing the Toolbox. * @param oldTokens The tokenized 'old' sequence. * @param oldStart The start index for diffing in oldTokens. * @param oldEnd The end index (exclusive) for diffing in oldTokens. * @param newTokens The tokenized 'new' sequence. * @param newStart The start index for diffing in newTokens. * @param newEnd The end index (exclusive) for diffing in newTokens. * @param idToString A map to convert token IDs back to strings. * @param config The fully resolved diff configuration. * @param debug A flag to enable verbose logging. * @returns An array of DiffResult tuples. */ export type DiffStrategyPlugin = (engine: MyersCoreDiff, // The engine instance for accessing the Toolbox oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], config: Required<DiffOptions>, debug: boolean) => DiffResult[]; /** * Represents an anchor, which is a significant, identical block of tokens * between the old and new sequences. Anchors guide the diffing process. * @internal */ export interface Anchor { /** The starting position in the 'old' sequence. */ oldPos: number; /** The starting position in the 'new' sequence. */ newPos: number; /** The length of the matching block. */ length: number; /** The absolute positional difference (Math.abs(newPos - oldPos)). */ driftDistance: number; /** The drift distance relative to the anchor length. */ driftRatio: number; /** A confidence score (0.0 - 1.0) for this anchor. */ confidence: number; } /** * Represents a "gap" between two anchors, which needs to be diffed. * @internal */ interface GapInfo { /** The start index of the gap in the 'old' sequence. */ oldStart: number; /** The end index (exclusive) of the gap in the 'old' sequence. */ oldEnd: number; /** The start index of the gap in the 'new' sequence. */ newStart: number; /** The end index (exclusive) of the gap in the 'new' sequence. */ newEnd: number; } /** * An advanced, high-performance implementation of the Myers diff algorithm. * * [v6.0] This class is implemented as an "Engine" (Toolbox) and a "Dispatcher". * It provides a "Toolbox" of core diffing algorithms (e.g., _findAnchors, * _recursiveDiff) and a "Registry" for "Strategy Plugins". * * The `diff()` method is a "Dispatcher" that performs tokenization and trimming, * then delegates the core diffing logic to the selected "Strategy Plugin" * (e.g., 'commonSES' or an external 'preserveStructure' plugin). * * ### Key Features & Techniques * * - **Token-Based Approach**: (Core) Converts string tokens to integer IDs * for blazing-fast comparisons. * * - **Prefix/Suffix Trimming**: (Core) Strips common prefixes and suffixes * before diffing. * * - **Strategy Registry (Plugins)**: Allows external code to register new * diffing strategies (e.G., `registerStrategy('preserveStructure', ...)`). * This makes the engine highly extensible for specialized tasks (like * genetic analysis) without modifying the core. * * - **Toolbox of Algorithms**: Provides all core algorithms as public methods * (e.g., `_findAnchors`, `_recursiveDiff`, `_guidedCalculateDiff`) for use * by external strategy plugins. * * ### Default Strategy: 'commonSES' * * The default built-in strategy, 'commonSES', implements the logic * optimized for finding the Shortest Edit Script (SES): * * - **Anchor-Based Guided Diff**: Uses `_findAnchors` (L1) to find * global floating anchors. * - **Recursive Myers**: Uses `_recursiveDiff` (with "middle snake") * to process the "gaps" between anchors, falling back to * `_guidedCalculateDiff` for very large gaps. * * @example * ```typescript * // 1. Using the default 'commonSES' strategy * const differ = new MyersCoreDiff(); * const result = differ.diff(oldCode, newCode); * * // 2. Using a custom (externally registered) strategy * // (Assuming 'preserveStructure' was registered) * const options = { diffStrategyName: 'preserveStructure' }; * const result = differ.diff(oldCode, newCode, false, options); * ``` */ export declare class MyersCoreDiff { static __DEV__: boolean; private static strategyRegistry; private static isDefaultRegistered; static readonly defaultOptions: Required<DiffOptions>; /** * Ensures that the default 'commonSES' strategy is registered. * This method is idempotent and will only register the strategy once, * using the provided instance to correctly bind 'this' for the method. * * @param instance - The MyersCoreDiff instance to which the strategy function will be bound. * @private * @static */ private static ensureDefaultStrategyRegistered; /** * Registers a new diffing strategy plugin with the Core Engine. * @param name The name of the strategy (e.g., 'preserveStructure'). * @param strategyFn The function implementing the DiffStrategyPlugin interface. * @public * @static */ static registerStrategy(name: string, strategyFn: DiffStrategyPlugin): void; /** * Initializes the Core Engine and registers built-in strategies. * @public */ constructor(); /** * Computes the difference using the "Dispatcher" logic. * * This method performs setup (tokenization, trimming) and then delegates * the core diffing logic to the selected "Strategy Plugin" from the * registry (based on `options.diffStrategyName`). * * @param oldTokens - The original array of strings. * @param newTokens - The new array of strings. * @param debug - (Internal) Enables verbose logging for debugging purposes. * @param options - Optional configuration, including `diffStrategyName`. * @returns An array of DiffResult tuples representing the edit script. * @public */ diff(oldTokens: string[], newTokens: string[], debug?: boolean, options?: DiffOptions): DiffResult[]; /** * Built-in plugin strategy "commonSES". * Implements the classic cdiff logic optimized for SES, * but *retains* the ability to use _calculateStableDiff if * config.preservePositions is true. * @param engine - The engine instance (unused, `this` is used). * @param oldTokens - The tokenized 'old' sequence. * @param oldStart - The start index for diffing in oldTokens. * @param oldEnd - The end index (exclusive) for diffing in oldTokens. * @param newTokens - The tokenized 'new' sequence. * @param newStart - The start index for diffing in newTokens. * @param newEnd - The end index (exclusive) for diffing in newTokens. * @param idToString - A map to convert token IDs back to strings. * @param config - The fully resolved diff configuration. * @param debug - A flag to enable verbose logging. * @returns An array of DiffResult tuples. * @private */ private _strategycommonSES; /** * [TOOLBOX] Finds anchors (significant matching blocks) between old and new token sequences. * These anchors help guide the diffing process by identifying stable regions. * * @param oldTokens - The original array of token IDs. * @param oldStart - The starting index in the oldTokens array. * @param oldEnd - The ending index (exclusive) in the oldTokens array. * @param newTokens - The new array of token IDs. * @param newStart - The starting index in the newTokens array. * @param newEnd - The ending index (exclusive) in the newTokens array. * @param config - The diff options configuration. * @param debug - Enables verbose logging for debugging purposes. * @returns An array of Anchor objects representing the found anchors. * @public */ _findAnchors(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, config: Required<DiffOptions>, debug: boolean): Anchor[]; /** * [TOOLBOX] Merges anchors, filters conflicts, and sorts them * to produce a final, monotonic chain (Longest Common Subsequence of anchors). * * @param anchors - The raw array of anchors found by `_findAnchors`. * @param config - The diff options configuration. * @param debug - Enables verbose logging for debugging purposes. * @returns A sorted and filtered array of Anchors forming a valid chain. * @public */ _mergeAndFilterAnchors(anchors: Anchor[], config: Required<DiffOptions>, debug: boolean): Anchor[]; /** * [TOOLBOX] Processes the diff by iterating through the anchor chain * and calling `_processGap` for regions between them. * * @param oldTokens - The tokenized 'old' sequence. * @param oldStart - The start index for diffing in oldTokens. * @param oldEnd - The end index (exclusive) for diffing in oldTokens. * @param newTokens - The tokenized 'new' sequence. * @param newStart - The start index for diffing in newTokens. * @param newEnd - The end index (exclusive) for diffing in newTokens. * @param anchors - The sorted and filtered chain of anchors. * @param idToString - A map to convert token IDs back to strings. * @param config - The fully resolved diff configuration. * @param debug - A flag to enable verbose logging. * @param depth - Recursion depth, for debugging. * @returns An array of DiffResult tuples. * @public */ _processWithAnchors(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, anchors: Anchor[], idToString: string[], config: Required<DiffOptions>, debug: boolean, depth?: number): DiffResult[]; /** * [TOOLBOX] A dispatcher that chooses the appropriate diffing strategy * for a gap, optimized for 'commonSES' (SES). * * @param gap - The GapInfo object defining the region to diff. * @param oldTokens - The tokenized 'old' sequence. * @param newTokens - The tokenized 'new' sequence. * @param idToString - A map to convert token IDs back to strings. * @param config - The fully resolved diff configuration. * @param debug - A flag to enable verbose logging. * @returns An array of DiffResult tuples. * @public */ _processGap(gap: GapInfo, oldTokens: Uint32Array, newTokens: Uint32Array, idToString: string[], config: Required<DiffOptions>, debug: boolean): DiffResult[]; /** * [TOOLBOX] The core recursive implementation of the Myers diff algorithm * with the "middle snake" optimization (SES). * * @param oldTokens - The tokenized 'old' sequence. * @param oldStart - The start index for diffing in oldTokens. * @param oldEnd - The end index (exclusive) for diffing in oldTokens. * @param newTokens - The tokenized 'new' sequence. * @param newStart - The start index for diffing in newTokens. * @param newEnd - The end index (exclusive) for diffing in newTokens. * @param idToString - A map to convert token IDs back to strings. * @param config - The fully resolved diff configuration. * @param debug - A flag to enable verbose logging. * @returns An array of DiffResult tuples. * @public */ _recursiveDiff(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], config: Required<DiffOptions>, debug: boolean): DiffResult[]; /** * [TOOLBOX] Finds the "middle snake" for linear-memory Myers. */ private forwardBuffer; private backwardBuffer; /** * Validates that the input ranges (start/end indices) are sane * and within the bounds of the token arrays. * * @param oldTokens - The 'old' token array. * @param oldStart - The start index for the 'old' range. * @param oldEnd - The end index (exclusive) for the 'old' range. * @param newTokens - The 'new' token array. * @param newStart - The start index for the 'new' range. * @param newEnd - The end index (exclusive) for the 'new' range. * @returns `true` if the ranges are valid, `false` otherwise. * @private */ private _validateInputs; /** * [TOOLBOX] Finds the "middle snake" for linear-memory Myers. * * @param oldTokens - The tokenized 'old' sequence. * @param oldStart - The start index for diffing in oldTokens. * @param oldEnd - The end index (exclusive) for diffing in oldTokens. * @param newTokens - The tokenized 'new' sequence. * @param newStart - The start index for diffing in newTokens. * @param newEnd - The end index (exclusive) for diffing in newTokens. * @param debug - A flag to enable verbose logging. * @returns A MiddleSnake object, or undefined if no overlap is found. * @public */ _findMiddleSnake(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, debug: boolean): MiddleSnake | undefined; /** * [TOOLBOX] A fast, heuristic-based diff algorithm ("corridor diff"). * Does not guarantee SES, but stays close to the diagonal. * Used as a fallback for very large or complex gaps. * * @param oldTokens - The tokenized 'old' sequence. * @param oldStart - The start index for diffing in oldTokens. * @param oldEnd - The end index (exclusive) for diffing in oldTokens. * @param newTokens - The tokenized 'new' sequence. * @param newStart - The start index for diffing in newTokens. * @param newEnd - The end index (exclusive) for diffing in newTokens. * @param idToString - A map to convert token IDs back to strings. * @param config - The fully resolved diff configuration. * @param debug - A flag to enable verbose logging. * @returns An array of DiffResult tuples. * @public */ _guidedCalculateDiff(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], config: Required<DiffOptions>, debug: boolean): DiffResult[]; /** * [TOOLBOX] The basic (O(ND)) Myers diff algorithm. * Finds the SES. Used for small gaps where recursion is overhead. * * @param oldTokens - The tokenized 'old' sequence. * @param oldStart - The start index for diffing in oldTokens. * @param oldEnd - The end index (exclusive) for diffing in oldTokens. * @param newTokens - The tokenized 'new' sequence. * @param newStart - The start index for diffing in newTokens. * @param newEnd - The end index (exclusive) for diffing in newTokens. * @param idToString - A map to convert token IDs back to strings. * @param config - The fully resolved diff configuration. * @param debug - A flag to enable verbose logging. * @returns An array of DiffResult tuples. * @public */ calculateDiff(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], config?: Required<DiffOptions>, debug?: boolean): DiffResult[]; /** * [TOOLBOX] (Legacy) A stable diff algorithm that prioritizes * finding positional anchors (L2 anchors). * * @param oldTokens - The tokenized 'old' sequence. * @param oldStart - The start index for diffing in oldTokens. * @param oldEnd - The end index (exclusive) for diffing in oldTokens. * @param newTokens - The tokenized 'new' sequence. * @param newStart - The start index for diffing in newTokens. * @param newEnd - The end index (exclusive) for diffing in newTokens. * @param idToString - A map to convert token IDs back to strings. * @param config - The fully resolved diff configuration. * @param debug - A flag to enable verbose logging. * @returns An array of DiffResult tuples. * @public */ _calculateStableDiff(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], config: Required<DiffOptions>, debug: boolean): DiffResult[]; /** * [TOOLBOX] Finds the next nearby positional anchor (L2 anchor). * * @param oldTokens - The tokenized 'old' sequence. * @param oldStart - The start index for diffing in oldTokens. * @param oldEnd - The end index (exclusive) for diffing in oldTokens. * @param newTokens - The tokenized 'new' sequence. * @param newStart - The start index for diffing in newTokens. * @param newEnd - The end index (exclusive) for diffing in newTokens. * @param lookahead - How far to search for a positional match. * @param debug - A flag to enable verbose logging. * @returns A simple object { oldPos, newPos } or null if no anchor is found. * @public */ _findNextLocalAnchor(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, lookahead: number, debug: boolean): { oldPos: number; newPos: number; } | null; /** * [TOOLBOX] (Legacy) Processes a gap for `_calculateStableDiff`. * * @param oldTokens - The tokenized 'old' sequence. * @param oldStart - The start index for diffing in oldTokens. * @param oldEnd - The end index (exclusive) for diffing in oldTokens. * @param newTokens - The tokenized 'new' sequence. * @param newStart - The start index for diffing in newTokens. * @param newEnd - The end index (exclusive) for diffing in newTokens. * @param idToString - A map to convert token IDs back to strings. * @param config - The fully resolved diff configuration. * @param debug - A flag to enable verbose logging. * @returns An array of DiffResult tuples. * @public */ _processLocalGap(oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], config: Required<DiffOptions>, debug: boolean): DiffResult[]; /** * Efficiently finds and separates common prefixes and suffixes from two token arrays. * This preprocessing step reduces the problem size for the main diff algorithm. * * @param oldTokens - The tokenized 'old' sequence. * @param oldStart - The start index for diffing in oldTokens. * @param oldEnd - The end index (exclusive) for diffing in oldTokens. * @param newTokens - The tokenized 'new' sequence. * @param newStart - The start index for diffing in newTokens. * @param newEnd - The end index (exclusive) for diffing in newTokens. * @param idToString - A map to convert token IDs back to strings. * @param debug - A flag to enable verbose logging. * @returns An object containing the prefix/suffix arrays and new trimmed indices. * @private */ private _trimCommonPrefixSuffix; /** * Converts arrays of string tokens into numerical IDs to speed up comparisons. * This is a critical performance optimization, as integer comparisons are much * faster than string comparisons. * * @param oldTokens - Array of 'old' string tokens. * @param newTokens - Array of 'new' string tokens. *V @param debug - A flag to enable verbose logging. * @returns An object containing hashed arrays and the ID-to-string map. * @private */ private _tokenize; /** * Helper method to determine if a token is rare within a given range. * This is used as a heuristic in the guided diff algorithm. * * @param token - The token ID to check. * @param tokens - The array to search within. * @param startPos - The start index of the range. * @param endPos - The end index (exclusive) of the range. * @param maxOccurrences - The threshold to be considered "rare". * @param debug - A flag to enable verbose logging. * @returns True if the token count is <= maxOccurrences, false otherwise. * @private */ private _isTokenRare; /** * [TOOLBOX] Helper function to create an array of ADD operations. * * @param tokens - The token array to read from. * @param start - The start index. * @param end - The end index (exclusive). * @param idToString - A map to convert token IDs back to strings. * @param debug - A flag to enable verbose logging. * @returns An array of ADD DiffResult tuples. * @public */ _createAdditions(tokens: Uint32Array, start: number, end: number, idToString: string[], debug?: boolean): DiffResult[]; /** * [TOOLBOX] Helper function to create an array of REMOVE operations. * * @param tokens - The token array to read from. * @param start - The start index. * @param end - The end index (exclusive). * @param idToString - A map to convert token IDs back to strings. * @param debug - A flag to enable verbose logging. * @returns An array of REMOVE DiffResult tuples. * @public */ _createDeletions(tokens: Uint32Array, start: number, end: number, idToString: string[], debug?: boolean): DiffResult[]; /** * [TOOLBOX] Reconstructs the diff from the trace generated by `calculateDiff`. * * @param trace - The array of O(ND) trace buffers. * @param oldTokens - The tokenized 'old' sequence. * @param oldStart - The start index for diffing in oldTokens. * @param oldEnd - The end index (exclusive) for diffing in oldTokens. * @param newTokens - The tokenized 'new' sequence. * @param newStart - The start index for diffing in newTokens. * @param newEnd - The end index (exclusive) for diffing in newTokens. * @param idToString - A map to convert token IDs back to strings. * @returns An array of DiffResult tuples. * @public */ buildValues(trace: Int32Array[], oldTokens: Uint32Array, oldStart: number, oldEnd: number, newTokens: Uint32Array, newStart: number, newEnd: number, idToString: string[], debug?: boolean): DiffResult[]; } export {};