UNPKG

@asmartbear/diff-merge

Version:

Text and arbitrary-array diff and merge, fast, multiple algorithms

github.com/asmartbear/diff-merge

asmartbear/diff-merge

1,116 lines • 96.3 kB

JavaScript

'use strict'; Object.defineProperty(exports, "__esModule", { value: true }); // A dependency-free, type-safe difference algorithm. // Operates on arrays of values, which just need to support the concept of equality (like strings). // Generally tries to find minimal edits, but trades off for speed especially with longer sequences. // Terminology: // "prev" -- the previous array; the "before" // "next" -- the next array; the "after" // true if we're running in various environments const is_node = typeof process !== "undefined"; const is_chrome = !is_node && typeof window !== "undefined" && "chrome" in window; // const is_safari: boolean = !is_node && typeof window !== "undefined" && "safari" in window; // true if we're running the V8 Javascript Engine const is_v8 = is_node || is_chrome; // when to use various native functions versus our own functions, depending on speeds of various environments const should_use_native_indexof = is_v8; // const supports_backwards_match_regex: boolean = !is_safari; /** * Like Math.max(), but executes faster in some Javascript browsers and contexts. */ function imax(a, b) { return a > b ? a : b; } /** * Like Math.min(), but executes faster in some Javascript browsers and contexts. */ function imin(a, b) { return a < b ? a : b; } /** * Returns the number of characters that both strings have in common, or 0 if none or at least one is null or undefined. * * Contains a few optimizations (that benchmarks indicate are, in fact, useful), and is Unicode-safe. */ function getCommonPrefixLength(a, b) { if (!a || !b || a.charAt(0) !== b.charAt(0)) return 0; // trivial, quick cases if (a.charAt(1) !== b.charAt(1)) return 1; // since both strings are non-empty, this is valid, and quick in the common case that there's nothing else let max = a.length < b.length ? a.length : b.length; for (let k = 2; k < max; ++k) { if (a.charAt(k) !== b.charAt(k)) { return k; } } return max; } exports.getCommonPrefixLength = getCommonPrefixLength; /** * Holds an array with offset and length, so we can create a pseudo-array "view" into an array without * making large-array copies or allocations. The "end" is inclusive! */ class ArrayView { constructor(a, start = 0, end = a.length - 1) { this.a = a; this.start = start; this.end = end; // nothing else to do } static createEmpty() { return new ArrayView([], 0, -1); // FIXME: share single empty array object since it's immutable? } /** * The number of elements in this view. */ get length() { return this.end - this.start + 1; } /** * True if this view is empty, i.e. represents the empty array, and has length 0. */ get empty() { return this.end < this.start; } /** * Like Array.indexOf(), but includes start (relative to the actual start of the underlying array) and end (inclusive * and relative to the actual underlying array). */ indexOf(target, start = this.start, end = this.end) { // In most browsers, Array#indexOf() is very slow. Therefore, we scan it ourselves. // Plus Array#indexOf() doesn't understand the end of the array, which we could correct for, but it's slower anyway. // Benchmark: https://run.perf.zone/view/for-vs-while-vs-indexof-1000-strings-1516293164213 // The native indexOf() can be faster, even though it can do "extra work" by running off the end of the array. if (should_use_native_indexof) { const result = this.a.indexOf(target, start); return result > end ? -1 : result; } // Manual scan. const a = this.a; for (let k = start; k <= end; ++k) { if (a[k] === target) { return k; } } return -1; } /** * Same behavior as String#indexOf: Find another ArrayView as a consecutive substring on this array, returning the * (absolute) index of the first such position, or -1 if not found. */ indexOfSubstring(target, start = this.start, end = this.end) { // Trivial cases if (target.empty) return 0; if (this.empty) return -1; // The following implementation is fine if the underlying array is short, but if it's long, it would be worth computing // a table for something like KMP or Boyer-Moore searching. Be sure to add lots more unit tests before doing that though! const target_first_element = target.a[target.start]; // cache this one, since we're scanning for it first const target_length = target.length; if (target_length === 1) return this.indexOf(target_first_element, start, end); // a faster algorithm in the case of finding a single element const a = this.a; end = imin(end, this.end - target_length + 1); // don't run off the end of our array looking for the target for (let k = start; k <= end; ++k) { // if the target is longer than us, this loop will end trivially, so we don't need to check that case separately if (a[k] === target_first_element) { let matched = true; for (let len = 1; len < target_length; ++len) { if (a[k + len] !== target.a[target.start + len]) { matched = false; break; } } if (matched) { return k; } } } return -1; } /** * Creates a copy of this subsection of the array, which is therefore modifable, and allocates memory. */ getCopy() { /** * Creates shallow copy of an array, using whatever is fastest in the current environment. * Ref: https://jsperf.com/new-array-vs-splice-vs-slice/31 * * In Safari desktop it's fastest to while-loop; slice is 11% slower. * In Chrome desktop it's fastest to slice(). * In Firefox desktop it's an order of magnitude faster to slice(). */ return this.a.slice(this.start, this.end + 1); } /** * Like Array.forEach(), but on this view of the array, and more restrictions on the callback function. */ forEach(f) { const a = this.a; const end = this.end; for (let k = this.start; k <= end; ++k) { // because of starting and ending f(a[k]); } } /** * Like Array.map(), but on this view of the array, and more restrictions on the callback function. */ map(f) { const result = []; const a = this.a; const end = this.end; for (let k = this.start; k <= end; ++k) { // because of starting and ending result[result.length] = f(a[k]); } return result; } /** * Like Array.map(), but on this view of the array, and the mapping function converts to a string. */ mapToString(f) { const result = []; const a = this.a; const end = this.end; for (let k = this.start; k <= end; ++k) { // because of starting and ending result[result.length] = f(a[k]); } return result; } /** * Pushes all of the elements from this view onto the end of the given array. */ pushAll(arry) { const a = this.a; const end = this.end; for (let k = this.start; k <= end; ++k) { arry[arry.length] = a[k]; } } /** * Retrieves an element from the array, relative to the start of this view. */ getElement(relative_index) { return this.a[this.start + relative_index]; } /** * Return an ArrayView that is a subsequence, starting at an offset relative to `this,` and with a given length. * If the result would be identical to `this`, then `this` itself is returned without allocating a new view. * If length is missing, the subsequence goes to the end. */ getSubsequence(relative_start, relative_length) { if (relative_length === undefined) { if (relative_start === 0) { // save an allocation if we can just return ourselves return this; } return new ArrayView(this.a, this.start + relative_start, this.end); // end is fixed, so no need to compute length and so forth } if (relative_length <= 0) { // clamp relative_length = 0; } if (relative_start == 0 && relative_length == this.length) { // happens to be identical to self return this; } const offset = this.start + relative_start; // slight optimization, since we use it twice return new ArrayView(this.a, offset, offset + relative_length - 1); } /** * Gets an array in reverse order, just the elements inside this view, as a new view. * This allocates memory. */ getReverse() { return new ArrayView(this.getCopy().reverse()); } toString() { return this.getCopy().join(''); } /** * True if this subsequence is equal to the other subsequence, false otherwise. */ equals(that) { // We intentionally use duplicate objects, e.g. for "equality" Edits, so if we're lucky, this super-fast check short-circuits "true" if (this === that) return true; // Length-check short-circuits "false" in many cases const len = this.length; if (len !== that.length) return false; // Reuse other code to compare all characters in both arrays return this.getLengthOfCommonPrefix(that) == len; } /** * Returns -1, 0, or 1, indicating whether `this` is less, equal, or greater than `that`, comparing like strings, e.g. one element at a time, * returning the answer if they're unequal, and if the shorter is equal to the prefix of the latter, the shorter is deemed smaller. */ compare(that) { const a = this.a; const b = that.a; const a_end = this.end; const b_end = that.end; let ai = this.start; let bi = that.start; while (ai <= a_end && bi <= b_end) { const a_el = a[ai]; const b_el = b[bi]; if (a_el < b_el) return -1; if (a_el > b_el) return 1; ++ai; ++bi; } return ai <= a_end ? 1 : bi <= b_end ? -1 : 0; } /** * Concatenates a sequence onto this one, NOT changing the current one, but rather returning a new result. * This is akin to Array#concat() with just one parameter. * Various optimizations prevent allocating memory whenever possible. */ concat(that) { // Trivial cases with no allocations; yay! if (this.empty) return that; if (that.empty) return this; // If exactly consecutive, we can get away with a new view on the existing underlying array. if (this.a === that.a && this.end + 1 === that.start) { return new ArrayView(this.a, this.start, that.end); } // We have to create a new array. const result = this.getCopy(); that.pushAll(result); return new ArrayView(result); } /** * Returns the number of elements that are common to the first elements in both arrays. * Each array can be any size, including empty, but cannot be null or undefined. */ getLengthOfCommonPrefix(that) { const a = this.a; const b = that.a; const a_end = this.end; const b_end = that.end; let ai = this.start; let bi = that.start; let n = 0; while (ai <= a_end && bi <= b_end) { if (a[ai] !== b[bi]) { break; } ++n; ++ai; ++bi; } return n; } /** * Returns the number of elements that are common to the last elements in both arrays. * Each array can be any size, including empty, but cannot be null or undefined. */ getLengthOfCommonSuffix(that) { const a = this.a; const b = that.a; const a_start = this.start; const b_start = that.start; let ai = this.end; let bi = that.end; let n = 0; while (ai >= a_start && bi >= b_start) { if (a[ai] !== b[bi]) { break; } ++n; --ai; --bi; } return n; } /** * Find the longest length L of a suffix of `this` which overlaps with a prefix of `that` also of length L. * Each array can be any size, including empty, but not null or undefined. Returns 0 if there is no commonality. */ getLengthOfOverlapAtMyEnd(that) { // FIXME: Better algorithm, maybe something like taking the first char of B, and scanning in A for it. // Anything in A that matches the whole prefix/suffix has to start by matching this way, but we reuse a fast algorithm for it. const a = this.a; const b = that.a; const a_start = this.start; const b_start = that.start; const a_end = this.end; const b_end = that.end; for (let n_inclusive = imin(a_end - a_start, b_end - b_start); n_inclusive >= 0; --n_inclusive) { let matches = true; for (let k = n_inclusive; k >= 0; --k) { if (a[a_end - k] !== b[b_start + n_inclusive - k]) { matches = false; break; } } if (matches) { return n_inclusive + 1; } } return 0; } /** * Returns a pair of ArrayViews containing the longest common substring in `this` and `that` respectively. * Both views in the pair will be empty if there's no common substring. * * This particular algorithm is slow -- O(N*M) -- but always finds the right answer, is simple, it approaches * O(N) if the longest substring is long, and it uses only O(1) additional memory. * It uses a few techniques to speed up, for finding the first element of a substring, * and by anticipating that a found-substring might be extended in-place to speed up the best-so-far length. * In short, the N-loop is slow but the M-loop is fast, and further, it orders strings such that N <= M. */ static getLongestCommonSubstringGrowingLinearScan(ths, tht) { // a is shorter, b is longer // Put the shorter string on the outer loop for faster execution. // As a special case, when sequence length is longer than the shorter string, it will automatically terminate; not so if longer were on the outside. // As a special case, if the shorter string is embedded completely in the longer, we'll find that very quickly and terminate. const this_is_shorter = ths.length <= tht.length; const shorter = (this_is_shorter ? ths : tht); const longer = (this_is_shorter ? tht : ths); const a = shorter.a; const b = longer.a; const a_start = shorter.start; const b_start = longer.start; const a_end_p1 = shorter.end + 1; const b_end_p1 = longer.end + 1; let subsequence_len = 1; // the current length of the subsequence that we're scanning for let best_a_start = 0; let best_b_start = 0; let best_length = 0; for (let ai = a_start; ai + subsequence_len <= a_end_p1;) { // Does the current subsequence in a, appear anywhere in b? // If yes, place the index in `bi_found`, else that will be -1 at the end of the loop. let bi_found = -1; const bi_end_search = b_end_p1 - subsequence_len; // don't scan off the end of the array, given the substring length we will require const target = a[ai]; // speedup: cache the target value let bi = b_start; while ((bi = longer.indexOf(target, bi, bi_end_search)) >= 0) { let found = true; for (let k = subsequence_len; --k > 0;) { // don't have to check when k==0 because the equality test above already did that if (a[ai + k] !== b[bi + k]) { found = false; break; } } if (found) { bi_found = bi; break; } ++bi; } // If we found something, save it as "best so far," and increment the length and try again, // otherwise keep the length the same and try the next index. if (bi_found >= 0) { // Attempt to extend the match we found. Don't run off the end of the array! for (let k = subsequence_len; ai + k < a_end_p1 && bi_found + k < b_end_p1 && a[ai + k] === b[bi_found + k];) { subsequence_len = ++k; // match! new length, and increment our test-length in k } // Save this as the "best so far," and continue with the next-larger subsequence length to beat it best_a_start = ai; best_b_start = bi_found; best_length = subsequence_len; ++subsequence_len; } else { ++ai; // next! } } const result = [ new ArrayView(a, best_a_start, best_a_start + best_length - 1), new ArrayView(b, best_b_start, best_b_start + best_length - 1), ]; if (!this_is_shorter) { result.reverse(); } return result; } /** * Returns a pair of ArrayViews containing the longest common substring in `this` and `that` respectively. * Both views in the pair will be empty if there's no common substring. * * This algorithm uses an "optimistic" method, in which if there are long common substrings (relative to the * shortest string), it will be found quickly, in O(M) comparisons. It is still O(NM) in the worst case, * which is the lack of a common substring, or a common of 1 character. */ static getLongestCommonSubstringOptimisticBisect(ths, tht) { // The shorter string must come first. If not, compute the converse, then reverse the order of the result. const is_swapped = ths.length > tht.length; const short = is_swapped ? tht : ths; const long = is_swapped ? ths : tht; // console.log(`LCS OB: [${short.toString()}] -> [${long.toString()}]`); // Load up the remaining variables const short_a = short.a; const long_a = long.a; const short_start = short.start; const long_start = long.start; const short_end = short.end; const long_end = long.end; const f_floor = Math.floor; // Initialize the state of the "best so far" // best x, y starting locations, and best length let best_so_far_x = -1; let best_so_far_y = -1; let best_so_far_len = 0; // Initialize bisection scanning queue. Must use a queue for scanning, to do breadth-first, attempting to // find the longest strings as soon as possible. This is the "optimism" of finding something long, and cuts off // all smaller scans in case we do. The format is [min,max] inclusive, the interval in the y direction, the smaller string. const scan_queue = [[short_start, short_end]]; while (scan_queue.length > 0) { // Dequeue the next interval to scan, and leave if it's too small to result in a longer substring than we've already found. const [y_min, y_max] = scan_queue.shift(); const interval_len = y_max - y_min + 1; if (interval_len <= best_so_far_len) { // once we start scanning things that are smaller than best-so-far, we can stop; all else enqueued are equal or smaller continue; } // Bisect the interval, thus finding all substrings that are at least half the lengths of this interval. const y = y_min + f_floor(interval_len / 2); const target = short_a[y]; let x = long_start; // console.log(` OB horizontal: scan '${target}' at y=${y}∈[${y_min},${y_max}], x∈[${long_start},${long_end}], seeking S > ${best_so_far_len}`); while ((x = long.indexOf(target, x, long_end)) >= 0) { // scan, using indexOf() to leap to the next match // console.log(` HIT: (${x},${y})`); // Find the head and tail of the snake in both strings, by extending in both directions let x_snake_start = x; let y_snake_start = y; let snake_length = 1; while (y_snake_start > short_start && x_snake_start > long_start && short_a[y_snake_start - 1] === long_a[x_snake_start - 1]) { // check backward, and track that starting point ++snake_length; --x_snake_start; --y_snake_start; } const max_long_length = long_end - x_snake_start; const max_short_length = short_end - y_snake_start; const max_snake_length = max_long_length < max_short_length ? max_long_length : max_short_length; // the snake cannot be longer than this, since the head is fixed while (snake_length <= max_snake_length && short_a[y_snake_start + snake_length] === long_a[x_snake_start + snake_length]) { ++snake_length; } // console.log(` HIT RESULT: (${x_snake_start},${y_snake_start}) for ${snake_length}`); // If this is the biggest snake we've seen so far, record it. if (snake_length > best_so_far_len) { // console.log(` LONGEST: (${x_snake_start},${y_snake_start}) for ${snake_length}`); best_so_far_x = x_snake_start; best_so_far_y = y_snake_start; best_so_far_len = snake_length; } // Next time through, start scanning at the following character ++x; } // Enqueue the recursive intervals if (y - y_min > best_so_far_len) { // we'll check when we pop too, but might as well not even make an array and push and pop; nearly 2x's performance! scan_queue.push([y_min, y - 1]); } if (y_max - y > best_so_far_len) { // we'll check when we pop too, but might as well not even make an array and push and pop; nearly 2x's performance! scan_queue.push([y + 1, y_max]); } } // If nothing found, return an empty result if (best_so_far_len == 0) { const empty = ArrayView.createEmpty(); return [empty, empty]; } // Return the result in the form of ArrayViews. // Reorder them based on whether we originally swapped them for the short/long requirement. const short_result = new ArrayView(short_a, best_so_far_y, best_so_far_y + best_so_far_len - 1); const long_result = new ArrayView(long_a, best_so_far_x, best_so_far_x + best_so_far_len - 1); return is_swapped ? [long_result, short_result] : [short_result, long_result]; } /** * Finds the longest common substring between two strings, returning the offset in the two input strings, * and the length. * * This algorithm uses an "optimistic" method, in which if there are long common substrings (relative to the * shortest string), it will be found quickly, in O(M) comparisons. It is still O(NM) in the worst case, * which is the lack of a common substring, or a common of 1 character. */ static getLongestCommonSubstringOptimisticBisectString(a, b) { // The shorter string must come first. If not, compute the converse, then reverse the order of the result. const is_swapped = a.length > b.length; const short_a = is_swapped ? b : a; const long_a = is_swapped ? a : b; const f_floor = Math.floor; // console.log(`LCS OB: [${short_a}] -> [${long_a}]`); // Load up the remaining variables const short_end = short_a.length - 1; const long_end = long_a.length - 1; // Initialize the state of the "best so far" // best x, y starting locations, and best length let best_so_far_x = -1; let best_so_far_y = -1; let best_so_far_len = 0; // Initialize bisection scanning queue. Must use a queue for scanning, to do breadth-first, attempting to // find the longest strings as soon as possible. This is the "optimism" of finding something long, and cuts off // all smaller scans in case we do. The format is [min,max] inclusive, the interval in the y direction, the smaller string. const scan_queue = [[0, short_end]]; while (scan_queue.length > 0) { // Dequeue the next interval to scan, and leave if it's too small to result in a longer substring than we've already found. const [y_min, y_max] = scan_queue.shift(); const interval_len = y_max - y_min + 1; if (interval_len <= best_so_far_len) { // once we start scanning things that are smaller than best-so-far, we can stop; all else enqueued are equal or smaller continue; } // Bisect the interval, thus finding all substrings that are at least half the lengths of this interval. const y = y_min + f_floor(interval_len / 2); const target = short_a.charAt(y); let x = 0; // console.log(` OB horizontal: scan '${target}' at y=${y}∈[${y_min},${y_max}], x∈[${long_start},${long_end}], seeking S > ${best_so_far_len}`); while ((x = long_a.indexOf(target, x)) >= 0) { // scan, using indexOf() to efficiently leap to the next match // console.log(` HIT: (${x},${y})`); // Find the head and tail of the snake in both strings, by extending in both directions let x_snake_start = x; let y_snake_start = y; let snake_length = 1; while (y_snake_start > 0 && x_snake_start > 0 && short_a.charAt(y_snake_start - 1) === long_a.charAt(x_snake_start - 1)) { // check backward, and track that starting point ++snake_length; --x_snake_start; --y_snake_start; } const max_long_length = long_end - x_snake_start; const max_short_length = short_end - y_snake_start; const max_snake_length = max_long_length < max_short_length ? max_long_length : max_short_length; // the snake cannot be longer than this, since the head is fixed while (snake_length <= max_snake_length && short_a.charAt(y_snake_start + snake_length) === long_a.charAt(x_snake_start + snake_length)) { ++snake_length; } // console.log(` HIT RESULT: (${x_snake_start},${y_snake_start}) for ${snake_length}`); // If this is the biggest snake we've seen so far, record it. if (snake_length > best_so_far_len) { // console.log(` LONGEST: (${x_snake_start},${y_snake_start}) for ${snake_length}`); best_so_far_x = x_snake_start; best_so_far_y = y_snake_start; best_so_far_len = snake_length; } // Next time through, start scanning at the following character ++x; } // Enqueue the recursive intervals if (y - y_min > best_so_far_len) { // we'll check when we pop too, but might as well not even make an array and push and pop; nearly 2x's performance! scan_queue.push([y_min, y - 1]); } if (y_max - y > best_so_far_len) { // we'll check when we pop too, but might as well not even make an array and push and pop; nearly 2x's performance! scan_queue.push([y + 1, y_max]); } } // Return the result, making sure to swap back a/b for short/long return { a_offset: is_swapped ? best_so_far_x : best_so_far_y, b_offset: is_swapped ? best_so_far_y : best_so_far_x, len: best_so_far_len, }; } /** * Returns the index coordinates of the middle of the longest common subsequence between the two arrays. * The longest subsequence is the longest set of ordered (but not necessarily consecutive) equal elements * from both arrays. The middle is often but not always along a common substring. Even when it is a common * substring, it is often not the globally-longest common substring. * * The index is relative to the start of each ArrayView, not relative to the absolute position inside the * underlying array. * * If there is no common subsequence whatsoever, `null` is returned. */ getLongestCommonSubsequenceMiddleMyers(that) { // HT to fraser@google.com (Neil Fraser); adapted this algorithm from his diff-merge-patch Javascript code. // console.log(`LCS Myers: [${this.toString()}] -> [${that.toString()}]`); const a = this.a; const b = that.a; const a_start = this.start; const b_start = that.start; // Cache the text lengths to prevent multiple calls. let text1_length = this.length; let text2_length = that.length; let max_d = Math.ceil((text1_length + text2_length) / 2); let v_offset = max_d; let v_length = 2 * max_d; let v1 = new Array(v_length); let v2 = new Array(v_length); // Setting all elements to -1 is faster in Chrome & Firefox than mixing // integers and undefined. for (let i = 0; i < v_length; i++) { v1[i] = -1; v2[i] = -1; } v1[v_offset + 1] = 0; v2[v_offset + 1] = 0; let delta = text1_length - text2_length; // If the total number of characters is odd, then the front path will collide // with the reverse path. let front = (delta % 2 != 0); // Offsets for start and end of k loop. // Prevents mapping of space beyond the grid. let k1start = 0; let k1end = 0; let k2start = 0; let k2end = 0; for (let d = 0; d < max_d; d++) { // Walk the front path one step. for (let k1 = -d + k1start; k1 <= d - k1end; k1 += 2) { let k1_offset = v_offset + k1; let x1; if (k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1])) { x1 = v1[k1_offset + 1]; } else { x1 = v1[k1_offset - 1] + 1; } let y1 = x1 - k1; // let snake_length = 0; while (x1 < text1_length && y1 < text2_length && a[a_start + x1] === b[b_start + y1]) { ++x1; ++y1; // ++snake_length; } v1[k1_offset] = x1; if (x1 > text1_length) { // Ran off the right of the graph. k1end += 2; } else if (y1 > text2_length) { // Ran off the bottom of the graph. k1start += 2; } else if (front) { let k2_offset = v_offset + delta - k1; if (k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1) { // Mirror x2 onto top-left coordinate system. const x2 = text1_length - v2[k2_offset]; // const k2 = k2_offset - v_offset; // const y2 = text2_length - (v2[k2_offset] - k2); if (x1 >= x2) { // Overlap detected. Return the head of the snake that we might have just traversed // console.log(`>>> LCS from forwards, S=${snake_length}, (x1,y2)=(${x1},${y1}) vs (x2,y2)=(${x2},${y2})`); return [x1, y1]; } } } } // Walk the reverse path one step. for (let k2 = -d + k2start; k2 <= d - k2end; k2 += 2) { let k2_offset = v_offset + k2; let x2; if (k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1])) { x2 = v2[k2_offset + 1]; } else { x2 = v2[k2_offset - 1] + 1; } let y2 = x2 - k2; // let snake_length = 0; while (x2 < text1_length && y2 < text2_length && a[a_start + text1_length - x2 - 1] === b[b_start + text2_length - y2 - 1]) { ++x2; ++y2; // ++snake_length; } v2[k2_offset] = x2; if (x2 > text1_length) { // Ran off the left of the graph. k2end += 2; } else if (y2 > text2_length) { // Ran off the top of the graph. k2start += 2; } else if (!front) { let k1_offset = v_offset + delta - k2; if (k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1) { const x1 = v1[k1_offset]; const y1 = v_offset + x1 - k1_offset; // Mirror x2 onto top-left coordinate system. x2 = text1_length - x2; if (x1 >= x2) { // Overlap detected. If there was a snake, we already traversed it, and these coordinates are the head // console.log(`>>> LCS from backwards, S=${snake_length}, (x1,y2)=(${x1},${y1}) vs (x2,y2)=(${x2},${text2_length-y2})`); return [x1, y1]; } } } } } // No overlap; completely different strings, no common substring. return null; } /** * Breaks a string into individual characters, and returns an array view of those characters. */ static fromCharacters(str) { return new ArrayView(str.split('')); } /** * Breaks a string into tokens based on any regular expression, and returns an array view of those tokens. */ static fromTokens(str, re) { const tokens = []; if (str) { // odd case of empty string const str_length = str.length; let m; let last_non_token_idx = 0; while ((m = re.exec(str)) !== null) { // console.log(`Found ${m[0]} at ${m.index}. Next starts at ${re.lastIndex}.`); const m_index = m.index; while (m_index > last_non_token_idx) { // add all non-token characters as their own token tokens[tokens.length] = str.substring(last_non_token_idx, last_non_token_idx + 1); ++last_non_token_idx; } tokens[tokens.length] = str.substring(last_non_token_idx, re.lastIndex); // add this entire token last_non_token_idx = re.lastIndex; // remember where we are } while (last_non_token_idx < str_length) { // drain any remaining non-tokens tokens[tokens.length] = str.substring(last_non_token_idx, last_non_token_idx + 1); ++last_non_token_idx; } } return new ArrayView(tokens); } /** * Tokenizes a string assuming the use-case is plain-text prose. */ static fromPlainProse(str) { return ArrayView.fromTokens(str, /\w+/g); // inline constant regex is faster than loading from a private static class variable } /** * Tokenizes a string by lines. Includes the line-ending character as well. */ static fromLines(str) { if (!str) return ArrayView.createEmpty(); // special case return new ArrayView(str.split(/([^\r\n]*(?:\r?\n|$))/g).filter(str => str.length > 0)); // FIXME: Slower, but Safari dies when we try the better one below // return new ArrayView( str.split(/(?<=\r?\n)/g) ); // splitting with non-consuming pattern (is this slow? is there a better way?) } } exports.ArrayView = ArrayView; /** * Generates a histogram from a set of tokens, counting the number of times each appears, and the location * that the first instance of that token appears. */ class Histogram { constructor(input) { /** * Map of each unique symbol's ID to its information record. */ this.histogram = {}; /** * Map of the relative-position in the original input, to the symbol ID, and a field that can be used in algorithms to */ this.ordered = []; const hist = this.histogram; const a = input.a; const a_start = input.start; const len = input.length; for (let k = 0; k < len; ++k) { const id = this.getIdForElement(a[a_start + k]); let r = hist[id]; if (!r) { // if the first time, create the record r = { id: id, count: 1, first_offset: k, }; hist[id] = r; } else { // not the first time, increment the count ++r.count; } this.ordered.push({ id: id, }); } } /** * Given an element from the original input, returns its histogram record, or `undefined` if it's not present in the original input */ lookupElement(el) { return this.histogram[this.getIdForElement(el)]; } /** * Converts any type of element into a unique ID that is used as a key in the histogram array. */ getIdForElement(el) { if (typeof el == "string" || typeof el == "number") { return el; } return JSON.stringify(el); // FIXME: need a different algorithm that orders keys and such; should be in another project } } exports.Histogram = Histogram; // Describes a step in transforming "prev" to "next." class Edit { constructor(prev, next) { this.prev = prev; this.next = next; // nothing to do except set the fields } static createEquality(keep) { return new Edit(keep, keep); // must be the same object, not just duplicate data, for fast-check of equality later } static createPureInsertion(ins) { return new Edit(ArrayView.createEmpty(), ins); // must be the same object, not just duplicate data, for fast-check of equality later } static createPureDeletion(del) { return new Edit(del, ArrayView.createEmpty()); // must be the same object, not just duplicate data, for fast-check of equality later } /** * True if this edit represents equal subsequences in the previous and next. * * @readonly */ isEquality() { return this.prev.equals(this.next); } /** * True if this edit is an insertion, with no deletion. * * @readonly */ isPureInsertion() { return this.prev.empty && !this.next.empty; } /** * True if this edit is a deletion, with no insertion. * * @readonly */ isPureDeletion() { return !this.prev.empty && this.next.empty; } /** * True if this edit is a modification, i.e. both a non-trivial delete and a non-trivial insert at the same location. * * @readonly */ isModification() { return !this.next.empty && !this.prev.empty && !this.isEquality(); } /** * Creates and returns a new Edit object, that is the same as this one, but the opposite. * Inserts become deletes. Equalities are returned without creating a new Edit object. * * @readonly */ getConverse() { return this.isEquality() ? this : new Edit(this.next, this.prev); } /** * Returns a human-readable, but not machine-usable, representation of this edit * * @param matched {boolean} if true, "equality" operations are surrounded by parenthesis to be explicit, otherwise they are plain for easier readability */ toString(matched = false) { if (this.isEquality()) { return matched ? `(${this.prev.toString()})` : this.prev.toString(); } if (this.isPureDeletion()) { return `[${this.prev.toString()}]`; } if (this.isPureInsertion()) { return `{${this.next.toString()}}`; } return `[${this.prev.toString()}]{${this.next.toString()}}`; } } exports.Edit = Edit; // A list of edits, with various algorithms for that list. class EditScript { constructor() { this.edits = []; // nothing more to do } get length() { return this.edits.length; } get empty() { return this.edits.length == 0; } append(e) { this.edits.push(e); return this; } prepend(e) { this.edits.unshift(e); return this; } /** * Returns a human-readable, but not machine-usable, representation of the entire edit script. * * @param matched {boolean} if true, "equality" operations are surrounded by parenthesis to be explicit, otherwise they are plain for easier readability */ toString(matched = false) { return this.edits.map((e) => e.toString(matched)).join(''); } /** * Given the output of `EditScript<T>.toString()`, parses and returns the result as a string-typed EditScript. */ static fromString(s) { var _a, _b, _c, _d, _e, _f; const result = new EditScript(); let m; // Find all deletions let deletions = []; while ((m = EditScript.re_delete.exec(s)) !== null) { deletions.push({ edit: new Edit(ArrayView.fromCharacters(m[1]), ArrayView.createEmpty()), offset: m.index, }); } // Find all insertions let insertions = []; while ((m = EditScript.re_insert.exec(s)) !== null) { insertions.push({ edit: new Edit(ArrayView.createEmpty(), ArrayView.fromCharacters(m[1])), offset: m.index, }); } // Find all equalities let equals = []; while ((m = EditScript.re_equal.exec(s)) !== null) { const av = ArrayView.fromCharacters(m[1]); equals.push({ edit: new Edit(av, av), offset: m.index, }); } // Splice it all together let di = 0; let ii = 0; let ei = 0; while (di < deletions.length || ii < insertions.length || ei < equals.length) { const d_offset = (_b = (_a = deletions[di]) === null || _a === void 0 ? void 0 : _a.offset) !== null && _b !== void 0 ? _b : Number.MAX_SAFE_INTEGER; const i_offset = (_d = (_c = insertions[ii]) === null || _c === void 0 ? void 0 : _c.offset) !== null && _d !== void 0 ? _d : Number.MAX_SAFE_INTEGER; const e_offset = (_f = (_e = equals[ei]) === null || _e === void 0 ? void 0 : _e.offset) !== null && _f !== void 0 ? _f : Number.MAX_SAFE_INTEGER; if (e_offset < d_offset && e_offset < i_offset) { result.append(equals[ei++].edit); } else if (d_offset < e_offset && d_offset < i_offset) { result.append(deletions[di++].edit); } else if (i_offset < e_offset && i_offset < e_offset) { result.append(insertions[ii++].edit); } else { throw new Error("Shouldn't get here."); } } // Join insert/delete pairs into a single modification result.visitEditPairs((left, right) => { if (left.isPureDeletion() && right.isPureInsertion()) { return [new Edit(left.prev, right.next)]; } return null; }); return result; } /** * Recreate the "previous" sequence using only edits, concatenating back to an array. */ getPrev() { const result = []; for (let k = 0; k < this.edits.length; ++k) { this.edits[k].prev.pushAll(result); } return result; } /** * Recreate the "next" sequence using only edits, concatenating back to an array. */ getNext() { const result = []; for (let k = 0; k < this.edits.length; ++k) { this.edits[k].next.pushAll(result); } return result; } /** * Creates and returns a new EditScript, that is the same as this one, but goes the opposite direction. * Inserts become deletes. */ getConverse() { const converse = new EditScript(); this.edits.forEach((ed) => { converse.edits[converse.length] = ed.getConverse(); }); return converse; } /** * Creates a string, assuming the script represents strings of text, emitting lines with a gutter * of '+' for insertion, '-' for deletion, or ' ' for equality. */ getScriptAsFormattedLines() { const result = []; const appendLines = (av, prefix) => { result[result.length] = av.mapToString((el) => prefix + el).join(''); }; this.visitEditsForward((ed) => { if (ed.isEquality()) { appendLines(ed.prev, ' '); } else { appendLines(ed.prev, '-'); appendLines(ed.next, '+'); } }); return result.join(''); } /** * Visits all Edits, in forward order (which disallows changing the edit list while iterating). */ visitEditsForward(f_visit) { const edits = this.edits; // speedup const len = edits.length; for (let ei = 0; ei < len; ++ei) { f_visit(edits[ei]); } } /** * Visits all Edits, in reverse order (which allows for changes to the underlying edit script). * * The callback function can return null to indicate that no change should be made in the edit script, * or it can return an array which replaces the two edits completely. If some of the edits should be * preserved, just include them in the array. It is legal for the array to be any length, including empty. */ visitEdits(f_visit) { const edits = this.edits; // speedup for (let ei = edits.length; --ei >= 0;) { const replacement = f_visit(edits[ei]); if (replacement !== null) { this.edits.splice(ei, 1, ...replacement); } } } /** * Visits all pairs of Edits, in reverse order (which allows for changes to the underlying edit script). * Won't visit anything if there's just one Edit. * * The callback function can return null to indicate that no change should be made in the edit script, * or it can return an array which replaces the two edits completely. If some of the edits should be * preserved, just include them in the array. It is legal for the array to be any length, including empty. */ visitEditPairs(f_visit) { const edits = this.edits; // speedup for (let ei = edits.length - 1; --ei >= 0;) { const replacement = f_visit(edits[ei], edits[ei + 1]); if (replacement !== null) { this.edits.splice(ei, 2, ...replacement); } } } /** * Visits all trios of Edits, in reverse order (which allows for changes to the underlying edit script). * Won't visit anything if there are fewer than three Edits. * * The callback function can return null to indicate that no change should be made in the edit script, * or it can return an array which replaces the three edits completely. If some of the edits should be * preserved, just include them in the array. It is legal for the array to be any length, including empty. */ visitEditTrios(f_visit) { const edits = this.edits; // speedup for (let ei = edits.length - 1; --ei > 0;) { const replacement = f_visit(edits[ei - 1], edits[ei], edits[ei + 1]); if (replacement !== null) { this.edits.splice(ei - 1, 3, ...replacement); } } } /** * Scans for an Equality edit (the "middle"), surrounded on both sides by either two insertions, two deletions, * or one modification and any change on the other side. These are the conditions in which it is * legal to join the left, middle, and right edits into a single edit. While the resulting script is * identical, it reduces the total number of edits in a way that might be preferable, usually for semantic * reasons. For example, a single space separating changes to words on either side, probably should be * folded into a sing