@asmartbear/diff-merge
Version:
Text and arbitrary-array diff and merge, fast, multiple algorithms
1,116 lines • 96.3 kB
JavaScript
'use strict';
Object.defineProperty(exports, "__esModule", { value: true });
// A dependency-free, type-safe difference algorithm.
// Operates on arrays of values, which just need to support the concept of equality (like strings).
// Generally tries to find minimal edits, but trades off for speed especially with longer sequences.
// Terminology:
// "prev" -- the previous array; the "before"
// "next" -- the next array; the "after"
// true if we're running in various environments
const is_node = typeof process !== "undefined";
const is_chrome = !is_node && typeof window !== "undefined" && "chrome" in window;
// const is_safari: boolean = !is_node && typeof window !== "undefined" && "safari" in window;
// true if we're running the V8 Javascript Engine
const is_v8 = is_node || is_chrome;
// when to use various native functions versus our own functions, depending on speeds of various environments
const should_use_native_indexof = is_v8;
// const supports_backwards_match_regex: boolean = !is_safari;
/**
* Like Math.max(), but executes faster in some Javascript browsers and contexts.
*/
function imax(a, b) {
return a > b ? a : b;
}
/**
* Like Math.min(), but executes faster in some Javascript browsers and contexts.
*/
function imin(a, b) {
return a < b ? a : b;
}
/**
* Returns the number of characters that both strings have in common, or 0 if none or at least one is null or undefined.
*
* Contains a few optimizations (that benchmarks indicate are, in fact, useful), and is Unicode-safe.
*/
function getCommonPrefixLength(a, b) {
if (!a || !b || a.charAt(0) !== b.charAt(0))
return 0; // trivial, quick cases
if (a.charAt(1) !== b.charAt(1))
return 1; // since both strings are non-empty, this is valid, and quick in the common case that there's nothing else
let max = a.length < b.length ? a.length : b.length;
for (let k = 2; k < max; ++k) {
if (a.charAt(k) !== b.charAt(k)) {
return k;
}
}
return max;
}
exports.getCommonPrefixLength = getCommonPrefixLength;
/**
* Holds an array with offset and length, so we can create a pseudo-array "view" into an array without
* making large-array copies or allocations. The "end" is inclusive!
*/
class ArrayView {
constructor(a, start = 0, end = a.length - 1) {
this.a = a;
this.start = start;
this.end = end;
// nothing else to do
}
static createEmpty() {
return new ArrayView([], 0, -1); // FIXME: share single empty array object since it's immutable?
}
/**
* The number of elements in this view.
*/
get length() {
return this.end - this.start + 1;
}
/**
* True if this view is empty, i.e. represents the empty array, and has length 0.
*/
get empty() {
return this.end < this.start;
}
/**
* Like Array.indexOf(), but includes start (relative to the actual start of the underlying array) and end (inclusive
* and relative to the actual underlying array).
*/
indexOf(target, start = this.start, end = this.end) {
// In most browsers, Array#indexOf() is very slow. Therefore, we scan it ourselves.
// Plus Array#indexOf() doesn't understand the end of the array, which we could correct for, but it's slower anyway.
// Benchmark: https://run.perf.zone/view/for-vs-while-vs-indexof-1000-strings-1516293164213
// The native indexOf() can be faster, even though it can do "extra work" by running off the end of the array.
if (should_use_native_indexof) {
const result = this.a.indexOf(target, start);
return result > end ? -1 : result;
}
// Manual scan.
const a = this.a;
for (let k = start; k <= end; ++k) {
if (a[k] === target) {
return k;
}
}
return -1;
}
/**
* Same behavior as String#indexOf: Find another ArrayView as a consecutive substring on this array, returning the
* (absolute) index of the first such position, or -1 if not found.
*/
indexOfSubstring(target, start = this.start, end = this.end) {
// Trivial cases
if (target.empty)
return 0;
if (this.empty)
return -1;
// The following implementation is fine if the underlying array is short, but if it's long, it would be worth computing
// a table for something like KMP or Boyer-Moore searching. Be sure to add lots more unit tests before doing that though!
const target_first_element = target.a[target.start]; // cache this one, since we're scanning for it first
const target_length = target.length;
if (target_length === 1)
return this.indexOf(target_first_element, start, end); // a faster algorithm in the case of finding a single element
const a = this.a;
end = imin(end, this.end - target_length + 1); // don't run off the end of our array looking for the target
for (let k = start; k <= end; ++k) { // if the target is longer than us, this loop will end trivially, so we don't need to check that case separately
if (a[k] === target_first_element) {
let matched = true;
for (let len = 1; len < target_length; ++len) {
if (a[k + len] !== target.a[target.start + len]) {
matched = false;
break;
}
}
if (matched) {
return k;
}
}
}
return -1;
}
/**
* Creates a copy of this subsection of the array, which is therefore modifable, and allocates memory.
*/
getCopy() {
/**
* Creates shallow copy of an array, using whatever is fastest in the current environment.
* Ref: https://jsperf.com/new-array-vs-splice-vs-slice/31
*
* In Safari desktop it's fastest to while-loop; slice is 11% slower.
* In Chrome desktop it's fastest to slice().
* In Firefox desktop it's an order of magnitude faster to slice().
*/
return this.a.slice(this.start, this.end + 1);
}
/**
* Like Array.forEach(), but on this view of the array, and more restrictions on the callback function.
*/
forEach(f) {
const a = this.a;
const end = this.end;
for (let k = this.start; k <= end; ++k) { // because of starting and ending
f(a[k]);
}
}
/**
* Like Array.map(), but on this view of the array, and more restrictions on the callback function.
*/
map(f) {
const result = [];
const a = this.a;
const end = this.end;
for (let k = this.start; k <= end; ++k) { // because of starting and ending
result[result.length] = f(a[k]);
}
return result;
}
/**
* Like Array.map(), but on this view of the array, and the mapping function converts to a string.
*/
mapToString(f) {
const result = [];
const a = this.a;
const end = this.end;
for (let k = this.start; k <= end; ++k) { // because of starting and ending
result[result.length] = f(a[k]);
}
return result;
}
/**
* Pushes all of the elements from this view onto the end of the given array.
*/
pushAll(arry) {
const a = this.a;
const end = this.end;
for (let k = this.start; k <= end; ++k) {
arry[arry.length] = a[k];
}
}
/**
* Retrieves an element from the array, relative to the start of this view.
*/
getElement(relative_index) {
return this.a[this.start + relative_index];
}
/**
* Return an ArrayView that is a subsequence, starting at an offset relative to `this,` and with a given length.
* If the result would be identical to `this`, then `this` itself is returned without allocating a new view.
* If length is missing, the subsequence goes to the end.
*/
getSubsequence(relative_start, relative_length) {
if (relative_length === undefined) {
if (relative_start === 0) { // save an allocation if we can just return ourselves
return this;
}
return new ArrayView(this.a, this.start + relative_start, this.end); // end is fixed, so no need to compute length and so forth
}
if (relative_length <= 0) { // clamp
relative_length = 0;
}
if (relative_start == 0 && relative_length == this.length) { // happens to be identical to self
return this;
}
const offset = this.start + relative_start; // slight optimization, since we use it twice
return new ArrayView(this.a, offset, offset + relative_length - 1);
}
/**
* Gets an array in reverse order, just the elements inside this view, as a new view.
* This allocates memory.
*/
getReverse() {
return new ArrayView(this.getCopy().reverse());
}
toString() {
return this.getCopy().join('');
}
/**
* True if this subsequence is equal to the other subsequence, false otherwise.
*/
equals(that) {
// We intentionally use duplicate objects, e.g. for "equality" Edits, so if we're lucky, this super-fast check short-circuits "true"
if (this === that)
return true;
// Length-check short-circuits "false" in many cases
const len = this.length;
if (len !== that.length)
return false;
// Reuse other code to compare all characters in both arrays
return this.getLengthOfCommonPrefix(that) == len;
}
/**
* Returns -1, 0, or 1, indicating whether `this` is less, equal, or greater than `that`, comparing like strings, e.g. one element at a time,
* returning the answer if they're unequal, and if the shorter is equal to the prefix of the latter, the shorter is deemed smaller.
*/
compare(that) {
const a = this.a;
const b = that.a;
const a_end = this.end;
const b_end = that.end;
let ai = this.start;
let bi = that.start;
while (ai <= a_end && bi <= b_end) {
const a_el = a[ai];
const b_el = b[bi];
if (a_el < b_el)
return -1;
if (a_el > b_el)
return 1;
++ai;
++bi;
}
return ai <= a_end ? 1 : bi <= b_end ? -1 : 0;
}
/**
* Concatenates a sequence onto this one, NOT changing the current one, but rather returning a new result.
* This is akin to Array#concat() with just one parameter.
* Various optimizations prevent allocating memory whenever possible.
*/
concat(that) {
// Trivial cases with no allocations; yay!
if (this.empty)
return that;
if (that.empty)
return this;
// If exactly consecutive, we can get away with a new view on the existing underlying array.
if (this.a === that.a && this.end + 1 === that.start) {
return new ArrayView(this.a, this.start, that.end);
}
// We have to create a new array.
const result = this.getCopy();
that.pushAll(result);
return new ArrayView(result);
}
/**
* Returns the number of elements that are common to the first elements in both arrays.
* Each array can be any size, including empty, but cannot be null or undefined.
*/
getLengthOfCommonPrefix(that) {
const a = this.a;
const b = that.a;
const a_end = this.end;
const b_end = that.end;
let ai = this.start;
let bi = that.start;
let n = 0;
while (ai <= a_end && bi <= b_end) {
if (a[ai] !== b[bi]) {
break;
}
++n;
++ai;
++bi;
}
return n;
}
/**
* Returns the number of elements that are common to the last elements in both arrays.
* Each array can be any size, including empty, but cannot be null or undefined.
*/
getLengthOfCommonSuffix(that) {
const a = this.a;
const b = that.a;
const a_start = this.start;
const b_start = that.start;
let ai = this.end;
let bi = that.end;
let n = 0;
while (ai >= a_start && bi >= b_start) {
if (a[ai] !== b[bi]) {
break;
}
++n;
--ai;
--bi;
}
return n;
}
/**
* Find the longest length L of a suffix of `this` which overlaps with a prefix of `that` also of length L.
* Each array can be any size, including empty, but not null or undefined. Returns 0 if there is no commonality.
*/
getLengthOfOverlapAtMyEnd(that) {
// FIXME: Better algorithm, maybe something like taking the first char of B, and scanning in A for it.
// Anything in A that matches the whole prefix/suffix has to start by matching this way, but we reuse a fast algorithm for it.
const a = this.a;
const b = that.a;
const a_start = this.start;
const b_start = that.start;
const a_end = this.end;
const b_end = that.end;
for (let n_inclusive = imin(a_end - a_start, b_end - b_start); n_inclusive >= 0; --n_inclusive) {
let matches = true;
for (let k = n_inclusive; k >= 0; --k) {
if (a[a_end - k] !== b[b_start + n_inclusive - k]) {
matches = false;
break;
}
}
if (matches) {
return n_inclusive + 1;
}
}
return 0;
}
/**
* Returns a pair of ArrayViews containing the longest common substring in `this` and `that` respectively.
* Both views in the pair will be empty if there's no common substring.
*
* This particular algorithm is slow -- O(N*M) -- but always finds the right answer, is simple, it approaches
* O(N) if the longest substring is long, and it uses only O(1) additional memory.
* It uses a few techniques to speed up, for finding the first element of a substring,
* and by anticipating that a found-substring might be extended in-place to speed up the best-so-far length.
* In short, the N-loop is slow but the M-loop is fast, and further, it orders strings such that N <= M.
*/
static getLongestCommonSubstringGrowingLinearScan(ths, tht) {
// a is shorter, b is longer
// Put the shorter string on the outer loop for faster execution.
// As a special case, when sequence length is longer than the shorter string, it will automatically terminate; not so if longer were on the outside.
// As a special case, if the shorter string is embedded completely in the longer, we'll find that very quickly and terminate.
const this_is_shorter = ths.length <= tht.length;
const shorter = (this_is_shorter ? ths : tht);
const longer = (this_is_shorter ? tht : ths);
const a = shorter.a;
const b = longer.a;
const a_start = shorter.start;
const b_start = longer.start;
const a_end_p1 = shorter.end + 1;
const b_end_p1 = longer.end + 1;
let subsequence_len = 1; // the current length of the subsequence that we're scanning for
let best_a_start = 0;
let best_b_start = 0;
let best_length = 0;
for (let ai = a_start; ai + subsequence_len <= a_end_p1;) {
// Does the current subsequence in a, appear anywhere in b?
// If yes, place the index in `bi_found`, else that will be -1 at the end of the loop.
let bi_found = -1;
const bi_end_search = b_end_p1 - subsequence_len; // don't scan off the end of the array, given the substring length we will require
const target = a[ai]; // speedup: cache the target value
let bi = b_start;
while ((bi = longer.indexOf(target, bi, bi_end_search)) >= 0) {
let found = true;
for (let k = subsequence_len; --k > 0;) { // don't have to check when k==0 because the equality test above already did that
if (a[ai + k] !== b[bi + k]) {
found = false;
break;
}
}
if (found) {
bi_found = bi;
break;
}
++bi;
}
// If we found something, save it as "best so far," and increment the length and try again,
// otherwise keep the length the same and try the next index.
if (bi_found >= 0) {
// Attempt to extend the match we found. Don't run off the end of the array!
for (let k = subsequence_len; ai + k < a_end_p1 && bi_found + k < b_end_p1 && a[ai + k] === b[bi_found + k];) {
subsequence_len = ++k; // match! new length, and increment our test-length in k
}
// Save this as the "best so far," and continue with the next-larger subsequence length to beat it
best_a_start = ai;
best_b_start = bi_found;
best_length = subsequence_len;
++subsequence_len;
}
else {
++ai; // next!
}
}
const result = [
new ArrayView(a, best_a_start, best_a_start + best_length - 1),
new ArrayView(b, best_b_start, best_b_start + best_length - 1),
];
if (!this_is_shorter) {
result.reverse();
}
return result;
}
/**
* Returns a pair of ArrayViews containing the longest common substring in `this` and `that` respectively.
* Both views in the pair will be empty if there's no common substring.
*
* This algorithm uses an "optimistic" method, in which if there are long common substrings (relative to the
* shortest string), it will be found quickly, in O(M) comparisons. It is still O(NM) in the worst case,
* which is the lack of a common substring, or a common of 1 character.
*/
static getLongestCommonSubstringOptimisticBisect(ths, tht) {
// The shorter string must come first. If not, compute the converse, then reverse the order of the result.
const is_swapped = ths.length > tht.length;
const short = is_swapped ? tht : ths;
const long = is_swapped ? ths : tht;
// console.log(`LCS OB: [${short.toString()}] -> [${long.toString()}]`);
// Load up the remaining variables
const short_a = short.a;
const long_a = long.a;
const short_start = short.start;
const long_start = long.start;
const short_end = short.end;
const long_end = long.end;
const f_floor = Math.floor;
// Initialize the state of the "best so far"
// best x, y starting locations, and best length
let best_so_far_x = -1;
let best_so_far_y = -1;
let best_so_far_len = 0;
// Initialize bisection scanning queue. Must use a queue for scanning, to do breadth-first, attempting to
// find the longest strings as soon as possible. This is the "optimism" of finding something long, and cuts off
// all smaller scans in case we do. The format is [min,max] inclusive, the interval in the y direction, the smaller string.
const scan_queue = [[short_start, short_end]];
while (scan_queue.length > 0) {
// Dequeue the next interval to scan, and leave if it's too small to result in a longer substring than we've already found.
const [y_min, y_max] = scan_queue.shift();
const interval_len = y_max - y_min + 1;
if (interval_len <= best_so_far_len) { // once we start scanning things that are smaller than best-so-far, we can stop; all else enqueued are equal or smaller
continue;
}
// Bisect the interval, thus finding all substrings that are at least half the lengths of this interval.
const y = y_min + f_floor(interval_len / 2);
const target = short_a[y];
let x = long_start;
// console.log(` OB horizontal: scan '${target}' at y=${y}∈[${y_min},${y_max}], x∈[${long_start},${long_end}], seeking S > ${best_so_far_len}`);
while ((x = long.indexOf(target, x, long_end)) >= 0) { // scan, using indexOf() to leap to the next match
// console.log(` HIT: (${x},${y})`);
// Find the head and tail of the snake in both strings, by extending in both directions
let x_snake_start = x;
let y_snake_start = y;
let snake_length = 1;
while (y_snake_start > short_start && x_snake_start > long_start && short_a[y_snake_start - 1] === long_a[x_snake_start - 1]) { // check backward, and track that starting point
++snake_length;
--x_snake_start;
--y_snake_start;
}
const max_long_length = long_end - x_snake_start;
const max_short_length = short_end - y_snake_start;
const max_snake_length = max_long_length < max_short_length ? max_long_length : max_short_length; // the snake cannot be longer than this, since the head is fixed
while (snake_length <= max_snake_length && short_a[y_snake_start + snake_length] === long_a[x_snake_start + snake_length]) {
++snake_length;
}
// console.log(` HIT RESULT: (${x_snake_start},${y_snake_start}) for ${snake_length}`);
// If this is the biggest snake we've seen so far, record it.
if (snake_length > best_so_far_len) {
// console.log(` LONGEST: (${x_snake_start},${y_snake_start}) for ${snake_length}`);
best_so_far_x = x_snake_start;
best_so_far_y = y_snake_start;
best_so_far_len = snake_length;
}
// Next time through, start scanning at the following character
++x;
}
// Enqueue the recursive intervals
if (y - y_min > best_so_far_len) { // we'll check when we pop too, but might as well not even make an array and push and pop; nearly 2x's performance!
scan_queue.push([y_min, y - 1]);
}
if (y_max - y > best_so_far_len) { // we'll check when we pop too, but might as well not even make an array and push and pop; nearly 2x's performance!
scan_queue.push([y + 1, y_max]);
}
}
// If nothing found, return an empty result
if (best_so_far_len == 0) {
const empty = ArrayView.createEmpty();
return [empty, empty];
}
// Return the result in the form of ArrayViews.
// Reorder them based on whether we originally swapped them for the short/long requirement.
const short_result = new ArrayView(short_a, best_so_far_y, best_so_far_y + best_so_far_len - 1);
const long_result = new ArrayView(long_a, best_so_far_x, best_so_far_x + best_so_far_len - 1);
return is_swapped ? [long_result, short_result] : [short_result, long_result];
}
/**
* Finds the longest common substring between two strings, returning the offset in the two input strings,
* and the length.
*
* This algorithm uses an "optimistic" method, in which if there are long common substrings (relative to the
* shortest string), it will be found quickly, in O(M) comparisons. It is still O(NM) in the worst case,
* which is the lack of a common substring, or a common of 1 character.
*/
static getLongestCommonSubstringOptimisticBisectString(a, b) {
// The shorter string must come first. If not, compute the converse, then reverse the order of the result.
const is_swapped = a.length > b.length;
const short_a = is_swapped ? b : a;
const long_a = is_swapped ? a : b;
const f_floor = Math.floor;
// console.log(`LCS OB: [${short_a}] -> [${long_a}]`);
// Load up the remaining variables
const short_end = short_a.length - 1;
const long_end = long_a.length - 1;
// Initialize the state of the "best so far"
// best x, y starting locations, and best length
let best_so_far_x = -1;
let best_so_far_y = -1;
let best_so_far_len = 0;
// Initialize bisection scanning queue. Must use a queue for scanning, to do breadth-first, attempting to
// find the longest strings as soon as possible. This is the "optimism" of finding something long, and cuts off
// all smaller scans in case we do. The format is [min,max] inclusive, the interval in the y direction, the smaller string.
const scan_queue = [[0, short_end]];
while (scan_queue.length > 0) {
// Dequeue the next interval to scan, and leave if it's too small to result in a longer substring than we've already found.
const [y_min, y_max] = scan_queue.shift();
const interval_len = y_max - y_min + 1;
if (interval_len <= best_so_far_len) { // once we start scanning things that are smaller than best-so-far, we can stop; all else enqueued are equal or smaller
continue;
}
// Bisect the interval, thus finding all substrings that are at least half the lengths of this interval.
const y = y_min + f_floor(interval_len / 2);
const target = short_a.charAt(y);
let x = 0;
// console.log(` OB horizontal: scan '${target}' at y=${y}∈[${y_min},${y_max}], x∈[${long_start},${long_end}], seeking S > ${best_so_far_len}`);
while ((x = long_a.indexOf(target, x)) >= 0) { // scan, using indexOf() to efficiently leap to the next match
// console.log(` HIT: (${x},${y})`);
// Find the head and tail of the snake in both strings, by extending in both directions
let x_snake_start = x;
let y_snake_start = y;
let snake_length = 1;
while (y_snake_start > 0 && x_snake_start > 0 && short_a.charAt(y_snake_start - 1) === long_a.charAt(x_snake_start - 1)) { // check backward, and track that starting point
++snake_length;
--x_snake_start;
--y_snake_start;
}
const max_long_length = long_end - x_snake_start;
const max_short_length = short_end - y_snake_start;
const max_snake_length = max_long_length < max_short_length ? max_long_length : max_short_length; // the snake cannot be longer than this, since the head is fixed
while (snake_length <= max_snake_length && short_a.charAt(y_snake_start + snake_length) === long_a.charAt(x_snake_start + snake_length)) {
++snake_length;
}
// console.log(` HIT RESULT: (${x_snake_start},${y_snake_start}) for ${snake_length}`);
// If this is the biggest snake we've seen so far, record it.
if (snake_length > best_so_far_len) {
// console.log(` LONGEST: (${x_snake_start},${y_snake_start}) for ${snake_length}`);
best_so_far_x = x_snake_start;
best_so_far_y = y_snake_start;
best_so_far_len = snake_length;
}
// Next time through, start scanning at the following character
++x;
}
// Enqueue the recursive intervals
if (y - y_min > best_so_far_len) { // we'll check when we pop too, but might as well not even make an array and push and pop; nearly 2x's performance!
scan_queue.push([y_min, y - 1]);
}
if (y_max - y > best_so_far_len) { // we'll check when we pop too, but might as well not even make an array and push and pop; nearly 2x's performance!
scan_queue.push([y + 1, y_max]);
}
}
// Return the result, making sure to swap back a/b for short/long
return {
a_offset: is_swapped ? best_so_far_x : best_so_far_y,
b_offset: is_swapped ? best_so_far_y : best_so_far_x,
len: best_so_far_len,
};
}
/**
* Returns the index coordinates of the middle of the longest common subsequence between the two arrays.
* The longest subsequence is the longest set of ordered (but not necessarily consecutive) equal elements
* from both arrays. The middle is often but not always along a common substring. Even when it is a common
* substring, it is often not the globally-longest common substring.
*
* The index is relative to the start of each ArrayView, not relative to the absolute position inside the
* underlying array.
*
* If there is no common subsequence whatsoever, `null` is returned.
*/
getLongestCommonSubsequenceMiddleMyers(that) {
// HT to fraser@google.com (Neil Fraser); adapted this algorithm from his diff-merge-patch Javascript code.
// console.log(`LCS Myers: [${this.toString()}] -> [${that.toString()}]`);
const a = this.a;
const b = that.a;
const a_start = this.start;
const b_start = that.start;
// Cache the text lengths to prevent multiple calls.
let text1_length = this.length;
let text2_length = that.length;
let max_d = Math.ceil((text1_length + text2_length) / 2);
let v_offset = max_d;
let v_length = 2 * max_d;
let v1 = new Array(v_length);
let v2 = new Array(v_length);
// Setting all elements to -1 is faster in Chrome & Firefox than mixing
// integers and undefined.
for (let i = 0; i < v_length; i++) {
v1[i] = -1;
v2[i] = -1;
}
v1[v_offset + 1] = 0;
v2[v_offset + 1] = 0;
let delta = text1_length - text2_length;
// If the total number of characters is odd, then the front path will collide
// with the reverse path.
let front = (delta % 2 != 0);
// Offsets for start and end of k loop.
// Prevents mapping of space beyond the grid.
let k1start = 0;
let k1end = 0;
let k2start = 0;
let k2end = 0;
for (let d = 0; d < max_d; d++) {
// Walk the front path one step.
for (let k1 = -d + k1start; k1 <= d - k1end; k1 += 2) {
let k1_offset = v_offset + k1;
let x1;
if (k1 == -d || (k1 != d && v1[k1_offset - 1] < v1[k1_offset + 1])) {
x1 = v1[k1_offset + 1];
}
else {
x1 = v1[k1_offset - 1] + 1;
}
let y1 = x1 - k1;
// let snake_length = 0;
while (x1 < text1_length && y1 < text2_length && a[a_start + x1] === b[b_start + y1]) {
++x1;
++y1;
// ++snake_length;
}
v1[k1_offset] = x1;
if (x1 > text1_length) {
// Ran off the right of the graph.
k1end += 2;
}
else if (y1 > text2_length) {
// Ran off the bottom of the graph.
k1start += 2;
}
else if (front) {
let k2_offset = v_offset + delta - k1;
if (k2_offset >= 0 && k2_offset < v_length && v2[k2_offset] != -1) {
// Mirror x2 onto top-left coordinate system.
const x2 = text1_length - v2[k2_offset];
// const k2 = k2_offset - v_offset;
// const y2 = text2_length - (v2[k2_offset] - k2);
if (x1 >= x2) {
// Overlap detected. Return the head of the snake that we might have just traversed
// console.log(`>>> LCS from forwards, S=${snake_length}, (x1,y2)=(${x1},${y1}) vs (x2,y2)=(${x2},${y2})`);
return [x1, y1];
}
}
}
}
// Walk the reverse path one step.
for (let k2 = -d + k2start; k2 <= d - k2end; k2 += 2) {
let k2_offset = v_offset + k2;
let x2;
if (k2 == -d || (k2 != d && v2[k2_offset - 1] < v2[k2_offset + 1])) {
x2 = v2[k2_offset + 1];
}
else {
x2 = v2[k2_offset - 1] + 1;
}
let y2 = x2 - k2;
// let snake_length = 0;
while (x2 < text1_length && y2 < text2_length && a[a_start + text1_length - x2 - 1] === b[b_start + text2_length - y2 - 1]) {
++x2;
++y2;
// ++snake_length;
}
v2[k2_offset] = x2;
if (x2 > text1_length) {
// Ran off the left of the graph.
k2end += 2;
}
else if (y2 > text2_length) {
// Ran off the top of the graph.
k2start += 2;
}
else if (!front) {
let k1_offset = v_offset + delta - k2;
if (k1_offset >= 0 && k1_offset < v_length && v1[k1_offset] != -1) {
const x1 = v1[k1_offset];
const y1 = v_offset + x1 - k1_offset;
// Mirror x2 onto top-left coordinate system.
x2 = text1_length - x2;
if (x1 >= x2) {
// Overlap detected. If there was a snake, we already traversed it, and these coordinates are the head
// console.log(`>>> LCS from backwards, S=${snake_length}, (x1,y2)=(${x1},${y1}) vs (x2,y2)=(${x2},${text2_length-y2})`);
return [x1, y1];
}
}
}
}
}
// No overlap; completely different strings, no common substring.
return null;
}
/**
* Breaks a string into individual characters, and returns an array view of those characters.
*/
static fromCharacters(str) {
return new ArrayView(str.split(''));
}
/**
* Breaks a string into tokens based on any regular expression, and returns an array view of those tokens.
*/
static fromTokens(str, re) {
const tokens = [];
if (str) { // odd case of empty string
const str_length = str.length;
let m;
let last_non_token_idx = 0;
while ((m = re.exec(str)) !== null) {
// console.log(`Found ${m[0]} at ${m.index}. Next starts at ${re.lastIndex}.`);
const m_index = m.index;
while (m_index > last_non_token_idx) { // add all non-token characters as their own token
tokens[tokens.length] = str.substring(last_non_token_idx, last_non_token_idx + 1);
++last_non_token_idx;
}
tokens[tokens.length] = str.substring(last_non_token_idx, re.lastIndex); // add this entire token
last_non_token_idx = re.lastIndex; // remember where we are
}
while (last_non_token_idx < str_length) { // drain any remaining non-tokens
tokens[tokens.length] = str.substring(last_non_token_idx, last_non_token_idx + 1);
++last_non_token_idx;
}
}
return new ArrayView(tokens);
}
/**
* Tokenizes a string assuming the use-case is plain-text prose.
*/
static fromPlainProse(str) {
return ArrayView.fromTokens(str, /\w+/g); // inline constant regex is faster than loading from a private static class variable
}
/**
* Tokenizes a string by lines. Includes the line-ending character as well.
*/
static fromLines(str) {
if (!str)
return ArrayView.createEmpty(); // special case
return new ArrayView(str.split(/([^\r\n]*(?:\r?\n|$))/g).filter(str => str.length > 0)); // FIXME: Slower, but Safari dies when we try the better one below
// return new ArrayView( str.split(/(?<=\r?\n)/g) ); // splitting with non-consuming pattern (is this slow? is there a better way?)
}
}
exports.ArrayView = ArrayView;
/**
* Generates a histogram from a set of tokens, counting the number of times each appears, and the location
* that the first instance of that token appears.
*/
class Histogram {
constructor(input) {
/**
* Map of each unique symbol's ID to its information record.
*/
this.histogram = {};
/**
* Map of the relative-position in the original input, to the symbol ID, and a field that can be used in algorithms to
*/
this.ordered = [];
const hist = this.histogram;
const a = input.a;
const a_start = input.start;
const len = input.length;
for (let k = 0; k < len; ++k) {
const id = this.getIdForElement(a[a_start + k]);
let r = hist[id];
if (!r) { // if the first time, create the record
r = {
id: id,
count: 1,
first_offset: k,
};
hist[id] = r;
}
else { // not the first time, increment the count
++r.count;
}
this.ordered.push({
id: id,
});
}
}
/**
* Given an element from the original input, returns its histogram record, or `undefined` if it's not present in the original input
*/
lookupElement(el) {
return this.histogram[this.getIdForElement(el)];
}
/**
* Converts any type of element into a unique ID that is used as a key in the histogram array.
*/
getIdForElement(el) {
if (typeof el == "string" || typeof el == "number") {
return el;
}
return JSON.stringify(el); // FIXME: need a different algorithm that orders keys and such; should be in another project
}
}
exports.Histogram = Histogram;
// Describes a step in transforming "prev" to "next."
class Edit {
constructor(prev, next) {
this.prev = prev;
this.next = next;
// nothing to do except set the fields
}
static createEquality(keep) {
return new Edit(keep, keep); // must be the same object, not just duplicate data, for fast-check of equality later
}
static createPureInsertion(ins) {
return new Edit(ArrayView.createEmpty(), ins); // must be the same object, not just duplicate data, for fast-check of equality later
}
static createPureDeletion(del) {
return new Edit(del, ArrayView.createEmpty()); // must be the same object, not just duplicate data, for fast-check of equality later
}
/**
* True if this edit represents equal subsequences in the previous and next.
*
* @readonly
*/
isEquality() {
return this.prev.equals(this.next);
}
/**
* True if this edit is an insertion, with no deletion.
*
* @readonly
*/
isPureInsertion() {
return this.prev.empty && !this.next.empty;
}
/**
* True if this edit is a deletion, with no insertion.
*
* @readonly
*/
isPureDeletion() {
return !this.prev.empty && this.next.empty;
}
/**
* True if this edit is a modification, i.e. both a non-trivial delete and a non-trivial insert at the same location.
*
* @readonly
*/
isModification() {
return !this.next.empty && !this.prev.empty && !this.isEquality();
}
/**
* Creates and returns a new Edit object, that is the same as this one, but the opposite.
* Inserts become deletes. Equalities are returned without creating a new Edit object.
*
* @readonly
*/
getConverse() {
return this.isEquality() ? this : new Edit(this.next, this.prev);
}
/**
* Returns a human-readable, but not machine-usable, representation of this edit
*
* @param matched {boolean} if true, "equality" operations are surrounded by parenthesis to be explicit, otherwise they are plain for easier readability
*/
toString(matched = false) {
if (this.isEquality()) {
return matched ? `(${this.prev.toString()})` : this.prev.toString();
}
if (this.isPureDeletion()) {
return `[${this.prev.toString()}]`;
}
if (this.isPureInsertion()) {
return `{${this.next.toString()}}`;
}
return `[${this.prev.toString()}]{${this.next.toString()}}`;
}
}
exports.Edit = Edit;
// A list of edits, with various algorithms for that list.
class EditScript {
constructor() {
this.edits = [];
// nothing more to do
}
get length() {
return this.edits.length;
}
get empty() {
return this.edits.length == 0;
}
append(e) {
this.edits.push(e);
return this;
}
prepend(e) {
this.edits.unshift(e);
return this;
}
/**
* Returns a human-readable, but not machine-usable, representation of the entire edit script.
*
* @param matched {boolean} if true, "equality" operations are surrounded by parenthesis to be explicit, otherwise they are plain for easier readability
*/
toString(matched = false) {
return this.edits.map((e) => e.toString(matched)).join('');
}
/**
* Given the output of `EditScript<T>.toString()`, parses and returns the result as a string-typed EditScript.
*/
static fromString(s) {
var _a, _b, _c, _d, _e, _f;
const result = new EditScript();
let m;
// Find all deletions
let deletions = [];
while ((m = EditScript.re_delete.exec(s)) !== null) {
deletions.push({
edit: new Edit(ArrayView.fromCharacters(m[1]), ArrayView.createEmpty()),
offset: m.index,
});
}
// Find all insertions
let insertions = [];
while ((m = EditScript.re_insert.exec(s)) !== null) {
insertions.push({
edit: new Edit(ArrayView.createEmpty(), ArrayView.fromCharacters(m[1])),
offset: m.index,
});
}
// Find all equalities
let equals = [];
while ((m = EditScript.re_equal.exec(s)) !== null) {
const av = ArrayView.fromCharacters(m[1]);
equals.push({
edit: new Edit(av, av),
offset: m.index,
});
}
// Splice it all together
let di = 0;
let ii = 0;
let ei = 0;
while (di < deletions.length || ii < insertions.length || ei < equals.length) {
const d_offset = (_b = (_a = deletions[di]) === null || _a === void 0 ? void 0 : _a.offset) !== null && _b !== void 0 ? _b : Number.MAX_SAFE_INTEGER;
const i_offset = (_d = (_c = insertions[ii]) === null || _c === void 0 ? void 0 : _c.offset) !== null && _d !== void 0 ? _d : Number.MAX_SAFE_INTEGER;
const e_offset = (_f = (_e = equals[ei]) === null || _e === void 0 ? void 0 : _e.offset) !== null && _f !== void 0 ? _f : Number.MAX_SAFE_INTEGER;
if (e_offset < d_offset && e_offset < i_offset) {
result.append(equals[ei++].edit);
}
else if (d_offset < e_offset && d_offset < i_offset) {
result.append(deletions[di++].edit);
}
else if (i_offset < e_offset && i_offset < e_offset) {
result.append(insertions[ii++].edit);
}
else {
throw new Error("Shouldn't get here.");
}
}
// Join insert/delete pairs into a single modification
result.visitEditPairs((left, right) => {
if (left.isPureDeletion() && right.isPureInsertion()) {
return [new Edit(left.prev, right.next)];
}
return null;
});
return result;
}
/**
* Recreate the "previous" sequence using only edits, concatenating back to an array.
*/
getPrev() {
const result = [];
for (let k = 0; k < this.edits.length; ++k) {
this.edits[k].prev.pushAll(result);
}
return result;
}
/**
* Recreate the "next" sequence using only edits, concatenating back to an array.
*/
getNext() {
const result = [];
for (let k = 0; k < this.edits.length; ++k) {
this.edits[k].next.pushAll(result);
}
return result;
}
/**
* Creates and returns a new EditScript, that is the same as this one, but goes the opposite direction.
* Inserts become deletes.
*/
getConverse() {
const converse = new EditScript();
this.edits.forEach((ed) => { converse.edits[converse.length] = ed.getConverse(); });
return converse;
}
/**
* Creates a string, assuming the script represents strings of text, emitting lines with a gutter
* of '+' for insertion, '-' for deletion, or ' ' for equality.
*/
getScriptAsFormattedLines() {
const result = [];
const appendLines = (av, prefix) => {
result[result.length] = av.mapToString((el) => prefix + el).join('');
};
this.visitEditsForward((ed) => {
if (ed.isEquality()) {
appendLines(ed.prev, ' ');
}
else {
appendLines(ed.prev, '-');
appendLines(ed.next, '+');
}
});
return result.join('');
}
/**
* Visits all Edits, in forward order (which disallows changing the edit list while iterating).
*/
visitEditsForward(f_visit) {
const edits = this.edits; // speedup
const len = edits.length;
for (let ei = 0; ei < len; ++ei) {
f_visit(edits[ei]);
}
}
/**
* Visits all Edits, in reverse order (which allows for changes to the underlying edit script).
*
* The callback function can return null to indicate that no change should be made in the edit script,
* or it can return an array which replaces the two edits completely. If some of the edits should be
* preserved, just include them in the array. It is legal for the array to be any length, including empty.
*/
visitEdits(f_visit) {
const edits = this.edits; // speedup
for (let ei = edits.length; --ei >= 0;) {
const replacement = f_visit(edits[ei]);
if (replacement !== null) {
this.edits.splice(ei, 1, ...replacement);
}
}
}
/**
* Visits all pairs of Edits, in reverse order (which allows for changes to the underlying edit script).
* Won't visit anything if there's just one Edit.
*
* The callback function can return null to indicate that no change should be made in the edit script,
* or it can return an array which replaces the two edits completely. If some of the edits should be
* preserved, just include them in the array. It is legal for the array to be any length, including empty.
*/
visitEditPairs(f_visit) {
const edits = this.edits; // speedup
for (let ei = edits.length - 1; --ei >= 0;) {
const replacement = f_visit(edits[ei], edits[ei + 1]);
if (replacement !== null) {
this.edits.splice(ei, 2, ...replacement);
}
}
}
/**
* Visits all trios of Edits, in reverse order (which allows for changes to the underlying edit script).
* Won't visit anything if there are fewer than three Edits.
*
* The callback function can return null to indicate that no change should be made in the edit script,
* or it can return an array which replaces the three edits completely. If some of the edits should be
* preserved, just include them in the array. It is legal for the array to be any length, including empty.
*/
visitEditTrios(f_visit) {
const edits = this.edits; // speedup
for (let ei = edits.length - 1; --ei > 0;) {
const replacement = f_visit(edits[ei - 1], edits[ei], edits[ei + 1]);
if (replacement !== null) {
this.edits.splice(ei - 1, 3, ...replacement);
}
}
}
/**
* Scans for an Equality edit (the "middle"), surrounded on both sides by either two insertions, two deletions,
* or one modification and any change on the other side. These are the conditions in which it is
* legal to join the left, middle, and right edits into a single edit. While the resulting script is
* identical, it reduces the total number of edits in a way that might be preferable, usually for semantic
* reasons. For example, a single space separating changes to words on either side, probably should be
* folded into a sing