UNPKG

diff

Version:

A JavaScript text diff implementation.

277 lines (276 loc) 13.3 kB
import Diff from './base.js'; import { longestCommonPrefix, longestCommonSuffix, replacePrefix, replaceSuffix, removePrefix, removeSuffix, maximumOverlap, leadingWs, trailingWs } from '../util/string.js'; // Based on https://en.wikipedia.org/wiki/Latin_script_in_Unicode // // Ranges and exceptions: // Latin-1 Supplement, 0080–00FF // - U+00D7 × Multiplication sign // - U+00F7 ÷ Division sign // Latin Extended-A, 0100–017F // Latin Extended-B, 0180–024F // IPA Extensions, 0250–02AF // Spacing Modifier Letters, 02B0–02FF // - U+02C7 ˇ ˇ Caron // - U+02D8 ˘ ˘ Breve // - U+02D9 ˙ ˙ Dot Above // - U+02DA ˚ ˚ Ring Above // - U+02DB ˛ ˛ Ogonek // - U+02DC ˜ ˜ Small Tilde // - U+02DD ˝ ˝ Double Acute Accent // Latin Extended Additional, 1E00–1EFF const extendedWordChars = 'a-zA-Z0-9_\\u{C0}-\\u{FF}\\u{D8}-\\u{F6}\\u{F8}-\\u{2C6}\\u{2C8}-\\u{2D7}\\u{2DE}-\\u{2FF}\\u{1E00}-\\u{1EFF}'; // Each token is one of the following: // - A punctuation mark plus the surrounding whitespace // - A word plus the surrounding whitespace // - Pure whitespace (but only in the special case where this the entire text // is just whitespace) // // We have to include surrounding whitespace in the tokens because the two // alternative approaches produce horribly broken results: // * If we just discard the whitespace, we can't fully reproduce the original // text from the sequence of tokens and any attempt to render the diff will // get the whitespace wrong. // * If we have separate tokens for whitespace, then in a typical text every // second token will be a single space character. But this often results in // the optimal diff between two texts being a perverse one that preserves // the spaces between words but deletes and reinserts actual common words. // See https://github.com/kpdecker/jsdiff/issues/160#issuecomment-1866099640 // for an example. // // Keeping the surrounding whitespace of course has implications for .equals // and .join, not just .tokenize. // This regex does NOT fully implement the tokenization rules described above. // Instead, it gives runs of whitespace their own "token". The tokenize method // then handles stitching whitespace tokens onto adjacent word or punctuation // tokens. const tokenizeIncludingWhitespace = new RegExp(`[${extendedWordChars}]+|\\s+|[^${extendedWordChars}]`, 'ug'); class WordDiff extends Diff { equals(left, right, options) { if (options.ignoreCase) { left = left.toLowerCase(); right = right.toLowerCase(); } return left.trim() === right.trim(); } tokenize(value, options = {}) { let parts; if (options.intlSegmenter) { const segmenter = options.intlSegmenter; if (segmenter.resolvedOptions().granularity != 'word') { throw new Error('The segmenter passed must have a granularity of "word"'); } parts = Array.from(segmenter.segment(value), segment => segment.segment); } else { parts = value.match(tokenizeIncludingWhitespace) || []; } const tokens = []; let prevPart = null; parts.forEach(part => { if ((/\s/).test(part)) { if (prevPart == null) { tokens.push(part); } else { tokens.push(tokens.pop() + part); } } else if (prevPart != null && (/\s/).test(prevPart)) { if (tokens[tokens.length - 1] == prevPart) { tokens.push(tokens.pop() + part); } else { tokens.push(prevPart + part); } } else { tokens.push(part); } prevPart = part; }); return tokens; } join(tokens) { // Tokens being joined here will always have appeared consecutively in the // same text, so we can simply strip off the leading whitespace from all the // tokens except the first (and except any whitespace-only tokens - but such // a token will always be the first and only token anyway) and then join them // and the whitespace around words and punctuation will end up correct. return tokens.map((token, i) => { if (i == 0) { return token; } else { return token.replace((/^\s+/), ''); } }).join(''); } postProcess(changes, options) { if (!changes || options.oneChangePerToken) { return changes; } let lastKeep = null; // Change objects representing any insertion or deletion since the last // "keep" change object. There can be at most one of each. let insertion = null; let deletion = null; changes.forEach(change => { if (change.added) { insertion = change; } else if (change.removed) { deletion = change; } else { if (insertion || deletion) { // May be false at start of text dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, change); } lastKeep = change; insertion = null; deletion = null; } }); if (insertion || deletion) { dedupeWhitespaceInChangeObjects(lastKeep, deletion, insertion, null); } return changes; } } export const wordDiff = new WordDiff(); export function diffWords(oldStr, newStr, options) { // This option has never been documented and never will be (it's clearer to // just call `diffWordsWithSpace` directly if you need that behavior), but // has existed in jsdiff for a long time, so we retain support for it here // for the sake of backwards compatibility. if ((options === null || options === void 0 ? void 0 : options.ignoreWhitespace) != null && !options.ignoreWhitespace) { return diffWordsWithSpace(oldStr, newStr, options); } return wordDiff.diff(oldStr, newStr, options); } function dedupeWhitespaceInChangeObjects(startKeep, deletion, insertion, endKeep) { // Before returning, we tidy up the leading and trailing whitespace of the // change objects to eliminate cases where trailing whitespace in one object // is repeated as leading whitespace in the next. // Below are examples of the outcomes we want here to explain the code. // I=insert, K=keep, D=delete // 1. diffing 'foo bar baz' vs 'foo baz' // Prior to cleanup, we have K:'foo ' D:' bar ' K:' baz' // After cleanup, we want: K:'foo ' D:'bar ' K:'baz' // // 2. Diffing 'foo bar baz' vs 'foo qux baz' // Prior to cleanup, we have K:'foo ' D:' bar ' I:' qux ' K:' baz' // After cleanup, we want K:'foo ' D:'bar' I:'qux' K:' baz' // // 3. Diffing 'foo\nbar baz' vs 'foo baz' // Prior to cleanup, we have K:'foo ' D:'\nbar ' K:' baz' // After cleanup, we want K'foo' D:'\nbar' K:' baz' // // 4. Diffing 'foo baz' vs 'foo\nbar baz' // Prior to cleanup, we have K:'foo\n' I:'\nbar ' K:' baz' // After cleanup, we ideally want K'foo' I:'\nbar' K:' baz' // but don't actually manage this currently (the pre-cleanup change // objects don't contain enough information to make it possible). // // 5. Diffing 'foo bar baz' vs 'foo baz' // Prior to cleanup, we have K:'foo ' D:' bar ' K:' baz' // After cleanup, we want K:'foo ' D:' bar ' K:'baz' // // Our handling is unavoidably imperfect in the case where there's a single // indel between keeps and the whitespace has changed. For instance, consider // diffing 'foo\tbar\nbaz' vs 'foo baz'. Unless we create an extra change // object to represent the insertion of the space character (which isn't even // a token), we have no way to avoid losing information about the texts' // original whitespace in the result we return. Still, we do our best to // output something that will look sensible if we e.g. print it with // insertions in green and deletions in red. // Between two "keep" change objects (or before the first or after the last // change object), we can have either: // * A "delete" followed by an "insert" // * Just an "insert" // * Just a "delete" // We handle the three cases separately. if (deletion && insertion) { const oldWsPrefix = leadingWs(deletion.value); const oldWsSuffix = trailingWs(deletion.value); const newWsPrefix = leadingWs(insertion.value); const newWsSuffix = trailingWs(insertion.value); if (startKeep) { const commonWsPrefix = longestCommonPrefix(oldWsPrefix, newWsPrefix); startKeep.value = replaceSuffix(startKeep.value, newWsPrefix, commonWsPrefix); deletion.value = removePrefix(deletion.value, commonWsPrefix); insertion.value = removePrefix(insertion.value, commonWsPrefix); } if (endKeep) { const commonWsSuffix = longestCommonSuffix(oldWsSuffix, newWsSuffix); endKeep.value = replacePrefix(endKeep.value, newWsSuffix, commonWsSuffix); deletion.value = removeSuffix(deletion.value, commonWsSuffix); insertion.value = removeSuffix(insertion.value, commonWsSuffix); } } else if (insertion) { // The whitespaces all reflect what was in the new text rather than // the old, so we essentially have no information about whitespace // insertion or deletion. We just want to dedupe the whitespace. // We do that by having each change object keep its trailing // whitespace and deleting duplicate leading whitespace where // present. if (startKeep) { const ws = leadingWs(insertion.value); insertion.value = insertion.value.substring(ws.length); } if (endKeep) { const ws = leadingWs(endKeep.value); endKeep.value = endKeep.value.substring(ws.length); } // otherwise we've got a deletion and no insertion } else if (startKeep && endKeep) { const newWsFull = leadingWs(endKeep.value), delWsStart = leadingWs(deletion.value), delWsEnd = trailingWs(deletion.value); // Any whitespace that comes straight after startKeep in both the old and // new texts, assign to startKeep and remove from the deletion. const newWsStart = longestCommonPrefix(newWsFull, delWsStart); deletion.value = removePrefix(deletion.value, newWsStart); // Any whitespace that comes straight before endKeep in both the old and // new texts, and hasn't already been assigned to startKeep, assign to // endKeep and remove from the deletion. const newWsEnd = longestCommonSuffix(removePrefix(newWsFull, newWsStart), delWsEnd); deletion.value = removeSuffix(deletion.value, newWsEnd); endKeep.value = replacePrefix(endKeep.value, newWsFull, newWsEnd); // If there's any whitespace from the new text that HASN'T already been // assigned, assign it to the start: startKeep.value = replaceSuffix(startKeep.value, newWsFull, newWsFull.slice(0, newWsFull.length - newWsEnd.length)); } else if (endKeep) { // We are at the start of the text. Preserve all the whitespace on // endKeep, and just remove whitespace from the end of deletion to the // extent that it overlaps with the start of endKeep. const endKeepWsPrefix = leadingWs(endKeep.value); const deletionWsSuffix = trailingWs(deletion.value); const overlap = maximumOverlap(deletionWsSuffix, endKeepWsPrefix); deletion.value = removeSuffix(deletion.value, overlap); } else if (startKeep) { // We are at the END of the text. Preserve all the whitespace on // startKeep, and just remove whitespace from the start of deletion to // the extent that it overlaps with the end of startKeep. const startKeepWsSuffix = trailingWs(startKeep.value); const deletionWsPrefix = leadingWs(deletion.value); const overlap = maximumOverlap(startKeepWsSuffix, deletionWsPrefix); deletion.value = removePrefix(deletion.value, overlap); } } class WordsWithSpaceDiff extends Diff { tokenize(value) { // Slightly different to the tokenizeIncludingWhitespace regex used above in // that this one treats each individual newline as a distinct tokens, rather // than merging them into other surrounding whitespace. This was requested // in https://github.com/kpdecker/jsdiff/issues/180 & // https://github.com/kpdecker/jsdiff/issues/211 const regex = new RegExp(`(\\r?\\n)|[${extendedWordChars}]+|[^\\S\\n\\r]+|[^${extendedWordChars}]`, 'ug'); return value.match(regex) || []; } } export const wordsWithSpaceDiff = new WordsWithSpaceDiff(); export function diffWordsWithSpace(oldStr, newStr, options) { return wordsWithSpaceDiff.diff(oldStr, newStr, options); }