@pawanosman/textdiff
Version:
Unicode-aware multilingual text diff library for Node.js that diffs by whole words and merges adjacent changes.
205 lines (203 loc) • 7.29 kB
JavaScript
"use strict";
var __defProp = Object.defineProperty;
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
var __getOwnPropNames = Object.getOwnPropertyNames;
var __hasOwnProp = Object.prototype.hasOwnProperty;
var __export = (target, all) => {
for (var name in all)
__defProp(target, name, { get: all[name], enumerable: true });
};
var __copyProps = (to, from, except, desc) => {
if (from && typeof from === "object" || typeof from === "function") {
for (let key of __getOwnPropNames(from))
if (!__hasOwnProp.call(to, key) && key !== except)
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
}
return to;
};
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
// src/index.ts
var index_exports = {};
__export(index_exports, {
default: () => index_default,
getTextDiffs: () => getTextDiffs
});
module.exports = __toCommonJS(index_exports);
var WORD_REGEX = /([\p{L}\p{N}\p{M}]+)|([^\p{L}\p{N}\p{M}]+)/gu;
function normalize(input) {
return input.normalize("NFC");
}
function tokenize(text) {
const normalized = normalize(text);
const tokens = [];
const useSegmenter = typeof Intl.Segmenter !== "undefined";
if (useSegmenter) {
try {
const segmenter = new Intl.Segmenter(void 0, { granularity: "word" });
const segments = Array.from(segmenter.segment(normalized));
let cursor = 0;
for (const seg of segments) {
const { segment, index } = seg;
if (index > cursor) {
tokens.push({ kind: "sep", value: normalized.slice(cursor, index), start: cursor, end: index });
}
const isWord = seg.isWordLike ?? /[\p{L}\p{N}\p{M}]/u.test(segment);
tokens.push({ kind: isWord ? "word" : "sep", value: segment, start: index, end: index + segment.length });
cursor = index + segment.length;
}
if (cursor < normalized.length) {
tokens.push({ kind: "sep", value: normalized.slice(cursor), start: cursor, end: normalized.length });
}
return tokens;
} catch {
}
}
let match;
while ((match = WORD_REGEX.exec(normalized)) !== null) {
const value = match[0];
const start = match.index;
const end = start + value.length;
const isWord = /[\p{L}\p{N}\p{M}]/u.test(value);
tokens.push({ kind: isWord ? "word" : "sep", value, start, end });
}
return tokens;
}
function lcsIndicesTokens(a, b) {
const n = a.length;
const m = b.length;
const dp = Array.from({ length: n + 1 }, () => new Array(m + 1).fill(0));
for (let i2 = n - 1; i2 >= 0; i2--) {
for (let j2 = m - 1; j2 >= 0; j2--) {
if (a[i2].value === b[j2].value) dp[i2][j2] = dp[i2 + 1][j2 + 1] + 1;
else dp[i2][j2] = Math.max(dp[i2 + 1][j2], dp[i2][j2 + 1]);
}
}
const pairs = [];
let i = 0;
let j = 0;
while (i < n && j < m) {
if (a[i].value === b[j].value) {
pairs.push([i, j]);
i++;
j++;
} else if (dp[i + 1][j] >= dp[i][j + 1]) {
i++;
} else {
j++;
}
}
return pairs;
}
function levenshtein(a, b) {
if (a === b) return 0;
const n = a.length;
const m = b.length;
if (n === 0) return m;
if (m === 0) return n;
const prev = new Array(m + 1);
const curr = new Array(m + 1);
for (let j = 0; j <= m; j++) prev[j] = j;
for (let i = 1; i <= n; i++) {
curr[0] = i;
const ai = a.charCodeAt(i - 1);
for (let j = 1; j <= m; j++) {
const cost = ai === b.charCodeAt(j - 1) ? 0 : 1;
curr[j] = Math.min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost);
}
for (let j = 0; j <= m; j++) prev[j] = curr[j];
}
return prev[m];
}
function classifyChange(oldTokSlice, newTokSlice, oldSlice, newSlice) {
if (oldSlice.length === 0 && newSlice.length > 0) return "insert";
if (newSlice.length === 0 && oldSlice.length > 0) return "delete";
const oldWords = oldTokSlice.filter((t) => t.kind === "word");
const newWords = newTokSlice.filter((t) => t.kind === "word");
if (oldWords.length === 1 && newWords.length === 1) {
const d = levenshtein(oldWords[0].value, newWords[0].value);
const maxLen = Math.max(oldWords[0].value.length, newWords[0].value.length);
if (d > 0 && d <= Math.max(2, Math.ceil(0.3 * maxLen))) return "spell-correction";
}
return "replace";
}
function sliceFromOriginal(original, start, end) {
return original.slice(start, end);
}
function getTextDiffs(oldTextInput, newTextInput) {
const oldText = normalize(oldTextInput);
const newText = normalize(newTextInput);
if (oldText === newText) return [];
const oldTokens = tokenize(oldText);
const newTokens = tokenize(newText);
const lcs = lcsIndicesTokens(oldTokens, newTokens);
const changes = [];
let prevOld = 0;
let prevNew = 0;
for (let k = 0; k <= lcs.length; k++) {
const pair = lcs[k];
const nextOld = pair ? pair[0] : oldTokens.length;
const nextNew = pair ? pair[1] : newTokens.length;
if (nextOld > prevOld || nextNew > prevNew) {
changes.push({ oldTokStart: prevOld, oldTokEnd: nextOld, newTokStart: prevNew, newTokEnd: nextNew });
}
prevOld = nextOld + 1;
prevNew = nextNew + 1;
}
const mergedChanges = [];
for (const ch of changes) {
const last = mergedChanges[mergedChanges.length - 1];
if (last) {
const oldBetweenStart = last.oldTokEnd;
const oldBetweenEnd = ch.oldTokStart;
const newBetweenStart = last.newTokEnd;
const newBetweenEnd = ch.newTokStart;
const onlySepsBetweenOld = oldBetweenStart <= oldBetweenEnd ? oldTokens.slice(oldBetweenStart, oldBetweenEnd).every((t) => t.kind === "sep") : true;
const onlySepsBetweenNew = newBetweenStart <= newBetweenEnd ? newTokens.slice(newBetweenStart, newBetweenEnd).every((t) => t.kind === "sep") : true;
if (onlySepsBetweenOld && onlySepsBetweenNew) {
last.oldTokEnd = ch.oldTokEnd;
last.newTokEnd = ch.newTokEnd;
continue;
}
}
mergedChanges.push({ ...ch });
}
const diffs = [];
for (const ch of mergedChanges) {
const hasOld = ch.oldTokEnd > ch.oldTokStart;
const hasNew = ch.newTokEnd > ch.newTokStart;
let oldStart;
let oldEnd;
if (hasOld) {
oldStart = oldTokens[ch.oldTokStart].start;
oldEnd = oldTokens[ch.oldTokEnd - 1].end;
} else {
const leftIdx = ch.oldTokStart - 1;
oldStart = leftIdx >= 0 ? oldTokens[leftIdx].end : 0;
oldEnd = oldStart;
}
let newStart;
let newEnd;
if (hasNew) {
newStart = newTokens[ch.newTokStart].start;
newEnd = newTokens[ch.newTokEnd - 1].end;
}
const oldSlice = sliceFromOriginal(oldText, oldStart, oldEnd);
const newSlice = hasNew ? sliceFromOriginal(newText, newStart, newEnd) : "";
if (oldSlice === newSlice) continue;
const changeType = classifyChange(oldTokens.slice(ch.oldTokStart, ch.oldTokEnd), newTokens.slice(ch.newTokStart, ch.newTokEnd), oldSlice, newSlice);
diffs.push({
oldText: oldSlice,
position: { startIndex: oldStart, endIndex: oldEnd },
newText: newSlice,
changeType
});
}
return diffs;
}
var index_default = {
getTextDiffs
};
// Annotate the CommonJS export names for ESM import in node:
0 && (module.exports = {
getTextDiffs
});