UNPKG

@pawanosman/textdiff

Version:

Unicode-aware multilingual text diff library for Node.js that diffs by whole words and merges adjacent changes.

205 lines (203 loc) 7.29 kB
"use strict"; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); // src/index.ts var index_exports = {}; __export(index_exports, { default: () => index_default, getTextDiffs: () => getTextDiffs }); module.exports = __toCommonJS(index_exports); var WORD_REGEX = /([\p{L}\p{N}\p{M}]+)|([^\p{L}\p{N}\p{M}]+)/gu; function normalize(input) { return input.normalize("NFC"); } function tokenize(text) { const normalized = normalize(text); const tokens = []; const useSegmenter = typeof Intl.Segmenter !== "undefined"; if (useSegmenter) { try { const segmenter = new Intl.Segmenter(void 0, { granularity: "word" }); const segments = Array.from(segmenter.segment(normalized)); let cursor = 0; for (const seg of segments) { const { segment, index } = seg; if (index > cursor) { tokens.push({ kind: "sep", value: normalized.slice(cursor, index), start: cursor, end: index }); } const isWord = seg.isWordLike ?? /[\p{L}\p{N}\p{M}]/u.test(segment); tokens.push({ kind: isWord ? "word" : "sep", value: segment, start: index, end: index + segment.length }); cursor = index + segment.length; } if (cursor < normalized.length) { tokens.push({ kind: "sep", value: normalized.slice(cursor), start: cursor, end: normalized.length }); } return tokens; } catch { } } let match; while ((match = WORD_REGEX.exec(normalized)) !== null) { const value = match[0]; const start = match.index; const end = start + value.length; const isWord = /[\p{L}\p{N}\p{M}]/u.test(value); tokens.push({ kind: isWord ? "word" : "sep", value, start, end }); } return tokens; } function lcsIndicesTokens(a, b) { const n = a.length; const m = b.length; const dp = Array.from({ length: n + 1 }, () => new Array(m + 1).fill(0)); for (let i2 = n - 1; i2 >= 0; i2--) { for (let j2 = m - 1; j2 >= 0; j2--) { if (a[i2].value === b[j2].value) dp[i2][j2] = dp[i2 + 1][j2 + 1] + 1; else dp[i2][j2] = Math.max(dp[i2 + 1][j2], dp[i2][j2 + 1]); } } const pairs = []; let i = 0; let j = 0; while (i < n && j < m) { if (a[i].value === b[j].value) { pairs.push([i, j]); i++; j++; } else if (dp[i + 1][j] >= dp[i][j + 1]) { i++; } else { j++; } } return pairs; } function levenshtein(a, b) { if (a === b) return 0; const n = a.length; const m = b.length; if (n === 0) return m; if (m === 0) return n; const prev = new Array(m + 1); const curr = new Array(m + 1); for (let j = 0; j <= m; j++) prev[j] = j; for (let i = 1; i <= n; i++) { curr[0] = i; const ai = a.charCodeAt(i - 1); for (let j = 1; j <= m; j++) { const cost = ai === b.charCodeAt(j - 1) ? 0 : 1; curr[j] = Math.min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost); } for (let j = 0; j <= m; j++) prev[j] = curr[j]; } return prev[m]; } function classifyChange(oldTokSlice, newTokSlice, oldSlice, newSlice) { if (oldSlice.length === 0 && newSlice.length > 0) return "insert"; if (newSlice.length === 0 && oldSlice.length > 0) return "delete"; const oldWords = oldTokSlice.filter((t) => t.kind === "word"); const newWords = newTokSlice.filter((t) => t.kind === "word"); if (oldWords.length === 1 && newWords.length === 1) { const d = levenshtein(oldWords[0].value, newWords[0].value); const maxLen = Math.max(oldWords[0].value.length, newWords[0].value.length); if (d > 0 && d <= Math.max(2, Math.ceil(0.3 * maxLen))) return "spell-correction"; } return "replace"; } function sliceFromOriginal(original, start, end) { return original.slice(start, end); } function getTextDiffs(oldTextInput, newTextInput) { const oldText = normalize(oldTextInput); const newText = normalize(newTextInput); if (oldText === newText) return []; const oldTokens = tokenize(oldText); const newTokens = tokenize(newText); const lcs = lcsIndicesTokens(oldTokens, newTokens); const changes = []; let prevOld = 0; let prevNew = 0; for (let k = 0; k <= lcs.length; k++) { const pair = lcs[k]; const nextOld = pair ? pair[0] : oldTokens.length; const nextNew = pair ? pair[1] : newTokens.length; if (nextOld > prevOld || nextNew > prevNew) { changes.push({ oldTokStart: prevOld, oldTokEnd: nextOld, newTokStart: prevNew, newTokEnd: nextNew }); } prevOld = nextOld + 1; prevNew = nextNew + 1; } const mergedChanges = []; for (const ch of changes) { const last = mergedChanges[mergedChanges.length - 1]; if (last) { const oldBetweenStart = last.oldTokEnd; const oldBetweenEnd = ch.oldTokStart; const newBetweenStart = last.newTokEnd; const newBetweenEnd = ch.newTokStart; const onlySepsBetweenOld = oldBetweenStart <= oldBetweenEnd ? oldTokens.slice(oldBetweenStart, oldBetweenEnd).every((t) => t.kind === "sep") : true; const onlySepsBetweenNew = newBetweenStart <= newBetweenEnd ? newTokens.slice(newBetweenStart, newBetweenEnd).every((t) => t.kind === "sep") : true; if (onlySepsBetweenOld && onlySepsBetweenNew) { last.oldTokEnd = ch.oldTokEnd; last.newTokEnd = ch.newTokEnd; continue; } } mergedChanges.push({ ...ch }); } const diffs = []; for (const ch of mergedChanges) { const hasOld = ch.oldTokEnd > ch.oldTokStart; const hasNew = ch.newTokEnd > ch.newTokStart; let oldStart; let oldEnd; if (hasOld) { oldStart = oldTokens[ch.oldTokStart].start; oldEnd = oldTokens[ch.oldTokEnd - 1].end; } else { const leftIdx = ch.oldTokStart - 1; oldStart = leftIdx >= 0 ? oldTokens[leftIdx].end : 0; oldEnd = oldStart; } let newStart; let newEnd; if (hasNew) { newStart = newTokens[ch.newTokStart].start; newEnd = newTokens[ch.newTokEnd - 1].end; } const oldSlice = sliceFromOriginal(oldText, oldStart, oldEnd); const newSlice = hasNew ? sliceFromOriginal(newText, newStart, newEnd) : ""; if (oldSlice === newSlice) continue; const changeType = classifyChange(oldTokens.slice(ch.oldTokStart, ch.oldTokEnd), newTokens.slice(ch.newTokStart, ch.newTokEnd), oldSlice, newSlice); diffs.push({ oldText: oldSlice, position: { startIndex: oldStart, endIndex: oldEnd }, newText: newSlice, changeType }); } return diffs; } var index_default = { getTextDiffs }; // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { getTextDiffs });