@sanity/diff-match-patch
Version:
Robust diff, match and patch algorithms to perform operations required for synchronizing plain text
110 lines (98 loc) • 3.11 kB
text/typescript
import {cloneDiff} from '../diff/clone.js'
import {type Patch} from '../index.js'
/**
* Counts the number of bytes in a string.
* Note that while this approach may seem heavy-handed, it is actually
* significantly faster than both `new Blob([str]).size` and `TextEncoder`.
*
* @param str - String to count
* @returns Number of bytes
*/
export function countUtf8Bytes(str: string): number {
let bytes = 0
for (let i = 0; i < str.length; i++) {
const codePoint = str.codePointAt(i)
if (typeof codePoint === 'undefined') {
throw new Error('Failed to get codepoint')
}
bytes += utf8len(codePoint)
}
return bytes
}
/**
* Options for the index adjustment operations.
*
* @public
*/
export interface AdjustmentOptions {
/**
* When converting indices between UTF-8 and UCS-2, certain scenarios can occur
* where we go beyond the target offset. This can happen in particular with
* surrogate pairs/high codepoints, when the base string we are applying the
* patch to does not fully match the one that was used to generate the patch.
* Defaults to `false`.
*/
allowExceedingIndices?: boolean
}
/**
* Takes a `patches` array as produced by diff-match-patch and adjusts the
* `start1` and `start2` properties so that they refer to UCS-2 index instead
* of a UTF-8 index.
*
* @param patches - The patches to adjust
* @param base - The base string to use for counting bytes
* @param options - Options for the adjustment of indices
* @returns A new array of patches with adjusted indicies
* @beta
*/
export function adjustIndiciesToUcs2(
patches: Patch[],
base: string,
options: AdjustmentOptions = {},
): Patch[] {
let byteOffset = 0
let idx = 0 // index into the input.
function advanceTo(target: number) {
for (; byteOffset < target; ) {
const codePoint = base.codePointAt(idx)
if (typeof codePoint === 'undefined') {
// Reached the end of the base string - the indicies won't be correct,
// but we also cannot advance any further to find a closer index.
return idx
}
byteOffset += utf8len(codePoint)
// This is encoded as a surrogate pair.
if (codePoint > 0xffff) {
idx += 2
} else {
idx += 1
}
}
if (!options.allowExceedingIndices && byteOffset !== target) {
throw new Error('Failed to determine byte offset')
}
return idx
}
const adjusted: Patch[] = []
for (const patch of patches) {
adjusted.push({
diffs: patch.diffs.map((diff) => cloneDiff(diff)),
start1: advanceTo(patch.start1),
start2: advanceTo(patch.start2),
utf8Start1: patch.utf8Start1,
utf8Start2: patch.utf8Start2,
length1: patch.length1,
length2: patch.length2,
utf8Length1: patch.utf8Length1,
utf8Length2: patch.utf8Length2,
})
}
return adjusted
}
function utf8len(codePoint: number): 1 | 2 | 3 | 4 {
// See table at https://en.wikipedia.org/wiki/UTF-8
if (codePoint <= 0x007f) return 1
if (codePoint <= 0x07ff) return 2
if (codePoint <= 0xffff) return 3
return 4
}