UNPKG

baburchi

Version:

A lightweight TypeScript library designed to fix typos in OCR post-processing.

github.com/ragaeeb/baburchi

ragaeeb/baburchi

1,393 lines (1,387 loc) • 96.3 kB

JavaScript

//#region src/utils/sanitize.ts const PRESETS = { aggressive: { collapseWhitespace: true, keepOnlyArabicLetters: false, lettersAndSpacesOnly: true, nfc: true, normalizeAlif: true, removeHijriMarker: true, replaceAlifMaqsurah: true, replaceTaMarbutahWithHa: true, stripDiacritics: true, stripFootnotes: true, stripLatinAndSymbols: true, stripTatweel: "all", stripZeroWidth: true, trim: true, zeroWidthToSpace: false }, light: { collapseWhitespace: true, keepOnlyArabicLetters: false, lettersAndSpacesOnly: false, nfc: true, normalizeAlif: false, removeHijriMarker: false, replaceAlifMaqsurah: false, replaceTaMarbutahWithHa: false, stripDiacritics: false, stripFootnotes: false, stripLatinAndSymbols: false, stripTatweel: false, stripZeroWidth: true, trim: true, zeroWidthToSpace: false }, search: { collapseWhitespace: true, keepOnlyArabicLetters: false, lettersAndSpacesOnly: false, nfc: true, normalizeAlif: true, removeHijriMarker: true, replaceAlifMaqsurah: true, replaceTaMarbutahWithHa: false, stripDiacritics: true, stripFootnotes: true, stripLatinAndSymbols: false, stripTatweel: "all", stripZeroWidth: true, trim: true, zeroWidthToSpace: false } }; const PRESET_NONE = { collapseWhitespace: false, keepOnlyArabicLetters: false, lettersAndSpacesOnly: false, nfc: false, normalizeAlif: false, removeHijriMarker: false, replaceAlifMaqsurah: false, replaceTaMarbutahWithHa: false, stripDiacritics: false, stripFootnotes: false, stripLatinAndSymbols: false, stripTatweel: false, stripZeroWidth: false, trim: false, zeroWidthToSpace: false }; const CHAR_SPACE = 32; const CHAR_TATWEEL = 1600; const CHAR_HA = 1607; const CHAR_YA = 1610; const CHAR_WAW = 1608; const CHAR_ALIF = 1575; const CHAR_ALIF_MADDA = 1570; const CHAR_ALIF_HAMZA_ABOVE = 1571; const CHAR_WAW_HAMZA_ABOVE = 1572; const CHAR_ALIF_HAMZA_BELOW = 1573; const CHAR_YEH_HAMZA_ABOVE = 1574; const CHAR_ALIF_WASLA = 1649; const CHAR_ALIF_MAQSURAH = 1609; const CHAR_TA_MARBUTAH = 1577; const CHAR_MADDA_ABOVE = 1619; const CHAR_HAMZA_ABOVE_MARK = 1620; const CHAR_HAMZA_BELOW_MARK = 1621; let sharedBuffer = new Uint16Array(2048); const decoder = new TextDecoder("utf-16le"); const isDiacritic = (code) => { return code >= 1611 && code <= 1631 || code >= 1552 && code <= 1562 || code === 1648 || code >= 1750 && code <= 1773; }; const isZeroWidth = (code) => { return code >= 8203 && code <= 8207 || code >= 8234 && code <= 8238 || code >= 8288 && code <= 8292 || code === 65279; }; const isLatinOrDigit = (code) => { return code >= 65 && code <= 90 || code >= 97 && code <= 122 || code >= 48 && code <= 57; }; const isSymbol = (code) => { return code === 172 || code === 167 || code === 96 || code === 61 || code === 38 || code === 65018; }; const isArabicLetter = (code) => { return code >= 1569 && code <= 1594 || code >= 1601 && code <= 1610 || code === 1649 || code === 1662 || code === 1670 || code >= 1700 && code <= 1711 || code === 1740 || code === 1746 || code === 1747; }; /** * Checks whether a code point represents a Western or Arabic-Indic digit. * * @param code - The numeric code point to evaluate. * @returns True when the code point is a digit in either numeral system. */ const isDigit = (code) => code >= 48 && code <= 57 || code >= 1632 && code <= 1641; /** * Resolves a boolean by taking an optional override over a preset value. * * @param presetValue - The value defined by the preset. * @param override - Optional override provided by the caller. * @returns The resolved boolean value. */ const resolveBoolean = (presetValue, override) => override === void 0 ? presetValue : !!override; /** * Resolves the tatweel mode by taking an optional override over a preset mode. * An override of `true` maps to `'safe'` for convenience. * * @param presetValue - The mode specified by the preset. * @param override - Optional override provided by the caller. * @returns The resolved tatweel mode. */ const resolveTatweelMode = (presetValue, override) => { if (override === void 0) return presetValue; if (override === true) return "safe"; if (override === false) return false; return override; }; /** * Internal sanitization logic that applies all transformations to a single string. * Uses single-pass character transformation for maximum performance when possible. * This function assumes all options have been pre-resolved for maximum performance. */ const applySanitization = (input, options) => { if (!input) return ""; const { nfc, stripZW, zwAsSpace, removeHijri, removeDia, tatweelMode, normAlif, maqToYa, taToHa, removeFootnotes, lettersSpacesOnly, stripNoise, lettersOnly, collapseWS, doTrim } = options; /** * NFC Normalization (Fast Path) * * `String.prototype.normalize('NFC')` is extremely expensive under high throughput. * For Arabic OCR text, the main canonical compositions we care about are: * - ا + ◌ٓ (U+0653) → آ * - ا + ◌ٔ (U+0654) → أ * - ا + ◌ٕ (U+0655) → إ * - و + ◌ٔ (U+0654) → ؤ * - ي + ◌ٔ (U+0654) → ئ * * We implement these compositions inline during the main loop, avoiding full NFC * normalization in the common case while preserving behavior needed by our sanitizer. */ const text = input; const len = text.length; if (len > sharedBuffer.length) sharedBuffer = new Uint16Array(len + 1024); const buffer = sharedBuffer; let bufIdx = 0; let lastWasSpace = false; let start = 0; if (doTrim) while (start < len && text.charCodeAt(start) <= 32) start++; for (let i = start; i < len; i++) { const code = text.charCodeAt(i); if (code <= 32) { if (lettersOnly) continue; if (collapseWS) { if (!lastWasSpace && bufIdx > 0) { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = true; } } else { buffer[bufIdx++] = code; lastWasSpace = false; } continue; } if (nfc) { if (code === CHAR_MADDA_ABOVE || code === CHAR_HAMZA_ABOVE_MARK || code === CHAR_HAMZA_BELOW_MARK) { const prevIdx = bufIdx - 1; if (prevIdx >= 0) { const prev = buffer[prevIdx]; let composed = 0; if (prev === CHAR_ALIF) if (code === CHAR_MADDA_ABOVE) composed = CHAR_ALIF_MADDA; else if (code === CHAR_HAMZA_ABOVE_MARK) composed = CHAR_ALIF_HAMZA_ABOVE; else composed = CHAR_ALIF_HAMZA_BELOW; else if (code === CHAR_HAMZA_ABOVE_MARK) { if (prev === CHAR_WAW) composed = CHAR_WAW_HAMZA_ABOVE; else if (prev === CHAR_YA) composed = CHAR_YEH_HAMZA_ABOVE; } if (composed !== 0) { buffer[prevIdx] = composed; continue; } } } } if (stripZW && isZeroWidth(code)) { if (zwAsSpace) if (collapseWS) { if (!lastWasSpace && bufIdx > 0) { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = true; } } else { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = false; } continue; } if (removeHijri && code === CHAR_HA) { let nextIdx = i + 1; if (nextIdx < len && text.charCodeAt(nextIdx) === CHAR_TATWEEL) nextIdx++; let isBoundary = false; if (nextIdx >= len) isBoundary = true; else { const nextCode = text.charCodeAt(nextIdx); if (nextCode <= 32 || isSymbol(nextCode) || nextCode === 47 || nextCode === 45) isBoundary = true; } if (isBoundary) { let backIdx = i - 1; while (backIdx >= 0) { const c = text.charCodeAt(backIdx); if (c <= 32 || isZeroWidth(c)) backIdx--; else break; } if (backIdx >= 0 && isDigit(text.charCodeAt(backIdx))) { if (nextIdx > i + 1) i++; continue; } } } if (removeDia && isDiacritic(code)) continue; if (code === CHAR_TATWEEL) { if (tatweelMode === "all") continue; if (tatweelMode === "safe") { let backIdx = bufIdx - 1; while (backIdx >= 0 && buffer[backIdx] === CHAR_SPACE) backIdx--; if (backIdx >= 0) { const prev = buffer[backIdx]; if (isDigit(prev) || prev === CHAR_HA) {} else continue; } else continue; } } if (stripNoise && !lettersSpacesOnly && !lettersOnly) { if (isLatinOrDigit(code) || isSymbol(code)) { if (collapseWS) { if (!lastWasSpace && bufIdx > 0) { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = true; } } else { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = false; } continue; } if (code === 47 && i + 1 < len && text.charCodeAt(i + 1) === 47) { while (i + 1 < len && text.charCodeAt(i + 1) === 47) i++; if (collapseWS) { if (!lastWasSpace && bufIdx > 0) { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = true; } } else { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = false; } continue; } } if (removeFootnotes && !lettersSpacesOnly && !lettersOnly && code === 40) { let nextIdx = i + 1; if (nextIdx < len && text.charCodeAt(nextIdx) === CHAR_SPACE) nextIdx++; if (nextIdx < len) { const c1 = text.charCodeAt(nextIdx); if (c1 === 172) { nextIdx++; let hasDigits = false; while (nextIdx < len) { const c = text.charCodeAt(nextIdx); if (c >= 1632 && c <= 1641) { hasDigits = true; nextIdx++; } else break; } if (hasDigits && nextIdx < len) { if (text.charCodeAt(nextIdx) === 41) { i = nextIdx; if (collapseWS) { if (!lastWasSpace && bufIdx > 0) { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = true; } } else { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = false; } continue; } if (text.charCodeAt(nextIdx) === CHAR_SPACE) { nextIdx++; if (nextIdx < len && text.charCodeAt(nextIdx) === 41) { i = nextIdx; if (collapseWS) { if (!lastWasSpace && bufIdx > 0) { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = true; } } else { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = false; } continue; } } } } else if (c1 >= 1632 && c1 <= 1641) { let tempIdx = nextIdx + 1; let matched = false; if (tempIdx < len) { const c2 = text.charCodeAt(tempIdx); if (c2 === 41) { matched = true; tempIdx++; } else if (c2 === CHAR_SPACE) { tempIdx++; if (tempIdx < len) { const c3 = text.charCodeAt(tempIdx); if (c3 >= 1536 && c3 <= 1791) { tempIdx++; if (tempIdx < len && text.charCodeAt(tempIdx) === 41) { matched = true; tempIdx++; } } } } } if (matched) { i = tempIdx - 1; if (collapseWS) { if (!lastWasSpace && bufIdx > 0) { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = true; } } else { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = false; } continue; } } } } if (lettersSpacesOnly || lettersOnly) { if (!isArabicLetter(code)) { if (lettersOnly) continue; if (collapseWS) { if (!lastWasSpace && bufIdx > 0) { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = true; } } else { buffer[bufIdx++] = CHAR_SPACE; lastWasSpace = false; } continue; } let outCode$1 = code; if (normAlif) { if (code === CHAR_ALIF_MADDA || code === CHAR_ALIF_HAMZA_ABOVE || code === CHAR_ALIF_HAMZA_BELOW || code === CHAR_ALIF_WASLA) outCode$1 = CHAR_ALIF; } if (maqToYa && code === CHAR_ALIF_MAQSURAH) outCode$1 = CHAR_YA; if (taToHa && code === CHAR_TA_MARBUTAH) outCode$1 = CHAR_HA; buffer[bufIdx++] = outCode$1; lastWasSpace = false; continue; } let outCode = code; if (normAlif) { if (code === CHAR_ALIF_MADDA || code === CHAR_ALIF_HAMZA_ABOVE || code === CHAR_ALIF_HAMZA_BELOW || code === CHAR_ALIF_WASLA) outCode = CHAR_ALIF; } if (maqToYa && code === CHAR_ALIF_MAQSURAH) outCode = CHAR_YA; if (taToHa && code === CHAR_TA_MARBUTAH) outCode = CHAR_HA; buffer[bufIdx++] = outCode; lastWasSpace = false; } if (doTrim && lastWasSpace && bufIdx > 0) bufIdx--; if (bufIdx === 0) return ""; const resultView = buffer.subarray(0, bufIdx); return decoder.decode(resultView); }; /** * Resolves options from a preset or custom options object. * Returns all resolved flags for reuse in batch processing. */ const resolveOptions = (optionsOrPreset) => { let preset; let opts = null; if (typeof optionsOrPreset === "string") preset = PRESETS[optionsOrPreset]; else { const base = optionsOrPreset.base ?? "light"; preset = base === "none" ? PRESET_NONE : PRESETS[base]; opts = optionsOrPreset; } return { collapseWS: resolveBoolean(preset.collapseWhitespace, opts?.collapseWhitespace), doTrim: resolveBoolean(preset.trim, opts?.trim), lettersOnly: resolveBoolean(preset.keepOnlyArabicLetters, opts?.keepOnlyArabicLetters), lettersSpacesOnly: resolveBoolean(preset.lettersAndSpacesOnly, opts?.lettersAndSpacesOnly), maqToYa: resolveBoolean(preset.replaceAlifMaqsurah, opts?.replaceAlifMaqsurah), nfc: resolveBoolean(preset.nfc, opts?.nfc), normAlif: resolveBoolean(preset.normalizeAlif, opts?.normalizeAlif), removeDia: resolveBoolean(preset.stripDiacritics, opts?.stripDiacritics), removeFootnotes: resolveBoolean(preset.stripFootnotes, opts?.stripFootnotes), removeHijri: resolveBoolean(preset.removeHijriMarker, opts?.removeHijriMarker), stripNoise: resolveBoolean(preset.stripLatinAndSymbols, opts?.stripLatinAndSymbols), stripZW: resolveBoolean(preset.stripZeroWidth, opts?.stripZeroWidth), taToHa: resolveBoolean(preset.replaceTaMarbutahWithHa, opts?.replaceTaMarbutahWithHa), tatweelMode: resolveTatweelMode(preset.stripTatweel, opts?.stripTatweel), zwAsSpace: resolveBoolean(preset.zeroWidthToSpace, opts?.zeroWidthToSpace) }; }; /** * Creates a reusable sanitizer function with pre-resolved options. * Use this when you need to sanitize many strings with the same options * for maximum performance. * * @example * ```ts * const sanitize = createArabicSanitizer('search'); * const results = texts.map(sanitize); * ``` */ const createArabicSanitizer = (optionsOrPreset = "search") => { const resolved = resolveOptions(optionsOrPreset); return (input) => applySanitization(input, resolved); }; function sanitizeArabic(input, optionsOrPreset = "search") { if (Array.isArray(input)) { if (input.length === 0) return []; const resolved = resolveOptions(optionsOrPreset); const results = new Array(input.length); for (let i = 0; i < input.length; i++) results[i] = applySanitization(input[i], resolved); return results; } if (!input) return ""; return applySanitization(input, resolveOptions(optionsOrPreset)); } //#endregion //#region src/utils/levenshthein.ts /** * Calculates Levenshtein distance between two strings using space-optimized dynamic programming. * The Levenshtein distance is the minimum number of single-character edits (insertions, * deletions, or substitutions) required to change one string into another. * * @param textA - First string to compare * @param textB - Second string to compare * @returns Minimum edit distance between the two strings * @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths * @example * calculateLevenshteinDistance('kitten', 'sitting') // Returns 3 * calculateLevenshteinDistance('', 'hello') // Returns 5 */ const calculateLevenshteinDistance = (textA, textB) => { const lengthA = textA.length; const lengthB = textB.length; if (lengthA === 0) return lengthB; if (lengthB === 0) return lengthA; const [shorter, longer] = lengthA <= lengthB ? [textA, textB] : [textB, textA]; const shortLen = shorter.length; const longLen = longer.length; let previousRow = Array.from({ length: shortLen + 1 }, (_, index) => index); for (let i = 1; i <= longLen; i++) { const currentRow = [i]; for (let j = 1; j <= shortLen; j++) { const substitutionCost = longer[i - 1] === shorter[j - 1] ? 0 : 1; const minCost = Math.min(previousRow[j] + 1, currentRow[j - 1] + 1, previousRow[j - 1] + substitutionCost); currentRow.push(minCost); } previousRow = currentRow; } return previousRow[shortLen]; }; /** * Early exit check for bounded Levenshtein distance. */ const shouldEarlyExit = (a, b, maxDist) => { if (Math.abs(a.length - b.length) > maxDist) return maxDist + 1; if (a.length === 0) return b.length <= maxDist ? b.length : maxDist + 1; if (b.length === 0) return a.length <= maxDist ? a.length : maxDist + 1; return null; }; /** * Initializes arrays for bounded Levenshtein calculation. */ const initializeBoundedArrays = (m) => { const prev = new Int16Array(m + 1); const curr = new Int16Array(m + 1); for (let j = 0; j <= m; j++) prev[j] = j; return [prev, curr]; }; /** * Calculates the bounds for the current row in bounded Levenshtein. */ const getRowBounds = (i, maxDist, m) => ({ from: Math.max(1, i - maxDist), to: Math.min(m, i + maxDist) }); /** * Processes a single cell in the bounded Levenshtein matrix. */ const processBoundedCell = (a, b, i, j, prev, curr) => { const cost = a[i - 1] === b[j - 1] ? 0 : 1; const del = prev[j] + 1; const ins = curr[j - 1] + 1; const sub = prev[j - 1] + cost; return Math.min(del, ins, sub); }; /** * Processes a single row in bounded Levenshtein calculation. */ const processBoundedRow = (a, b, i, maxDist, prev, curr) => { const m = b.length; const big = maxDist + 1; const { from, to } = getRowBounds(i, maxDist, m); curr[0] = i; let rowMin = i; for (let j = 1; j < from; j++) curr[j] = big; for (let j = to + 1; j <= m; j++) curr[j] = big; for (let j = from; j <= to; j++) { const val = processBoundedCell(a, b, i, j, prev, curr); curr[j] = val; if (val < rowMin) rowMin = val; } return rowMin; }; /** * Calculates bounded Levenshtein distance with early termination. * More efficient when you only care about distances up to a threshold. */ const boundedLevenshtein = (a, b, maxDist) => { const big = maxDist + 1; const earlyResult = shouldEarlyExit(a, b, maxDist); if (earlyResult !== null) return earlyResult; if (a.length > b.length) return boundedLevenshtein(b, a, maxDist); let [prev, curr] = initializeBoundedArrays(b.length); for (let i = 1; i <= a.length; i++) { if (processBoundedRow(a, b, i, maxDist, prev, curr) > maxDist) return big; const tmp = prev; prev = curr; curr = tmp; } return prev[b.length] <= maxDist ? prev[b.length] : big; }; //#endregion //#region src/utils/similarity.ts const ALIGNMENT_SCORES = { GAP_PENALTY: -1, MISMATCH_PENALTY: -2, PERFECT_MATCH: 2, SOFT_MATCH: 1 }; /** * Calculates similarity ratio between two strings as a value between 0.0 and 1.0. * Uses Levenshtein distance normalized by the length of the longer string. * A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings. * * @param textA - First string to compare * @param textB - Second string to compare * @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical) * @example * calculateSimilarity('hello', 'hello') // Returns 1.0 * calculateSimilarity('hello', 'help') // Returns 0.6 */ const calculateSimilarity = (textA, textB) => { const maxLength = Math.max(textA.length, textB.length) || 1; return (maxLength - calculateLevenshteinDistance(textA, textB)) / maxLength; }; /** * Checks if two texts are similar after Arabic normalization. * Normalizes both texts by removing diacritics and decorative elements, * then compares their similarity against the provided threshold. * * @param textA - First text to compare * @param textB - Second text to compare * @param threshold - Similarity threshold (0.0 to 1.0) * @returns True if normalized texts meet the similarity threshold * @example * areSimilarAfterNormalization('السَّلام', 'السلام', 0.9) // Returns true */ const areSimilarAfterNormalization = (textA, textB, threshold = .6) => { return calculateSimilarity(sanitizeArabic(textA), sanitizeArabic(textB)) >= threshold; }; /** * Calculates alignment score for two tokens in sequence alignment. * Uses different scoring criteria: perfect match after normalization gets highest score, * typo symbols or highly similar tokens get soft match score, mismatches get penalty. * * @param tokenA - First token to score * @param tokenB - Second token to score * @param typoSymbols - Array of special symbols that get preferential treatment * @param similarityThreshold - Threshold for considering tokens highly similar * @returns Alignment score (higher is better match) * @example * calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match) * calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity */ const calculateAlignmentScore = (tokenA, tokenB, typoSymbols, similarityThreshold) => { const normalizedA = sanitizeArabic(tokenA); const normalizedB = sanitizeArabic(tokenB); if (normalizedA === normalizedB) return ALIGNMENT_SCORES.PERFECT_MATCH; const isTypoSymbol = typoSymbols.includes(tokenA) || typoSymbols.includes(tokenB); const isHighlySimilar = calculateSimilarity(normalizedA, normalizedB) >= similarityThreshold; return isTypoSymbol || isHighlySimilar ? ALIGNMENT_SCORES.SOFT_MATCH : ALIGNMENT_SCORES.MISMATCH_PENALTY; }; /** * Backtracks through the scoring matrix to reconstruct optimal sequence alignment. * Follows the directional indicators in the matrix to build the sequence of aligned * token pairs from the Needleman-Wunsch algorithm. * * @param matrix - Scoring matrix with directional information from alignment * @param tokensA - First sequence of tokens * @param tokensB - Second sequence of tokens * @returns Array of aligned token pairs, where null indicates a gap * @throws Error if invalid alignment direction is encountered */ const backtrackAlignment = (matrix, tokensA, tokensB) => { const alignment = []; let i = tokensA.length; let j = tokensB.length; while (i > 0 || j > 0) switch (matrix[i][j].direction) { case "diagonal": alignment.push([tokensA[--i], tokensB[--j]]); break; case "left": alignment.push([null, tokensB[--j]]); break; case "up": alignment.push([tokensA[--i], null]); break; default: throw new Error("Invalid alignment direction"); } return alignment.reverse(); }; /** * Initializes the scoring matrix with gap penalties. * * @param lengthA - Length of the first token sequence. * @param lengthB - Length of the second token sequence. * @returns A matrix seeded with gap penalties for alignment. */ const initializeScoringMatrix = (lengthA, lengthB) => { const matrix = Array.from({ length: lengthA + 1 }, () => Array.from({ length: lengthB + 1 }, () => ({ direction: null, score: 0 }))); for (let i = 1; i <= lengthA; i++) matrix[i][0] = { direction: "up", score: i * ALIGNMENT_SCORES.GAP_PENALTY }; for (let j = 1; j <= lengthB; j++) matrix[0][j] = { direction: "left", score: j * ALIGNMENT_SCORES.GAP_PENALTY }; return matrix; }; /** * Determines the best alignment direction and score for a cell. * * @param diagonalScore - Score achieved by aligning tokens diagonally. * @param upScore - Score achieved by inserting a gap in the second sequence. * @param leftScore - Score achieved by inserting a gap in the first sequence. * @returns The direction and score that maximize the alignment. */ const getBestAlignment = (diagonalScore, upScore, leftScore) => { const maxScore = Math.max(diagonalScore, upScore, leftScore); if (maxScore === diagonalScore) return { direction: "diagonal", score: maxScore }; if (maxScore === upScore) return { direction: "up", score: maxScore }; return { direction: "left", score: maxScore }; }; /** * Performs global sequence alignment using the Needleman-Wunsch algorithm. * Aligns two token sequences to find the optimal pairing that maximizes * the total alignment score, handling insertions, deletions, and substitutions. * * @param tokensA - First sequence of tokens to align * @param tokensB - Second sequence of tokens to align * @param typoSymbols - Special symbols that affect scoring * @param similarityThreshold - Threshold for high similarity scoring * @returns Array of aligned token pairs, with null indicating gaps * @example * alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8) * // Returns [['a', 'a'], ['b', 'c']] */ const alignTokenSequences = (tokensA, tokensB, typoSymbols, similarityThreshold) => { const lengthA = tokensA.length; const lengthB = tokensB.length; const matrix = initializeScoringMatrix(lengthA, lengthB); const typoSymbolsSet = new Set(typoSymbols); const normalizedA = tokensA.map((t) => sanitizeArabic(t)); const normalizedB = tokensB.map((t) => sanitizeArabic(t)); for (let i = 1; i <= lengthA; i++) for (let j = 1; j <= lengthB; j++) { const aNorm = normalizedA[i - 1]; const bNorm = normalizedB[j - 1]; let alignmentScore; if (aNorm === bNorm) alignmentScore = ALIGNMENT_SCORES.PERFECT_MATCH; else { const isTypo = typoSymbolsSet.has(tokensA[i - 1]) || typoSymbolsSet.has(tokensB[j - 1]); const highSim = calculateSimilarity(aNorm, bNorm) >= similarityThreshold; alignmentScore = isTypo || highSim ? ALIGNMENT_SCORES.SOFT_MATCH : ALIGNMENT_SCORES.MISMATCH_PENALTY; } const { direction, score } = getBestAlignment(matrix[i - 1][j - 1].score + alignmentScore, matrix[i - 1][j].score + ALIGNMENT_SCORES.GAP_PENALTY, matrix[i][j - 1].score + ALIGNMENT_SCORES.GAP_PENALTY); matrix[i][j] = { direction, score }; } return backtrackAlignment(matrix, tokensA, tokensB); }; //#endregion //#region src/alignment.ts /** * Aligns split text segments to match target lines by finding the best order. * * This function handles cases where text lines have been split into segments * and need to be merged back together in the correct order. It compares * different arrangements of the segments against target lines to find the * best match based on similarity scores. * * @param targetLines - Array where each element is either a string to align against, or falsy to skip alignment * @param segmentLines - Array of text segments that may represent split versions of target lines. * @returns Array of aligned text lines */ const alignTextSegments = (targetLines, segmentLines) => { const alignedLines = []; let segmentIndex = 0; for (const targetLine of targetLines) { if (segmentIndex >= segmentLines.length) break; if (targetLine) { const { result, segmentsConsumed } = processAlignmentTarget(targetLine, segmentLines, segmentIndex); if (result) alignedLines.push(result); segmentIndex += segmentsConsumed; } else { alignedLines.push(segmentLines[segmentIndex]); segmentIndex++; } } if (segmentIndex < segmentLines.length) alignedLines.push(...segmentLines.slice(segmentIndex)); return alignedLines; }; /** * Tries to merge two candidate segments in both possible orders and returns the best match. * * @param targetLine - The line we are trying to reconstruct. * @param partA - The first candidate segment to evaluate. * @param partB - The second candidate segment to evaluate. * @returns The merged segment that best matches the target line after normalization. */ const findBestSegmentMerge = (targetLine, partA, partB) => { const mergedForward = `${partA} ${partB}`; const mergedReversed = `${partB} ${partA}`; const normalizedTarget = sanitizeArabic(targetLine); return calculateSimilarity(normalizedTarget, sanitizeArabic(mergedForward)) >= calculateSimilarity(normalizedTarget, sanitizeArabic(mergedReversed)) ? mergedForward : mergedReversed; }; /** * Processes a single target line that needs alignment. * * @param targetLine - The line we are attempting to align to. * @param segmentLines - The collection of available text segments. * @param segmentIndex - The current index within {@link segmentLines} to consider. * @returns An object containing the resulting aligned text and how many segments were consumed. */ const processAlignmentTarget = (targetLine, segmentLines, segmentIndex) => { const currentSegment = segmentLines[segmentIndex]; if (areSimilarAfterNormalization(targetLine, currentSegment)) return { result: currentSegment, segmentsConsumed: 1 }; const partA = segmentLines[segmentIndex]; const partB = segmentLines[segmentIndex + 1]; if (!partA || !partB) return partA ? { result: partA, segmentsConsumed: 1 } : { result: "", segmentsConsumed: 0 }; return { result: findBestSegmentMerge(targetLine, partA, partB), segmentsConsumed: 2 }; }; //#endregion //#region src/balance.ts /** * Checks if all double quotes in a string are balanced and returns detailed error information. * * A string has balanced quotes when every opening quote has a corresponding closing quote. * This function counts all quote characters and determines if there's an even number of them. * If there's an odd number, the last quote is marked as unmatched. * * @param str - The string to check for quote balance * @returns An object containing balance status and any errors found * * @example * ```typescript * checkQuoteBalance('Hello "world"') // { errors: [], isBalanced: true } * checkQuoteBalance('Hello "world') // { errors: [{ char: '"', index: 6, reason: 'unmatched', type: 'quote' }], isBalanced: false } * ``` */ const checkQuoteBalance = (str) => { const errors = []; let quoteCount = 0; let lastQuoteIndex = -1; for (let i = 0; i < str.length; i++) if (str[i] === "\"") { quoteCount++; lastQuoteIndex = i; } const isBalanced$1 = quoteCount % 2 === 0; if (!isBalanced$1 && lastQuoteIndex !== -1) errors.push({ char: "\"", index: lastQuoteIndex, reason: "unmatched", type: "quote" }); return { errors, isBalanced: isBalanced$1 }; }; /** Mapping of opening brackets to their corresponding closing brackets */ const BRACKETS = { "«": "»", "(": ")", "[": "]", "{": "}" }; /** Set of all opening bracket characters */ const OPEN_BRACKETS = new Set([ "«", "(", "[", "{" ]); /** Set of all closing bracket characters */ const CLOSE_BRACKETS = new Set([ "»", ")", "]", "}" ]); /** * Checks if all brackets in a string are properly balanced and returns detailed error information. * * A string has balanced brackets when: * - Every opening bracket has a corresponding closing bracket * - Brackets are properly nested (no crossing pairs) * - Each closing bracket matches the most recent unmatched opening bracket * * Supports the following bracket pairs: (), [], {}, «» * * @param str - The string to check for bracket balance * @returns An object containing balance status and any errors found * * @example * ```typescript * checkBracketBalance('(hello [world])') // { errors: [], isBalanced: true } * checkBracketBalance('(hello [world)') // { errors: [{ char: '[', index: 7, reason: 'unclosed', type: 'bracket' }], isBalanced: false } * checkBracketBalance('(hello ]world[') // { errors: [...], isBalanced: false } * ``` */ const checkBracketBalance = (str) => { const errors = []; const stack = []; for (let i = 0; i < str.length; i++) { const char = str[i]; if (OPEN_BRACKETS.has(char)) stack.push({ char, index: i }); else if (CLOSE_BRACKETS.has(char)) { const lastOpen = stack.pop(); if (!lastOpen) errors.push({ char, index: i, reason: "unmatched", type: "bracket" }); else if (BRACKETS[lastOpen.char] !== char) { errors.push({ char: lastOpen.char, index: lastOpen.index, reason: "mismatched", type: "bracket" }); errors.push({ char, index: i, reason: "mismatched", type: "bracket" }); } } } stack.forEach(({ char, index }) => { errors.push({ char, index, reason: "unclosed", type: "bracket" }); }); return { errors, isBalanced: errors.length === 0 }; }; /** * Checks if both quotes and brackets are balanced in a string and returns detailed error information. * * This function combines the results of both quote and bracket balance checking, * providing a comprehensive analysis of all balance issues in the text. * The errors are sorted by their position in the string for easier debugging. * * @param str - The string to check for overall balance * @returns An object containing combined balance status and all errors found, sorted by position * * @example * ```typescript * checkBalance('Hello "world" and (test)') // { errors: [], isBalanced: true } * checkBalance('Hello "world and (test') // { errors: [...], isBalanced: false } * ``` */ const checkBalance = (str) => { const quoteResult = checkQuoteBalance(str); const bracketResult = checkBracketBalance(str); return { errors: [...quoteResult.errors, ...bracketResult.errors].sort((a, b) => a.index - b.index), isBalanced: quoteResult.isBalanced && bracketResult.isBalanced }; }; /** * Gets detailed character-level errors for unbalanced quotes and brackets in multi-line text. * * This function processes text line by line, but only checks lines longer than 10 characters * for balance issues. It returns absolute positions that can be used with text editors * or highlighting components that need precise character positioning across the entire text. * * The absolute index accounts for newline characters between lines, providing accurate * positioning for the original text string. * * @param text - The multi-line text to analyze for balance errors * @returns Array of character errors with absolute positioning information * * @example * ```typescript * const text = 'Line 1 with "quote\nLine 2 with (bracket'; * const errors = getUnbalancedErrors(text); * // Returns errors with absoluteIndex pointing to exact character positions * ``` */ const getUnbalancedErrors = (text) => { const characterErrors = []; const lines = text.split("\n"); let absoluteIndex = 0; lines.forEach((line, lineIndex) => { if (line.length > 10) { const balanceResult = checkBalance(line); if (!balanceResult.isBalanced) balanceResult.errors.forEach((error) => { characterErrors.push({ absoluteIndex: absoluteIndex + error.index, char: error.char, reason: error.reason, type: error.type }); }); } absoluteIndex += line.length + (lineIndex < lines.length - 1 ? 1 : 0); }); return characterErrors; }; /** * Checks if all double quotes in a string are balanced. * * This is a convenience function that returns only the boolean result * without detailed error information. * * @param str - The string to check for quote balance * @returns True if quotes are balanced, false otherwise * * @example * ```typescript * areQuotesBalanced('Hello "world"') // true * areQuotesBalanced('Hello "world') // false * ``` */ const areQuotesBalanced = (str) => { return checkQuoteBalance(str).isBalanced; }; /** * Checks if all brackets in a string are properly balanced. * * This is a convenience function that returns only the boolean result * without detailed error information. * * @param str - The string to check for bracket balance * @returns True if brackets are balanced, false otherwise * * @example * ```typescript * areBracketsBalanced('(hello [world])') // true * areBracketsBalanced('(hello [world') // false * ``` */ const areBracketsBalanced = (str) => { return checkBracketBalance(str).isBalanced; }; /** * Checks if both quotes and brackets are balanced in a string. * * This is a convenience function that returns only the boolean result * without detailed error information. * * @param str - The string to check for overall balance * @returns True if both quotes and brackets are balanced, false otherwise * * @example * ```typescript * isBalanced('Hello "world" and (test)') // true * isBalanced('Hello "world and (test') // false * ``` */ const isBalanced = (str) => { return checkBalance(str).isBalanced; }; //#endregion //#region src/utils/textUtils.ts const INTAHA_ACTUAL = "اهـ"; /** * Collection of regex patterns used throughout the library for text processing */ const PATTERNS = { arabicCharacters: /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]/, arabicDigits: /[0-9\u0660-\u0669]+/, arabicFootnoteReferenceRegex: /^\([\u0660-\u0669]+\)/g, arabicLettersAndDigits: /[0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669]+/g, arabicPunctuationAndWhitespace: /[\s\u060C\u061B\u061F\u06D4]+/, arabicReferenceRegex: /\([\u0660-\u0669]+\)/g, footnoteEmbedded: /\([0-9\u0660-\u0669]+\)/, footnoteStandalone: /^\(?[0-9\u0660-\u0669]+\)?[،.]?$/, invalidReferenceRegex: /\(\)|\([.1OV9]+\)/g, ocrConfusedFootnoteReferenceRegex: /^\([.1OV9]+\)/g, ocrConfusedReferenceRegex: /\([.1OV9]+\)/g, whitespace: /\s+/ }; /** * Extracts the first sequence of Arabic or Western digits from text. * Used primarily for footnote number comparison to match related footnote elements. * * @param text - Text containing digits to extract * @returns First digit sequence found, or empty string if none found * @example * extractDigits('(٥)أخرجه البخاري') // Returns '٥' * extractDigits('See note (123)') // Returns '123' */ const extractDigits = (text) => { const match = text.match(PATTERNS.arabicDigits); return match ? match[0] : ""; }; /** * Tokenizes text into individual words while preserving special symbols. * Adds spacing around preserved symbols to ensure they are tokenized separately, * then splits on whitespace. * * @param text - Text to tokenize * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens * @returns Array of tokens, or empty array if input is empty/whitespace * @example * tokenizeText('Hello ﷺ world', ['ﷺ']) // Returns ['Hello', 'ﷺ', 'world'] */ const tokenizeText = (text, preserveSymbols = []) => { let processedText = text; for (const symbol of preserveSymbols) { const symbolRegex = new RegExp(symbol, "g"); processedText = processedText.replace(symbolRegex, ` ${symbol} `); } return processedText.trim().split(PATTERNS.whitespace).filter(Boolean); }; /** * Handles fusion of standalone and embedded footnotes during token processing. * Detects patterns where standalone footnotes should be merged with embedded ones * or where trailing standalone footnotes should be skipped. * * @param result - Current result array being built * @param previousToken - The previous token in the sequence * @param currentToken - The current token being processed * @returns True if the current token was handled (fused or skipped), false otherwise * @example * // (٥) + (٥)أخرجه → result gets (٥)أخرجه * // (٥)أخرجه + (٥) → (٥) is skipped */ const handleFootnoteFusion = (result, previousToken, currentToken) => { const prevIsStandalone = PATTERNS.footnoteStandalone.test(previousToken); const currHasEmbedded = PATTERNS.footnoteEmbedded.test(currentToken); const currIsStandalone = PATTERNS.footnoteStandalone.test(currentToken); const prevHasEmbedded = PATTERNS.footnoteEmbedded.test(previousToken); const prevDigits = extractDigits(previousToken); const currDigits = extractDigits(currentToken); if (prevIsStandalone && currHasEmbedded && prevDigits === currDigits) { result[result.length - 1] = currentToken; return true; } if (prevHasEmbedded && currIsStandalone && prevDigits === currDigits) return true; return false; }; /** * Handles selection logic for tokens with embedded footnotes during alignment. * Prefers tokens that contain embedded footnotes over plain text, and among * tokens with embedded footnotes, prefers the shorter one. * * @param tokenA - First token to compare * @param tokenB - Second token to compare * @returns Array containing selected token(s), or null if no special handling needed * @example * handleFootnoteSelection('text', '(١)text') // Returns ['(١)text'] * handleFootnoteSelection('(١)longtext', '(١)text') // Returns ['(١)text'] */ const handleFootnoteSelection = (tokenA, tokenB) => { const aHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenA); const bHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenB); if (aHasEmbedded && !bHasEmbedded) return [tokenA]; if (bHasEmbedded && !aHasEmbedded) return [tokenB]; if (aHasEmbedded && bHasEmbedded) return [tokenA.length <= tokenB.length ? tokenA : tokenB]; return null; }; /** * Handles selection logic for standalone footnote tokens during alignment. * Manages cases where one or both tokens are standalone footnotes, preserving * both tokens when one is a footnote and the other is regular text. * * @param tokenA - First token to compare * @param tokenB - Second token to compare * @returns Array containing selected token(s), or null if no special handling needed * @example * handleStandaloneFootnotes('(١)', 'text') // Returns ['(١)', 'text'] * handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one) */ const handleStandaloneFootnotes = (tokenA, tokenB) => { const aIsFootnote = PATTERNS.footnoteStandalone.test(tokenA); const bIsFootnote = PATTERNS.footnoteStandalone.test(tokenB); if (aIsFootnote && !bIsFootnote) return [tokenA, tokenB]; if (bIsFootnote && !aIsFootnote) return [tokenB, tokenA]; if (aIsFootnote && bIsFootnote) return [tokenA.length <= tokenB.length ? tokenA : tokenB]; return null; }; /** * Removes simple footnote references from Arabic text. * Handles footnotes in the format (¬[Arabic numerals]) where ¬ is the not symbol (U+00AC). * * @param text - The input text containing footnote references to remove * @returns The text with footnote references removed and extra spaces normalized * * @example * ```typescript * removeFootnoteReferencesSimple("هذا النص (¬١٢٣) يحتوي على حاشية") * // Returns: "هذا النص يحتوي على حاشية" * ``` */ const removeFootnoteReferencesSimple = (text) => { return text.replace(/ ?\(\u00AC[\u0660-\u0669]+\) ?/g, " ").replace(/ +/g, " ").trim(); }; /** * Removes single digit footnote references and extended footnote formats from Arabic text. * Handles footnotes in the format: * - ([single Arabic digit]) - e.g., (٣) * - ([single Arabic digit] [single Arabic letter]) - e.g., (٣ م), (٥ ه), (٧ ب) * * @param text - The input text containing footnote references to remove * @returns The text with footnote references removed and extra spaces normalized * * @example * ```typescript * removeSingleDigitFootnoteReferences("هذا النص (٣) والآخر (٥ م) والثالث (٧ ه) يحتوي على حواشي") * // Returns: "هذا النص والآخر والثالث يحتوي على حواشي" * ``` */ const removeSingleDigitFootnoteReferences = (text) => { return text.replace(/ ?\([٠-٩]{1}(\s+[\u0600-\u06FF])?\) ?/g, " ").replace(/ +/g, " ").trim(); }; /** * Standardizes standalone Hijri symbol ه to هـ when following Arabic digits * @param text - Input text to process * @returns Text with standardized Hijri symbols */ const standardizeHijriSymbol = (text) => { return text.replace(/([0-9\u0660-\u0669])\s*ه(?=\s|$|[^\u0621-\u063A\u0641-\u064A\u0660-\u0669])/gu, "$1 هـ"); }; /** * Standardizes standalone اه to اهـ when appearing as whole word * @param text - Input text to process * @returns Text with standardized AH Hijri symbols */ const standardizeIntahaSymbol = (text) => { return text.replace(/(^|\s|[^\u0600-\u06FF])اه(?=\s|$|[^\u0600-\u06FF])/gu, `$1${INTAHA_ACTUAL}`); }; //#endregion //#region src/footnotes.ts const INVALID_FOOTNOTE = "()"; /** * Checks if the given text contains invalid footnote references. * Invalid footnotes include empty parentheses "()" or OCR-confused characters * like ".1OV9" that were misrecognized instead of Arabic numerals. * * @param text - Text to check for invalid footnote patterns * @returns True if text contains invalid footnote references, false otherwise * @example * hasInvalidFootnotes('This text has ()') // Returns true * hasInvalidFootnotes('This text has (١)') // Returns false * hasInvalidFootnotes('OCR mistake (O)') // Returns true */ const hasInvalidFootnotes = (text) => { return PATTERNS.invalidReferenceRegex.test(text); }; const arabicFormatter = new Intl.NumberFormat("ar-SA"); /** * Converts a number to Arabic-Indic numerals using the Intl.NumberFormat API. * Uses the 'ar-SA' locale to ensure proper Arabic numeral formatting. * * @param num - The number to convert to Arabic numerals * @returns String representation using Arabic-Indic digits (٠-٩) * @example * numberToArabic(123) // Returns '١٢٣' * numberToArabic(5) // Returns '٥' */ const numberToArabic = (num) => { return arabicFormatter.format(num); }; /** * Converts OCR-confused characters to their corresponding Arabic-Indic numerals. * Handles common OCR misrecognitions where Latin characters are mistaken for Arabic digits. * * @param char - Single character that may be an OCR mistake * @returns Corresponding Arabic-Indic numeral or original character if no mapping exists * @example * ocrToArabic('O') // Returns '٥' (O often confused with ٥) * ocrToArabic('1') // Returns '١' (1 often confused with ١) * ocrToArabic('.') // Returns '٠' (dot often confused with ٠) */ const ocrToArabic = (char) => { return { "1": "١", "9": "٩", ".": "٠", O: "٥", o: "٥", V: "٧", v: "٧" }[char] || char; }; /** * Parses Arabic-Indic numerals from a reference string and converts to a JavaScript number. * Removes parentheses and converts each Arabic-Indic digit to its Western equivalent. * * @param arabicStr - String containing Arabic-Indic numerals, typically in format '(١٢٣)' * @returns Parsed number, or 0 if parsing fails * @example * arabicToNumber('(١٢٣)') // Returns 123 * arabicToNumber('(٥)') // Returns 5 * arabicToNumber('invalid') // Returns 0 */ const arabicToNumber = (arabicStr) => { const lookup = { "٠": "0", "١": "1", "٢": "2", "٣": "3", "٤": "4", "٥": "5", "٦": "6", "٧": "7", "٨": "8", "٩": "9" }; const digits = arabicStr.replace(/[()]/g, ""); let numStr = ""; for (const char of digits) numStr += lookup[char]; const parsed = parseInt(numStr, 10); return Number.isNaN(parsed) ? 0 : parsed; }; /** * Extracts all footnote references from text lines, categorizing them by type and location. * Handles both Arabic-Indic numerals and OCR-confused characters in body text and footnotes. * * @param lines - Array of text line objects with optional isFootnote flag * @returns Object containing categorized reference arrays: * - bodyReferences: All valid references found in body text * - footnoteReferences: All valid references found in footnotes * - ocrConfusedInBody: OCR-confused references in body text (for tracking) * - ocrConfusedInFootnotes: OCR-confused references in footnotes (for tracking) * @example * const lines = [ * { text: 'Body with (١) and (O)', isFootnote: false }, * { text: '(١) Footnote text', isFootnote: true } * ]; * const refs = extractReferences(lines); * // refs.bodyReferences contains ['(١)', '(٥)'] - OCR 'O' converted to '٥' */ const extractReferences = (lines) => { const arabicReferencesInBody = lines.filter((b) => !b.isFootnote).flatMap((b) => b.text.match(PATTERNS.arabicReferenceRegex) || []); const ocrConfusedReferencesInBody = lines.filter((b) => !b.isFootnote).flatMap((b) => b.text.match(PATTERNS.ocrConfusedReferenceRegex) || []); const arabicReferencesInFootnotes = lines.filter((b) => b.isFootnote).flatMap((b) => b.text.match(PATTERNS.arabicFootnoteReferenceRegex) || []); const ocrConfusedReferencesInFootnotes = lines.filter((b) => b.isFootnote).flatMap((b) => b.text.match(PATTERNS.ocrConfusedFootnoteReferenceRegex) || []); const convertedOcrBodyRefs = ocrConfusedReferencesInBody.map((ref) => ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char))); const convertedOcrFootnoteRefs = ocrConfusedReferencesInFootnotes.map((ref) => ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char))); return { bodyReferences: [...arabicReferencesInBody, ...convertedOcrBodyRefs], footnoteReferences: [...arabicReferencesInFootnotes, ...convertedOcrFootnoteRefs], ocrConfusedInBody: ocrConfusedReferencesInBody, ocrConfusedInFootnotes: ocrConfusedReferencesInFootnotes }; }; /** * Determines if footnote reference correction is needed by checking for: * 1. Invalid footnote patterns (empty parentheses, OCR mistakes) * 2. Mismatched sets of references between body text and footnotes * 3. Different counts of references in body vs footnotes * * @param lines - Array of text line objects to analyze * @param references - Extracted reference data from extractReferences() * @returns True if correction is needed, false if references are already correct * @example * const lines = [{ text: 'Text with ()', isFootnote: false }]; * const refs = extractReferences(lines); * needsCorrection(lines, refs) // Returns true due to invalid "()" reference */ const needsCorrection = (lines, references) => { if (lines.some((line) => hasInvalidFootnotes(line.text))) return true; const bodySet = new Set(references.bodyReferences); const footnoteSet = new Set(references.footnoteReferences); if (bodySet.size !== footnoteSet.size) return true; for (const ref of bodySet) if (!footnoteSet.has(ref)) return true; return false; }; /** * Corrects footnote references in an array of text lines by: * 1. Converting OCR-confused characters to proper Arabic numerals * 2. Filling in empty "()" references with appropriate numbers * 3. Ensuring footnote references in body text match those in footnotes * 4. Generating new reference numbers when needed * * @param lines - Array of text line objects, each with optional isFootnote flag * @returns Array of corrected text lines with proper footnote references * @example * const lines = [ * { text: 'Main text with ()', isFootnote: false }, * { text: '() This is a footnote', isFootnote: true } * ]; * const corrected = correctReferences(lines); * // Returns lines with "()" replaced by proper Arabic numerals like "(١)" */ const correctReferences = (lines) => { if (!needsCorrection(lines, extractReferences(lines))) return lines; const sanitizedLines = lines.map((line) => { let updatedText = line.text; updatedText = updatedText.replace(/\([.1OV9]+\)/g, (match) => { return match.replace(/[.1OV9]/g, (char) => ocrToArabic(char)); }); return { ...line, text: updatedText }; }); const cleanReferences = extractReferences(sanitizedLines); const bodyRefSet = new Set(cleanReferences.bodyReferences); const footnoteRefSet = new Set(cleanReferences.footnot