baburchi
Version:
A lightweight TypeScript library designed to fix typos in OCR post-processing.
1,393 lines (1,387 loc) • 96.3 kB
JavaScript
//#region src/utils/sanitize.ts
const PRESETS = {
aggressive: {
collapseWhitespace: true,
keepOnlyArabicLetters: false,
lettersAndSpacesOnly: true,
nfc: true,
normalizeAlif: true,
removeHijriMarker: true,
replaceAlifMaqsurah: true,
replaceTaMarbutahWithHa: true,
stripDiacritics: true,
stripFootnotes: true,
stripLatinAndSymbols: true,
stripTatweel: "all",
stripZeroWidth: true,
trim: true,
zeroWidthToSpace: false
},
light: {
collapseWhitespace: true,
keepOnlyArabicLetters: false,
lettersAndSpacesOnly: false,
nfc: true,
normalizeAlif: false,
removeHijriMarker: false,
replaceAlifMaqsurah: false,
replaceTaMarbutahWithHa: false,
stripDiacritics: false,
stripFootnotes: false,
stripLatinAndSymbols: false,
stripTatweel: false,
stripZeroWidth: true,
trim: true,
zeroWidthToSpace: false
},
search: {
collapseWhitespace: true,
keepOnlyArabicLetters: false,
lettersAndSpacesOnly: false,
nfc: true,
normalizeAlif: true,
removeHijriMarker: true,
replaceAlifMaqsurah: true,
replaceTaMarbutahWithHa: false,
stripDiacritics: true,
stripFootnotes: true,
stripLatinAndSymbols: false,
stripTatweel: "all",
stripZeroWidth: true,
trim: true,
zeroWidthToSpace: false
}
};
const PRESET_NONE = {
collapseWhitespace: false,
keepOnlyArabicLetters: false,
lettersAndSpacesOnly: false,
nfc: false,
normalizeAlif: false,
removeHijriMarker: false,
replaceAlifMaqsurah: false,
replaceTaMarbutahWithHa: false,
stripDiacritics: false,
stripFootnotes: false,
stripLatinAndSymbols: false,
stripTatweel: false,
stripZeroWidth: false,
trim: false,
zeroWidthToSpace: false
};
const CHAR_SPACE = 32;
const CHAR_TATWEEL = 1600;
const CHAR_HA = 1607;
const CHAR_YA = 1610;
const CHAR_WAW = 1608;
const CHAR_ALIF = 1575;
const CHAR_ALIF_MADDA = 1570;
const CHAR_ALIF_HAMZA_ABOVE = 1571;
const CHAR_WAW_HAMZA_ABOVE = 1572;
const CHAR_ALIF_HAMZA_BELOW = 1573;
const CHAR_YEH_HAMZA_ABOVE = 1574;
const CHAR_ALIF_WASLA = 1649;
const CHAR_ALIF_MAQSURAH = 1609;
const CHAR_TA_MARBUTAH = 1577;
const CHAR_MADDA_ABOVE = 1619;
const CHAR_HAMZA_ABOVE_MARK = 1620;
const CHAR_HAMZA_BELOW_MARK = 1621;
let sharedBuffer = new Uint16Array(2048);
const decoder = new TextDecoder("utf-16le");
const isDiacritic = (code) => {
return code >= 1611 && code <= 1631 || code >= 1552 && code <= 1562 || code === 1648 || code >= 1750 && code <= 1773;
};
const isZeroWidth = (code) => {
return code >= 8203 && code <= 8207 || code >= 8234 && code <= 8238 || code >= 8288 && code <= 8292 || code === 65279;
};
const isLatinOrDigit = (code) => {
return code >= 65 && code <= 90 || code >= 97 && code <= 122 || code >= 48 && code <= 57;
};
const isSymbol = (code) => {
return code === 172 || code === 167 || code === 96 || code === 61 || code === 38 || code === 65018;
};
const isArabicLetter = (code) => {
return code >= 1569 && code <= 1594 || code >= 1601 && code <= 1610 || code === 1649 || code === 1662 || code === 1670 || code >= 1700 && code <= 1711 || code === 1740 || code === 1746 || code === 1747;
};
/**
* Checks whether a code point represents a Western or Arabic-Indic digit.
*
* @param code - The numeric code point to evaluate.
* @returns True when the code point is a digit in either numeral system.
*/
const isDigit = (code) => code >= 48 && code <= 57 || code >= 1632 && code <= 1641;
/**
* Resolves a boolean by taking an optional override over a preset value.
*
* @param presetValue - The value defined by the preset.
* @param override - Optional override provided by the caller.
* @returns The resolved boolean value.
*/
const resolveBoolean = (presetValue, override) => override === void 0 ? presetValue : !!override;
/**
* Resolves the tatweel mode by taking an optional override over a preset mode.
* An override of `true` maps to `'safe'` for convenience.
*
* @param presetValue - The mode specified by the preset.
* @param override - Optional override provided by the caller.
* @returns The resolved tatweel mode.
*/
const resolveTatweelMode = (presetValue, override) => {
if (override === void 0) return presetValue;
if (override === true) return "safe";
if (override === false) return false;
return override;
};
/**
* Internal sanitization logic that applies all transformations to a single string.
* Uses single-pass character transformation for maximum performance when possible.
* This function assumes all options have been pre-resolved for maximum performance.
*/
const applySanitization = (input, options) => {
if (!input) return "";
const { nfc, stripZW, zwAsSpace, removeHijri, removeDia, tatweelMode, normAlif, maqToYa, taToHa, removeFootnotes, lettersSpacesOnly, stripNoise, lettersOnly, collapseWS, doTrim } = options;
/**
* NFC Normalization (Fast Path)
*
* `String.prototype.normalize('NFC')` is extremely expensive under high throughput.
* For Arabic OCR text, the main canonical compositions we care about are:
* - ا + ◌ٓ (U+0653) → آ
* - ا + ◌ٔ (U+0654) → أ
* - ا + ◌ٕ (U+0655) → إ
* - و + ◌ٔ (U+0654) → ؤ
* - ي + ◌ٔ (U+0654) → ئ
*
* We implement these compositions inline during the main loop, avoiding full NFC
* normalization in the common case while preserving behavior needed by our sanitizer.
*/
const text = input;
const len = text.length;
if (len > sharedBuffer.length) sharedBuffer = new Uint16Array(len + 1024);
const buffer = sharedBuffer;
let bufIdx = 0;
let lastWasSpace = false;
let start = 0;
if (doTrim) while (start < len && text.charCodeAt(start) <= 32) start++;
for (let i = start; i < len; i++) {
const code = text.charCodeAt(i);
if (code <= 32) {
if (lettersOnly) continue;
if (collapseWS) {
if (!lastWasSpace && bufIdx > 0) {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = true;
}
} else {
buffer[bufIdx++] = code;
lastWasSpace = false;
}
continue;
}
if (nfc) {
if (code === CHAR_MADDA_ABOVE || code === CHAR_HAMZA_ABOVE_MARK || code === CHAR_HAMZA_BELOW_MARK) {
const prevIdx = bufIdx - 1;
if (prevIdx >= 0) {
const prev = buffer[prevIdx];
let composed = 0;
if (prev === CHAR_ALIF) if (code === CHAR_MADDA_ABOVE) composed = CHAR_ALIF_MADDA;
else if (code === CHAR_HAMZA_ABOVE_MARK) composed = CHAR_ALIF_HAMZA_ABOVE;
else composed = CHAR_ALIF_HAMZA_BELOW;
else if (code === CHAR_HAMZA_ABOVE_MARK) {
if (prev === CHAR_WAW) composed = CHAR_WAW_HAMZA_ABOVE;
else if (prev === CHAR_YA) composed = CHAR_YEH_HAMZA_ABOVE;
}
if (composed !== 0) {
buffer[prevIdx] = composed;
continue;
}
}
}
}
if (stripZW && isZeroWidth(code)) {
if (zwAsSpace) if (collapseWS) {
if (!lastWasSpace && bufIdx > 0) {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = true;
}
} else {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = false;
}
continue;
}
if (removeHijri && code === CHAR_HA) {
let nextIdx = i + 1;
if (nextIdx < len && text.charCodeAt(nextIdx) === CHAR_TATWEEL) nextIdx++;
let isBoundary = false;
if (nextIdx >= len) isBoundary = true;
else {
const nextCode = text.charCodeAt(nextIdx);
if (nextCode <= 32 || isSymbol(nextCode) || nextCode === 47 || nextCode === 45) isBoundary = true;
}
if (isBoundary) {
let backIdx = i - 1;
while (backIdx >= 0) {
const c = text.charCodeAt(backIdx);
if (c <= 32 || isZeroWidth(c)) backIdx--;
else break;
}
if (backIdx >= 0 && isDigit(text.charCodeAt(backIdx))) {
if (nextIdx > i + 1) i++;
continue;
}
}
}
if (removeDia && isDiacritic(code)) continue;
if (code === CHAR_TATWEEL) {
if (tatweelMode === "all") continue;
if (tatweelMode === "safe") {
let backIdx = bufIdx - 1;
while (backIdx >= 0 && buffer[backIdx] === CHAR_SPACE) backIdx--;
if (backIdx >= 0) {
const prev = buffer[backIdx];
if (isDigit(prev) || prev === CHAR_HA) {} else continue;
} else continue;
}
}
if (stripNoise && !lettersSpacesOnly && !lettersOnly) {
if (isLatinOrDigit(code) || isSymbol(code)) {
if (collapseWS) {
if (!lastWasSpace && bufIdx > 0) {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = true;
}
} else {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = false;
}
continue;
}
if (code === 47 && i + 1 < len && text.charCodeAt(i + 1) === 47) {
while (i + 1 < len && text.charCodeAt(i + 1) === 47) i++;
if (collapseWS) {
if (!lastWasSpace && bufIdx > 0) {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = true;
}
} else {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = false;
}
continue;
}
}
if (removeFootnotes && !lettersSpacesOnly && !lettersOnly && code === 40) {
let nextIdx = i + 1;
if (nextIdx < len && text.charCodeAt(nextIdx) === CHAR_SPACE) nextIdx++;
if (nextIdx < len) {
const c1 = text.charCodeAt(nextIdx);
if (c1 === 172) {
nextIdx++;
let hasDigits = false;
while (nextIdx < len) {
const c = text.charCodeAt(nextIdx);
if (c >= 1632 && c <= 1641) {
hasDigits = true;
nextIdx++;
} else break;
}
if (hasDigits && nextIdx < len) {
if (text.charCodeAt(nextIdx) === 41) {
i = nextIdx;
if (collapseWS) {
if (!lastWasSpace && bufIdx > 0) {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = true;
}
} else {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = false;
}
continue;
}
if (text.charCodeAt(nextIdx) === CHAR_SPACE) {
nextIdx++;
if (nextIdx < len && text.charCodeAt(nextIdx) === 41) {
i = nextIdx;
if (collapseWS) {
if (!lastWasSpace && bufIdx > 0) {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = true;
}
} else {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = false;
}
continue;
}
}
}
} else if (c1 >= 1632 && c1 <= 1641) {
let tempIdx = nextIdx + 1;
let matched = false;
if (tempIdx < len) {
const c2 = text.charCodeAt(tempIdx);
if (c2 === 41) {
matched = true;
tempIdx++;
} else if (c2 === CHAR_SPACE) {
tempIdx++;
if (tempIdx < len) {
const c3 = text.charCodeAt(tempIdx);
if (c3 >= 1536 && c3 <= 1791) {
tempIdx++;
if (tempIdx < len && text.charCodeAt(tempIdx) === 41) {
matched = true;
tempIdx++;
}
}
}
}
}
if (matched) {
i = tempIdx - 1;
if (collapseWS) {
if (!lastWasSpace && bufIdx > 0) {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = true;
}
} else {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = false;
}
continue;
}
}
}
}
if (lettersSpacesOnly || lettersOnly) {
if (!isArabicLetter(code)) {
if (lettersOnly) continue;
if (collapseWS) {
if (!lastWasSpace && bufIdx > 0) {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = true;
}
} else {
buffer[bufIdx++] = CHAR_SPACE;
lastWasSpace = false;
}
continue;
}
let outCode$1 = code;
if (normAlif) {
if (code === CHAR_ALIF_MADDA || code === CHAR_ALIF_HAMZA_ABOVE || code === CHAR_ALIF_HAMZA_BELOW || code === CHAR_ALIF_WASLA) outCode$1 = CHAR_ALIF;
}
if (maqToYa && code === CHAR_ALIF_MAQSURAH) outCode$1 = CHAR_YA;
if (taToHa && code === CHAR_TA_MARBUTAH) outCode$1 = CHAR_HA;
buffer[bufIdx++] = outCode$1;
lastWasSpace = false;
continue;
}
let outCode = code;
if (normAlif) {
if (code === CHAR_ALIF_MADDA || code === CHAR_ALIF_HAMZA_ABOVE || code === CHAR_ALIF_HAMZA_BELOW || code === CHAR_ALIF_WASLA) outCode = CHAR_ALIF;
}
if (maqToYa && code === CHAR_ALIF_MAQSURAH) outCode = CHAR_YA;
if (taToHa && code === CHAR_TA_MARBUTAH) outCode = CHAR_HA;
buffer[bufIdx++] = outCode;
lastWasSpace = false;
}
if (doTrim && lastWasSpace && bufIdx > 0) bufIdx--;
if (bufIdx === 0) return "";
const resultView = buffer.subarray(0, bufIdx);
return decoder.decode(resultView);
};
/**
* Resolves options from a preset or custom options object.
* Returns all resolved flags for reuse in batch processing.
*/
const resolveOptions = (optionsOrPreset) => {
let preset;
let opts = null;
if (typeof optionsOrPreset === "string") preset = PRESETS[optionsOrPreset];
else {
const base = optionsOrPreset.base ?? "light";
preset = base === "none" ? PRESET_NONE : PRESETS[base];
opts = optionsOrPreset;
}
return {
collapseWS: resolveBoolean(preset.collapseWhitespace, opts?.collapseWhitespace),
doTrim: resolveBoolean(preset.trim, opts?.trim),
lettersOnly: resolveBoolean(preset.keepOnlyArabicLetters, opts?.keepOnlyArabicLetters),
lettersSpacesOnly: resolveBoolean(preset.lettersAndSpacesOnly, opts?.lettersAndSpacesOnly),
maqToYa: resolveBoolean(preset.replaceAlifMaqsurah, opts?.replaceAlifMaqsurah),
nfc: resolveBoolean(preset.nfc, opts?.nfc),
normAlif: resolveBoolean(preset.normalizeAlif, opts?.normalizeAlif),
removeDia: resolveBoolean(preset.stripDiacritics, opts?.stripDiacritics),
removeFootnotes: resolveBoolean(preset.stripFootnotes, opts?.stripFootnotes),
removeHijri: resolveBoolean(preset.removeHijriMarker, opts?.removeHijriMarker),
stripNoise: resolveBoolean(preset.stripLatinAndSymbols, opts?.stripLatinAndSymbols),
stripZW: resolveBoolean(preset.stripZeroWidth, opts?.stripZeroWidth),
taToHa: resolveBoolean(preset.replaceTaMarbutahWithHa, opts?.replaceTaMarbutahWithHa),
tatweelMode: resolveTatweelMode(preset.stripTatweel, opts?.stripTatweel),
zwAsSpace: resolveBoolean(preset.zeroWidthToSpace, opts?.zeroWidthToSpace)
};
};
/**
* Creates a reusable sanitizer function with pre-resolved options.
* Use this when you need to sanitize many strings with the same options
* for maximum performance.
*
* @example
* ```ts
* const sanitize = createArabicSanitizer('search');
* const results = texts.map(sanitize);
* ```
*/
const createArabicSanitizer = (optionsOrPreset = "search") => {
const resolved = resolveOptions(optionsOrPreset);
return (input) => applySanitization(input, resolved);
};
function sanitizeArabic(input, optionsOrPreset = "search") {
if (Array.isArray(input)) {
if (input.length === 0) return [];
const resolved = resolveOptions(optionsOrPreset);
const results = new Array(input.length);
for (let i = 0; i < input.length; i++) results[i] = applySanitization(input[i], resolved);
return results;
}
if (!input) return "";
return applySanitization(input, resolveOptions(optionsOrPreset));
}
//#endregion
//#region src/utils/levenshthein.ts
/**
* Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
* The Levenshtein distance is the minimum number of single-character edits (insertions,
* deletions, or substitutions) required to change one string into another.
*
* @param textA - First string to compare
* @param textB - Second string to compare
* @returns Minimum edit distance between the two strings
* @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths
* @example
* calculateLevenshteinDistance('kitten', 'sitting') // Returns 3
* calculateLevenshteinDistance('', 'hello') // Returns 5
*/
const calculateLevenshteinDistance = (textA, textB) => {
const lengthA = textA.length;
const lengthB = textB.length;
if (lengthA === 0) return lengthB;
if (lengthB === 0) return lengthA;
const [shorter, longer] = lengthA <= lengthB ? [textA, textB] : [textB, textA];
const shortLen = shorter.length;
const longLen = longer.length;
let previousRow = Array.from({ length: shortLen + 1 }, (_, index) => index);
for (let i = 1; i <= longLen; i++) {
const currentRow = [i];
for (let j = 1; j <= shortLen; j++) {
const substitutionCost = longer[i - 1] === shorter[j - 1] ? 0 : 1;
const minCost = Math.min(previousRow[j] + 1, currentRow[j - 1] + 1, previousRow[j - 1] + substitutionCost);
currentRow.push(minCost);
}
previousRow = currentRow;
}
return previousRow[shortLen];
};
/**
* Early exit check for bounded Levenshtein distance.
*/
const shouldEarlyExit = (a, b, maxDist) => {
if (Math.abs(a.length - b.length) > maxDist) return maxDist + 1;
if (a.length === 0) return b.length <= maxDist ? b.length : maxDist + 1;
if (b.length === 0) return a.length <= maxDist ? a.length : maxDist + 1;
return null;
};
/**
* Initializes arrays for bounded Levenshtein calculation.
*/
const initializeBoundedArrays = (m) => {
const prev = new Int16Array(m + 1);
const curr = new Int16Array(m + 1);
for (let j = 0; j <= m; j++) prev[j] = j;
return [prev, curr];
};
/**
* Calculates the bounds for the current row in bounded Levenshtein.
*/
const getRowBounds = (i, maxDist, m) => ({
from: Math.max(1, i - maxDist),
to: Math.min(m, i + maxDist)
});
/**
* Processes a single cell in the bounded Levenshtein matrix.
*/
const processBoundedCell = (a, b, i, j, prev, curr) => {
const cost = a[i - 1] === b[j - 1] ? 0 : 1;
const del = prev[j] + 1;
const ins = curr[j - 1] + 1;
const sub = prev[j - 1] + cost;
return Math.min(del, ins, sub);
};
/**
* Processes a single row in bounded Levenshtein calculation.
*/
const processBoundedRow = (a, b, i, maxDist, prev, curr) => {
const m = b.length;
const big = maxDist + 1;
const { from, to } = getRowBounds(i, maxDist, m);
curr[0] = i;
let rowMin = i;
for (let j = 1; j < from; j++) curr[j] = big;
for (let j = to + 1; j <= m; j++) curr[j] = big;
for (let j = from; j <= to; j++) {
const val = processBoundedCell(a, b, i, j, prev, curr);
curr[j] = val;
if (val < rowMin) rowMin = val;
}
return rowMin;
};
/**
* Calculates bounded Levenshtein distance with early termination.
* More efficient when you only care about distances up to a threshold.
*/
const boundedLevenshtein = (a, b, maxDist) => {
const big = maxDist + 1;
const earlyResult = shouldEarlyExit(a, b, maxDist);
if (earlyResult !== null) return earlyResult;
if (a.length > b.length) return boundedLevenshtein(b, a, maxDist);
let [prev, curr] = initializeBoundedArrays(b.length);
for (let i = 1; i <= a.length; i++) {
if (processBoundedRow(a, b, i, maxDist, prev, curr) > maxDist) return big;
const tmp = prev;
prev = curr;
curr = tmp;
}
return prev[b.length] <= maxDist ? prev[b.length] : big;
};
//#endregion
//#region src/utils/similarity.ts
const ALIGNMENT_SCORES = {
GAP_PENALTY: -1,
MISMATCH_PENALTY: -2,
PERFECT_MATCH: 2,
SOFT_MATCH: 1
};
/**
* Calculates similarity ratio between two strings as a value between 0.0 and 1.0.
* Uses Levenshtein distance normalized by the length of the longer string.
* A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings.
*
* @param textA - First string to compare
* @param textB - Second string to compare
* @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical)
* @example
* calculateSimilarity('hello', 'hello') // Returns 1.0
* calculateSimilarity('hello', 'help') // Returns 0.6
*/
const calculateSimilarity = (textA, textB) => {
const maxLength = Math.max(textA.length, textB.length) || 1;
return (maxLength - calculateLevenshteinDistance(textA, textB)) / maxLength;
};
/**
* Checks if two texts are similar after Arabic normalization.
* Normalizes both texts by removing diacritics and decorative elements,
* then compares their similarity against the provided threshold.
*
* @param textA - First text to compare
* @param textB - Second text to compare
* @param threshold - Similarity threshold (0.0 to 1.0)
* @returns True if normalized texts meet the similarity threshold
* @example
* areSimilarAfterNormalization('السَّلام', 'السلام', 0.9) // Returns true
*/
const areSimilarAfterNormalization = (textA, textB, threshold = .6) => {
return calculateSimilarity(sanitizeArabic(textA), sanitizeArabic(textB)) >= threshold;
};
/**
* Calculates alignment score for two tokens in sequence alignment.
* Uses different scoring criteria: perfect match after normalization gets highest score,
* typo symbols or highly similar tokens get soft match score, mismatches get penalty.
*
* @param tokenA - First token to score
* @param tokenB - Second token to score
* @param typoSymbols - Array of special symbols that get preferential treatment
* @param similarityThreshold - Threshold for considering tokens highly similar
* @returns Alignment score (higher is better match)
* @example
* calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match)
* calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity
*/
const calculateAlignmentScore = (tokenA, tokenB, typoSymbols, similarityThreshold) => {
const normalizedA = sanitizeArabic(tokenA);
const normalizedB = sanitizeArabic(tokenB);
if (normalizedA === normalizedB) return ALIGNMENT_SCORES.PERFECT_MATCH;
const isTypoSymbol = typoSymbols.includes(tokenA) || typoSymbols.includes(tokenB);
const isHighlySimilar = calculateSimilarity(normalizedA, normalizedB) >= similarityThreshold;
return isTypoSymbol || isHighlySimilar ? ALIGNMENT_SCORES.SOFT_MATCH : ALIGNMENT_SCORES.MISMATCH_PENALTY;
};
/**
* Backtracks through the scoring matrix to reconstruct optimal sequence alignment.
* Follows the directional indicators in the matrix to build the sequence of aligned
* token pairs from the Needleman-Wunsch algorithm.
*
* @param matrix - Scoring matrix with directional information from alignment
* @param tokensA - First sequence of tokens
* @param tokensB - Second sequence of tokens
* @returns Array of aligned token pairs, where null indicates a gap
* @throws Error if invalid alignment direction is encountered
*/
const backtrackAlignment = (matrix, tokensA, tokensB) => {
const alignment = [];
let i = tokensA.length;
let j = tokensB.length;
while (i > 0 || j > 0) switch (matrix[i][j].direction) {
case "diagonal":
alignment.push([tokensA[--i], tokensB[--j]]);
break;
case "left":
alignment.push([null, tokensB[--j]]);
break;
case "up":
alignment.push([tokensA[--i], null]);
break;
default: throw new Error("Invalid alignment direction");
}
return alignment.reverse();
};
/**
* Initializes the scoring matrix with gap penalties.
*
* @param lengthA - Length of the first token sequence.
* @param lengthB - Length of the second token sequence.
* @returns A matrix seeded with gap penalties for alignment.
*/
const initializeScoringMatrix = (lengthA, lengthB) => {
const matrix = Array.from({ length: lengthA + 1 }, () => Array.from({ length: lengthB + 1 }, () => ({
direction: null,
score: 0
})));
for (let i = 1; i <= lengthA; i++) matrix[i][0] = {
direction: "up",
score: i * ALIGNMENT_SCORES.GAP_PENALTY
};
for (let j = 1; j <= lengthB; j++) matrix[0][j] = {
direction: "left",
score: j * ALIGNMENT_SCORES.GAP_PENALTY
};
return matrix;
};
/**
* Determines the best alignment direction and score for a cell.
*
* @param diagonalScore - Score achieved by aligning tokens diagonally.
* @param upScore - Score achieved by inserting a gap in the second sequence.
* @param leftScore - Score achieved by inserting a gap in the first sequence.
* @returns The direction and score that maximize the alignment.
*/
const getBestAlignment = (diagonalScore, upScore, leftScore) => {
const maxScore = Math.max(diagonalScore, upScore, leftScore);
if (maxScore === diagonalScore) return {
direction: "diagonal",
score: maxScore
};
if (maxScore === upScore) return {
direction: "up",
score: maxScore
};
return {
direction: "left",
score: maxScore
};
};
/**
* Performs global sequence alignment using the Needleman-Wunsch algorithm.
* Aligns two token sequences to find the optimal pairing that maximizes
* the total alignment score, handling insertions, deletions, and substitutions.
*
* @param tokensA - First sequence of tokens to align
* @param tokensB - Second sequence of tokens to align
* @param typoSymbols - Special symbols that affect scoring
* @param similarityThreshold - Threshold for high similarity scoring
* @returns Array of aligned token pairs, with null indicating gaps
* @example
* alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8)
* // Returns [['a', 'a'], ['b', 'c']]
*/
const alignTokenSequences = (tokensA, tokensB, typoSymbols, similarityThreshold) => {
const lengthA = tokensA.length;
const lengthB = tokensB.length;
const matrix = initializeScoringMatrix(lengthA, lengthB);
const typoSymbolsSet = new Set(typoSymbols);
const normalizedA = tokensA.map((t) => sanitizeArabic(t));
const normalizedB = tokensB.map((t) => sanitizeArabic(t));
for (let i = 1; i <= lengthA; i++) for (let j = 1; j <= lengthB; j++) {
const aNorm = normalizedA[i - 1];
const bNorm = normalizedB[j - 1];
let alignmentScore;
if (aNorm === bNorm) alignmentScore = ALIGNMENT_SCORES.PERFECT_MATCH;
else {
const isTypo = typoSymbolsSet.has(tokensA[i - 1]) || typoSymbolsSet.has(tokensB[j - 1]);
const highSim = calculateSimilarity(aNorm, bNorm) >= similarityThreshold;
alignmentScore = isTypo || highSim ? ALIGNMENT_SCORES.SOFT_MATCH : ALIGNMENT_SCORES.MISMATCH_PENALTY;
}
const { direction, score } = getBestAlignment(matrix[i - 1][j - 1].score + alignmentScore, matrix[i - 1][j].score + ALIGNMENT_SCORES.GAP_PENALTY, matrix[i][j - 1].score + ALIGNMENT_SCORES.GAP_PENALTY);
matrix[i][j] = {
direction,
score
};
}
return backtrackAlignment(matrix, tokensA, tokensB);
};
//#endregion
//#region src/alignment.ts
/**
* Aligns split text segments to match target lines by finding the best order.
*
* This function handles cases where text lines have been split into segments
* and need to be merged back together in the correct order. It compares
* different arrangements of the segments against target lines to find the
* best match based on similarity scores.
*
* @param targetLines - Array where each element is either a string to align against, or falsy to skip alignment
* @param segmentLines - Array of text segments that may represent split versions of target lines.
* @returns Array of aligned text lines
*/
const alignTextSegments = (targetLines, segmentLines) => {
const alignedLines = [];
let segmentIndex = 0;
for (const targetLine of targetLines) {
if (segmentIndex >= segmentLines.length) break;
if (targetLine) {
const { result, segmentsConsumed } = processAlignmentTarget(targetLine, segmentLines, segmentIndex);
if (result) alignedLines.push(result);
segmentIndex += segmentsConsumed;
} else {
alignedLines.push(segmentLines[segmentIndex]);
segmentIndex++;
}
}
if (segmentIndex < segmentLines.length) alignedLines.push(...segmentLines.slice(segmentIndex));
return alignedLines;
};
/**
* Tries to merge two candidate segments in both possible orders and returns the best match.
*
* @param targetLine - The line we are trying to reconstruct.
* @param partA - The first candidate segment to evaluate.
* @param partB - The second candidate segment to evaluate.
* @returns The merged segment that best matches the target line after normalization.
*/
const findBestSegmentMerge = (targetLine, partA, partB) => {
const mergedForward = `${partA} ${partB}`;
const mergedReversed = `${partB} ${partA}`;
const normalizedTarget = sanitizeArabic(targetLine);
return calculateSimilarity(normalizedTarget, sanitizeArabic(mergedForward)) >= calculateSimilarity(normalizedTarget, sanitizeArabic(mergedReversed)) ? mergedForward : mergedReversed;
};
/**
* Processes a single target line that needs alignment.
*
* @param targetLine - The line we are attempting to align to.
* @param segmentLines - The collection of available text segments.
* @param segmentIndex - The current index within {@link segmentLines} to consider.
* @returns An object containing the resulting aligned text and how many segments were consumed.
*/
const processAlignmentTarget = (targetLine, segmentLines, segmentIndex) => {
const currentSegment = segmentLines[segmentIndex];
if (areSimilarAfterNormalization(targetLine, currentSegment)) return {
result: currentSegment,
segmentsConsumed: 1
};
const partA = segmentLines[segmentIndex];
const partB = segmentLines[segmentIndex + 1];
if (!partA || !partB) return partA ? {
result: partA,
segmentsConsumed: 1
} : {
result: "",
segmentsConsumed: 0
};
return {
result: findBestSegmentMerge(targetLine, partA, partB),
segmentsConsumed: 2
};
};
//#endregion
//#region src/balance.ts
/**
* Checks if all double quotes in a string are balanced and returns detailed error information.
*
* A string has balanced quotes when every opening quote has a corresponding closing quote.
* This function counts all quote characters and determines if there's an even number of them.
* If there's an odd number, the last quote is marked as unmatched.
*
* @param str - The string to check for quote balance
* @returns An object containing balance status and any errors found
*
* @example
* ```typescript
* checkQuoteBalance('Hello "world"') // { errors: [], isBalanced: true }
* checkQuoteBalance('Hello "world') // { errors: [{ char: '"', index: 6, reason: 'unmatched', type: 'quote' }], isBalanced: false }
* ```
*/
const checkQuoteBalance = (str) => {
const errors = [];
let quoteCount = 0;
let lastQuoteIndex = -1;
for (let i = 0; i < str.length; i++) if (str[i] === "\"") {
quoteCount++;
lastQuoteIndex = i;
}
const isBalanced$1 = quoteCount % 2 === 0;
if (!isBalanced$1 && lastQuoteIndex !== -1) errors.push({
char: "\"",
index: lastQuoteIndex,
reason: "unmatched",
type: "quote"
});
return {
errors,
isBalanced: isBalanced$1
};
};
/** Mapping of opening brackets to their corresponding closing brackets */
const BRACKETS = {
"«": "»",
"(": ")",
"[": "]",
"{": "}"
};
/** Set of all opening bracket characters */
const OPEN_BRACKETS = new Set([
"«",
"(",
"[",
"{"
]);
/** Set of all closing bracket characters */
const CLOSE_BRACKETS = new Set([
"»",
")",
"]",
"}"
]);
/**
* Checks if all brackets in a string are properly balanced and returns detailed error information.
*
* A string has balanced brackets when:
* - Every opening bracket has a corresponding closing bracket
* - Brackets are properly nested (no crossing pairs)
* - Each closing bracket matches the most recent unmatched opening bracket
*
* Supports the following bracket pairs: (), [], {}, «»
*
* @param str - The string to check for bracket balance
* @returns An object containing balance status and any errors found
*
* @example
* ```typescript
* checkBracketBalance('(hello [world])') // { errors: [], isBalanced: true }
* checkBracketBalance('(hello [world)') // { errors: [{ char: '[', index: 7, reason: 'unclosed', type: 'bracket' }], isBalanced: false }
* checkBracketBalance('(hello ]world[') // { errors: [...], isBalanced: false }
* ```
*/
const checkBracketBalance = (str) => {
const errors = [];
const stack = [];
for (let i = 0; i < str.length; i++) {
const char = str[i];
if (OPEN_BRACKETS.has(char)) stack.push({
char,
index: i
});
else if (CLOSE_BRACKETS.has(char)) {
const lastOpen = stack.pop();
if (!lastOpen) errors.push({
char,
index: i,
reason: "unmatched",
type: "bracket"
});
else if (BRACKETS[lastOpen.char] !== char) {
errors.push({
char: lastOpen.char,
index: lastOpen.index,
reason: "mismatched",
type: "bracket"
});
errors.push({
char,
index: i,
reason: "mismatched",
type: "bracket"
});
}
}
}
stack.forEach(({ char, index }) => {
errors.push({
char,
index,
reason: "unclosed",
type: "bracket"
});
});
return {
errors,
isBalanced: errors.length === 0
};
};
/**
* Checks if both quotes and brackets are balanced in a string and returns detailed error information.
*
* This function combines the results of both quote and bracket balance checking,
* providing a comprehensive analysis of all balance issues in the text.
* The errors are sorted by their position in the string for easier debugging.
*
* @param str - The string to check for overall balance
* @returns An object containing combined balance status and all errors found, sorted by position
*
* @example
* ```typescript
* checkBalance('Hello "world" and (test)') // { errors: [], isBalanced: true }
* checkBalance('Hello "world and (test') // { errors: [...], isBalanced: false }
* ```
*/
const checkBalance = (str) => {
const quoteResult = checkQuoteBalance(str);
const bracketResult = checkBracketBalance(str);
return {
errors: [...quoteResult.errors, ...bracketResult.errors].sort((a, b) => a.index - b.index),
isBalanced: quoteResult.isBalanced && bracketResult.isBalanced
};
};
/**
* Gets detailed character-level errors for unbalanced quotes and brackets in multi-line text.
*
* This function processes text line by line, but only checks lines longer than 10 characters
* for balance issues. It returns absolute positions that can be used with text editors
* or highlighting components that need precise character positioning across the entire text.
*
* The absolute index accounts for newline characters between lines, providing accurate
* positioning for the original text string.
*
* @param text - The multi-line text to analyze for balance errors
* @returns Array of character errors with absolute positioning information
*
* @example
* ```typescript
* const text = 'Line 1 with "quote\nLine 2 with (bracket';
* const errors = getUnbalancedErrors(text);
* // Returns errors with absoluteIndex pointing to exact character positions
* ```
*/
const getUnbalancedErrors = (text) => {
const characterErrors = [];
const lines = text.split("\n");
let absoluteIndex = 0;
lines.forEach((line, lineIndex) => {
if (line.length > 10) {
const balanceResult = checkBalance(line);
if (!balanceResult.isBalanced) balanceResult.errors.forEach((error) => {
characterErrors.push({
absoluteIndex: absoluteIndex + error.index,
char: error.char,
reason: error.reason,
type: error.type
});
});
}
absoluteIndex += line.length + (lineIndex < lines.length - 1 ? 1 : 0);
});
return characterErrors;
};
/**
* Checks if all double quotes in a string are balanced.
*
* This is a convenience function that returns only the boolean result
* without detailed error information.
*
* @param str - The string to check for quote balance
* @returns True if quotes are balanced, false otherwise
*
* @example
* ```typescript
* areQuotesBalanced('Hello "world"') // true
* areQuotesBalanced('Hello "world') // false
* ```
*/
const areQuotesBalanced = (str) => {
return checkQuoteBalance(str).isBalanced;
};
/**
* Checks if all brackets in a string are properly balanced.
*
* This is a convenience function that returns only the boolean result
* without detailed error information.
*
* @param str - The string to check for bracket balance
* @returns True if brackets are balanced, false otherwise
*
* @example
* ```typescript
* areBracketsBalanced('(hello [world])') // true
* areBracketsBalanced('(hello [world') // false
* ```
*/
const areBracketsBalanced = (str) => {
return checkBracketBalance(str).isBalanced;
};
/**
* Checks if both quotes and brackets are balanced in a string.
*
* This is a convenience function that returns only the boolean result
* without detailed error information.
*
* @param str - The string to check for overall balance
* @returns True if both quotes and brackets are balanced, false otherwise
*
* @example
* ```typescript
* isBalanced('Hello "world" and (test)') // true
* isBalanced('Hello "world and (test') // false
* ```
*/
const isBalanced = (str) => {
return checkBalance(str).isBalanced;
};
//#endregion
//#region src/utils/textUtils.ts
const INTAHA_ACTUAL = "اهـ";
/**
* Collection of regex patterns used throughout the library for text processing
*/
const PATTERNS = {
arabicCharacters: /[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]/,
arabicDigits: /[0-9\u0660-\u0669]+/,
arabicFootnoteReferenceRegex: /^\([\u0660-\u0669]+\)/g,
arabicLettersAndDigits: /[0-9\u0621-\u063A\u0641-\u064A\u0660-\u0669]+/g,
arabicPunctuationAndWhitespace: /[\s\u060C\u061B\u061F\u06D4]+/,
arabicReferenceRegex: /\([\u0660-\u0669]+\)/g,
footnoteEmbedded: /\([0-9\u0660-\u0669]+\)/,
footnoteStandalone: /^\(?[0-9\u0660-\u0669]+\)?[،.]?$/,
invalidReferenceRegex: /\(\)|\([.1OV9]+\)/g,
ocrConfusedFootnoteReferenceRegex: /^\([.1OV9]+\)/g,
ocrConfusedReferenceRegex: /\([.1OV9]+\)/g,
whitespace: /\s+/
};
/**
* Extracts the first sequence of Arabic or Western digits from text.
* Used primarily for footnote number comparison to match related footnote elements.
*
* @param text - Text containing digits to extract
* @returns First digit sequence found, or empty string if none found
* @example
* extractDigits('(٥)أخرجه البخاري') // Returns '٥'
* extractDigits('See note (123)') // Returns '123'
*/
const extractDigits = (text) => {
const match = text.match(PATTERNS.arabicDigits);
return match ? match[0] : "";
};
/**
* Tokenizes text into individual words while preserving special symbols.
* Adds spacing around preserved symbols to ensure they are tokenized separately,
* then splits on whitespace.
*
* @param text - Text to tokenize
* @param preserveSymbols - Array of symbols that should be tokenized as separate tokens
* @returns Array of tokens, or empty array if input is empty/whitespace
* @example
* tokenizeText('Hello ﷺ world', ['ﷺ']) // Returns ['Hello', 'ﷺ', 'world']
*/
const tokenizeText = (text, preserveSymbols = []) => {
let processedText = text;
for (const symbol of preserveSymbols) {
const symbolRegex = new RegExp(symbol, "g");
processedText = processedText.replace(symbolRegex, ` ${symbol} `);
}
return processedText.trim().split(PATTERNS.whitespace).filter(Boolean);
};
/**
* Handles fusion of standalone and embedded footnotes during token processing.
* Detects patterns where standalone footnotes should be merged with embedded ones
* or where trailing standalone footnotes should be skipped.
*
* @param result - Current result array being built
* @param previousToken - The previous token in the sequence
* @param currentToken - The current token being processed
* @returns True if the current token was handled (fused or skipped), false otherwise
* @example
* // (٥) + (٥)أخرجه → result gets (٥)أخرجه
* // (٥)أخرجه + (٥) → (٥) is skipped
*/
const handleFootnoteFusion = (result, previousToken, currentToken) => {
const prevIsStandalone = PATTERNS.footnoteStandalone.test(previousToken);
const currHasEmbedded = PATTERNS.footnoteEmbedded.test(currentToken);
const currIsStandalone = PATTERNS.footnoteStandalone.test(currentToken);
const prevHasEmbedded = PATTERNS.footnoteEmbedded.test(previousToken);
const prevDigits = extractDigits(previousToken);
const currDigits = extractDigits(currentToken);
if (prevIsStandalone && currHasEmbedded && prevDigits === currDigits) {
result[result.length - 1] = currentToken;
return true;
}
if (prevHasEmbedded && currIsStandalone && prevDigits === currDigits) return true;
return false;
};
/**
* Handles selection logic for tokens with embedded footnotes during alignment.
* Prefers tokens that contain embedded footnotes over plain text, and among
* tokens with embedded footnotes, prefers the shorter one.
*
* @param tokenA - First token to compare
* @param tokenB - Second token to compare
* @returns Array containing selected token(s), or null if no special handling needed
* @example
* handleFootnoteSelection('text', '(١)text') // Returns ['(١)text']
* handleFootnoteSelection('(١)longtext', '(١)text') // Returns ['(١)text']
*/
const handleFootnoteSelection = (tokenA, tokenB) => {
const aHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenA);
const bHasEmbedded = PATTERNS.footnoteEmbedded.test(tokenB);
if (aHasEmbedded && !bHasEmbedded) return [tokenA];
if (bHasEmbedded && !aHasEmbedded) return [tokenB];
if (aHasEmbedded && bHasEmbedded) return [tokenA.length <= tokenB.length ? tokenA : tokenB];
return null;
};
/**
* Handles selection logic for standalone footnote tokens during alignment.
* Manages cases where one or both tokens are standalone footnotes, preserving
* both tokens when one is a footnote and the other is regular text.
*
* @param tokenA - First token to compare
* @param tokenB - Second token to compare
* @returns Array containing selected token(s), or null if no special handling needed
* @example
* handleStandaloneFootnotes('(١)', 'text') // Returns ['(١)', 'text']
* handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one)
*/
const handleStandaloneFootnotes = (tokenA, tokenB) => {
const aIsFootnote = PATTERNS.footnoteStandalone.test(tokenA);
const bIsFootnote = PATTERNS.footnoteStandalone.test(tokenB);
if (aIsFootnote && !bIsFootnote) return [tokenA, tokenB];
if (bIsFootnote && !aIsFootnote) return [tokenB, tokenA];
if (aIsFootnote && bIsFootnote) return [tokenA.length <= tokenB.length ? tokenA : tokenB];
return null;
};
/**
* Removes simple footnote references from Arabic text.
* Handles footnotes in the format (¬[Arabic numerals]) where ¬ is the not symbol (U+00AC).
*
* @param text - The input text containing footnote references to remove
* @returns The text with footnote references removed and extra spaces normalized
*
* @example
* ```typescript
* removeFootnoteReferencesSimple("هذا النص (¬١٢٣) يحتوي على حاشية")
* // Returns: "هذا النص يحتوي على حاشية"
* ```
*/
const removeFootnoteReferencesSimple = (text) => {
return text.replace(/ ?\(\u00AC[\u0660-\u0669]+\) ?/g, " ").replace(/ +/g, " ").trim();
};
/**
* Removes single digit footnote references and extended footnote formats from Arabic text.
* Handles footnotes in the format:
* - ([single Arabic digit]) - e.g., (٣)
* - ([single Arabic digit] [single Arabic letter]) - e.g., (٣ م), (٥ ه), (٧ ب)
*
* @param text - The input text containing footnote references to remove
* @returns The text with footnote references removed and extra spaces normalized
*
* @example
* ```typescript
* removeSingleDigitFootnoteReferences("هذا النص (٣) والآخر (٥ م) والثالث (٧ ه) يحتوي على حواشي")
* // Returns: "هذا النص والآخر والثالث يحتوي على حواشي"
* ```
*/
const removeSingleDigitFootnoteReferences = (text) => {
return text.replace(/ ?\([٠-٩]{1}(\s+[\u0600-\u06FF])?\) ?/g, " ").replace(/ +/g, " ").trim();
};
/**
* Standardizes standalone Hijri symbol ه to هـ when following Arabic digits
* @param text - Input text to process
* @returns Text with standardized Hijri symbols
*/
const standardizeHijriSymbol = (text) => {
return text.replace(/([0-9\u0660-\u0669])\s*ه(?=\s|$|[^\u0621-\u063A\u0641-\u064A\u0660-\u0669])/gu, "$1 هـ");
};
/**
* Standardizes standalone اه to اهـ when appearing as whole word
* @param text - Input text to process
* @returns Text with standardized AH Hijri symbols
*/
const standardizeIntahaSymbol = (text) => {
return text.replace(/(^|\s|[^\u0600-\u06FF])اه(?=\s|$|[^\u0600-\u06FF])/gu, `$1${INTAHA_ACTUAL}`);
};
//#endregion
//#region src/footnotes.ts
const INVALID_FOOTNOTE = "()";
/**
* Checks if the given text contains invalid footnote references.
* Invalid footnotes include empty parentheses "()" or OCR-confused characters
* like ".1OV9" that were misrecognized instead of Arabic numerals.
*
* @param text - Text to check for invalid footnote patterns
* @returns True if text contains invalid footnote references, false otherwise
* @example
* hasInvalidFootnotes('This text has ()') // Returns true
* hasInvalidFootnotes('This text has (١)') // Returns false
* hasInvalidFootnotes('OCR mistake (O)') // Returns true
*/
const hasInvalidFootnotes = (text) => {
return PATTERNS.invalidReferenceRegex.test(text);
};
const arabicFormatter = new Intl.NumberFormat("ar-SA");
/**
* Converts a number to Arabic-Indic numerals using the Intl.NumberFormat API.
* Uses the 'ar-SA' locale to ensure proper Arabic numeral formatting.
*
* @param num - The number to convert to Arabic numerals
* @returns String representation using Arabic-Indic digits (٠-٩)
* @example
* numberToArabic(123) // Returns '١٢٣'
* numberToArabic(5) // Returns '٥'
*/
const numberToArabic = (num) => {
return arabicFormatter.format(num);
};
/**
* Converts OCR-confused characters to their corresponding Arabic-Indic numerals.
* Handles common OCR misrecognitions where Latin characters are mistaken for Arabic digits.
*
* @param char - Single character that may be an OCR mistake
* @returns Corresponding Arabic-Indic numeral or original character if no mapping exists
* @example
* ocrToArabic('O') // Returns '٥' (O often confused with ٥)
* ocrToArabic('1') // Returns '١' (1 often confused with ١)
* ocrToArabic('.') // Returns '٠' (dot often confused with ٠)
*/
const ocrToArabic = (char) => {
return {
"1": "١",
"9": "٩",
".": "٠",
O: "٥",
o: "٥",
V: "٧",
v: "٧"
}[char] || char;
};
/**
* Parses Arabic-Indic numerals from a reference string and converts to a JavaScript number.
* Removes parentheses and converts each Arabic-Indic digit to its Western equivalent.
*
* @param arabicStr - String containing Arabic-Indic numerals, typically in format '(١٢٣)'
* @returns Parsed number, or 0 if parsing fails
* @example
* arabicToNumber('(١٢٣)') // Returns 123
* arabicToNumber('(٥)') // Returns 5
* arabicToNumber('invalid') // Returns 0
*/
const arabicToNumber = (arabicStr) => {
const lookup = {
"٠": "0",
"١": "1",
"٢": "2",
"٣": "3",
"٤": "4",
"٥": "5",
"٦": "6",
"٧": "7",
"٨": "8",
"٩": "9"
};
const digits = arabicStr.replace(/[()]/g, "");
let numStr = "";
for (const char of digits) numStr += lookup[char];
const parsed = parseInt(numStr, 10);
return Number.isNaN(parsed) ? 0 : parsed;
};
/**
* Extracts all footnote references from text lines, categorizing them by type and location.
* Handles both Arabic-Indic numerals and OCR-confused characters in body text and footnotes.
*
* @param lines - Array of text line objects with optional isFootnote flag
* @returns Object containing categorized reference arrays:
* - bodyReferences: All valid references found in body text
* - footnoteReferences: All valid references found in footnotes
* - ocrConfusedInBody: OCR-confused references in body text (for tracking)
* - ocrConfusedInFootnotes: OCR-confused references in footnotes (for tracking)
* @example
* const lines = [
* { text: 'Body with (١) and (O)', isFootnote: false },
* { text: '(١) Footnote text', isFootnote: true }
* ];
* const refs = extractReferences(lines);
* // refs.bodyReferences contains ['(١)', '(٥)'] - OCR 'O' converted to '٥'
*/
const extractReferences = (lines) => {
const arabicReferencesInBody = lines.filter((b) => !b.isFootnote).flatMap((b) => b.text.match(PATTERNS.arabicReferenceRegex) || []);
const ocrConfusedReferencesInBody = lines.filter((b) => !b.isFootnote).flatMap((b) => b.text.match(PATTERNS.ocrConfusedReferenceRegex) || []);
const arabicReferencesInFootnotes = lines.filter((b) => b.isFootnote).flatMap((b) => b.text.match(PATTERNS.arabicFootnoteReferenceRegex) || []);
const ocrConfusedReferencesInFootnotes = lines.filter((b) => b.isFootnote).flatMap((b) => b.text.match(PATTERNS.ocrConfusedFootnoteReferenceRegex) || []);
const convertedOcrBodyRefs = ocrConfusedReferencesInBody.map((ref) => ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char)));
const convertedOcrFootnoteRefs = ocrConfusedReferencesInFootnotes.map((ref) => ref.replace(/[.1OV9]/g, (char) => ocrToArabic(char)));
return {
bodyReferences: [...arabicReferencesInBody, ...convertedOcrBodyRefs],
footnoteReferences: [...arabicReferencesInFootnotes, ...convertedOcrFootnoteRefs],
ocrConfusedInBody: ocrConfusedReferencesInBody,
ocrConfusedInFootnotes: ocrConfusedReferencesInFootnotes
};
};
/**
* Determines if footnote reference correction is needed by checking for:
* 1. Invalid footnote patterns (empty parentheses, OCR mistakes)
* 2. Mismatched sets of references between body text and footnotes
* 3. Different counts of references in body vs footnotes
*
* @param lines - Array of text line objects to analyze
* @param references - Extracted reference data from extractReferences()
* @returns True if correction is needed, false if references are already correct
* @example
* const lines = [{ text: 'Text with ()', isFootnote: false }];
* const refs = extractReferences(lines);
* needsCorrection(lines, refs) // Returns true due to invalid "()" reference
*/
const needsCorrection = (lines, references) => {
if (lines.some((line) => hasInvalidFootnotes(line.text))) return true;
const bodySet = new Set(references.bodyReferences);
const footnoteSet = new Set(references.footnoteReferences);
if (bodySet.size !== footnoteSet.size) return true;
for (const ref of bodySet) if (!footnoteSet.has(ref)) return true;
return false;
};
/**
* Corrects footnote references in an array of text lines by:
* 1. Converting OCR-confused characters to proper Arabic numerals
* 2. Filling in empty "()" references with appropriate numbers
* 3. Ensuring footnote references in body text match those in footnotes
* 4. Generating new reference numbers when needed
*
* @param lines - Array of text line objects, each with optional isFootnote flag
* @returns Array of corrected text lines with proper footnote references
* @example
* const lines = [
* { text: 'Main text with ()', isFootnote: false },
* { text: '() This is a footnote', isFootnote: true }
* ];
* const corrected = correctReferences(lines);
* // Returns lines with "()" replaced by proper Arabic numerals like "(١)"
*/
const correctReferences = (lines) => {
if (!needsCorrection(lines, extractReferences(lines))) return lines;
const sanitizedLines = lines.map((line) => {
let updatedText = line.text;
updatedText = updatedText.replace(/\([.1OV9]+\)/g, (match) => {
return match.replace(/[.1OV9]/g, (char) => ocrToArabic(char));
});
return {
...line,
text: updatedText
};
});
const cleanReferences = extractReferences(sanitizedLines);
const bodyRefSet = new Set(cleanReferences.bodyReferences);
const footnoteRefSet = new Set(cleanReferences.footnot