baburchi
Version:
A lightweight TypeScript library designed to fix typos in OCR post-processing.
814 lines • 36.9 kB
TypeScript
//#region src/alignment.d.ts
/**
* Aligns split text segments to match target lines by finding the best order.
*
* This function handles cases where text lines have been split into segments
* and need to be merged back together in the correct order. It compares
* different arrangements of the segments against target lines to find the
* best match based on similarity scores.
*
* @param targetLines - Array where each element is either a string to align against, or falsy to skip alignment
* @param segmentLines - Array of text segments that may represent split versions of target lines.
* @returns Array of aligned text lines
*/
declare const alignTextSegments: (targetLines: string[], segmentLines: string[]) => string[];
//#endregion
//#region src/balance.d.ts
/**
* Represents an error found when checking balance of quotes or brackets in text.
*/
type BalanceError = {
/** The character that caused the error */
char: string;
/** The position of the character in the string */
index: number;
/** The reason for the error */
reason: 'mismatched' | 'unclosed' | 'unmatched';
/** The type of character that caused the error */
type: 'bracket' | 'quote';
};
/**
* Result of a balance check operation.
*/
type BalanceResult = {
/** Array of errors found during balance checking */
errors: BalanceError[];
/** Whether the text is properly balanced */
isBalanced: boolean;
};
/** Mapping of opening brackets to their corresponding closing brackets */
declare const BRACKETS: {
'\u00AB': string;
'(': string;
'[': string;
'{': string;
};
/** Set of all opening bracket characters */
declare const OPEN_BRACKETS: Set<string>;
/** Set of all closing bracket characters */
declare const CLOSE_BRACKETS: Set<string>;
/**
* Checks if both quotes and brackets are balanced in a string and returns detailed error information.
*
* This function combines the results of both quote and bracket balance checking,
* providing a comprehensive analysis of all balance issues in the text.
* The errors are sorted by their position in the string for easier debugging.
*
* @param str - The string to check for overall balance
* @returns An object containing combined balance status and all errors found, sorted by position
*
* @example
* ```typescript
* checkBalance('Hello "world" and (test)') // { errors: [], isBalanced: true }
* checkBalance('Hello "world and (test') // { errors: [...], isBalanced: false }
* ```
*/
declare const checkBalance: (str: string) => BalanceResult;
/**
* Enhanced error detection that returns absolute character positions for use with HighlightableTextarea.
*
* This interface extends the basic BalanceError to include absolute positioning
* across multiple lines of text, making it suitable for text editors and
* syntax highlighters that need precise character positioning.
*/
interface CharacterError {
/** Absolute character position from the start of the entire text */
absoluteIndex: number;
/** The character that caused the error */
char: string;
/** The reason for the error */
reason: 'mismatched' | 'unclosed' | 'unmatched';
/** The type of character that caused the error */
type: 'bracket' | 'quote';
}
/**
* Gets detailed character-level errors for unbalanced quotes and brackets in multi-line text.
*
* This function processes text line by line, but only checks lines longer than 10 characters
* for balance issues. It returns absolute positions that can be used with text editors
* or highlighting components that need precise character positioning across the entire text.
*
* The absolute index accounts for newline characters between lines, providing accurate
* positioning for the original text string.
*
* @param text - The multi-line text to analyze for balance errors
* @returns Array of character errors with absolute positioning information
*
* @example
* ```typescript
* const text = 'Line 1 with "quote\nLine 2 with (bracket';
* const errors = getUnbalancedErrors(text);
* // Returns errors with absoluteIndex pointing to exact character positions
* ```
*/
declare const getUnbalancedErrors: (text: string) => CharacterError[];
/**
* Checks if all double quotes in a string are balanced.
*
* This is a convenience function that returns only the boolean result
* without detailed error information.
*
* @param str - The string to check for quote balance
* @returns True if quotes are balanced, false otherwise
*
* @example
* ```typescript
* areQuotesBalanced('Hello "world"') // true
* areQuotesBalanced('Hello "world') // false
* ```
*/
declare const areQuotesBalanced: (str: string) => boolean;
/**
* Checks if all brackets in a string are properly balanced.
*
* This is a convenience function that returns only the boolean result
* without detailed error information.
*
* @param str - The string to check for bracket balance
* @returns True if brackets are balanced, false otherwise
*
* @example
* ```typescript
* areBracketsBalanced('(hello [world])') // true
* areBracketsBalanced('(hello [world') // false
* ```
*/
declare const areBracketsBalanced: (str: string) => boolean;
/**
* Checks if both quotes and brackets are balanced in a string.
*
* This is a convenience function that returns only the boolean result
* without detailed error information.
*
* @param str - The string to check for overall balance
* @returns True if both quotes and brackets are balanced, false otherwise
*
* @example
* ```typescript
* isBalanced('Hello "world" and (test)') // true
* isBalanced('Hello "world and (test') // false
* ```
*/
declare const isBalanced: (str: string) => boolean;
//#endregion
//#region src/footnotes.d.ts
/**
* Checks if the given text contains invalid footnote references.
* Invalid footnotes include empty parentheses "()" or OCR-confused characters
* like ".1OV9" that were misrecognized instead of Arabic numerals.
*
* @param text - Text to check for invalid footnote patterns
* @returns True if text contains invalid footnote references, false otherwise
* @example
* hasInvalidFootnotes('This text has ()') // Returns true
* hasInvalidFootnotes('This text has (١)') // Returns false
* hasInvalidFootnotes('OCR mistake (O)') // Returns true
*/
declare const hasInvalidFootnotes: (text: string) => boolean;
type TextLine = {
isFootnote?: boolean;
text: string;
};
/**
* Corrects footnote references in an array of text lines by:
* 1. Converting OCR-confused characters to proper Arabic numerals
* 2. Filling in empty "()" references with appropriate numbers
* 3. Ensuring footnote references in body text match those in footnotes
* 4. Generating new reference numbers when needed
*
* @param lines - Array of text line objects, each with optional isFootnote flag
* @returns Array of corrected text lines with proper footnote references
* @example
* const lines = [
* { text: 'Main text with ()', isFootnote: false },
* { text: '() This is a footnote', isFootnote: true }
* ];
* const corrected = correctReferences(lines);
* // Returns lines with "()" replaced by proper Arabic numerals like "(١)"
*/
declare const correctReferences: <T extends TextLine>(lines: T[]) => T[];
//#endregion
//#region src/types.d.ts
/**
* Configuration options for fixing typos in OCR text using alignment algorithms.
* These options control how text tokens are compared, aligned, and merged during typo correction.
*/
type FixTypoOptions = {
/**
* High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
* Used in post-processing to eliminate redundant tokens that are nearly identical.
* Should typically be higher than similarityThreshold to catch only very similar duplicates.
* @default 0.9
* @example 0.95 // Removes tokens that are 95% or more similar
*/
readonly highSimilarityThreshold: number;
/**
* Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
* Higher values require closer matches, lower values are more permissive.
* Used in the Needleman-Wunsch alignment algorithm for token matching.
* @default 0.7
* @example 0.8 // Requires 80% similarity for token alignment
*/
readonly similarityThreshold: number;
/**
* Array of special symbols that should be preserved during typo correction.
* These symbols (like honorifics or religious markers) take precedence in token selection.
* @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
*/
readonly typoSymbols: string[];
};
type MatchPolicy = {
/** Try approximate matches for leftovers (default true). */
enableFuzzy?: boolean;
/** Max absolute edit distance accepted in fuzzy (default 3). */
maxEditAbs?: number;
/** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */
maxEditRel?: number;
/** q-gram length for candidate generation (default 4). */
q?: number;
/** Max rare grams to seed candidates per excerpt (default 5). */
gramsPerExcerpt?: number;
/** Max candidate windows verified per excerpt (default 40). */
maxCandidatesPerExcerpt?: number;
/** Seam length for bleed windows (default 512). */
seamLen?: number;
/**
* Optional logging function for debugging.
*/
log?(message?: any, ...optionalParams: any[]): void;
};
//#endregion
//#region src/fuzzy.d.ts
/**
* Main function to find the single best match per excerpt.
* Combines exact matching with fuzzy matching for comprehensive text search.
*
* @param pages - Array of page texts to search within
* @param excerpts - Array of text excerpts to find matches for
* @param policy - Optional matching policy configuration
* @returns Array of page indices (one per excerpt, -1 if no match found)
*
* @example
* ```typescript
* const pages = ['Hello world', 'Goodbye world'];
* const excerpts = ['Hello', 'Good bye']; // Note the typo
* const matches = findMatches(pages, excerpts, { enableFuzzy: true });
* // Returns [0, 1] - exact match on page 0, fuzzy match on page 1
* ```
*/
declare function findMatches(pages: string[], excerpts: string[], policy?: MatchPolicy): number[];
/**
* Main function to find all matches per excerpt, ranked by quality.
* Returns comprehensive results with both exact and fuzzy matches for each excerpt.
*
* @param pages - Array of page texts to search within
* @param excerpts - Array of text excerpts to find matches for
* @param policy - Optional matching policy configuration
* @returns Array of page index arrays (one array per excerpt, sorted by match quality)
*
* @example
* ```typescript
* const pages = ['Hello world', 'Hello there', 'Goodbye world'];
* const excerpts = ['Hello'];
* const matches = findMatchesAll(pages, excerpts);
* // Returns [[0, 1]] - both pages 0 and 1 contain "Hello", sorted by page order
* ```
*/
declare function findMatchesAll(pages: string[], excerpts: string[], policy?: MatchPolicy): number[][];
//#endregion
//#region src/noise.d.ts
/**
* Character statistics for analyzing text content and patterns
*/
type CharacterStats = {
/** Number of Arabic script characters in the text */
arabicCount: number;
/** Map of character frequencies for repetition analysis */
charFreq: Map<string, number>;
/** Number of digit characters (0-9) in the text */
digitCount: number;
/** Number of Latin alphabet characters (a-z, A-Z) in the text */
latinCount: number;
/** Number of punctuation characters in the text */
punctuationCount: number;
/** Number of whitespace characters in the text */
spaceCount: number;
/** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */
symbolCount: number;
};
/**
* Determines if a given Arabic text string is likely to be noise or unwanted OCR artifacts.
* This function performs comprehensive analysis to identify patterns commonly associated
* with OCR errors, formatting artifacts, or meaningless content in Arabic text processing.
*
* @param text - The input string to analyze for noise patterns
* @returns true if the text is likely noise or unwanted content, false if it appears to be valid Arabic content
*
* @example
* ```typescript
* import { isArabicTextNoise } from 'baburchi';
*
* console.log(isArabicTextNoise('---')); // true (formatting artifact)
* console.log(isArabicTextNoise('السلام عليكم')); // false (valid Arabic)
* console.log(isArabicTextNoise('ABC')); // true (uppercase pattern)
* ```
*/
declare const isArabicTextNoise: (text: string) => boolean;
/**
* Analyzes character composition and frequency statistics for the input text.
* Categorizes characters by type (Arabic, Latin, digits, spaces, punctuation, symbols)
* and tracks character frequency for pattern analysis.
*
* @param text - The text string to analyze
* @returns CharacterStats object containing detailed character analysis
*
* @example
* ```typescript
* import { analyzeCharacterStats } from 'baburchi';
*
* const stats = analyzeCharacterStats('مرحبا 123!');
* console.log(stats.arabicCount); // 5
* console.log(stats.digitCount); // 3
* console.log(stats.symbolCount); // 1
* ```
*/
declare function analyzeCharacterStats(text: string): CharacterStats;
/**
* Detects excessive repetition of specific characters that commonly indicate noise.
* Focuses on repetitive characters like exclamation marks, dots, dashes, equals signs,
* and underscores that often appear in OCR artifacts or formatting elements.
*
* @param charStats - Character statistics from analyzeCharacterStats
* @param textLength - Total length of the original text
* @returns true if excessive repetition is detected, false otherwise
*
* @example
* ```typescript
* import { hasExcessiveRepetition, analyzeCharacterStats } from 'baburchi';
*
* const stats = analyzeCharacterStats('!!!!!');
* console.log(hasExcessiveRepetition(stats, 5)); // true
*
* const normalStats = analyzeCharacterStats('hello world');
* console.log(hasExcessiveRepetition(normalStats, 11)); // false
* ```
*/
declare function hasExcessiveRepetition(charStats: CharacterStats, textLength: number): boolean;
/**
* Identifies text that matches common noise patterns using regular expressions.
* Detects patterns like repeated dashes, dot sequences, uppercase-only text,
* digit-dash combinations, and other formatting artifacts commonly found in OCR output.
*
* @param text - The text string to check against noise patterns
* @returns true if the text matches a basic noise pattern, false otherwise
*
* @example
* ```typescript
* import { isBasicNoisePattern } from 'baburchi';
*
* console.log(isBasicNoisePattern('---')); // true
* console.log(isBasicNoisePattern('...')); // true
* console.log(isBasicNoisePattern('ABC')); // true
* console.log(isBasicNoisePattern('- 77')); // true
* console.log(isBasicNoisePattern('hello world')); // false
* ```
*/
declare function isBasicNoisePattern(text: string): boolean;
/**
* Determines if non-Arabic content should be classified as noise based on various heuristics.
* Analyzes symbol-to-content ratios, text length, spacing patterns, and content composition
* to identify unwanted OCR artifacts or meaningless content.
*
* @param charStats - Character statistics from analyzeCharacterStats
* @param textLength - Total length of the original text
* @param text - The original text string for additional pattern matching
* @returns true if the content is likely noise, false if it appears to be valid content
*
* @example
* ```typescript
* import { isNonArabicNoise, analyzeCharacterStats } from 'baburchi';
*
* const stats = analyzeCharacterStats('!!!');
* console.log(isNonArabicNoise(stats, 3, '!!!')); // true
*
* const validStats = analyzeCharacterStats('2023');
* console.log(isNonArabicNoise(validStats, 4, '2023')); // false
* ```
*/
declare function isNonArabicNoise(charStats: CharacterStats, textLength: number, text: string): boolean;
/**
* Detects problematic spacing patterns that indicate noise or OCR artifacts.
* Identifies cases where spacing is excessive relative to content, or where
* single characters are surrounded by spaces in a way that suggests OCR errors.
*
* @param charStats - Character statistics from analyzeCharacterStats
* @param contentChars - Number of meaningful content characters (Arabic + Latin + digits)
* @param textLength - Total length of the original text
* @returns true if spacing patterns indicate noise, false otherwise
*
* @example
* ```typescript
* import { isSpacingNoise, analyzeCharacterStats } from 'baburchi';
*
* const stats = analyzeCharacterStats(' a ');
* const contentChars = stats.arabicCount + stats.latinCount + stats.digitCount;
* console.log(isSpacingNoise(stats, contentChars, 3)); // true
*
* const normalStats = analyzeCharacterStats('hello world');
* const normalContent = normalStats.arabicCount + normalStats.latinCount + normalStats.digitCount;
* console.log(isSpacingNoise(normalStats, normalContent, 11)); // false
* ```
*/
declare function isSpacingNoise(charStats: CharacterStats, contentChars: number, textLength: number): boolean;
/**
* Validates whether Arabic content is substantial enough to be considered meaningful.
* Uses character counts and text length to determine if Arabic text contains
* sufficient content or if it's likely to be a fragment or OCR artifact.
*
* @param charStats - Character statistics from analyzeCharacterStats
* @param textLength - Total length of the original text
* @returns true if the Arabic content appears valid, false if it's likely noise
*
* @example
* ```typescript
* import { isValidArabicContent, analyzeCharacterStats } from 'baburchi';
*
* const validStats = analyzeCharacterStats('السلام عليكم');
* console.log(isValidArabicContent(validStats, 12)); // true
*
* const shortStats = analyzeCharacterStats('ص');
* console.log(isValidArabicContent(shortStats, 1)); // false
*
* const withDigitsStats = analyzeCharacterStats('ص 5');
* console.log(isValidArabicContent(withDigitsStats, 3)); // true
* ```
*/
declare function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean;
//#endregion
//#region src/typos.d.ts
/**
* Processes text alignment between original and alternate OCR results to fix typos.
* Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
* then selects the best tokens and performs post-processing.
*
* @param originalText - Original OCR text that may contain typos
* @param altText - Reference text from alternate OCR for comparison
* @param options - Configuration options for alignment and selection
* @returns Corrected text with typos fixed
*/
declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
/**
* Convenience wrapper around {@link processTextAlignment} that accepts partial options.
*
* @param original - The source text that may contain typographical errors.
* @param correction - The reference text used to correct the {@link original} text.
* @param options - Partial typo correction options combined with required typo symbols.
* @returns The corrected text generated from the alignment process.
*/
declare const fixTypo: (original: string, correction: string, {
highSimilarityThreshold,
similarityThreshold,
typoSymbols
}: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
//#endregion
//#region src/utils/levenshthein.d.ts
/**
* Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
* The Levenshtein distance is the minimum number of single-character edits (insertions,
* deletions, or substitutions) required to change one string into another.
*
* @param textA - First string to compare
* @param textB - Second string to compare
* @returns Minimum edit distance between the two strings
* @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths
* @example
* calculateLevenshteinDistance('kitten', 'sitting') // Returns 3
* calculateLevenshteinDistance('', 'hello') // Returns 5
*/
declare const calculateLevenshteinDistance: (textA: string, textB: string) => number;
/**
* Calculates bounded Levenshtein distance with early termination.
* More efficient when you only care about distances up to a threshold.
*/
declare const boundedLevenshtein: (a: string, b: string, maxDist: number) => number;
//#endregion
//#region src/utils/sanitize.d.ts
/**
* Ultra-fast Arabic text sanitizer for search/indexing/display.
* Optimized for very high call rates: avoids per-call object spreads and minimizes allocations.
* Options can merge over a base preset or `'none'` to apply exactly the rules you request.
*/
type SanitizePreset = 'light' | 'search' | 'aggressive';
type SanitizeBase = 'none' | SanitizePreset;
/**
* Public options for {@link sanitizeArabic}. When you pass an options object, it overlays the chosen
* `base` (default `'light'`) without allocating merged objects on the hot path; flags are resolved
* directly into local booleans for speed.
*/
type SanitizeOptions = {
/** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */
base?: SanitizeBase;
/**
* NFC normalization (fast-path).
*
* For performance, this sanitizer avoids calling `String.prototype.normalize('NFC')` and instead
* applies the key Arabic canonical compositions inline (hamza/madda combining marks).
* This preserves the NFC behavior that matters for typical Arabic OCR text while keeping throughput high.
*
* Default: `true` in all presets.
*/
nfc?: boolean;
/** Strip zero-width controls (U+200B–U+200F, U+202A–U+202E, U+2060–U+2064, U+FEFF). Default: `true` in presets. */
stripZeroWidth?: boolean;
/** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */
zeroWidthToSpace?: boolean;
/** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */
stripDiacritics?: boolean;
/** Remove footnote references. Default: `true` in `'search'`/`'aggressive'`. */
stripFootnotes?: boolean;
/**
* Remove tatweel (ـ).
* - `true` is treated as `'safe'` (preserves tatweel after digits or 'ه' for dates/list markers)
* - `'safe'` or `'all'` explicitly
* - `false` to keep tatweel
* Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`.
*/
stripTatweel?: boolean | 'safe' | 'all';
/** Normalize آ/أ/إ → ا. Default: `true` in `'search'`/`'aggressive'`. */
normalizeAlif?: boolean;
/** Replace ى → ي. Default: `true` in `'search'`/`'aggressive'`. */
replaceAlifMaqsurah?: boolean;
/** Replace ة → ه (lossy). Default: `true` in `'aggressive'` only. */
replaceTaMarbutahWithHa?: boolean;
/** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */
stripLatinAndSymbols?: boolean;
/** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */
keepOnlyArabicLetters?: boolean;
/** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */
lettersAndSpacesOnly?: boolean;
/** Collapse runs of whitespace to a single space. Default: `true`. */
collapseWhitespace?: boolean;
/** Trim leading/trailing whitespace. Default: `true`. */
trim?: boolean;
/**
* Remove the Hijri date marker ("هـ" or bare "ه" if tatweel already removed) when it follows a date-like token
* (digits/slashes/hyphens/spaces). Example: `1435/3/29 هـ` → `1435/3/29`.
* Default: `true` in `'search'`/`'aggressive'`, `false` in `'light'`.
*/
removeHijriMarker?: boolean;
};
/**
* Creates a reusable sanitizer function with pre-resolved options.
* Use this when you need to sanitize many strings with the same options
* for maximum performance.
*
* @example
* ```ts
* const sanitize = createArabicSanitizer('search');
* const results = texts.map(sanitize);
* ```
*/
declare const createArabicSanitizer: (optionsOrPreset?: SanitizePreset | SanitizeOptions) => ((input: string) => string);
/**
* Sanitizes Arabic text according to a preset or custom options.
*
* Presets:
* - `'light'`: NFC, zero-width removal, collapse/trim spaces.
* - `'search'`: removes diacritics and tatweel, normalizes Alif and ى→ي, removes Hijri marker.
* - `'aggressive'`: ideal for FTS; keeps letters+spaces only and strips common noise.
*
* Custom options:
* - Passing an options object overlays the selected `base` preset (default `'light'`).
* - Use `base: 'none'` to apply **only** the rules you specify (e.g., tatweel only).
*
* **Batch processing**: Pass an array of strings for optimized batch processing.
* Options are resolved once and applied to all strings, providing significant
* performance gains over calling the function in a loop.
*
* Examples:
* ```ts
* sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // 'أبتِكَةُ'
* sanitizeArabic('1435/3/29 هـ', 'aggressive'); // '1435 3 29'
* sanitizeArabic('اَلسَّلَامُ عَلَيْكُمْ', 'search'); // 'السلام عليكم'
*
* // Batch processing (optimized):
* sanitizeArabic(['text1', 'text2', 'text3'], 'search'); // ['result1', 'result2', 'result3']
* ```
*/
declare function sanitizeArabic(input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions): string;
declare function sanitizeArabic(input: string[], optionsOrPreset?: SanitizePreset | SanitizeOptions): string[];
//#endregion
//#region src/utils/similarity.d.ts
/**
* Calculates similarity ratio between two strings as a value between 0.0 and 1.0.
* Uses Levenshtein distance normalized by the length of the longer string.
* A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings.
*
* @param textA - First string to compare
* @param textB - Second string to compare
* @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical)
* @example
* calculateSimilarity('hello', 'hello') // Returns 1.0
* calculateSimilarity('hello', 'help') // Returns 0.6
*/
declare const calculateSimilarity: (textA: string, textB: string) => number;
/**
* Checks if two texts are similar after Arabic normalization.
* Normalizes both texts by removing diacritics and decorative elements,
* then compares their similarity against the provided threshold.
*
* @param textA - First text to compare
* @param textB - Second text to compare
* @param threshold - Similarity threshold (0.0 to 1.0)
* @returns True if normalized texts meet the similarity threshold
* @example
* areSimilarAfterNormalization('السَّلام', 'السلام', 0.9) // Returns true
*/
declare const areSimilarAfterNormalization: (textA: string, textB: string, threshold?: number) => boolean;
/**
* Calculates alignment score for two tokens in sequence alignment.
* Uses different scoring criteria: perfect match after normalization gets highest score,
* typo symbols or highly similar tokens get soft match score, mismatches get penalty.
*
* @param tokenA - First token to score
* @param tokenB - Second token to score
* @param typoSymbols - Array of special symbols that get preferential treatment
* @param similarityThreshold - Threshold for considering tokens highly similar
* @returns Alignment score (higher is better match)
* @example
* calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match)
* calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity
*/
declare const calculateAlignmentScore: (tokenA: string, tokenB: string, typoSymbols: string[], similarityThreshold: number) => number;
type AlignedTokenPair = [null | string, null | string];
type AlignmentCell = {
direction: 'diagonal' | 'left' | 'up' | null;
score: number;
};
/**
* Backtracks through the scoring matrix to reconstruct optimal sequence alignment.
* Follows the directional indicators in the matrix to build the sequence of aligned
* token pairs from the Needleman-Wunsch algorithm.
*
* @param matrix - Scoring matrix with directional information from alignment
* @param tokensA - First sequence of tokens
* @param tokensB - Second sequence of tokens
* @returns Array of aligned token pairs, where null indicates a gap
* @throws Error if invalid alignment direction is encountered
*/
declare const backtrackAlignment: (matrix: AlignmentCell[][], tokensA: string[], tokensB: string[]) => AlignedTokenPair[];
/**
* Performs global sequence alignment using the Needleman-Wunsch algorithm.
* Aligns two token sequences to find the optimal pairing that maximizes
* the total alignment score, handling insertions, deletions, and substitutions.
*
* @param tokensA - First sequence of tokens to align
* @param tokensB - Second sequence of tokens to align
* @param typoSymbols - Special symbols that affect scoring
* @param similarityThreshold - Threshold for high similarity scoring
* @returns Array of aligned token pairs, with null indicating gaps
* @example
* alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8)
* // Returns [['a', 'a'], ['b', 'c']]
*/
declare const alignTokenSequences: (tokensA: string[], tokensB: string[], typoSymbols: string[], similarityThreshold: number) => AlignedTokenPair[];
//#endregion
//#region src/utils/textUtils.d.ts
declare const INTAHA_ACTUAL = "\u0627\u0647\u0640";
/**
* Collection of regex patterns used throughout the library for text processing
*/
declare const PATTERNS: {
/** Matches Arabic characters across all Unicode blocks */
arabicCharacters: RegExp;
/** Matches Arabic-Indic digits (٠-٩) and Western digits (0-9) */
arabicDigits: RegExp;
/** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */
arabicFootnoteReferenceRegex: RegExp;
/** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ٠-٩) */
arabicLettersAndDigits: RegExp;
/** Matches Arabic punctuation marks and whitespace characters */
arabicPunctuationAndWhitespace: RegExp;
/** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
arabicReferenceRegex: RegExp;
/** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
footnoteEmbedded: RegExp;
/** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */
footnoteStandalone: RegExp;
/** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */
invalidReferenceRegex: RegExp;
/** Matches OCR-confused footnote references at line start with characters like .1OV9 */
ocrConfusedFootnoteReferenceRegex: RegExp;
/** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
ocrConfusedReferenceRegex: RegExp;
/** Matches one or more whitespace characters */
whitespace: RegExp;
};
/**
* Extracts the first sequence of Arabic or Western digits from text.
* Used primarily for footnote number comparison to match related footnote elements.
*
* @param text - Text containing digits to extract
* @returns First digit sequence found, or empty string if none found
* @example
* extractDigits('(٥)أخرجه البخاري') // Returns '٥'
* extractDigits('See note (123)') // Returns '123'
*/
declare const extractDigits: (text: string) => string;
/**
* Tokenizes text into individual words while preserving special symbols.
* Adds spacing around preserved symbols to ensure they are tokenized separately,
* then splits on whitespace.
*
* @param text - Text to tokenize
* @param preserveSymbols - Array of symbols that should be tokenized as separate tokens
* @returns Array of tokens, or empty array if input is empty/whitespace
* @example
* tokenizeText('Hello ﷺ world', ['ﷺ']) // Returns ['Hello', 'ﷺ', 'world']
*/
declare const tokenizeText: (text: string, preserveSymbols?: string[]) => string[];
/**
* Handles fusion of standalone and embedded footnotes during token processing.
* Detects patterns where standalone footnotes should be merged with embedded ones
* or where trailing standalone footnotes should be skipped.
*
* @param result - Current result array being built
* @param previousToken - The previous token in the sequence
* @param currentToken - The current token being processed
* @returns True if the current token was handled (fused or skipped), false otherwise
* @example
* // (٥) + (٥)أخرجه → result gets (٥)أخرجه
* // (٥)أخرجه + (٥) → (٥) is skipped
*/
declare const handleFootnoteFusion: (result: string[], previousToken: string, currentToken: string) => boolean;
/**
* Handles selection logic for tokens with embedded footnotes during alignment.
* Prefers tokens that contain embedded footnotes over plain text, and among
* tokens with embedded footnotes, prefers the shorter one.
*
* @param tokenA - First token to compare
* @param tokenB - Second token to compare
* @returns Array containing selected token(s), or null if no special handling needed
* @example
* handleFootnoteSelection('text', '(١)text') // Returns ['(١)text']
* handleFootnoteSelection('(١)longtext', '(١)text') // Returns ['(١)text']
*/
declare const handleFootnoteSelection: (tokenA: string, tokenB: string) => null | string[];
/**
* Handles selection logic for standalone footnote tokens during alignment.
* Manages cases where one or both tokens are standalone footnotes, preserving
* both tokens when one is a footnote and the other is regular text.
*
* @param tokenA - First token to compare
* @param tokenB - Second token to compare
* @returns Array containing selected token(s), or null if no special handling needed
* @example
* handleStandaloneFootnotes('(١)', 'text') // Returns ['(١)', 'text']
* handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one)
*/
declare const handleStandaloneFootnotes: (tokenA: string, tokenB: string) => null | string[];
/**
* Removes simple footnote references from Arabic text.
* Handles footnotes in the format (¬[Arabic numerals]) where ¬ is the not symbol (U+00AC).
*
* @param text - The input text containing footnote references to remove
* @returns The text with footnote references removed and extra spaces normalized
*
* @example
* ```typescript
* removeFootnoteReferencesSimple("هذا النص (¬١٢٣) يحتوي على حاشية")
* // Returns: "هذا النص يحتوي على حاشية"
* ```
*/
declare const removeFootnoteReferencesSimple: (text: string) => string;
/**
* Removes single digit footnote references and extended footnote formats from Arabic text.
* Handles footnotes in the format:
* - ([single Arabic digit]) - e.g., (٣)
* - ([single Arabic digit] [single Arabic letter]) - e.g., (٣ م), (٥ ه), (٧ ب)
*
* @param text - The input text containing footnote references to remove
* @returns The text with footnote references removed and extra spaces normalized
*
* @example
* ```typescript
* removeSingleDigitFootnoteReferences("هذا النص (٣) والآخر (٥ م) والثالث (٧ ه) يحتوي على حواشي")
* // Returns: "هذا النص والآخر والثالث يحتوي على حواشي"
* ```
*/
declare const removeSingleDigitFootnoteReferences: (text: string) => string;
/**
* Standardizes standalone Hijri symbol ه to هـ when following Arabic digits
* @param text - Input text to process
* @returns Text with standardized Hijri symbols
*/
declare const standardizeHijriSymbol: (text: string) => string;
/**
* Standardizes standalone اه to اهـ when appearing as whole word
* @param text - Input text to process
* @returns Text with standardized AH Hijri symbols
*/
declare const standardizeIntahaSymbol: (text: string) => string;
//#endregion
export { BRACKETS, CLOSE_BRACKETS, CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, SanitizeBase, SanitizeOptions, SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };
//# sourceMappingURL=index.d.ts.map