UNPKG

baburchi

Version:

A lightweight TypeScript library designed to fix typos in OCR post-processing.

814 lines 36.9 kB
//#region src/alignment.d.ts /** * Aligns split text segments to match target lines by finding the best order. * * This function handles cases where text lines have been split into segments * and need to be merged back together in the correct order. It compares * different arrangements of the segments against target lines to find the * best match based on similarity scores. * * @param targetLines - Array where each element is either a string to align against, or falsy to skip alignment * @param segmentLines - Array of text segments that may represent split versions of target lines. * @returns Array of aligned text lines */ declare const alignTextSegments: (targetLines: string[], segmentLines: string[]) => string[]; //#endregion //#region src/balance.d.ts /** * Represents an error found when checking balance of quotes or brackets in text. */ type BalanceError = { /** The character that caused the error */ char: string; /** The position of the character in the string */ index: number; /** The reason for the error */ reason: 'mismatched' | 'unclosed' | 'unmatched'; /** The type of character that caused the error */ type: 'bracket' | 'quote'; }; /** * Result of a balance check operation. */ type BalanceResult = { /** Array of errors found during balance checking */ errors: BalanceError[]; /** Whether the text is properly balanced */ isBalanced: boolean; }; /** Mapping of opening brackets to their corresponding closing brackets */ declare const BRACKETS: { '\u00AB': string; '(': string; '[': string; '{': string; }; /** Set of all opening bracket characters */ declare const OPEN_BRACKETS: Set<string>; /** Set of all closing bracket characters */ declare const CLOSE_BRACKETS: Set<string>; /** * Checks if both quotes and brackets are balanced in a string and returns detailed error information. * * This function combines the results of both quote and bracket balance checking, * providing a comprehensive analysis of all balance issues in the text. * The errors are sorted by their position in the string for easier debugging. * * @param str - The string to check for overall balance * @returns An object containing combined balance status and all errors found, sorted by position * * @example * ```typescript * checkBalance('Hello "world" and (test)') // { errors: [], isBalanced: true } * checkBalance('Hello "world and (test') // { errors: [...], isBalanced: false } * ``` */ declare const checkBalance: (str: string) => BalanceResult; /** * Enhanced error detection that returns absolute character positions for use with HighlightableTextarea. * * This interface extends the basic BalanceError to include absolute positioning * across multiple lines of text, making it suitable for text editors and * syntax highlighters that need precise character positioning. */ interface CharacterError { /** Absolute character position from the start of the entire text */ absoluteIndex: number; /** The character that caused the error */ char: string; /** The reason for the error */ reason: 'mismatched' | 'unclosed' | 'unmatched'; /** The type of character that caused the error */ type: 'bracket' | 'quote'; } /** * Gets detailed character-level errors for unbalanced quotes and brackets in multi-line text. * * This function processes text line by line, but only checks lines longer than 10 characters * for balance issues. It returns absolute positions that can be used with text editors * or highlighting components that need precise character positioning across the entire text. * * The absolute index accounts for newline characters between lines, providing accurate * positioning for the original text string. * * @param text - The multi-line text to analyze for balance errors * @returns Array of character errors with absolute positioning information * * @example * ```typescript * const text = 'Line 1 with "quote\nLine 2 with (bracket'; * const errors = getUnbalancedErrors(text); * // Returns errors with absoluteIndex pointing to exact character positions * ``` */ declare const getUnbalancedErrors: (text: string) => CharacterError[]; /** * Checks if all double quotes in a string are balanced. * * This is a convenience function that returns only the boolean result * without detailed error information. * * @param str - The string to check for quote balance * @returns True if quotes are balanced, false otherwise * * @example * ```typescript * areQuotesBalanced('Hello "world"') // true * areQuotesBalanced('Hello "world') // false * ``` */ declare const areQuotesBalanced: (str: string) => boolean; /** * Checks if all brackets in a string are properly balanced. * * This is a convenience function that returns only the boolean result * without detailed error information. * * @param str - The string to check for bracket balance * @returns True if brackets are balanced, false otherwise * * @example * ```typescript * areBracketsBalanced('(hello [world])') // true * areBracketsBalanced('(hello [world') // false * ``` */ declare const areBracketsBalanced: (str: string) => boolean; /** * Checks if both quotes and brackets are balanced in a string. * * This is a convenience function that returns only the boolean result * without detailed error information. * * @param str - The string to check for overall balance * @returns True if both quotes and brackets are balanced, false otherwise * * @example * ```typescript * isBalanced('Hello "world" and (test)') // true * isBalanced('Hello "world and (test') // false * ``` */ declare const isBalanced: (str: string) => boolean; //#endregion //#region src/footnotes.d.ts /** * Checks if the given text contains invalid footnote references. * Invalid footnotes include empty parentheses "()" or OCR-confused characters * like ".1OV9" that were misrecognized instead of Arabic numerals. * * @param text - Text to check for invalid footnote patterns * @returns True if text contains invalid footnote references, false otherwise * @example * hasInvalidFootnotes('This text has ()') // Returns true * hasInvalidFootnotes('This text has (١)') // Returns false * hasInvalidFootnotes('OCR mistake (O)') // Returns true */ declare const hasInvalidFootnotes: (text: string) => boolean; type TextLine = { isFootnote?: boolean; text: string; }; /** * Corrects footnote references in an array of text lines by: * 1. Converting OCR-confused characters to proper Arabic numerals * 2. Filling in empty "()" references with appropriate numbers * 3. Ensuring footnote references in body text match those in footnotes * 4. Generating new reference numbers when needed * * @param lines - Array of text line objects, each with optional isFootnote flag * @returns Array of corrected text lines with proper footnote references * @example * const lines = [ * { text: 'Main text with ()', isFootnote: false }, * { text: '() This is a footnote', isFootnote: true } * ]; * const corrected = correctReferences(lines); * // Returns lines with "()" replaced by proper Arabic numerals like "(١)" */ declare const correctReferences: <T extends TextLine>(lines: T[]) => T[]; //#endregion //#region src/types.d.ts /** * Configuration options for fixing typos in OCR text using alignment algorithms. * These options control how text tokens are compared, aligned, and merged during typo correction. */ type FixTypoOptions = { /** * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens. * Used in post-processing to eliminate redundant tokens that are nearly identical. * Should typically be higher than similarityThreshold to catch only very similar duplicates. * @default 0.9 * @example 0.95 // Removes tokens that are 95% or more similar */ readonly highSimilarityThreshold: number; /** * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned. * Higher values require closer matches, lower values are more permissive. * Used in the Needleman-Wunsch alignment algorithm for token matching. * @default 0.7 * @example 0.8 // Requires 80% similarity for token alignment */ readonly similarityThreshold: number; /** * Array of special symbols that should be preserved during typo correction. * These symbols (like honorifics or religious markers) take precedence in token selection. * @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols */ readonly typoSymbols: string[]; }; type MatchPolicy = { /** Try approximate matches for leftovers (default true). */ enableFuzzy?: boolean; /** Max absolute edit distance accepted in fuzzy (default 3). */ maxEditAbs?: number; /** Max relative edit distance (fraction of excerpt length). Default 0.1 (10%). */ maxEditRel?: number; /** q-gram length for candidate generation (default 4). */ q?: number; /** Max rare grams to seed candidates per excerpt (default 5). */ gramsPerExcerpt?: number; /** Max candidate windows verified per excerpt (default 40). */ maxCandidatesPerExcerpt?: number; /** Seam length for bleed windows (default 512). */ seamLen?: number; /** * Optional logging function for debugging. */ log?(message?: any, ...optionalParams: any[]): void; }; //#endregion //#region src/fuzzy.d.ts /** * Main function to find the single best match per excerpt. * Combines exact matching with fuzzy matching for comprehensive text search. * * @param pages - Array of page texts to search within * @param excerpts - Array of text excerpts to find matches for * @param policy - Optional matching policy configuration * @returns Array of page indices (one per excerpt, -1 if no match found) * * @example * ```typescript * const pages = ['Hello world', 'Goodbye world']; * const excerpts = ['Hello', 'Good bye']; // Note the typo * const matches = findMatches(pages, excerpts, { enableFuzzy: true }); * // Returns [0, 1] - exact match on page 0, fuzzy match on page 1 * ``` */ declare function findMatches(pages: string[], excerpts: string[], policy?: MatchPolicy): number[]; /** * Main function to find all matches per excerpt, ranked by quality. * Returns comprehensive results with both exact and fuzzy matches for each excerpt. * * @param pages - Array of page texts to search within * @param excerpts - Array of text excerpts to find matches for * @param policy - Optional matching policy configuration * @returns Array of page index arrays (one array per excerpt, sorted by match quality) * * @example * ```typescript * const pages = ['Hello world', 'Hello there', 'Goodbye world']; * const excerpts = ['Hello']; * const matches = findMatchesAll(pages, excerpts); * // Returns [[0, 1]] - both pages 0 and 1 contain "Hello", sorted by page order * ``` */ declare function findMatchesAll(pages: string[], excerpts: string[], policy?: MatchPolicy): number[][]; //#endregion //#region src/noise.d.ts /** * Character statistics for analyzing text content and patterns */ type CharacterStats = { /** Number of Arabic script characters in the text */ arabicCount: number; /** Map of character frequencies for repetition analysis */ charFreq: Map<string, number>; /** Number of digit characters (0-9) in the text */ digitCount: number; /** Number of Latin alphabet characters (a-z, A-Z) in the text */ latinCount: number; /** Number of punctuation characters in the text */ punctuationCount: number; /** Number of whitespace characters in the text */ spaceCount: number; /** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */ symbolCount: number; }; /** * Determines if a given Arabic text string is likely to be noise or unwanted OCR artifacts. * This function performs comprehensive analysis to identify patterns commonly associated * with OCR errors, formatting artifacts, or meaningless content in Arabic text processing. * * @param text - The input string to analyze for noise patterns * @returns true if the text is likely noise or unwanted content, false if it appears to be valid Arabic content * * @example * ```typescript * import { isArabicTextNoise } from 'baburchi'; * * console.log(isArabicTextNoise('---')); // true (formatting artifact) * console.log(isArabicTextNoise('السلام عليكم')); // false (valid Arabic) * console.log(isArabicTextNoise('ABC')); // true (uppercase pattern) * ``` */ declare const isArabicTextNoise: (text: string) => boolean; /** * Analyzes character composition and frequency statistics for the input text. * Categorizes characters by type (Arabic, Latin, digits, spaces, punctuation, symbols) * and tracks character frequency for pattern analysis. * * @param text - The text string to analyze * @returns CharacterStats object containing detailed character analysis * * @example * ```typescript * import { analyzeCharacterStats } from 'baburchi'; * * const stats = analyzeCharacterStats('مرحبا 123!'); * console.log(stats.arabicCount); // 5 * console.log(stats.digitCount); // 3 * console.log(stats.symbolCount); // 1 * ``` */ declare function analyzeCharacterStats(text: string): CharacterStats; /** * Detects excessive repetition of specific characters that commonly indicate noise. * Focuses on repetitive characters like exclamation marks, dots, dashes, equals signs, * and underscores that often appear in OCR artifacts or formatting elements. * * @param charStats - Character statistics from analyzeCharacterStats * @param textLength - Total length of the original text * @returns true if excessive repetition is detected, false otherwise * * @example * ```typescript * import { hasExcessiveRepetition, analyzeCharacterStats } from 'baburchi'; * * const stats = analyzeCharacterStats('!!!!!'); * console.log(hasExcessiveRepetition(stats, 5)); // true * * const normalStats = analyzeCharacterStats('hello world'); * console.log(hasExcessiveRepetition(normalStats, 11)); // false * ``` */ declare function hasExcessiveRepetition(charStats: CharacterStats, textLength: number): boolean; /** * Identifies text that matches common noise patterns using regular expressions. * Detects patterns like repeated dashes, dot sequences, uppercase-only text, * digit-dash combinations, and other formatting artifacts commonly found in OCR output. * * @param text - The text string to check against noise patterns * @returns true if the text matches a basic noise pattern, false otherwise * * @example * ```typescript * import { isBasicNoisePattern } from 'baburchi'; * * console.log(isBasicNoisePattern('---')); // true * console.log(isBasicNoisePattern('...')); // true * console.log(isBasicNoisePattern('ABC')); // true * console.log(isBasicNoisePattern('- 77')); // true * console.log(isBasicNoisePattern('hello world')); // false * ``` */ declare function isBasicNoisePattern(text: string): boolean; /** * Determines if non-Arabic content should be classified as noise based on various heuristics. * Analyzes symbol-to-content ratios, text length, spacing patterns, and content composition * to identify unwanted OCR artifacts or meaningless content. * * @param charStats - Character statistics from analyzeCharacterStats * @param textLength - Total length of the original text * @param text - The original text string for additional pattern matching * @returns true if the content is likely noise, false if it appears to be valid content * * @example * ```typescript * import { isNonArabicNoise, analyzeCharacterStats } from 'baburchi'; * * const stats = analyzeCharacterStats('!!!'); * console.log(isNonArabicNoise(stats, 3, '!!!')); // true * * const validStats = analyzeCharacterStats('2023'); * console.log(isNonArabicNoise(validStats, 4, '2023')); // false * ``` */ declare function isNonArabicNoise(charStats: CharacterStats, textLength: number, text: string): boolean; /** * Detects problematic spacing patterns that indicate noise or OCR artifacts. * Identifies cases where spacing is excessive relative to content, or where * single characters are surrounded by spaces in a way that suggests OCR errors. * * @param charStats - Character statistics from analyzeCharacterStats * @param contentChars - Number of meaningful content characters (Arabic + Latin + digits) * @param textLength - Total length of the original text * @returns true if spacing patterns indicate noise, false otherwise * * @example * ```typescript * import { isSpacingNoise, analyzeCharacterStats } from 'baburchi'; * * const stats = analyzeCharacterStats(' a '); * const contentChars = stats.arabicCount + stats.latinCount + stats.digitCount; * console.log(isSpacingNoise(stats, contentChars, 3)); // true * * const normalStats = analyzeCharacterStats('hello world'); * const normalContent = normalStats.arabicCount + normalStats.latinCount + normalStats.digitCount; * console.log(isSpacingNoise(normalStats, normalContent, 11)); // false * ``` */ declare function isSpacingNoise(charStats: CharacterStats, contentChars: number, textLength: number): boolean; /** * Validates whether Arabic content is substantial enough to be considered meaningful. * Uses character counts and text length to determine if Arabic text contains * sufficient content or if it's likely to be a fragment or OCR artifact. * * @param charStats - Character statistics from analyzeCharacterStats * @param textLength - Total length of the original text * @returns true if the Arabic content appears valid, false if it's likely noise * * @example * ```typescript * import { isValidArabicContent, analyzeCharacterStats } from 'baburchi'; * * const validStats = analyzeCharacterStats('السلام عليكم'); * console.log(isValidArabicContent(validStats, 12)); // true * * const shortStats = analyzeCharacterStats('ص'); * console.log(isValidArabicContent(shortStats, 1)); // false * * const withDigitsStats = analyzeCharacterStats('ص 5'); * console.log(isValidArabicContent(withDigitsStats, 3)); // true * ``` */ declare function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean; //#endregion //#region src/typos.d.ts /** * Processes text alignment between original and alternate OCR results to fix typos. * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens, * then selects the best tokens and performs post-processing. * * @param originalText - Original OCR text that may contain typos * @param altText - Reference text from alternate OCR for comparison * @param options - Configuration options for alignment and selection * @returns Corrected text with typos fixed */ declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string; /** * Convenience wrapper around {@link processTextAlignment} that accepts partial options. * * @param original - The source text that may contain typographical errors. * @param correction - The reference text used to correct the {@link original} text. * @param options - Partial typo correction options combined with required typo symbols. * @returns The corrected text generated from the alignment process. */ declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string; //#endregion //#region src/utils/levenshthein.d.ts /** * Calculates Levenshtein distance between two strings using space-optimized dynamic programming. * The Levenshtein distance is the minimum number of single-character edits (insertions, * deletions, or substitutions) required to change one string into another. * * @param textA - First string to compare * @param textB - Second string to compare * @returns Minimum edit distance between the two strings * @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths * @example * calculateLevenshteinDistance('kitten', 'sitting') // Returns 3 * calculateLevenshteinDistance('', 'hello') // Returns 5 */ declare const calculateLevenshteinDistance: (textA: string, textB: string) => number; /** * Calculates bounded Levenshtein distance with early termination. * More efficient when you only care about distances up to a threshold. */ declare const boundedLevenshtein: (a: string, b: string, maxDist: number) => number; //#endregion //#region src/utils/sanitize.d.ts /** * Ultra-fast Arabic text sanitizer for search/indexing/display. * Optimized for very high call rates: avoids per-call object spreads and minimizes allocations. * Options can merge over a base preset or `'none'` to apply exactly the rules you request. */ type SanitizePreset = 'light' | 'search' | 'aggressive'; type SanitizeBase = 'none' | SanitizePreset; /** * Public options for {@link sanitizeArabic}. When you pass an options object, it overlays the chosen * `base` (default `'light'`) without allocating merged objects on the hot path; flags are resolved * directly into local booleans for speed. */ type SanitizeOptions = { /** Base to merge over. `'none'` applies only the options you specify. Default when passing an object: `'light'`. */ base?: SanitizeBase; /** * NFC normalization (fast-path). * * For performance, this sanitizer avoids calling `String.prototype.normalize('NFC')` and instead * applies the key Arabic canonical compositions inline (hamza/madda combining marks). * This preserves the NFC behavior that matters for typical Arabic OCR text while keeping throughput high. * * Default: `true` in all presets. */ nfc?: boolean; /** Strip zero-width controls (U+200B–U+200F, U+202A–U+202E, U+2060–U+2064, U+FEFF). Default: `true` in presets. */ stripZeroWidth?: boolean; /** If stripping zero-width, replace them with a space instead of removing. Default: `false`. */ zeroWidthToSpace?: boolean; /** Remove Arabic diacritics (tashkīl). Default: `true` in `'search'`/`'aggressive'`. */ stripDiacritics?: boolean; /** Remove footnote references. Default: `true` in `'search'`/`'aggressive'`. */ stripFootnotes?: boolean; /** * Remove tatweel (ـ). * - `true` is treated as `'safe'` (preserves tatweel after digits or 'ه' for dates/list markers) * - `'safe'` or `'all'` explicitly * - `false` to keep tatweel * Default: `'all'` in `'search'`/`'aggressive'`, `false` in `'light'`. */ stripTatweel?: boolean | 'safe' | 'all'; /** Normalize آ/أ/إ → ا. Default: `true` in `'search'`/`'aggressive'`. */ normalizeAlif?: boolean; /** Replace ى → ي. Default: `true` in `'search'`/`'aggressive'`. */ replaceAlifMaqsurah?: boolean; /** Replace ة → ه (lossy). Default: `true` in `'aggressive'` only. */ replaceTaMarbutahWithHa?: boolean; /** Strip Latin letters/digits and common OCR noise into spaces. Default: `true` in `'aggressive'`. */ stripLatinAndSymbols?: boolean; /** Keep only Arabic letters (no whitespace). Use for compact keys, not FTS. */ keepOnlyArabicLetters?: boolean; /** Keep Arabic letters + spaces (drops digits/punct/symbols). Great for FTS. Default: `true` in `'aggressive'`. */ lettersAndSpacesOnly?: boolean; /** Collapse runs of whitespace to a single space. Default: `true`. */ collapseWhitespace?: boolean; /** Trim leading/trailing whitespace. Default: `true`. */ trim?: boolean; /** * Remove the Hijri date marker ("هـ" or bare "ه" if tatweel already removed) when it follows a date-like token * (digits/slashes/hyphens/spaces). Example: `1435/3/29 هـ` → `1435/3/29`. * Default: `true` in `'search'`/`'aggressive'`, `false` in `'light'`. */ removeHijriMarker?: boolean; }; /** * Creates a reusable sanitizer function with pre-resolved options. * Use this when you need to sanitize many strings with the same options * for maximum performance. * * @example * ```ts * const sanitize = createArabicSanitizer('search'); * const results = texts.map(sanitize); * ``` */ declare const createArabicSanitizer: (optionsOrPreset?: SanitizePreset | SanitizeOptions) => ((input: string) => string); /** * Sanitizes Arabic text according to a preset or custom options. * * Presets: * - `'light'`: NFC, zero-width removal, collapse/trim spaces. * - `'search'`: removes diacritics and tatweel, normalizes Alif and ى→ي, removes Hijri marker. * - `'aggressive'`: ideal for FTS; keeps letters+spaces only and strips common noise. * * Custom options: * - Passing an options object overlays the selected `base` preset (default `'light'`). * - Use `base: 'none'` to apply **only** the rules you specify (e.g., tatweel only). * * **Batch processing**: Pass an array of strings for optimized batch processing. * Options are resolved once and applied to all strings, providing significant * performance gains over calling the function in a loop. * * Examples: * ```ts * sanitizeArabic('أبـــتِـــكَةُ', { base: 'none', stripTatweel: true }); // 'أبتِكَةُ' * sanitizeArabic('1435/3/29 هـ', 'aggressive'); // '1435 3 29' * sanitizeArabic('اَلسَّلَامُ عَلَيْكُمْ', 'search'); // 'السلام عليكم' * * // Batch processing (optimized): * sanitizeArabic(['text1', 'text2', 'text3'], 'search'); // ['result1', 'result2', 'result3'] * ``` */ declare function sanitizeArabic(input: string, optionsOrPreset?: SanitizePreset | SanitizeOptions): string; declare function sanitizeArabic(input: string[], optionsOrPreset?: SanitizePreset | SanitizeOptions): string[]; //#endregion //#region src/utils/similarity.d.ts /** * Calculates similarity ratio between two strings as a value between 0.0 and 1.0. * Uses Levenshtein distance normalized by the length of the longer string. * A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings. * * @param textA - First string to compare * @param textB - Second string to compare * @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical) * @example * calculateSimilarity('hello', 'hello') // Returns 1.0 * calculateSimilarity('hello', 'help') // Returns 0.6 */ declare const calculateSimilarity: (textA: string, textB: string) => number; /** * Checks if two texts are similar after Arabic normalization. * Normalizes both texts by removing diacritics and decorative elements, * then compares their similarity against the provided threshold. * * @param textA - First text to compare * @param textB - Second text to compare * @param threshold - Similarity threshold (0.0 to 1.0) * @returns True if normalized texts meet the similarity threshold * @example * areSimilarAfterNormalization('السَّلام', 'السلام', 0.9) // Returns true */ declare const areSimilarAfterNormalization: (textA: string, textB: string, threshold?: number) => boolean; /** * Calculates alignment score for two tokens in sequence alignment. * Uses different scoring criteria: perfect match after normalization gets highest score, * typo symbols or highly similar tokens get soft match score, mismatches get penalty. * * @param tokenA - First token to score * @param tokenB - Second token to score * @param typoSymbols - Array of special symbols that get preferential treatment * @param similarityThreshold - Threshold for considering tokens highly similar * @returns Alignment score (higher is better match) * @example * calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match) * calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity */ declare const calculateAlignmentScore: (tokenA: string, tokenB: string, typoSymbols: string[], similarityThreshold: number) => number; type AlignedTokenPair = [null | string, null | string]; type AlignmentCell = { direction: 'diagonal' | 'left' | 'up' | null; score: number; }; /** * Backtracks through the scoring matrix to reconstruct optimal sequence alignment. * Follows the directional indicators in the matrix to build the sequence of aligned * token pairs from the Needleman-Wunsch algorithm. * * @param matrix - Scoring matrix with directional information from alignment * @param tokensA - First sequence of tokens * @param tokensB - Second sequence of tokens * @returns Array of aligned token pairs, where null indicates a gap * @throws Error if invalid alignment direction is encountered */ declare const backtrackAlignment: (matrix: AlignmentCell[][], tokensA: string[], tokensB: string[]) => AlignedTokenPair[]; /** * Performs global sequence alignment using the Needleman-Wunsch algorithm. * Aligns two token sequences to find the optimal pairing that maximizes * the total alignment score, handling insertions, deletions, and substitutions. * * @param tokensA - First sequence of tokens to align * @param tokensB - Second sequence of tokens to align * @param typoSymbols - Special symbols that affect scoring * @param similarityThreshold - Threshold for high similarity scoring * @returns Array of aligned token pairs, with null indicating gaps * @example * alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8) * // Returns [['a', 'a'], ['b', 'c']] */ declare const alignTokenSequences: (tokensA: string[], tokensB: string[], typoSymbols: string[], similarityThreshold: number) => AlignedTokenPair[]; //#endregion //#region src/utils/textUtils.d.ts declare const INTAHA_ACTUAL = "\u0627\u0647\u0640"; /** * Collection of regex patterns used throughout the library for text processing */ declare const PATTERNS: { /** Matches Arabic characters across all Unicode blocks */ arabicCharacters: RegExp; /** Matches Arabic-Indic digits (٠-٩) and Western digits (0-9) */ arabicDigits: RegExp; /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */ arabicFootnoteReferenceRegex: RegExp; /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ٠-٩) */ arabicLettersAndDigits: RegExp; /** Matches Arabic punctuation marks and whitespace characters */ arabicPunctuationAndWhitespace: RegExp; /** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */ arabicReferenceRegex: RegExp; /** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */ footnoteEmbedded: RegExp; /** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */ footnoteStandalone: RegExp; /** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */ invalidReferenceRegex: RegExp; /** Matches OCR-confused footnote references at line start with characters like .1OV9 */ ocrConfusedFootnoteReferenceRegex: RegExp; /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */ ocrConfusedReferenceRegex: RegExp; /** Matches one or more whitespace characters */ whitespace: RegExp; }; /** * Extracts the first sequence of Arabic or Western digits from text. * Used primarily for footnote number comparison to match related footnote elements. * * @param text - Text containing digits to extract * @returns First digit sequence found, or empty string if none found * @example * extractDigits('(٥)أخرجه البخاري') // Returns '٥' * extractDigits('See note (123)') // Returns '123' */ declare const extractDigits: (text: string) => string; /** * Tokenizes text into individual words while preserving special symbols. * Adds spacing around preserved symbols to ensure they are tokenized separately, * then splits on whitespace. * * @param text - Text to tokenize * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens * @returns Array of tokens, or empty array if input is empty/whitespace * @example * tokenizeText('Hello ﷺ world', ['ﷺ']) // Returns ['Hello', 'ﷺ', 'world'] */ declare const tokenizeText: (text: string, preserveSymbols?: string[]) => string[]; /** * Handles fusion of standalone and embedded footnotes during token processing. * Detects patterns where standalone footnotes should be merged with embedded ones * or where trailing standalone footnotes should be skipped. * * @param result - Current result array being built * @param previousToken - The previous token in the sequence * @param currentToken - The current token being processed * @returns True if the current token was handled (fused or skipped), false otherwise * @example * // (٥) + (٥)أخرجه → result gets (٥)أخرجه * // (٥)أخرجه + (٥) → (٥) is skipped */ declare const handleFootnoteFusion: (result: string[], previousToken: string, currentToken: string) => boolean; /** * Handles selection logic for tokens with embedded footnotes during alignment. * Prefers tokens that contain embedded footnotes over plain text, and among * tokens with embedded footnotes, prefers the shorter one. * * @param tokenA - First token to compare * @param tokenB - Second token to compare * @returns Array containing selected token(s), or null if no special handling needed * @example * handleFootnoteSelection('text', '(١)text') // Returns ['(١)text'] * handleFootnoteSelection('(١)longtext', '(١)text') // Returns ['(١)text'] */ declare const handleFootnoteSelection: (tokenA: string, tokenB: string) => null | string[]; /** * Handles selection logic for standalone footnote tokens during alignment. * Manages cases where one or both tokens are standalone footnotes, preserving * both tokens when one is a footnote and the other is regular text. * * @param tokenA - First token to compare * @param tokenB - Second token to compare * @returns Array containing selected token(s), or null if no special handling needed * @example * handleStandaloneFootnotes('(١)', 'text') // Returns ['(١)', 'text'] * handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one) */ declare const handleStandaloneFootnotes: (tokenA: string, tokenB: string) => null | string[]; /** * Removes simple footnote references from Arabic text. * Handles footnotes in the format (¬[Arabic numerals]) where ¬ is the not symbol (U+00AC). * * @param text - The input text containing footnote references to remove * @returns The text with footnote references removed and extra spaces normalized * * @example * ```typescript * removeFootnoteReferencesSimple("هذا النص (¬١٢٣) يحتوي على حاشية") * // Returns: "هذا النص يحتوي على حاشية" * ``` */ declare const removeFootnoteReferencesSimple: (text: string) => string; /** * Removes single digit footnote references and extended footnote formats from Arabic text. * Handles footnotes in the format: * - ([single Arabic digit]) - e.g., (٣) * - ([single Arabic digit] [single Arabic letter]) - e.g., (٣ م), (٥ ه), (٧ ب) * * @param text - The input text containing footnote references to remove * @returns The text with footnote references removed and extra spaces normalized * * @example * ```typescript * removeSingleDigitFootnoteReferences("هذا النص (٣) والآخر (٥ م) والثالث (٧ ه) يحتوي على حواشي") * // Returns: "هذا النص والآخر والثالث يحتوي على حواشي" * ``` */ declare const removeSingleDigitFootnoteReferences: (text: string) => string; /** * Standardizes standalone Hijri symbol ه to هـ when following Arabic digits * @param text - Input text to process * @returns Text with standardized Hijri symbols */ declare const standardizeHijriSymbol: (text: string) => string; /** * Standardizes standalone اه to اهـ when appearing as whole word * @param text - Input text to process * @returns Text with standardized AH Hijri symbols */ declare const standardizeIntahaSymbol: (text: string) => string; //#endregion export { BRACKETS, CLOSE_BRACKETS, CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, SanitizeBase, SanitizeOptions, SanitizePreset, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, boundedLevenshtein, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, createArabicSanitizer, extractDigits, findMatches, findMatchesAll, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, processTextAlignment, removeFootnoteReferencesSimple, removeSingleDigitFootnoteReferences, sanitizeArabic, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText }; //# sourceMappingURL=index.d.ts.map