UNPKG

baburchi

Version:

A lightweight TypeScript library designed to fix typos in OCR post-processing.

608 lines (599 loc) 27.6 kB
/** * Configuration options for fixing typos in OCR text using alignment algorithms. * These options control how text tokens are compared, aligned, and merged during typo correction. */ type FixTypoOptions = { /** * High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens. * Used in post-processing to eliminate redundant tokens that are nearly identical. * Should typically be higher than similarityThreshold to catch only very similar duplicates. * @default 0.9 * @example 0.95 // Removes tokens that are 95% or more similar */ readonly highSimilarityThreshold: number; /** * Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned. * Higher values require closer matches, lower values are more permissive. * Used in the Needleman-Wunsch alignment algorithm for token matching. * @default 0.7 * @example 0.8 // Requires 80% similarity for token alignment */ readonly similarityThreshold: number; /** * Array of special symbols that should be preserved during typo correction. * These symbols (like honorifics or religious markers) take precedence in token selection. * @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols */ readonly typoSymbols: string[]; }; /** * Aligns split text segments to match target lines by finding the best order. * * This function handles cases where text lines have been split into segments * and need to be merged back together in the correct order. It compares * different arrangements of the segments against target lines to find the * best match based on similarity scores. * * @param targetLines - Array where each element is either a string to align against, or falsy to skip alignment * @param segmentLines - Array of text segments that may represent split versions of target lines. * @returns Array of aligned text lines */ declare const alignTextSegments: (targetLines: string[], segmentLines: string[]) => string[]; /** * Represents an error found when checking balance of quotes or brackets in text. */ type BalanceError = { /** The character that caused the error */ char: string; /** The position of the character in the string */ index: number; /** The reason for the error */ reason: 'mismatched' | 'unclosed' | 'unmatched'; /** The type of character that caused the error */ type: 'bracket' | 'quote'; }; /** * Result of a balance check operation. */ type BalanceResult = { /** Array of errors found during balance checking */ errors: BalanceError[]; /** Whether the text is properly balanced */ isBalanced: boolean; }; /** Mapping of opening brackets to their corresponding closing brackets */ declare const BRACKETS: { '\u00AB': string; '(': string; '[': string; '{': string; }; /** Set of all opening bracket characters */ declare const OPEN_BRACKETS: Set<string>; /** Set of all closing bracket characters */ declare const CLOSE_BRACKETS: Set<string>; /** * Checks if both quotes and brackets are balanced in a string and returns detailed error information. * * This function combines the results of both quote and bracket balance checking, * providing a comprehensive analysis of all balance issues in the text. * The errors are sorted by their position in the string for easier debugging. * * @param str - The string to check for overall balance * @returns An object containing combined balance status and all errors found, sorted by position * * @example * ```typescript * checkBalance('Hello "world" and (test)') // { errors: [], isBalanced: true } * checkBalance('Hello "world and (test') // { errors: [...], isBalanced: false } * ``` */ declare const checkBalance: (str: string) => BalanceResult; /** * Enhanced error detection that returns absolute character positions for use with HighlightableTextarea. * * This interface extends the basic BalanceError to include absolute positioning * across multiple lines of text, making it suitable for text editors and * syntax highlighters that need precise character positioning. */ interface CharacterError { /** Absolute character position from the start of the entire text */ absoluteIndex: number; /** The character that caused the error */ char: string; /** The reason for the error */ reason: 'mismatched' | 'unclosed' | 'unmatched'; /** The type of character that caused the error */ type: 'bracket' | 'quote'; } /** * Gets detailed character-level errors for unbalanced quotes and brackets in multi-line text. * * This function processes text line by line, but only checks lines longer than 10 characters * for balance issues. It returns absolute positions that can be used with text editors * or highlighting components that need precise character positioning across the entire text. * * The absolute index accounts for newline characters between lines, providing accurate * positioning for the original text string. * * @param text - The multi-line text to analyze for balance errors * @returns Array of character errors with absolute positioning information * * @example * ```typescript * const text = 'Line 1 with "quote\nLine 2 with (bracket'; * const errors = getUnbalancedErrors(text); * // Returns errors with absoluteIndex pointing to exact character positions * ``` */ declare const getUnbalancedErrors: (text: string) => CharacterError[]; /** * Checks if all double quotes in a string are balanced. * * This is a convenience function that returns only the boolean result * without detailed error information. * * @param str - The string to check for quote balance * @returns True if quotes are balanced, false otherwise * * @example * ```typescript * areQuotesBalanced('Hello "world"') // true * areQuotesBalanced('Hello "world') // false * ``` */ declare const areQuotesBalanced: (str: string) => boolean; /** * Checks if all brackets in a string are properly balanced. * * This is a convenience function that returns only the boolean result * without detailed error information. * * @param str - The string to check for bracket balance * @returns True if brackets are balanced, false otherwise * * @example * ```typescript * areBracketsBalanced('(hello [world])') // true * areBracketsBalanced('(hello [world') // false * ``` */ declare const areBracketsBalanced: (str: string) => boolean; /** * Checks if both quotes and brackets are balanced in a string. * * This is a convenience function that returns only the boolean result * without detailed error information. * * @param str - The string to check for overall balance * @returns True if both quotes and brackets are balanced, false otherwise * * @example * ```typescript * isBalanced('Hello "world" and (test)') // true * isBalanced('Hello "world and (test') // false * ``` */ declare const isBalanced: (str: string) => boolean; /** * Checks if the given text contains invalid footnote references. * Invalid footnotes include empty parentheses "()" or OCR-confused characters * like ".1OV9" that were misrecognized instead of Arabic numerals. * * @param text - Text to check for invalid footnote patterns * @returns True if text contains invalid footnote references, false otherwise * @example * hasInvalidFootnotes('This text has ()') // Returns true * hasInvalidFootnotes('This text has (١)') // Returns false * hasInvalidFootnotes('OCR mistake (O)') // Returns true */ declare const hasInvalidFootnotes: (text: string) => boolean; type TextLine = { isFootnote?: boolean; text: string; }; /** * Corrects footnote references in an array of text lines by: * 1. Converting OCR-confused characters to proper Arabic numerals * 2. Filling in empty "()" references with appropriate numbers * 3. Ensuring footnote references in body text match those in footnotes * 4. Generating new reference numbers when needed * * @param lines - Array of text line objects, each with optional isFootnote flag * @returns Array of corrected text lines with proper footnote references * @example * const lines = [ * { text: 'Main text with ()', isFootnote: false }, * { text: '() This is a footnote', isFootnote: true } * ]; * const corrected = correctReferences(lines); * // Returns lines with "()" replaced by proper Arabic numerals like "(١)" */ declare const correctReferences: <T extends TextLine>(lines: T[]) => T[]; /** * Character statistics for analyzing text content and patterns */ type CharacterStats = { /** Number of Arabic script characters in the text */ arabicCount: number; /** Map of character frequencies for repetition analysis */ charFreq: Map<string, number>; /** Number of digit characters (0-9) in the text */ digitCount: number; /** Number of Latin alphabet characters (a-z, A-Z) in the text */ latinCount: number; /** Number of punctuation characters in the text */ punctuationCount: number; /** Number of whitespace characters in the text */ spaceCount: number; /** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */ symbolCount: number; }; /** * Determines if a given Arabic text string is likely to be noise or unwanted OCR artifacts. * This function performs comprehensive analysis to identify patterns commonly associated * with OCR errors, formatting artifacts, or meaningless content in Arabic text processing. * * @param text - The input string to analyze for noise patterns * @returns true if the text is likely noise or unwanted content, false if it appears to be valid Arabic content * * @example * ```typescript * import { isArabicTextNoise } from 'baburchi'; * * console.log(isArabicTextNoise('---')); // true (formatting artifact) * console.log(isArabicTextNoise('السلام عليكم')); // false (valid Arabic) * console.log(isArabicTextNoise('ABC')); // true (uppercase pattern) * ``` */ declare const isArabicTextNoise: (text: string) => boolean; /** * Analyzes character composition and frequency statistics for the input text. * Categorizes characters by type (Arabic, Latin, digits, spaces, punctuation, symbols) * and tracks character frequency for pattern analysis. * * @param text - The text string to analyze * @returns CharacterStats object containing detailed character analysis * * @example * ```typescript * import { analyzeCharacterStats } from 'baburchi'; * * const stats = analyzeCharacterStats('مرحبا 123!'); * console.log(stats.arabicCount); // 5 * console.log(stats.digitCount); // 3 * console.log(stats.symbolCount); // 1 * ``` */ declare function analyzeCharacterStats(text: string): CharacterStats; /** * Detects excessive repetition of specific characters that commonly indicate noise. * Focuses on repetitive characters like exclamation marks, dots, dashes, equals signs, * and underscores that often appear in OCR artifacts or formatting elements. * * @param charStats - Character statistics from analyzeCharacterStats * @param textLength - Total length of the original text * @returns true if excessive repetition is detected, false otherwise * * @example * ```typescript * import { hasExcessiveRepetition, analyzeCharacterStats } from 'baburchi'; * * const stats = analyzeCharacterStats('!!!!!'); * console.log(hasExcessiveRepetition(stats, 5)); // true * * const normalStats = analyzeCharacterStats('hello world'); * console.log(hasExcessiveRepetition(normalStats, 11)); // false * ``` */ declare function hasExcessiveRepetition(charStats: CharacterStats, textLength: number): boolean; /** * Identifies text that matches common noise patterns using regular expressions. * Detects patterns like repeated dashes, dot sequences, uppercase-only text, * digit-dash combinations, and other formatting artifacts commonly found in OCR output. * * @param text - The text string to check against noise patterns * @returns true if the text matches a basic noise pattern, false otherwise * * @example * ```typescript * import { isBasicNoisePattern } from 'baburchi'; * * console.log(isBasicNoisePattern('---')); // true * console.log(isBasicNoisePattern('...')); // true * console.log(isBasicNoisePattern('ABC')); // true * console.log(isBasicNoisePattern('- 77')); // true * console.log(isBasicNoisePattern('hello world')); // false * ``` */ declare function isBasicNoisePattern(text: string): boolean; /** * Determines if non-Arabic content should be classified as noise based on various heuristics. * Analyzes symbol-to-content ratios, text length, spacing patterns, and content composition * to identify unwanted OCR artifacts or meaningless content. * * @param charStats - Character statistics from analyzeCharacterStats * @param textLength - Total length of the original text * @param text - The original text string for additional pattern matching * @returns true if the content is likely noise, false if it appears to be valid content * * @example * ```typescript * import { isNonArabicNoise, analyzeCharacterStats } from 'baburchi'; * * const stats = analyzeCharacterStats('!!!'); * console.log(isNonArabicNoise(stats, 3, '!!!')); // true * * const validStats = analyzeCharacterStats('2023'); * console.log(isNonArabicNoise(validStats, 4, '2023')); // false * ``` */ declare function isNonArabicNoise(charStats: CharacterStats, textLength: number, text: string): boolean; /** * Detects problematic spacing patterns that indicate noise or OCR artifacts. * Identifies cases where spacing is excessive relative to content, or where * single characters are surrounded by spaces in a way that suggests OCR errors. * * @param charStats - Character statistics from analyzeCharacterStats * @param contentChars - Number of meaningful content characters (Arabic + Latin + digits) * @param textLength - Total length of the original text * @returns true if spacing patterns indicate noise, false otherwise * * @example * ```typescript * import { isSpacingNoise, analyzeCharacterStats } from 'baburchi'; * * const stats = analyzeCharacterStats(' a '); * const contentChars = stats.arabicCount + stats.latinCount + stats.digitCount; * console.log(isSpacingNoise(stats, contentChars, 3)); // true * * const normalStats = analyzeCharacterStats('hello world'); * const normalContent = normalStats.arabicCount + normalStats.latinCount + normalStats.digitCount; * console.log(isSpacingNoise(normalStats, normalContent, 11)); // false * ``` */ declare function isSpacingNoise(charStats: CharacterStats, contentChars: number, textLength: number): boolean; /** * Validates whether Arabic content is substantial enough to be considered meaningful. * Uses character counts and text length to determine if Arabic text contains * sufficient content or if it's likely to be a fragment or OCR artifact. * * @param charStats - Character statistics from analyzeCharacterStats * @param textLength - Total length of the original text * @returns true if the Arabic content appears valid, false if it's likely noise * * @example * ```typescript * import { isValidArabicContent, analyzeCharacterStats } from 'baburchi'; * * const validStats = analyzeCharacterStats('السلام عليكم'); * console.log(isValidArabicContent(validStats, 12)); // true * * const shortStats = analyzeCharacterStats('ص'); * console.log(isValidArabicContent(shortStats, 1)); // false * * const withDigitsStats = analyzeCharacterStats('ص 5'); * console.log(isValidArabicContent(withDigitsStats, 3)); // true * ``` */ declare function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean; /** * Calculates Levenshtein distance between two strings using space-optimized dynamic programming. * The Levenshtein distance is the minimum number of single-character edits (insertions, * deletions, or substitutions) required to change one string into another. * * @param textA - First string to compare * @param textB - Second string to compare * @returns Minimum edit distance between the two strings * @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths * @example * calculateLevenshteinDistance('kitten', 'sitting') // Returns 3 * calculateLevenshteinDistance('', 'hello') // Returns 5 */ declare const calculateLevenshteinDistance: (textA: string, textB: string) => number; /** * Calculates similarity ratio between two strings as a value between 0.0 and 1.0. * Uses Levenshtein distance normalized by the length of the longer string. * A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings. * * @param textA - First string to compare * @param textB - Second string to compare * @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical) * @example * calculateSimilarity('hello', 'hello') // Returns 1.0 * calculateSimilarity('hello', 'help') // Returns 0.6 */ declare const calculateSimilarity: (textA: string, textB: string) => number; /** * Checks if two texts are similar after Arabic normalization. * Normalizes both texts by removing diacritics and decorative elements, * then compares their similarity against the provided threshold. * * @param textA - First text to compare * @param textB - Second text to compare * @param threshold - Similarity threshold (0.0 to 1.0) * @returns True if normalized texts meet the similarity threshold * @example * areSimilarAfterNormalization('السَّلام', 'السلام', 0.9) // Returns true */ declare const areSimilarAfterNormalization: (textA: string, textB: string, threshold?: number) => boolean; /** * Calculates alignment score for two tokens in sequence alignment. * Uses different scoring criteria: perfect match after normalization gets highest score, * typo symbols or highly similar tokens get soft match score, mismatches get penalty. * * @param tokenA - First token to score * @param tokenB - Second token to score * @param typoSymbols - Array of special symbols that get preferential treatment * @param similarityThreshold - Threshold for considering tokens highly similar * @returns Alignment score (higher is better match) * @example * calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match) * calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity */ declare const calculateAlignmentScore: (tokenA: string, tokenB: string, typoSymbols: string[], similarityThreshold: number) => number; type AlignedTokenPair = [null | string, null | string]; type AlignmentCell = { direction: 'diagonal' | 'left' | 'up' | null; score: number; }; /** * Backtracks through the scoring matrix to reconstruct optimal sequence alignment. * Follows the directional indicators in the matrix to build the sequence of aligned * token pairs from the Needleman-Wunsch algorithm. * * @param matrix - Scoring matrix with directional information from alignment * @param tokensA - First sequence of tokens * @param tokensB - Second sequence of tokens * @returns Array of aligned token pairs, where null indicates a gap * @throws Error if invalid alignment direction is encountered */ declare const backtrackAlignment: (matrix: AlignmentCell[][], tokensA: string[], tokensB: string[]) => AlignedTokenPair[]; /** * Performs global sequence alignment using the Needleman-Wunsch algorithm. * Aligns two token sequences to find the optimal pairing that maximizes * the total alignment score, handling insertions, deletions, and substitutions. * * @param tokensA - First sequence of tokens to align * @param tokensB - Second sequence of tokens to align * @param typoSymbols - Special symbols that affect scoring * @param similarityThreshold - Threshold for high similarity scoring * @returns Array of aligned token pairs, with null indicating gaps * @example * alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8) * // Returns [['a', 'a'], ['b', 'c']] */ declare const alignTokenSequences: (tokensA: string[], tokensB: string[], typoSymbols: string[], similarityThreshold: number) => AlignedTokenPair[]; declare const INTAHA_ACTUAL = "\u0627\u0647\u0640"; /** * Collection of regex patterns used throughout the library for text processing */ declare const PATTERNS: { /** Matches Arabic characters across all Unicode blocks */ arabicCharacters: RegExp; /** Matches Arabic-Indic digits (٠-٩) and Western digits (0-9) */ arabicDigits: RegExp; /** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */ arabicFootnoteReferenceRegex: RegExp; /** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ٠-٩) */ arabicLettersAndDigits: RegExp; /** Matches Arabic punctuation marks and whitespace characters */ arabicPunctuationAndWhitespace: RegExp; /** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */ arabicReferenceRegex: RegExp; /** Matches Arabic diacritical marks (harakat, tanween, etc.) */ diacritics: RegExp; /** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */ footnoteEmbedded: RegExp; /** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */ footnoteStandalone: RegExp; /** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */ invalidReferenceRegex: RegExp; /** Matches OCR-confused footnote references at line start with characters like .1OV9 */ ocrConfusedFootnoteReferenceRegex: RegExp; /** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */ ocrConfusedReferenceRegex: RegExp; /** Matches Arabic tatweel (kashida) character used for text stretching */ tatweel: RegExp; /** Matches one or more whitespace characters */ whitespace: RegExp; }; /** * Normalizes Arabic text by removing diacritics, and tatweel marks. * This normalization enables better text comparison by focusing on core characters * while ignoring decorative elements that don't affect meaning. * * @param text - Arabic text to normalize * @returns Normalized text with diacritics, tatweel, and basic tags removed * @example * normalizeArabicText('اَلسَّلَامُ عَلَيْكُمْ') // Returns 'السلام عليكم' */ declare const normalizeArabicText: (text: string) => string; /** * Extracts the first sequence of Arabic or Western digits from text. * Used primarily for footnote number comparison to match related footnote elements. * * @param text - Text containing digits to extract * @returns First digit sequence found, or empty string if none found * @example * extractDigits('(٥)أخرجه البخاري') // Returns '٥' * extractDigits('See note (123)') // Returns '123' */ declare const extractDigits: (text: string) => string; /** * Tokenizes text into individual words while preserving special symbols. * Removes HTML tags, adds spacing around preserved symbols to ensure they * are tokenized separately, then splits on whitespace. * * @param text - Text to tokenize * @param preserveSymbols - Array of symbols that should be tokenized as separate tokens * @returns Array of tokens, or empty array if input is empty/whitespace * @example * tokenizeText('Hello ﷺ world', ['ﷺ']) // Returns ['Hello', 'ﷺ', 'world'] */ declare const tokenizeText: (text: string, preserveSymbols?: string[]) => string[]; /** * Handles fusion of standalone and embedded footnotes during token processing. * Detects patterns where standalone footnotes should be merged with embedded ones * or where trailing standalone footnotes should be skipped. * * @param result - Current result array being built * @param previousToken - The previous token in the sequence * @param currentToken - The current token being processed * @returns True if the current token was handled (fused or skipped), false otherwise * @example * // (٥) + (٥)أخرجه → result gets (٥)أخرجه * // (٥)أخرجه + (٥) → (٥) is skipped */ declare const handleFootnoteFusion: (result: string[], previousToken: string, currentToken: string) => boolean; /** * Handles selection logic for tokens with embedded footnotes during alignment. * Prefers tokens that contain embedded footnotes over plain text, and among * tokens with embedded footnotes, prefers the shorter one. * * @param tokenA - First token to compare * @param tokenB - Second token to compare * @returns Array containing selected token(s), or null if no special handling needed * @example * handleFootnoteSelection('text', '(١)text') // Returns ['(١)text'] * handleFootnoteSelection('(١)longtext', '(١)text') // Returns ['(١)text'] */ declare const handleFootnoteSelection: (tokenA: string, tokenB: string) => null | string[]; /** * Handles selection logic for standalone footnote tokens during alignment. * Manages cases where one or both tokens are standalone footnotes, preserving * both tokens when one is a footnote and the other is regular text. * * @param tokenA - First token to compare * @param tokenB - Second token to compare * @returns Array containing selected token(s), or null if no special handling needed * @example * handleStandaloneFootnotes('(١)', 'text') // Returns ['(١)', 'text'] * handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one) */ declare const handleStandaloneFootnotes: (tokenA: string, tokenB: string) => null | string[]; /** * Standardizes standalone Hijri symbol ه to هـ when following Arabic digits * @param text - Input text to process * @returns Text with standardized Hijri symbols */ declare const standardizeHijriSymbol: (text: string) => string; /** * Standardizes standalone اه to اهـ when appearing as whole word * @param text - Input text to process * @returns Text with standardized AH Hijri symbols */ declare const standardizeIntahaSymbol: (text: string) => string; /** * Processes text alignment between original and alternate OCR results to fix typos. * Uses the Needleman-Wunsch sequence alignment algorithm to align tokens, * then selects the best tokens and performs post-processing. * * @param originalText - Original OCR text that may contain typos * @param altText - Reference text from alternate OCR for comparison * @param options - Configuration options for alignment and selection * @returns Corrected text with typos fixed */ declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string; declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string; export { BRACKETS, CLOSE_BRACKETS, type CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, normalizeArabicText, processTextAlignment, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };