baburchi
Version:
A lightweight TypeScript library designed to fix typos in OCR post-processing.
608 lines (599 loc) • 27.6 kB
TypeScript
/**
* Configuration options for fixing typos in OCR text using alignment algorithms.
* These options control how text tokens are compared, aligned, and merged during typo correction.
*/
type FixTypoOptions = {
/**
* High similarity threshold (0.0 to 1.0) for detecting and removing duplicate tokens.
* Used in post-processing to eliminate redundant tokens that are nearly identical.
* Should typically be higher than similarityThreshold to catch only very similar duplicates.
* @default 0.9
* @example 0.95 // Removes tokens that are 95% or more similar
*/
readonly highSimilarityThreshold: number;
/**
* Similarity threshold (0.0 to 1.0) for determining if two tokens should be aligned.
* Higher values require closer matches, lower values are more permissive.
* Used in the Needleman-Wunsch alignment algorithm for token matching.
* @default 0.7
* @example 0.8 // Requires 80% similarity for token alignment
*/
readonly similarityThreshold: number;
/**
* Array of special symbols that should be preserved during typo correction.
* These symbols (like honorifics or religious markers) take precedence in token selection.
* @example ['ﷺ', '﷽', 'ﷻ'] // Common Arabic religious symbols
*/
readonly typoSymbols: string[];
};
/**
* Aligns split text segments to match target lines by finding the best order.
*
* This function handles cases where text lines have been split into segments
* and need to be merged back together in the correct order. It compares
* different arrangements of the segments against target lines to find the
* best match based on similarity scores.
*
* @param targetLines - Array where each element is either a string to align against, or falsy to skip alignment
* @param segmentLines - Array of text segments that may represent split versions of target lines.
* @returns Array of aligned text lines
*/
declare const alignTextSegments: (targetLines: string[], segmentLines: string[]) => string[];
/**
* Represents an error found when checking balance of quotes or brackets in text.
*/
type BalanceError = {
/** The character that caused the error */
char: string;
/** The position of the character in the string */
index: number;
/** The reason for the error */
reason: 'mismatched' | 'unclosed' | 'unmatched';
/** The type of character that caused the error */
type: 'bracket' | 'quote';
};
/**
* Result of a balance check operation.
*/
type BalanceResult = {
/** Array of errors found during balance checking */
errors: BalanceError[];
/** Whether the text is properly balanced */
isBalanced: boolean;
};
/** Mapping of opening brackets to their corresponding closing brackets */
declare const BRACKETS: {
'\u00AB': string;
'(': string;
'[': string;
'{': string;
};
/** Set of all opening bracket characters */
declare const OPEN_BRACKETS: Set<string>;
/** Set of all closing bracket characters */
declare const CLOSE_BRACKETS: Set<string>;
/**
* Checks if both quotes and brackets are balanced in a string and returns detailed error information.
*
* This function combines the results of both quote and bracket balance checking,
* providing a comprehensive analysis of all balance issues in the text.
* The errors are sorted by their position in the string for easier debugging.
*
* @param str - The string to check for overall balance
* @returns An object containing combined balance status and all errors found, sorted by position
*
* @example
* ```typescript
* checkBalance('Hello "world" and (test)') // { errors: [], isBalanced: true }
* checkBalance('Hello "world and (test') // { errors: [...], isBalanced: false }
* ```
*/
declare const checkBalance: (str: string) => BalanceResult;
/**
* Enhanced error detection that returns absolute character positions for use with HighlightableTextarea.
*
* This interface extends the basic BalanceError to include absolute positioning
* across multiple lines of text, making it suitable for text editors and
* syntax highlighters that need precise character positioning.
*/
interface CharacterError {
/** Absolute character position from the start of the entire text */
absoluteIndex: number;
/** The character that caused the error */
char: string;
/** The reason for the error */
reason: 'mismatched' | 'unclosed' | 'unmatched';
/** The type of character that caused the error */
type: 'bracket' | 'quote';
}
/**
* Gets detailed character-level errors for unbalanced quotes and brackets in multi-line text.
*
* This function processes text line by line, but only checks lines longer than 10 characters
* for balance issues. It returns absolute positions that can be used with text editors
* or highlighting components that need precise character positioning across the entire text.
*
* The absolute index accounts for newline characters between lines, providing accurate
* positioning for the original text string.
*
* @param text - The multi-line text to analyze for balance errors
* @returns Array of character errors with absolute positioning information
*
* @example
* ```typescript
* const text = 'Line 1 with "quote\nLine 2 with (bracket';
* const errors = getUnbalancedErrors(text);
* // Returns errors with absoluteIndex pointing to exact character positions
* ```
*/
declare const getUnbalancedErrors: (text: string) => CharacterError[];
/**
* Checks if all double quotes in a string are balanced.
*
* This is a convenience function that returns only the boolean result
* without detailed error information.
*
* @param str - The string to check for quote balance
* @returns True if quotes are balanced, false otherwise
*
* @example
* ```typescript
* areQuotesBalanced('Hello "world"') // true
* areQuotesBalanced('Hello "world') // false
* ```
*/
declare const areQuotesBalanced: (str: string) => boolean;
/**
* Checks if all brackets in a string are properly balanced.
*
* This is a convenience function that returns only the boolean result
* without detailed error information.
*
* @param str - The string to check for bracket balance
* @returns True if brackets are balanced, false otherwise
*
* @example
* ```typescript
* areBracketsBalanced('(hello [world])') // true
* areBracketsBalanced('(hello [world') // false
* ```
*/
declare const areBracketsBalanced: (str: string) => boolean;
/**
* Checks if both quotes and brackets are balanced in a string.
*
* This is a convenience function that returns only the boolean result
* without detailed error information.
*
* @param str - The string to check for overall balance
* @returns True if both quotes and brackets are balanced, false otherwise
*
* @example
* ```typescript
* isBalanced('Hello "world" and (test)') // true
* isBalanced('Hello "world and (test') // false
* ```
*/
declare const isBalanced: (str: string) => boolean;
/**
* Checks if the given text contains invalid footnote references.
* Invalid footnotes include empty parentheses "()" or OCR-confused characters
* like ".1OV9" that were misrecognized instead of Arabic numerals.
*
* @param text - Text to check for invalid footnote patterns
* @returns True if text contains invalid footnote references, false otherwise
* @example
* hasInvalidFootnotes('This text has ()') // Returns true
* hasInvalidFootnotes('This text has (١)') // Returns false
* hasInvalidFootnotes('OCR mistake (O)') // Returns true
*/
declare const hasInvalidFootnotes: (text: string) => boolean;
type TextLine = {
isFootnote?: boolean;
text: string;
};
/**
* Corrects footnote references in an array of text lines by:
* 1. Converting OCR-confused characters to proper Arabic numerals
* 2. Filling in empty "()" references with appropriate numbers
* 3. Ensuring footnote references in body text match those in footnotes
* 4. Generating new reference numbers when needed
*
* @param lines - Array of text line objects, each with optional isFootnote flag
* @returns Array of corrected text lines with proper footnote references
* @example
* const lines = [
* { text: 'Main text with ()', isFootnote: false },
* { text: '() This is a footnote', isFootnote: true }
* ];
* const corrected = correctReferences(lines);
* // Returns lines with "()" replaced by proper Arabic numerals like "(١)"
*/
declare const correctReferences: <T extends TextLine>(lines: T[]) => T[];
/**
* Character statistics for analyzing text content and patterns
*/
type CharacterStats = {
/** Number of Arabic script characters in the text */
arabicCount: number;
/** Map of character frequencies for repetition analysis */
charFreq: Map<string, number>;
/** Number of digit characters (0-9) in the text */
digitCount: number;
/** Number of Latin alphabet characters (a-z, A-Z) in the text */
latinCount: number;
/** Number of punctuation characters in the text */
punctuationCount: number;
/** Number of whitespace characters in the text */
spaceCount: number;
/** Number of symbol characters (non-alphanumeric, non-punctuation) in the text */
symbolCount: number;
};
/**
* Determines if a given Arabic text string is likely to be noise or unwanted OCR artifacts.
* This function performs comprehensive analysis to identify patterns commonly associated
* with OCR errors, formatting artifacts, or meaningless content in Arabic text processing.
*
* @param text - The input string to analyze for noise patterns
* @returns true if the text is likely noise or unwanted content, false if it appears to be valid Arabic content
*
* @example
* ```typescript
* import { isArabicTextNoise } from 'baburchi';
*
* console.log(isArabicTextNoise('---')); // true (formatting artifact)
* console.log(isArabicTextNoise('السلام عليكم')); // false (valid Arabic)
* console.log(isArabicTextNoise('ABC')); // true (uppercase pattern)
* ```
*/
declare const isArabicTextNoise: (text: string) => boolean;
/**
* Analyzes character composition and frequency statistics for the input text.
* Categorizes characters by type (Arabic, Latin, digits, spaces, punctuation, symbols)
* and tracks character frequency for pattern analysis.
*
* @param text - The text string to analyze
* @returns CharacterStats object containing detailed character analysis
*
* @example
* ```typescript
* import { analyzeCharacterStats } from 'baburchi';
*
* const stats = analyzeCharacterStats('مرحبا 123!');
* console.log(stats.arabicCount); // 5
* console.log(stats.digitCount); // 3
* console.log(stats.symbolCount); // 1
* ```
*/
declare function analyzeCharacterStats(text: string): CharacterStats;
/**
* Detects excessive repetition of specific characters that commonly indicate noise.
* Focuses on repetitive characters like exclamation marks, dots, dashes, equals signs,
* and underscores that often appear in OCR artifacts or formatting elements.
*
* @param charStats - Character statistics from analyzeCharacterStats
* @param textLength - Total length of the original text
* @returns true if excessive repetition is detected, false otherwise
*
* @example
* ```typescript
* import { hasExcessiveRepetition, analyzeCharacterStats } from 'baburchi';
*
* const stats = analyzeCharacterStats('!!!!!');
* console.log(hasExcessiveRepetition(stats, 5)); // true
*
* const normalStats = analyzeCharacterStats('hello world');
* console.log(hasExcessiveRepetition(normalStats, 11)); // false
* ```
*/
declare function hasExcessiveRepetition(charStats: CharacterStats, textLength: number): boolean;
/**
* Identifies text that matches common noise patterns using regular expressions.
* Detects patterns like repeated dashes, dot sequences, uppercase-only text,
* digit-dash combinations, and other formatting artifacts commonly found in OCR output.
*
* @param text - The text string to check against noise patterns
* @returns true if the text matches a basic noise pattern, false otherwise
*
* @example
* ```typescript
* import { isBasicNoisePattern } from 'baburchi';
*
* console.log(isBasicNoisePattern('---')); // true
* console.log(isBasicNoisePattern('...')); // true
* console.log(isBasicNoisePattern('ABC')); // true
* console.log(isBasicNoisePattern('- 77')); // true
* console.log(isBasicNoisePattern('hello world')); // false
* ```
*/
declare function isBasicNoisePattern(text: string): boolean;
/**
* Determines if non-Arabic content should be classified as noise based on various heuristics.
* Analyzes symbol-to-content ratios, text length, spacing patterns, and content composition
* to identify unwanted OCR artifacts or meaningless content.
*
* @param charStats - Character statistics from analyzeCharacterStats
* @param textLength - Total length of the original text
* @param text - The original text string for additional pattern matching
* @returns true if the content is likely noise, false if it appears to be valid content
*
* @example
* ```typescript
* import { isNonArabicNoise, analyzeCharacterStats } from 'baburchi';
*
* const stats = analyzeCharacterStats('!!!');
* console.log(isNonArabicNoise(stats, 3, '!!!')); // true
*
* const validStats = analyzeCharacterStats('2023');
* console.log(isNonArabicNoise(validStats, 4, '2023')); // false
* ```
*/
declare function isNonArabicNoise(charStats: CharacterStats, textLength: number, text: string): boolean;
/**
* Detects problematic spacing patterns that indicate noise or OCR artifacts.
* Identifies cases where spacing is excessive relative to content, or where
* single characters are surrounded by spaces in a way that suggests OCR errors.
*
* @param charStats - Character statistics from analyzeCharacterStats
* @param contentChars - Number of meaningful content characters (Arabic + Latin + digits)
* @param textLength - Total length of the original text
* @returns true if spacing patterns indicate noise, false otherwise
*
* @example
* ```typescript
* import { isSpacingNoise, analyzeCharacterStats } from 'baburchi';
*
* const stats = analyzeCharacterStats(' a ');
* const contentChars = stats.arabicCount + stats.latinCount + stats.digitCount;
* console.log(isSpacingNoise(stats, contentChars, 3)); // true
*
* const normalStats = analyzeCharacterStats('hello world');
* const normalContent = normalStats.arabicCount + normalStats.latinCount + normalStats.digitCount;
* console.log(isSpacingNoise(normalStats, normalContent, 11)); // false
* ```
*/
declare function isSpacingNoise(charStats: CharacterStats, contentChars: number, textLength: number): boolean;
/**
* Validates whether Arabic content is substantial enough to be considered meaningful.
* Uses character counts and text length to determine if Arabic text contains
* sufficient content or if it's likely to be a fragment or OCR artifact.
*
* @param charStats - Character statistics from analyzeCharacterStats
* @param textLength - Total length of the original text
* @returns true if the Arabic content appears valid, false if it's likely noise
*
* @example
* ```typescript
* import { isValidArabicContent, analyzeCharacterStats } from 'baburchi';
*
* const validStats = analyzeCharacterStats('السلام عليكم');
* console.log(isValidArabicContent(validStats, 12)); // true
*
* const shortStats = analyzeCharacterStats('ص');
* console.log(isValidArabicContent(shortStats, 1)); // false
*
* const withDigitsStats = analyzeCharacterStats('ص 5');
* console.log(isValidArabicContent(withDigitsStats, 3)); // true
* ```
*/
declare function isValidArabicContent(charStats: CharacterStats, textLength: number): boolean;
/**
* Calculates Levenshtein distance between two strings using space-optimized dynamic programming.
* The Levenshtein distance is the minimum number of single-character edits (insertions,
* deletions, or substitutions) required to change one string into another.
*
* @param textA - First string to compare
* @param textB - Second string to compare
* @returns Minimum edit distance between the two strings
* @complexity Time: O(m*n), Space: O(min(m,n)) where m,n are string lengths
* @example
* calculateLevenshteinDistance('kitten', 'sitting') // Returns 3
* calculateLevenshteinDistance('', 'hello') // Returns 5
*/
declare const calculateLevenshteinDistance: (textA: string, textB: string) => number;
/**
* Calculates similarity ratio between two strings as a value between 0.0 and 1.0.
* Uses Levenshtein distance normalized by the length of the longer string.
* A ratio of 1.0 indicates identical strings, 0.0 indicates completely different strings.
*
* @param textA - First string to compare
* @param textB - Second string to compare
* @returns Similarity ratio from 0.0 (completely different) to 1.0 (identical)
* @example
* calculateSimilarity('hello', 'hello') // Returns 1.0
* calculateSimilarity('hello', 'help') // Returns 0.6
*/
declare const calculateSimilarity: (textA: string, textB: string) => number;
/**
* Checks if two texts are similar after Arabic normalization.
* Normalizes both texts by removing diacritics and decorative elements,
* then compares their similarity against the provided threshold.
*
* @param textA - First text to compare
* @param textB - Second text to compare
* @param threshold - Similarity threshold (0.0 to 1.0)
* @returns True if normalized texts meet the similarity threshold
* @example
* areSimilarAfterNormalization('السَّلام', 'السلام', 0.9) // Returns true
*/
declare const areSimilarAfterNormalization: (textA: string, textB: string, threshold?: number) => boolean;
/**
* Calculates alignment score for two tokens in sequence alignment.
* Uses different scoring criteria: perfect match after normalization gets highest score,
* typo symbols or highly similar tokens get soft match score, mismatches get penalty.
*
* @param tokenA - First token to score
* @param tokenB - Second token to score
* @param typoSymbols - Array of special symbols that get preferential treatment
* @param similarityThreshold - Threshold for considering tokens highly similar
* @returns Alignment score (higher is better match)
* @example
* calculateAlignmentScore('hello', 'hello', [], 0.8) // Returns 2 (perfect match)
* calculateAlignmentScore('hello', 'help', [], 0.8) // Returns 1 or -2 based on similarity
*/
declare const calculateAlignmentScore: (tokenA: string, tokenB: string, typoSymbols: string[], similarityThreshold: number) => number;
type AlignedTokenPair = [null | string, null | string];
type AlignmentCell = {
direction: 'diagonal' | 'left' | 'up' | null;
score: number;
};
/**
* Backtracks through the scoring matrix to reconstruct optimal sequence alignment.
* Follows the directional indicators in the matrix to build the sequence of aligned
* token pairs from the Needleman-Wunsch algorithm.
*
* @param matrix - Scoring matrix with directional information from alignment
* @param tokensA - First sequence of tokens
* @param tokensB - Second sequence of tokens
* @returns Array of aligned token pairs, where null indicates a gap
* @throws Error if invalid alignment direction is encountered
*/
declare const backtrackAlignment: (matrix: AlignmentCell[][], tokensA: string[], tokensB: string[]) => AlignedTokenPair[];
/**
* Performs global sequence alignment using the Needleman-Wunsch algorithm.
* Aligns two token sequences to find the optimal pairing that maximizes
* the total alignment score, handling insertions, deletions, and substitutions.
*
* @param tokensA - First sequence of tokens to align
* @param tokensB - Second sequence of tokens to align
* @param typoSymbols - Special symbols that affect scoring
* @param similarityThreshold - Threshold for high similarity scoring
* @returns Array of aligned token pairs, with null indicating gaps
* @example
* alignTokenSequences(['a', 'b'], ['a', 'c'], [], 0.8)
* // Returns [['a', 'a'], ['b', 'c']]
*/
declare const alignTokenSequences: (tokensA: string[], tokensB: string[], typoSymbols: string[], similarityThreshold: number) => AlignedTokenPair[];
declare const INTAHA_ACTUAL = "\u0627\u0647\u0640";
/**
* Collection of regex patterns used throughout the library for text processing
*/
declare const PATTERNS: {
/** Matches Arabic characters across all Unicode blocks */
arabicCharacters: RegExp;
/** Matches Arabic-Indic digits (٠-٩) and Western digits (0-9) */
arabicDigits: RegExp;
/** Matches footnote references at the start of a line with Arabic-Indic digits: ^\([\u0660-\u0669]+\) */
arabicFootnoteReferenceRegex: RegExp;
/** Matches Arabic letters and digits (both Western 0-9 and Arabic-Indic ٠-٩) */
arabicLettersAndDigits: RegExp;
/** Matches Arabic punctuation marks and whitespace characters */
arabicPunctuationAndWhitespace: RegExp;
/** Matches footnote references with Arabic-Indic digits in parentheses: \([\u0660-\u0669]+\) */
arabicReferenceRegex: RegExp;
/** Matches Arabic diacritical marks (harakat, tanween, etc.) */
diacritics: RegExp;
/** Matches embedded footnotes within text: \([0-9\u0660-\u0669]+\) */
footnoteEmbedded: RegExp;
/** Matches standalone footnote markers at line start/end: ^\(?[0-9\u0660-\u0669]+\)?[،.]?$ */
footnoteStandalone: RegExp;
/** Matches invalid/problematic footnote references: empty "()" or OCR-confused endings */
invalidReferenceRegex: RegExp;
/** Matches OCR-confused footnote references at line start with characters like .1OV9 */
ocrConfusedFootnoteReferenceRegex: RegExp;
/** Matches OCR-confused footnote references with characters commonly misread as Arabic digits */
ocrConfusedReferenceRegex: RegExp;
/** Matches Arabic tatweel (kashida) character used for text stretching */
tatweel: RegExp;
/** Matches one or more whitespace characters */
whitespace: RegExp;
};
/**
* Normalizes Arabic text by removing diacritics, and tatweel marks.
* This normalization enables better text comparison by focusing on core characters
* while ignoring decorative elements that don't affect meaning.
*
* @param text - Arabic text to normalize
* @returns Normalized text with diacritics, tatweel, and basic tags removed
* @example
* normalizeArabicText('اَلسَّلَامُ عَلَيْكُمْ') // Returns 'السلام عليكم'
*/
declare const normalizeArabicText: (text: string) => string;
/**
* Extracts the first sequence of Arabic or Western digits from text.
* Used primarily for footnote number comparison to match related footnote elements.
*
* @param text - Text containing digits to extract
* @returns First digit sequence found, or empty string if none found
* @example
* extractDigits('(٥)أخرجه البخاري') // Returns '٥'
* extractDigits('See note (123)') // Returns '123'
*/
declare const extractDigits: (text: string) => string;
/**
* Tokenizes text into individual words while preserving special symbols.
* Removes HTML tags, adds spacing around preserved symbols to ensure they
* are tokenized separately, then splits on whitespace.
*
* @param text - Text to tokenize
* @param preserveSymbols - Array of symbols that should be tokenized as separate tokens
* @returns Array of tokens, or empty array if input is empty/whitespace
* @example
* tokenizeText('Hello ﷺ world', ['ﷺ']) // Returns ['Hello', 'ﷺ', 'world']
*/
declare const tokenizeText: (text: string, preserveSymbols?: string[]) => string[];
/**
* Handles fusion of standalone and embedded footnotes during token processing.
* Detects patterns where standalone footnotes should be merged with embedded ones
* or where trailing standalone footnotes should be skipped.
*
* @param result - Current result array being built
* @param previousToken - The previous token in the sequence
* @param currentToken - The current token being processed
* @returns True if the current token was handled (fused or skipped), false otherwise
* @example
* // (٥) + (٥)أخرجه → result gets (٥)أخرجه
* // (٥)أخرجه + (٥) → (٥) is skipped
*/
declare const handleFootnoteFusion: (result: string[], previousToken: string, currentToken: string) => boolean;
/**
* Handles selection logic for tokens with embedded footnotes during alignment.
* Prefers tokens that contain embedded footnotes over plain text, and among
* tokens with embedded footnotes, prefers the shorter one.
*
* @param tokenA - First token to compare
* @param tokenB - Second token to compare
* @returns Array containing selected token(s), or null if no special handling needed
* @example
* handleFootnoteSelection('text', '(١)text') // Returns ['(١)text']
* handleFootnoteSelection('(١)longtext', '(١)text') // Returns ['(١)text']
*/
declare const handleFootnoteSelection: (tokenA: string, tokenB: string) => null | string[];
/**
* Handles selection logic for standalone footnote tokens during alignment.
* Manages cases where one or both tokens are standalone footnotes, preserving
* both tokens when one is a footnote and the other is regular text.
*
* @param tokenA - First token to compare
* @param tokenB - Second token to compare
* @returns Array containing selected token(s), or null if no special handling needed
* @example
* handleStandaloneFootnotes('(١)', 'text') // Returns ['(١)', 'text']
* handleStandaloneFootnotes('(١)', '(٢)') // Returns ['(١)'] (shorter one)
*/
declare const handleStandaloneFootnotes: (tokenA: string, tokenB: string) => null | string[];
/**
* Standardizes standalone Hijri symbol ه to هـ when following Arabic digits
* @param text - Input text to process
* @returns Text with standardized Hijri symbols
*/
declare const standardizeHijriSymbol: (text: string) => string;
/**
* Standardizes standalone اه to اهـ when appearing as whole word
* @param text - Input text to process
* @returns Text with standardized AH Hijri symbols
*/
declare const standardizeIntahaSymbol: (text: string) => string;
/**
* Processes text alignment between original and alternate OCR results to fix typos.
* Uses the Needleman-Wunsch sequence alignment algorithm to align tokens,
* then selects the best tokens and performs post-processing.
*
* @param originalText - Original OCR text that may contain typos
* @param altText - Reference text from alternate OCR for comparison
* @param options - Configuration options for alignment and selection
* @returns Corrected text with typos fixed
*/
declare const processTextAlignment: (originalText: string, altText: string, options: FixTypoOptions) => string;
declare const fixTypo: (original: string, correction: string, { highSimilarityThreshold, similarityThreshold, typoSymbols, }: Partial<FixTypoOptions> & Pick<FixTypoOptions, "typoSymbols">) => string;
export { BRACKETS, CLOSE_BRACKETS, type CharacterError, INTAHA_ACTUAL, OPEN_BRACKETS, PATTERNS, alignTextSegments, alignTokenSequences, analyzeCharacterStats, areBracketsBalanced, areQuotesBalanced, areSimilarAfterNormalization, backtrackAlignment, calculateAlignmentScore, calculateLevenshteinDistance, calculateSimilarity, checkBalance, correctReferences, extractDigits, fixTypo, getUnbalancedErrors, handleFootnoteFusion, handleFootnoteSelection, handleStandaloneFootnotes, hasExcessiveRepetition, hasInvalidFootnotes, isArabicTextNoise, isBalanced, isBasicNoisePattern, isNonArabicNoise, isSpacingNoise, isValidArabicContent, normalizeArabicText, processTextAlignment, standardizeHijriSymbol, standardizeIntahaSymbol, tokenizeText };