UNPKG

flappa-doormal

Version:

Arabic text marker pattern library for generating regex from declarative configurations

460 lines 16 kB
//#region src/types.d.ts /** * Numbering styles for markers */ type NumberingStyle = 'arabic-indic' | 'latin'; /** * Separator styles for markers */ type SeparatorStyle = 'dash' | 'dot' | 'paren' | 'colon' | 'none'; /** * Marker types for text segmentation */ type MarkerType = 'numbered' | 'bullet' | 'heading' | 'pattern' | 'bab' | 'hadith-chain' | 'basmala' | 'phrase' | 'square-bracket' | 'num-letter' | 'num-paren' | 'num-slash'; /** * Configuration for a single marker pattern */ type MarkerConfig = { /** The type of marker to look for */ type: MarkerType; /** For numbered markers, the digit style */ numbering?: NumberingStyle; /** The separator that follows the marker */ separator?: SeparatorStyle | string; /** * Template format for numbered markers using token syntax. * Example: '{bullet}+ {num} {dash}' * Only valid when type is 'numbered'. */ format?: string; /** * For 'pattern' type, provide a template using tokens like {num}, {dash}, {bullet}. * For raw regex patterns that don't use templates, provide the raw pattern string here. * Example: '{bullet}? {num}+ {s}{dash}' or '^[•*°]? ([\\u0660-\\u0669]+\\s?[-–—ـ].*)' */ template?: string; /** * Alternative to template: raw regex pattern string (for 'pattern' type only). * Use this for complex patterns that can't be expressed with templates. * The pattern should have a capture group for the content. * Example: '^CUSTOM: (.*)' */ pattern?: string; /** * Custom token map for advanced users. * Extends the default TOKENS with additional definitions. */ tokens?: Record<string, string>; /** * List of phrases for 'phrase' and 'hadith-chain' types. * For 'hadith-chain', defaults to common narrator patterns if not provided. */ phrases?: string[]; /** * Optional: Only apply this marker after a specific page number. * Useful for books with different formatting in front matter vs main content. */ minPage?: number; /** * Optional: Arbitrary metadata to attach to entries matched by this marker. * This allows for agnostic handling of entry properties. * Example: { type: 0, category: 'hadith' } */ metadata?: Record<string, any>; }; //#endregion //#region src/markers/defaults.d.ts /** * Default numbering style for markers */ declare const DEFAULT_NUMBERING: NumberingStyle; /** * Default separator style for markers */ declare const DEFAULT_SEPARATOR: SeparatorStyle; /** * Default separator pattern (used when separator is a custom string) */ declare const DEFAULT_SEPARATOR_PATTERN = "[-\u2013\u2014\u0640]"; /** * Numbering patterns mapped by style */ declare const NUMBERING_PATTERNS: Record<NumberingStyle, string>; /** * Separator patterns mapped by style */ declare const SEPARATOR_PATTERNS: Record<SeparatorStyle, string>; //#endregion //#region src/markers/generator.d.ts /** * Generates a regex pattern from a marker configuration. * Always returns a regex with three named capture groups: * - full: Complete match including marker * - marker: Just the marker part (for metadata/indexing) * - content: Clean content without marker (for LLM processing) * * This function applies all default values before delegating to type-specific generators. * * @param config - Marker configuration * @returns Regular expression with named groups * * @example * const regex = generateRegexFromMarker({ type: 'numbered' }); * const match = regex.exec('٥ - نص'); * match.groups.full // "٥ - نص" * match.groups.marker // "٥ -" * match.groups.content // "نص" */ declare function generateRegexFromMarker(config: MarkerConfig): RegExp; //#endregion //#region src/markers/presets.d.ts /** * Default phrase lists for preset marker types. * Export these so users can extend them. */ /** * Common hadith narrator phrases (diacritic-insensitive) * Users can extend: [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي'] */ declare const DEFAULT_HADITH_PHRASES: readonly ["حَدَّثَنَا", "حدثنا", "أَخْبَرَنَا", "حدثني", "حدَّثني", "وحدثنا", "حُدِّثت عن", "وحَدَّثَنَا"]; /** * Common basmala patterns * Users can extend: [...DEFAULT_BASMALA_PATTERNS, 'customPattern'] */ declare const DEFAULT_BASMALA_PATTERNS: readonly ["بسم الله", "\\[بسم", "\\[تم"]; //#endregion //#region src/markers/tokens.d.ts /** * Token definitions for pattern templates. * Tokens provide a readable alternative to raw regex patterns. */ /** * Standard tokens for building marker patterns. * Use these in templates like: '{num} {dash}' instead of '[\\u0660-\\u0669]+ [-–—ـ]' */ declare const TOKENS: { readonly bullet: "[•*°]"; readonly colon: ":"; readonly comma: "،"; readonly content: "(.*)"; readonly dash: "[-–—ـ]"; readonly dot: "\\."; readonly latin: "\\d+"; readonly letter: "[أ-ي]"; readonly num: "[\\u0660-\\u0669]+"; readonly paren: "\\)"; readonly s: "\\s?"; readonly slash: "/"; readonly space: "\\s+"; }; type TokenMap = Record<string, string>; //#endregion //#region src/markers/template-parser.d.ts /** * Result of template validation */ interface ValidationResult { valid: boolean; errors?: string[]; } /** * Options for template expansion */ interface ExpandOptions { /** Custom token map to use instead of default TOKENS */ tokens?: TokenMap; } /** * Expands a template string into a regex pattern using named capture groups. * Always creates three groups: full (entire match), marker (just the marker), content (clean text). * * The content group uses [\s\S]*? (non-greedy) to match across newlines but stop at next marker. * * @param template - Template string with {token} placeholders * @param options - Optional configuration * @returns Regex pattern string with named groups * * @example * expandTemplate('{num} {dash}') * // Returns: ^(?<full>(?<marker>[\\u0660-\\u0669]+\\s?[-–—ـ])(?<content>[\\s\\S]*?)) */ declare function expandTemplate(template: string, options?: ExpandOptions): string; /** * Create a custom token map by extending the base tokens. * * @param customTokens - Custom token definitions * @returns Combined token map * * @example * const myTokens = createTokenMap({ * verse: '\\[[\\u0660-\\u0669]+\\]', * tafsir: 'تفسير' * }); */ declare function createTokenMap(customTokens: Record<string, string>): TokenMap; /** * Validates a template string. * * @param template - Template to validate * @param tokens - Token map to validate against * @returns Validation result with errors if invalid * * @example * validateTemplate('{num} {dash}') * // Returns: { valid: true } * * validateTemplate('{invalid}') * // Returns: { valid: false, errors: ['Unknown token: {invalid}'] } */ declare function validateTemplate(template: string, tokens?: TokenMap): ValidationResult; //#endregion //#region src/markers/type-generators.d.ts /** * Generates a regular expression for pattern-type markers. * * Supports two modes: * 1. Template-based: Uses the `template` field with token expansion * 2. Pattern-based: Uses the raw `pattern` field as-is * * @param config - Marker configuration with either `template` or `pattern` field * @returns A compiled RegExp object for matching the pattern * @throws {Error} When neither `template` nor `pattern` is provided * * @example * // Using template * const regex = generatePatternRegex({ type: 'pattern', template: '{num} {dash}' }); * * @example * // Using raw pattern * const regex = generatePatternRegex({ type: 'pattern', pattern: '^\\d+' }); * * @example * // Using custom tokens * const regex = generatePatternRegex({ * type: 'pattern', * template: '{verse}', * tokens: { verse: '\\[[0-9]+\\]' } * }); */ declare function generatePatternRegex(config: MarkerConfig): RegExp; /** * Generates a regular expression for 'bab' (chapter) markers. * * Matches Arabic chapter markers like باب, بَابُ, بَابٌ with optional diacritics. * The pattern is diacritic-insensitive using bitaboom's makeDiacriticInsensitive. * * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateBabRegex(); * const match = regex.exec('باب الصلاة'); * // match.groups.marker -> 'باب' * // match.groups.content -> ' الصلاة' */ declare function generateBabRegex(): RegExp; /** * Generates a regular expression for hadith chain (isnad) markers. * * Matches common hadith narrator phrases like حَدَّثَنَا, أَخْبَرَنَا, etc. * Uses default phrases from presets or custom phrases from config. * All phrases are made diacritic-insensitive. * * @param config - Marker configuration with optional `phrases` array * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * // Using default phrases * const regex = generateHadithChainRegex({ type: 'hadith-chain' }); * const match = regex.exec('حَدَّثَنَا أبو بكر'); * * @example * // Using custom phrases * const regex = generateHadithChainRegex({ * type: 'hadith-chain', * phrases: ['قَالَ', 'رَوَى'] * }); */ declare function generateHadithChainRegex(config: MarkerConfig): RegExp; /** * Generates a regular expression for basmala markers. * * Matches various forms of بِسْمِ اللَّهِ (In the name of Allah): * - بسم الله (without diacritics) * - بِسْمِ اللَّهِ (with diacritics) * - Special patterns like [بسم, [تم * * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateBasmalaRegex(); * const match = regex.exec('بسم الله الرحمن الرحيم'); * // match.groups.marker -> 'بسم الله' */ declare function generateBasmalaRegex(): RegExp; /** * Generates a regular expression for custom phrase markers. * * Similar to hadith-chain markers but requires explicit phrase list. * All phrases are made diacritic-insensitive. * * @param config - Marker configuration with required `phrases` array * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * @throws {Error} When `phrases` is undefined or empty * * @example * const regex = generatePhraseRegex({ * type: 'phrase', * phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ'] * }); */ declare function generatePhraseRegex(config: MarkerConfig): RegExp; /** * Generates a regular expression for square bracket markers. * * Matches verse or hadith reference numbers in square brackets: * - [٦٥] - Simple bracket * - • [٦٥] - With bullet prefix * - ° [٦٥] - With degree prefix * * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateSquareBracketRegex(); * const match = regex.exec('[٦٥] نص الحديث'); * // match.groups.content -> ' نص الحديث' */ declare function generateSquareBracketRegex(): RegExp; /** * Generates a regular expression for number-letter-separator markers. * * Matches patterns like: * - ٥ أ - (Arabic-Indic number, Arabic letter, dash) * - 5 ب. (Latin number, Arabic letter, dot) * * @param config - Configuration with required `numbering` and `separator` fields * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateNumLetterRegex({ * numbering: 'arabic-indic', * separator: 'dash' * }); * const match = regex.exec('٥ أ - نص'); */ declare function generateNumLetterRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp; /** * Generates a regular expression for number-parenthetical-separator markers. * * Matches patterns like: * - ٥ (أ) - (number, parenthetical content, separator) * - 5 (٦) - (number with parenthetical number) * * @param config - Configuration with required `numbering` and `separator` fields * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateNumParenRegex({ * numbering: 'arabic-indic', * separator: 'dash' * }); * const match = regex.exec('٥ (أ) - نص'); */ declare function generateNumParenRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp; /** * Generates a regular expression for number-slash-number markers. * * Matches patterns like: * - ٥/٦ - (number slash number, separator) * - ٥ - (single number, separator) * * The second number after the slash is optional. * * @param config - Configuration with required `numbering` and `separator` fields * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateNumSlashRegex({ * numbering: 'arabic-indic', * separator: 'dash' * }); * const match1 = regex.exec('٥/٦ - نص'); * const match2 = regex.exec('٥ - نص'); // Also matches */ declare function generateNumSlashRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp; /** * Generates a regular expression for numbered markers with optional format template. * * Supports two modes: * 1. Format template: Uses `format` field with token expansion (e.g., '{bullet}+ {num} {dash}') * 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers * * When using default pattern: * - Separator 'none' generates pattern without separator * - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS * * @param config - Configuration with `numbering`, `separator`, and optional `format`/`tokens` * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * // Using format template * const regex = generateNumberedRegex({ * numbering: 'arabic-indic', * separator: 'dash', * format: '{bullet}+ {num} {dash}' * }); * * @example * // Using default pattern * const regex = generateNumberedRegex({ * numbering: 'arabic-indic', * separator: 'dash' * }); * const match = regex.exec('٥ - نص'); * * @example * // With 'none' separator * const regex = generateNumberedRegex({ * numbering: 'latin', * separator: 'none' * }); * const match = regex.exec('5 text'); */ declare function generateNumberedRegex(config: Pick<MarkerConfig, 'numbering' | 'separator' | 'format' | 'tokens'>): RegExp; /** * Generates a regular expression for bullet-point markers. * * Matches common bullet characters: * - • (bullet) * - * (asterisk) * - ° (degree) * - - (dash) * * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateBulletRegex(); * const match = regex.exec('• نقطة'); * // match.groups.content -> 'نقطة' */ declare function generateBulletRegex(): RegExp; /** * Generates a regular expression for Markdown-style heading markers. * * Matches heading levels using hash symbols: * - # Heading 1 * - ## Heading 2 * - ### Heading 3 * - etc. * * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateHeadingRegex(); * const match = regex.exec('## عنوان فرعي'); * // match.groups.marker -> '## ' * // match.groups.content -> 'عنوان فرعي' */ declare function generateHeadingRegex(): RegExp; //#endregion export { DEFAULT_BASMALA_PATTERNS, DEFAULT_HADITH_PHRASES, DEFAULT_NUMBERING, DEFAULT_SEPARATOR, DEFAULT_SEPARATOR_PATTERN, type MarkerConfig, type MarkerType, NUMBERING_PATTERNS, type NumberingStyle, SEPARATOR_PATTERNS, type SeparatorStyle, TOKENS, createTokenMap, expandTemplate, generateBabRegex, generateBasmalaRegex, generateBulletRegex, generateHadithChainRegex, generateHeadingRegex, generateNumLetterRegex, generateNumParenRegex, generateNumSlashRegex, generateNumberedRegex, generatePatternRegex, generatePhraseRegex, generateRegexFromMarker, generateSquareBracketRegex, validateTemplate }; //# sourceMappingURL=index.d.mts.map