UNPKG

flappa-doormal

Version:

Arabic text marker pattern library for generating regex from declarative configurations

517 lines (510 loc) 17.5 kB
import { makeDiacriticInsensitive } from "bitaboom"; //#region src/markers/defaults.ts /** * Default numbering style for markers */ const DEFAULT_NUMBERING = "arabic-indic"; /** * Default separator style for markers */ const DEFAULT_SEPARATOR = "dash"; /** * Default separator pattern (used when separator is a custom string) */ const DEFAULT_SEPARATOR_PATTERN = "[-–—ـ]"; /** * Numbering patterns mapped by style */ const NUMBERING_PATTERNS = { "arabic-indic": "[\\u0660-\\u0669]+", "latin": "\\d+" }; /** * Separator patterns mapped by style */ const SEPARATOR_PATTERNS = { "colon": ":", "dash": "[-–—ـ]", "dot": "\\.", "none": "", "paren": "\\)" }; //#endregion //#region src/markers/presets.ts /** * Default phrase lists for preset marker types. * Export these so users can extend them. */ /** * Common hadith narrator phrases (diacritic-insensitive) * Users can extend: [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي'] */ const DEFAULT_HADITH_PHRASES = [ "حَدَّثَنَا", "حدثنا", "أَخْبَرَنَا", "حدثني", "حدَّثني", "وحدثنا", "حُدِّثت عن", "وحَدَّثَنَا" ]; /** * Common basmala patterns * Users can extend: [...DEFAULT_BASMALA_PATTERNS, 'customPattern'] */ const DEFAULT_BASMALA_PATTERNS = [ "بسم الله", "\\[بسم", "\\[تم" ]; //#endregion //#region src/markers/tokens.ts /** * Token definitions for pattern templates. * Tokens provide a readable alternative to raw regex patterns. */ /** * Standard tokens for building marker patterns. * Use these in templates like: '{num} {dash}' instead of '[\\u0660-\\u0669]+ [-–—ـ]' */ const TOKENS = { bullet: "[•*°]", colon: ":", comma: "،", content: "(.*)", dash: "[-–—ـ]", dot: "\\.", latin: "\\d+", letter: "[أ-ي]", num: "[\\u0660-\\u0669]+", paren: "\\)", s: "\\s?", slash: "/", space: "\\s+" }; //#endregion //#region src/markers/template-parser.ts /** * Expands a template string into a regex pattern using named capture groups. * Always creates three groups: full (entire match), marker (just the marker), content (clean text). * * The content group uses [\s\S]*? (non-greedy) to match across newlines but stop at next marker. * * @param template - Template string with {token} placeholders * @param options - Optional configuration * @returns Regex pattern string with named groups * * @example * expandTemplate('{num} {dash}') * // Returns: ^(?<full>(?<marker>[\\u0660-\\u0669]+\\s?[-–—ـ])(?<content>[\\s\\S]*?)) */ function expandTemplate(template, options) { const tokenMap = options?.tokens || TOKENS; let expandedMarker = template; for (const [token, pattern] of Object.entries(tokenMap)) { const placeholder = `{${token}}`; expandedMarker = expandedMarker.replaceAll(placeholder, pattern); } return String.raw`^(?<full>(?<marker>${expandedMarker})(?<content>[\s\S]*))`; } /** * Create a custom token map by extending the base tokens. * * @param customTokens - Custom token definitions * @returns Combined token map * * @example * const myTokens = createTokenMap({ * verse: '\\[[\\u0660-\\u0669]+\\]', * tafsir: 'تفسير' * }); */ function createTokenMap(customTokens) { return { ...TOKENS, ...customTokens }; } /** * Validates a template string. * * @param template - Template to validate * @param tokens - Token map to validate against * @returns Validation result with errors if invalid * * @example * validateTemplate('{num} {dash}') * // Returns: { valid: true } * * validateTemplate('{invalid}') * // Returns: { valid: false, errors: ['Unknown token: {invalid}'] } */ function validateTemplate(template, tokens = TOKENS) { const unknownTokens = (template.match(/\{(\w+)\}/g) || []).map((t) => t.slice(1, -1)).filter((name) => !tokens[name]); if (unknownTokens.length > 0) return { valid: false, errors: [`Unknown tokens: ${unknownTokens.map((t) => `{${t}}`).join(", ")}`, `Available tokens: ${Object.keys(tokens).map((t) => `{${t}}`).join(", ")}`] }; return { valid: true }; } //#endregion //#region src/markers/type-generators.ts /** * Generates a regular expression for pattern-type markers. * * Supports two modes: * 1. Template-based: Uses the `template` field with token expansion * 2. Pattern-based: Uses the raw `pattern` field as-is * * @param config - Marker configuration with either `template` or `pattern` field * @returns A compiled RegExp object for matching the pattern * @throws {Error} When neither `template` nor `pattern` is provided * * @example * // Using template * const regex = generatePatternRegex({ type: 'pattern', template: '{num} {dash}' }); * * @example * // Using raw pattern * const regex = generatePatternRegex({ type: 'pattern', pattern: '^\\d+' }); * * @example * // Using custom tokens * const regex = generatePatternRegex({ * type: 'pattern', * template: '{verse}', * tokens: { verse: '\\[[0-9]+\\]' } * }); */ function generatePatternRegex(config) { if (config.template) { const tokenMap = config.tokens ? createTokenMap(config.tokens) : TOKENS; const pattern = expandTemplate(config.template, { tokens: tokenMap }); return new RegExp(pattern, "u"); } if (!config.pattern) throw new Error("pattern marker must provide either a template or pattern"); return new RegExp(config.pattern, "u"); } /** * Generates a regular expression for 'bab' (chapter) markers. * * Matches Arabic chapter markers like باب, بَابُ, بَابٌ with optional diacritics. * The pattern is diacritic-insensitive using bitaboom's makeDiacriticInsensitive. * * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateBabRegex(); * const match = regex.exec('باب الصلاة'); * // match.groups.marker -> 'باب' * // match.groups.content -> ' الصلاة' */ function generateBabRegex() { const babPattern = makeDiacriticInsensitive("باب"); const pattern = String.raw`^(?<full>(?<marker>${babPattern}[ًٌٍَُ]?)(?<content>[\s\S]*))`; return new RegExp(pattern, "u"); } /** * Generates a regular expression for hadith chain (isnad) markers. * * Matches common hadith narrator phrases like حَدَّثَنَا, أَخْبَرَنَا, etc. * Uses default phrases from presets or custom phrases from config. * All phrases are made diacritic-insensitive. * * @param config - Marker configuration with optional `phrases` array * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * // Using default phrases * const regex = generateHadithChainRegex({ type: 'hadith-chain' }); * const match = regex.exec('حَدَّثَنَا أبو بكر'); * * @example * // Using custom phrases * const regex = generateHadithChainRegex({ * type: 'hadith-chain', * phrases: ['قَالَ', 'رَوَى'] * }); */ function generateHadithChainRegex(config) { const phrasesPattern = (config.phrases || DEFAULT_HADITH_PHRASES).map((p) => makeDiacriticInsensitive(p)).join("|"); const pattern = String.raw`^(?<full>(?<marker>${phrasesPattern})(?<content>[\s\S]*))`; return new RegExp(pattern, "u"); } /** * Generates a regular expression for basmala markers. * * Matches various forms of بِسْمِ اللَّهِ (In the name of Allah): * - بسم الله (without diacritics) * - بِسْمِ اللَّهِ (with diacritics) * - Special patterns like [بسم, [تم * * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateBasmalaRegex(); * const match = regex.exec('بسم الله الرحمن الرحيم'); * // match.groups.marker -> 'بسم الله' */ function generateBasmalaRegex() { const combinedPattern = DEFAULT_BASMALA_PATTERNS.map((p) => makeDiacriticInsensitive(p)).join("|"); const pattern = String.raw`^(?<full>(?<marker>${combinedPattern})(?<content>[\s\S]*))`; return new RegExp(pattern, "u"); } /** * Generates a regular expression for custom phrase markers. * * Similar to hadith-chain markers but requires explicit phrase list. * All phrases are made diacritic-insensitive. * * @param config - Marker configuration with required `phrases` array * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * @throws {Error} When `phrases` is undefined or empty * * @example * const regex = generatePhraseRegex({ * type: 'phrase', * phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ'] * }); */ function generatePhraseRegex(config) { if (!config.phrases || config.phrases.length === 0) throw new Error("phrase marker requires phrases array"); const phrasesPattern = config.phrases.map((p) => makeDiacriticInsensitive(p)).join("|"); const pattern = String.raw`^(?<full>(?<marker>${phrasesPattern})(?<content>[\s\S]*))`; return new RegExp(pattern, "u"); } /** * Generates a regular expression for square bracket markers. * * Matches verse or hadith reference numbers in square brackets: * - [٦٥] - Simple bracket * - • [٦٥] - With bullet prefix * - ° [٦٥] - With degree prefix * * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateSquareBracketRegex(); * const match = regex.exec('[٦٥] نص الحديث'); * // match.groups.content -> ' نص الحديث' */ function generateSquareBracketRegex() { const markerPattern = String.raw`[•°]?\s?\[[\u0660-\u0669]+\]\s?`; const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`; return new RegExp(pattern, "u"); } /** * Generates a regular expression for number-letter-separator markers. * * Matches patterns like: * - ٥ أ - (Arabic-Indic number, Arabic letter, dash) * - 5 ب. (Latin number, Arabic letter, dot) * * @param config - Configuration with required `numbering` and `separator` fields * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateNumLetterRegex({ * numbering: 'arabic-indic', * separator: 'dash' * }); * const match = regex.exec('٥ أ - نص'); */ function generateNumLetterRegex(config) { const numPattern = NUMBERING_PATTERNS[config.numbering]; const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator; const markerPattern = String.raw`${numPattern} [أ-ي]\s?${sepPattern}`; const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`; return new RegExp(pattern, "u"); } /** * Generates a regular expression for number-parenthetical-separator markers. * * Matches patterns like: * - ٥ (أ) - (number, parenthetical content, separator) * - 5 (٦) - (number with parenthetical number) * * @param config - Configuration with required `numbering` and `separator` fields * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateNumParenRegex({ * numbering: 'arabic-indic', * separator: 'dash' * }); * const match = regex.exec('٥ (أ) - نص'); */ function generateNumParenRegex(config) { const numPattern = NUMBERING_PATTERNS[config.numbering]; const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator; const markerPattern = String.raw`${numPattern}\s*\([\u0600-\u06FF\u0660-\u0669\s]+\)\s?${sepPattern}`; const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`; return new RegExp(pattern, "u"); } /** * Generates a regular expression for number-slash-number markers. * * Matches patterns like: * - ٥/٦ - (number slash number, separator) * - ٥ - (single number, separator) * * The second number after the slash is optional. * * @param config - Configuration with required `numbering` and `separator` fields * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateNumSlashRegex({ * numbering: 'arabic-indic', * separator: 'dash' * }); * const match1 = regex.exec('٥/٦ - نص'); * const match2 = regex.exec('٥ - نص'); // Also matches */ function generateNumSlashRegex(config) { const numPattern = NUMBERING_PATTERNS[config.numbering]; const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator; const markerPattern = String.raw`${numPattern}(?:\s?/\s?${numPattern})?\s?${sepPattern}`; const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`; return new RegExp(pattern, "u"); } /** * Generates a regular expression for numbered markers with optional format template. * * Supports two modes: * 1. Format template: Uses `format` field with token expansion (e.g., '{bullet}+ {num} {dash}') * 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers * * When using default pattern: * - Separator 'none' generates pattern without separator * - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS * * @param config - Configuration with `numbering`, `separator`, and optional `format`/`tokens` * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * // Using format template * const regex = generateNumberedRegex({ * numbering: 'arabic-indic', * separator: 'dash', * format: '{bullet}+ {num} {dash}' * }); * * @example * // Using default pattern * const regex = generateNumberedRegex({ * numbering: 'arabic-indic', * separator: 'dash' * }); * const match = regex.exec('٥ - نص'); * * @example * // With 'none' separator * const regex = generateNumberedRegex({ * numbering: 'latin', * separator: 'none' * }); * const match = regex.exec('5 text'); */ function generateNumberedRegex(config) { if (config.format) { const tokenMap = config.tokens ? createTokenMap(config.tokens) : TOKENS; const expandedPattern = expandTemplate(config.format, { tokens: tokenMap }); return new RegExp(expandedPattern, "u"); } const numPattern = NUMBERING_PATTERNS[config.numbering]; const separator = config.separator; const sepPattern = separator !== "none" ? SEPARATOR_PATTERNS[separator] ?? separator : ""; const markerPattern = sepPattern ? String.raw`${numPattern}\s?${sepPattern}` : numPattern; const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`; return new RegExp(pattern, "u"); } /** * Generates a regular expression for bullet-point markers. * * Matches common bullet characters: * - • (bullet) * - * (asterisk) * - ° (degree) * - - (dash) * * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateBulletRegex(); * const match = regex.exec('• نقطة'); * // match.groups.content -> 'نقطة' */ function generateBulletRegex() { return new RegExp("^(?<full>(?<marker>[•*°\\-]\\s?)(?<content>[\\s\\S]*))", "u"); } /** * Generates a regular expression for Markdown-style heading markers. * * Matches heading levels using hash symbols: * - # Heading 1 * - ## Heading 2 * - ### Heading 3 * - etc. * * @returns A compiled RegExp with named groups: `full`, `marker`, `content` * * @example * const regex = generateHeadingRegex(); * const match = regex.exec('## عنوان فرعي'); * // match.groups.marker -> '## ' * // match.groups.content -> 'عنوان فرعي' */ function generateHeadingRegex() { return new RegExp("^(?<full>(?<marker>#+\\s?)(?<content>[\\s\\S]*))", "u"); } //#endregion //#region src/markers/generator.ts /** * Generates a regex pattern from a marker configuration. * Always returns a regex with three named capture groups: * - full: Complete match including marker * - marker: Just the marker part (for metadata/indexing) * - content: Clean content without marker (for LLM processing) * * This function applies all default values before delegating to type-specific generators. * * @param config - Marker configuration * @returns Regular expression with named groups * * @example * const regex = generateRegexFromMarker({ type: 'numbered' }); * const match = regex.exec('٥ - نص'); * match.groups.full // "٥ - نص" * match.groups.marker // "٥ -" * match.groups.content // "نص" */ function generateRegexFromMarker(config) { const normalized = { numbering: config.numbering ?? DEFAULT_NUMBERING, separator: config.separator ?? DEFAULT_SEPARATOR, ...config }; switch (normalized.type) { case "pattern": return generatePatternRegex(normalized); case "bab": return generateBabRegex(); case "hadith-chain": return generateHadithChainRegex(normalized); case "basmala": return generateBasmalaRegex(); case "phrase": return generatePhraseRegex(normalized); case "square-bracket": return generateSquareBracketRegex(); case "num-letter": return generateNumLetterRegex(normalized); case "num-paren": return generateNumParenRegex(normalized); case "num-slash": return generateNumSlashRegex(normalized); case "numbered": return generateNumberedRegex(normalized); case "bullet": return generateBulletRegex(); case "heading": return generateHeadingRegex(); default: { const _exhaustive = normalized.type; throw new Error(`Unknown marker type: ${_exhaustive}`); } } } //#endregion export { DEFAULT_BASMALA_PATTERNS, DEFAULT_HADITH_PHRASES, DEFAULT_NUMBERING, DEFAULT_SEPARATOR, DEFAULT_SEPARATOR_PATTERN, NUMBERING_PATTERNS, SEPARATOR_PATTERNS, TOKENS, createTokenMap, expandTemplate, generateBabRegex, generateBasmalaRegex, generateBulletRegex, generateHadithChainRegex, generateHeadingRegex, generateNumLetterRegex, generateNumParenRegex, generateNumSlashRegex, generateNumberedRegex, generatePatternRegex, generatePhraseRegex, generateRegexFromMarker, generateSquareBracketRegex, validateTemplate }; //# sourceMappingURL=index.mjs.map