flappa-doormal
Version:
Arabic text marker pattern library for generating regex from declarative configurations
460 lines • 16 kB
text/typescript
//#region src/types.d.ts
/**
* Numbering styles for markers
*/
type NumberingStyle = 'arabic-indic' | 'latin';
/**
* Separator styles for markers
*/
type SeparatorStyle = 'dash' | 'dot' | 'paren' | 'colon' | 'none';
/**
* Marker types for text segmentation
*/
type MarkerType = 'numbered' | 'bullet' | 'heading' | 'pattern' | 'bab' | 'hadith-chain' | 'basmala' | 'phrase' | 'square-bracket' | 'num-letter' | 'num-paren' | 'num-slash';
/**
* Configuration for a single marker pattern
*/
type MarkerConfig = {
/** The type of marker to look for */
type: MarkerType;
/** For numbered markers, the digit style */
numbering?: NumberingStyle;
/** The separator that follows the marker */
separator?: SeparatorStyle | string;
/**
* Template format for numbered markers using token syntax.
* Example: '{bullet}+ {num} {dash}'
* Only valid when type is 'numbered'.
*/
format?: string;
/**
* For 'pattern' type, provide a template using tokens like {num}, {dash}, {bullet}.
* For raw regex patterns that don't use templates, provide the raw pattern string here.
* Example: '{bullet}? {num}+ {s}{dash}' or '^[•*°]? ([\\u0660-\\u0669]+\\s?[-–—ـ].*)'
*/
template?: string;
/**
* Alternative to template: raw regex pattern string (for 'pattern' type only).
* Use this for complex patterns that can't be expressed with templates.
* The pattern should have a capture group for the content.
* Example: '^CUSTOM: (.*)'
*/
pattern?: string;
/**
* Custom token map for advanced users.
* Extends the default TOKENS with additional definitions.
*/
tokens?: Record<string, string>;
/**
* List of phrases for 'phrase' and 'hadith-chain' types.
* For 'hadith-chain', defaults to common narrator patterns if not provided.
*/
phrases?: string[];
/**
* Optional: Only apply this marker after a specific page number.
* Useful for books with different formatting in front matter vs main content.
*/
minPage?: number;
/**
* Optional: Arbitrary metadata to attach to entries matched by this marker.
* This allows for agnostic handling of entry properties.
* Example: { type: 0, category: 'hadith' }
*/
metadata?: Record<string, any>;
};
//#endregion
//#region src/markers/defaults.d.ts
/**
* Default numbering style for markers
*/
declare const DEFAULT_NUMBERING: NumberingStyle;
/**
* Default separator style for markers
*/
declare const DEFAULT_SEPARATOR: SeparatorStyle;
/**
* Default separator pattern (used when separator is a custom string)
*/
declare const DEFAULT_SEPARATOR_PATTERN = "[-\u2013\u2014\u0640]";
/**
* Numbering patterns mapped by style
*/
declare const NUMBERING_PATTERNS: Record<NumberingStyle, string>;
/**
* Separator patterns mapped by style
*/
declare const SEPARATOR_PATTERNS: Record<SeparatorStyle, string>;
//#endregion
//#region src/markers/generator.d.ts
/**
* Generates a regex pattern from a marker configuration.
* Always returns a regex with three named capture groups:
* - full: Complete match including marker
* - marker: Just the marker part (for metadata/indexing)
* - content: Clean content without marker (for LLM processing)
*
* This function applies all default values before delegating to type-specific generators.
*
* @param config - Marker configuration
* @returns Regular expression with named groups
*
* @example
* const regex = generateRegexFromMarker({ type: 'numbered' });
* const match = regex.exec('٥ - نص');
* match.groups.full // "٥ - نص"
* match.groups.marker // "٥ -"
* match.groups.content // "نص"
*/
declare function generateRegexFromMarker(config: MarkerConfig): RegExp;
//#endregion
//#region src/markers/presets.d.ts
/**
* Default phrase lists for preset marker types.
* Export these so users can extend them.
*/
/**
* Common hadith narrator phrases (diacritic-insensitive)
* Users can extend: [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي']
*/
declare const DEFAULT_HADITH_PHRASES: readonly ["حَدَّثَنَا", "حدثنا", "أَخْبَرَنَا", "حدثني", "حدَّثني", "وحدثنا", "حُدِّثت عن", "وحَدَّثَنَا"];
/**
* Common basmala patterns
* Users can extend: [...DEFAULT_BASMALA_PATTERNS, 'customPattern']
*/
declare const DEFAULT_BASMALA_PATTERNS: readonly ["بسم الله", "\\[بسم", "\\[تم"];
//#endregion
//#region src/markers/tokens.d.ts
/**
* Token definitions for pattern templates.
* Tokens provide a readable alternative to raw regex patterns.
*/
/**
* Standard tokens for building marker patterns.
* Use these in templates like: '{num} {dash}' instead of '[\\u0660-\\u0669]+ [-–—ـ]'
*/
declare const TOKENS: {
readonly bullet: "[•*°]";
readonly colon: ":";
readonly comma: "،";
readonly content: "(.*)";
readonly dash: "[-–—ـ]";
readonly dot: "\\.";
readonly latin: "\\d+";
readonly letter: "[أ-ي]";
readonly num: "[\\u0660-\\u0669]+";
readonly paren: "\\)";
readonly s: "\\s?";
readonly slash: "/";
readonly space: "\\s+";
};
type TokenMap = Record<string, string>;
//#endregion
//#region src/markers/template-parser.d.ts
/**
* Result of template validation
*/
interface ValidationResult {
valid: boolean;
errors?: string[];
}
/**
* Options for template expansion
*/
interface ExpandOptions {
/** Custom token map to use instead of default TOKENS */
tokens?: TokenMap;
}
/**
* Expands a template string into a regex pattern using named capture groups.
* Always creates three groups: full (entire match), marker (just the marker), content (clean text).
*
* The content group uses [\s\S]*? (non-greedy) to match across newlines but stop at next marker.
*
* @param template - Template string with {token} placeholders
* @param options - Optional configuration
* @returns Regex pattern string with named groups
*
* @example
* expandTemplate('{num} {dash}')
* // Returns: ^(?<full>(?<marker>[\\u0660-\\u0669]+\\s?[-–—ـ])(?<content>[\\s\\S]*?))
*/
declare function expandTemplate(template: string, options?: ExpandOptions): string;
/**
* Create a custom token map by extending the base tokens.
*
* @param customTokens - Custom token definitions
* @returns Combined token map
*
* @example
* const myTokens = createTokenMap({
* verse: '\\[[\\u0660-\\u0669]+\\]',
* tafsir: 'تفسير'
* });
*/
declare function createTokenMap(customTokens: Record<string, string>): TokenMap;
/**
* Validates a template string.
*
* @param template - Template to validate
* @param tokens - Token map to validate against
* @returns Validation result with errors if invalid
*
* @example
* validateTemplate('{num} {dash}')
* // Returns: { valid: true }
*
* validateTemplate('{invalid}')
* // Returns: { valid: false, errors: ['Unknown token: {invalid}'] }
*/
declare function validateTemplate(template: string, tokens?: TokenMap): ValidationResult;
//#endregion
//#region src/markers/type-generators.d.ts
/**
* Generates a regular expression for pattern-type markers.
*
* Supports two modes:
* 1. Template-based: Uses the `template` field with token expansion
* 2. Pattern-based: Uses the raw `pattern` field as-is
*
* @param config - Marker configuration with either `template` or `pattern` field
* @returns A compiled RegExp object for matching the pattern
* @throws {Error} When neither `template` nor `pattern` is provided
*
* @example
* // Using template
* const regex = generatePatternRegex({ type: 'pattern', template: '{num} {dash}' });
*
* @example
* // Using raw pattern
* const regex = generatePatternRegex({ type: 'pattern', pattern: '^\\d+' });
*
* @example
* // Using custom tokens
* const regex = generatePatternRegex({
* type: 'pattern',
* template: '{verse}',
* tokens: { verse: '\\[[0-9]+\\]' }
* });
*/
declare function generatePatternRegex(config: MarkerConfig): RegExp;
/**
* Generates a regular expression for 'bab' (chapter) markers.
*
* Matches Arabic chapter markers like باب, بَابُ, بَابٌ with optional diacritics.
* The pattern is diacritic-insensitive using bitaboom's makeDiacriticInsensitive.
*
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateBabRegex();
* const match = regex.exec('باب الصلاة');
* // match.groups.marker -> 'باب'
* // match.groups.content -> ' الصلاة'
*/
declare function generateBabRegex(): RegExp;
/**
* Generates a regular expression for hadith chain (isnad) markers.
*
* Matches common hadith narrator phrases like حَدَّثَنَا, أَخْبَرَنَا, etc.
* Uses default phrases from presets or custom phrases from config.
* All phrases are made diacritic-insensitive.
*
* @param config - Marker configuration with optional `phrases` array
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* // Using default phrases
* const regex = generateHadithChainRegex({ type: 'hadith-chain' });
* const match = regex.exec('حَدَّثَنَا أبو بكر');
*
* @example
* // Using custom phrases
* const regex = generateHadithChainRegex({
* type: 'hadith-chain',
* phrases: ['قَالَ', 'رَوَى']
* });
*/
declare function generateHadithChainRegex(config: MarkerConfig): RegExp;
/**
* Generates a regular expression for basmala markers.
*
* Matches various forms of بِسْمِ اللَّهِ (In the name of Allah):
* - بسم الله (without diacritics)
* - بِسْمِ اللَّهِ (with diacritics)
* - Special patterns like [بسم, [تم
*
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateBasmalaRegex();
* const match = regex.exec('بسم الله الرحمن الرحيم');
* // match.groups.marker -> 'بسم الله'
*/
declare function generateBasmalaRegex(): RegExp;
/**
* Generates a regular expression for custom phrase markers.
*
* Similar to hadith-chain markers but requires explicit phrase list.
* All phrases are made diacritic-insensitive.
*
* @param config - Marker configuration with required `phrases` array
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
* @throws {Error} When `phrases` is undefined or empty
*
* @example
* const regex = generatePhraseRegex({
* type: 'phrase',
* phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ']
* });
*/
declare function generatePhraseRegex(config: MarkerConfig): RegExp;
/**
* Generates a regular expression for square bracket markers.
*
* Matches verse or hadith reference numbers in square brackets:
* - [٦٥] - Simple bracket
* - • [٦٥] - With bullet prefix
* - ° [٦٥] - With degree prefix
*
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateSquareBracketRegex();
* const match = regex.exec('[٦٥] نص الحديث');
* // match.groups.content -> ' نص الحديث'
*/
declare function generateSquareBracketRegex(): RegExp;
/**
* Generates a regular expression for number-letter-separator markers.
*
* Matches patterns like:
* - ٥ أ - (Arabic-Indic number, Arabic letter, dash)
* - 5 ب. (Latin number, Arabic letter, dot)
*
* @param config - Configuration with required `numbering` and `separator` fields
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateNumLetterRegex({
* numbering: 'arabic-indic',
* separator: 'dash'
* });
* const match = regex.exec('٥ أ - نص');
*/
declare function generateNumLetterRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
/**
* Generates a regular expression for number-parenthetical-separator markers.
*
* Matches patterns like:
* - ٥ (أ) - (number, parenthetical content, separator)
* - 5 (٦) - (number with parenthetical number)
*
* @param config - Configuration with required `numbering` and `separator` fields
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateNumParenRegex({
* numbering: 'arabic-indic',
* separator: 'dash'
* });
* const match = regex.exec('٥ (أ) - نص');
*/
declare function generateNumParenRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
/**
* Generates a regular expression for number-slash-number markers.
*
* Matches patterns like:
* - ٥/٦ - (number slash number, separator)
* - ٥ - (single number, separator)
*
* The second number after the slash is optional.
*
* @param config - Configuration with required `numbering` and `separator` fields
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateNumSlashRegex({
* numbering: 'arabic-indic',
* separator: 'dash'
* });
* const match1 = regex.exec('٥/٦ - نص');
* const match2 = regex.exec('٥ - نص'); // Also matches
*/
declare function generateNumSlashRegex(config: Pick<MarkerConfig, 'numbering' | 'separator'>): RegExp;
/**
* Generates a regular expression for numbered markers with optional format template.
*
* Supports two modes:
* 1. Format template: Uses `format` field with token expansion (e.g., '{bullet}+ {num} {dash}')
* 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers
*
* When using default pattern:
* - Separator 'none' generates pattern without separator
* - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS
*
* @param config - Configuration with `numbering`, `separator`, and optional `format`/`tokens`
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* // Using format template
* const regex = generateNumberedRegex({
* numbering: 'arabic-indic',
* separator: 'dash',
* format: '{bullet}+ {num} {dash}'
* });
*
* @example
* // Using default pattern
* const regex = generateNumberedRegex({
* numbering: 'arabic-indic',
* separator: 'dash'
* });
* const match = regex.exec('٥ - نص');
*
* @example
* // With 'none' separator
* const regex = generateNumberedRegex({
* numbering: 'latin',
* separator: 'none'
* });
* const match = regex.exec('5 text');
*/
declare function generateNumberedRegex(config: Pick<MarkerConfig, 'numbering' | 'separator' | 'format' | 'tokens'>): RegExp;
/**
* Generates a regular expression for bullet-point markers.
*
* Matches common bullet characters:
* - • (bullet)
* - * (asterisk)
* - ° (degree)
* - - (dash)
*
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateBulletRegex();
* const match = regex.exec('• نقطة');
* // match.groups.content -> 'نقطة'
*/
declare function generateBulletRegex(): RegExp;
/**
* Generates a regular expression for Markdown-style heading markers.
*
* Matches heading levels using hash symbols:
* - # Heading 1
* - ## Heading 2
* - ### Heading 3
* - etc.
*
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateHeadingRegex();
* const match = regex.exec('## عنوان فرعي');
* // match.groups.marker -> '## '
* // match.groups.content -> 'عنوان فرعي'
*/
declare function generateHeadingRegex(): RegExp;
//#endregion
export { DEFAULT_BASMALA_PATTERNS, DEFAULT_HADITH_PHRASES, DEFAULT_NUMBERING, DEFAULT_SEPARATOR, DEFAULT_SEPARATOR_PATTERN, type MarkerConfig, type MarkerType, NUMBERING_PATTERNS, type NumberingStyle, SEPARATOR_PATTERNS, type SeparatorStyle, TOKENS, createTokenMap, expandTemplate, generateBabRegex, generateBasmalaRegex, generateBulletRegex, generateHadithChainRegex, generateHeadingRegex, generateNumLetterRegex, generateNumParenRegex, generateNumSlashRegex, generateNumberedRegex, generatePatternRegex, generatePhraseRegex, generateRegexFromMarker, generateSquareBracketRegex, validateTemplate };
//# sourceMappingURL=index.d.mts.map