flappa-doormal
Version:
Arabic text marker pattern library for generating regex from declarative configurations
517 lines (510 loc) • 17.5 kB
JavaScript
import { makeDiacriticInsensitive } from "bitaboom";
//#region src/markers/defaults.ts
/**
* Default numbering style for markers
*/
const DEFAULT_NUMBERING = "arabic-indic";
/**
* Default separator style for markers
*/
const DEFAULT_SEPARATOR = "dash";
/**
* Default separator pattern (used when separator is a custom string)
*/
const DEFAULT_SEPARATOR_PATTERN = "[-–—ـ]";
/**
* Numbering patterns mapped by style
*/
const NUMBERING_PATTERNS = {
"arabic-indic": "[\\u0660-\\u0669]+",
"latin": "\\d+"
};
/**
* Separator patterns mapped by style
*/
const SEPARATOR_PATTERNS = {
"colon": ":",
"dash": "[-–—ـ]",
"dot": "\\.",
"none": "",
"paren": "\\)"
};
//#endregion
//#region src/markers/presets.ts
/**
* Default phrase lists for preset marker types.
* Export these so users can extend them.
*/
/**
* Common hadith narrator phrases (diacritic-insensitive)
* Users can extend: [...DEFAULT_HADITH_PHRASES, 'أَخْبَرَنِي']
*/
const DEFAULT_HADITH_PHRASES = [
"حَدَّثَنَا",
"حدثنا",
"أَخْبَرَنَا",
"حدثني",
"حدَّثني",
"وحدثنا",
"حُدِّثت عن",
"وحَدَّثَنَا"
];
/**
* Common basmala patterns
* Users can extend: [...DEFAULT_BASMALA_PATTERNS, 'customPattern']
*/
const DEFAULT_BASMALA_PATTERNS = [
"بسم الله",
"\\[بسم",
"\\[تم"
];
//#endregion
//#region src/markers/tokens.ts
/**
* Token definitions for pattern templates.
* Tokens provide a readable alternative to raw regex patterns.
*/
/**
* Standard tokens for building marker patterns.
* Use these in templates like: '{num} {dash}' instead of '[\\u0660-\\u0669]+ [-–—ـ]'
*/
const TOKENS = {
bullet: "[•*°]",
colon: ":",
comma: "،",
content: "(.*)",
dash: "[-–—ـ]",
dot: "\\.",
latin: "\\d+",
letter: "[أ-ي]",
num: "[\\u0660-\\u0669]+",
paren: "\\)",
s: "\\s?",
slash: "/",
space: "\\s+"
};
//#endregion
//#region src/markers/template-parser.ts
/**
* Expands a template string into a regex pattern using named capture groups.
* Always creates three groups: full (entire match), marker (just the marker), content (clean text).
*
* The content group uses [\s\S]*? (non-greedy) to match across newlines but stop at next marker.
*
* @param template - Template string with {token} placeholders
* @param options - Optional configuration
* @returns Regex pattern string with named groups
*
* @example
* expandTemplate('{num} {dash}')
* // Returns: ^(?<full>(?<marker>[\\u0660-\\u0669]+\\s?[-–—ـ])(?<content>[\\s\\S]*?))
*/
function expandTemplate(template, options) {
const tokenMap = options?.tokens || TOKENS;
let expandedMarker = template;
for (const [token, pattern] of Object.entries(tokenMap)) {
const placeholder = `{${token}}`;
expandedMarker = expandedMarker.replaceAll(placeholder, pattern);
}
return String.raw`^(?<full>(?<marker>${expandedMarker})(?<content>[\s\S]*))`;
}
/**
* Create a custom token map by extending the base tokens.
*
* @param customTokens - Custom token definitions
* @returns Combined token map
*
* @example
* const myTokens = createTokenMap({
* verse: '\\[[\\u0660-\\u0669]+\\]',
* tafsir: 'تفسير'
* });
*/
function createTokenMap(customTokens) {
return {
...TOKENS,
...customTokens
};
}
/**
* Validates a template string.
*
* @param template - Template to validate
* @param tokens - Token map to validate against
* @returns Validation result with errors if invalid
*
* @example
* validateTemplate('{num} {dash}')
* // Returns: { valid: true }
*
* validateTemplate('{invalid}')
* // Returns: { valid: false, errors: ['Unknown token: {invalid}'] }
*/
function validateTemplate(template, tokens = TOKENS) {
const unknownTokens = (template.match(/\{(\w+)\}/g) || []).map((t) => t.slice(1, -1)).filter((name) => !tokens[name]);
if (unknownTokens.length > 0) return {
valid: false,
errors: [`Unknown tokens: ${unknownTokens.map((t) => `{${t}}`).join(", ")}`, `Available tokens: ${Object.keys(tokens).map((t) => `{${t}}`).join(", ")}`]
};
return { valid: true };
}
//#endregion
//#region src/markers/type-generators.ts
/**
* Generates a regular expression for pattern-type markers.
*
* Supports two modes:
* 1. Template-based: Uses the `template` field with token expansion
* 2. Pattern-based: Uses the raw `pattern` field as-is
*
* @param config - Marker configuration with either `template` or `pattern` field
* @returns A compiled RegExp object for matching the pattern
* @throws {Error} When neither `template` nor `pattern` is provided
*
* @example
* // Using template
* const regex = generatePatternRegex({ type: 'pattern', template: '{num} {dash}' });
*
* @example
* // Using raw pattern
* const regex = generatePatternRegex({ type: 'pattern', pattern: '^\\d+' });
*
* @example
* // Using custom tokens
* const regex = generatePatternRegex({
* type: 'pattern',
* template: '{verse}',
* tokens: { verse: '\\[[0-9]+\\]' }
* });
*/
function generatePatternRegex(config) {
if (config.template) {
const tokenMap = config.tokens ? createTokenMap(config.tokens) : TOKENS;
const pattern = expandTemplate(config.template, { tokens: tokenMap });
return new RegExp(pattern, "u");
}
if (!config.pattern) throw new Error("pattern marker must provide either a template or pattern");
return new RegExp(config.pattern, "u");
}
/**
* Generates a regular expression for 'bab' (chapter) markers.
*
* Matches Arabic chapter markers like باب, بَابُ, بَابٌ with optional diacritics.
* The pattern is diacritic-insensitive using bitaboom's makeDiacriticInsensitive.
*
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateBabRegex();
* const match = regex.exec('باب الصلاة');
* // match.groups.marker -> 'باب'
* // match.groups.content -> ' الصلاة'
*/
function generateBabRegex() {
const babPattern = makeDiacriticInsensitive("باب");
const pattern = String.raw`^(?<full>(?<marker>${babPattern}[ًٌٍَُ]?)(?<content>[\s\S]*))`;
return new RegExp(pattern, "u");
}
/**
* Generates a regular expression for hadith chain (isnad) markers.
*
* Matches common hadith narrator phrases like حَدَّثَنَا, أَخْبَرَنَا, etc.
* Uses default phrases from presets or custom phrases from config.
* All phrases are made diacritic-insensitive.
*
* @param config - Marker configuration with optional `phrases` array
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* // Using default phrases
* const regex = generateHadithChainRegex({ type: 'hadith-chain' });
* const match = regex.exec('حَدَّثَنَا أبو بكر');
*
* @example
* // Using custom phrases
* const regex = generateHadithChainRegex({
* type: 'hadith-chain',
* phrases: ['قَالَ', 'رَوَى']
* });
*/
function generateHadithChainRegex(config) {
const phrasesPattern = (config.phrases || DEFAULT_HADITH_PHRASES).map((p) => makeDiacriticInsensitive(p)).join("|");
const pattern = String.raw`^(?<full>(?<marker>${phrasesPattern})(?<content>[\s\S]*))`;
return new RegExp(pattern, "u");
}
/**
* Generates a regular expression for basmala markers.
*
* Matches various forms of بِسْمِ اللَّهِ (In the name of Allah):
* - بسم الله (without diacritics)
* - بِسْمِ اللَّهِ (with diacritics)
* - Special patterns like [بسم, [تم
*
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateBasmalaRegex();
* const match = regex.exec('بسم الله الرحمن الرحيم');
* // match.groups.marker -> 'بسم الله'
*/
function generateBasmalaRegex() {
const combinedPattern = DEFAULT_BASMALA_PATTERNS.map((p) => makeDiacriticInsensitive(p)).join("|");
const pattern = String.raw`^(?<full>(?<marker>${combinedPattern})(?<content>[\s\S]*))`;
return new RegExp(pattern, "u");
}
/**
* Generates a regular expression for custom phrase markers.
*
* Similar to hadith-chain markers but requires explicit phrase list.
* All phrases are made diacritic-insensitive.
*
* @param config - Marker configuration with required `phrases` array
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
* @throws {Error} When `phrases` is undefined or empty
*
* @example
* const regex = generatePhraseRegex({
* type: 'phrase',
* phrases: ['فَائِدَةٌ', 'مَسْأَلَةٌ']
* });
*/
function generatePhraseRegex(config) {
if (!config.phrases || config.phrases.length === 0) throw new Error("phrase marker requires phrases array");
const phrasesPattern = config.phrases.map((p) => makeDiacriticInsensitive(p)).join("|");
const pattern = String.raw`^(?<full>(?<marker>${phrasesPattern})(?<content>[\s\S]*))`;
return new RegExp(pattern, "u");
}
/**
* Generates a regular expression for square bracket markers.
*
* Matches verse or hadith reference numbers in square brackets:
* - [٦٥] - Simple bracket
* - • [٦٥] - With bullet prefix
* - ° [٦٥] - With degree prefix
*
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateSquareBracketRegex();
* const match = regex.exec('[٦٥] نص الحديث');
* // match.groups.content -> ' نص الحديث'
*/
function generateSquareBracketRegex() {
const markerPattern = String.raw`[•°]?\s?\[[\u0660-\u0669]+\]\s?`;
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
return new RegExp(pattern, "u");
}
/**
* Generates a regular expression for number-letter-separator markers.
*
* Matches patterns like:
* - ٥ أ - (Arabic-Indic number, Arabic letter, dash)
* - 5 ب. (Latin number, Arabic letter, dot)
*
* @param config - Configuration with required `numbering` and `separator` fields
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateNumLetterRegex({
* numbering: 'arabic-indic',
* separator: 'dash'
* });
* const match = regex.exec('٥ أ - نص');
*/
function generateNumLetterRegex(config) {
const numPattern = NUMBERING_PATTERNS[config.numbering];
const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator;
const markerPattern = String.raw`${numPattern} [أ-ي]\s?${sepPattern}`;
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
return new RegExp(pattern, "u");
}
/**
* Generates a regular expression for number-parenthetical-separator markers.
*
* Matches patterns like:
* - ٥ (أ) - (number, parenthetical content, separator)
* - 5 (٦) - (number with parenthetical number)
*
* @param config - Configuration with required `numbering` and `separator` fields
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateNumParenRegex({
* numbering: 'arabic-indic',
* separator: 'dash'
* });
* const match = regex.exec('٥ (أ) - نص');
*/
function generateNumParenRegex(config) {
const numPattern = NUMBERING_PATTERNS[config.numbering];
const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator;
const markerPattern = String.raw`${numPattern}\s*\([\u0600-\u06FF\u0660-\u0669\s]+\)\s?${sepPattern}`;
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
return new RegExp(pattern, "u");
}
/**
* Generates a regular expression for number-slash-number markers.
*
* Matches patterns like:
* - ٥/٦ - (number slash number, separator)
* - ٥ - (single number, separator)
*
* The second number after the slash is optional.
*
* @param config - Configuration with required `numbering` and `separator` fields
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateNumSlashRegex({
* numbering: 'arabic-indic',
* separator: 'dash'
* });
* const match1 = regex.exec('٥/٦ - نص');
* const match2 = regex.exec('٥ - نص'); // Also matches
*/
function generateNumSlashRegex(config) {
const numPattern = NUMBERING_PATTERNS[config.numbering];
const sepPattern = SEPARATOR_PATTERNS[config.separator] ?? config.separator;
const markerPattern = String.raw`${numPattern}(?:\s?/\s?${numPattern})?\s?${sepPattern}`;
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
return new RegExp(pattern, "u");
}
/**
* Generates a regular expression for numbered markers with optional format template.
*
* Supports two modes:
* 1. Format template: Uses `format` field with token expansion (e.g., '{bullet}+ {num} {dash}')
* 2. Default pattern: Uses `numbering` and `separator` to build standard numbered markers
*
* When using default pattern:
* - Separator 'none' generates pattern without separator
* - Custom separator strings are used as-is or looked up in SEPARATOR_PATTERNS
*
* @param config - Configuration with `numbering`, `separator`, and optional `format`/`tokens`
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* // Using format template
* const regex = generateNumberedRegex({
* numbering: 'arabic-indic',
* separator: 'dash',
* format: '{bullet}+ {num} {dash}'
* });
*
* @example
* // Using default pattern
* const regex = generateNumberedRegex({
* numbering: 'arabic-indic',
* separator: 'dash'
* });
* const match = regex.exec('٥ - نص');
*
* @example
* // With 'none' separator
* const regex = generateNumberedRegex({
* numbering: 'latin',
* separator: 'none'
* });
* const match = regex.exec('5 text');
*/
function generateNumberedRegex(config) {
if (config.format) {
const tokenMap = config.tokens ? createTokenMap(config.tokens) : TOKENS;
const expandedPattern = expandTemplate(config.format, { tokens: tokenMap });
return new RegExp(expandedPattern, "u");
}
const numPattern = NUMBERING_PATTERNS[config.numbering];
const separator = config.separator;
const sepPattern = separator !== "none" ? SEPARATOR_PATTERNS[separator] ?? separator : "";
const markerPattern = sepPattern ? String.raw`${numPattern}\s?${sepPattern}` : numPattern;
const pattern = String.raw`^(?<full>(?<marker>${markerPattern})(?<content>[\s\S]*))`;
return new RegExp(pattern, "u");
}
/**
* Generates a regular expression for bullet-point markers.
*
* Matches common bullet characters:
* - • (bullet)
* - * (asterisk)
* - ° (degree)
* - - (dash)
*
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateBulletRegex();
* const match = regex.exec('• نقطة');
* // match.groups.content -> 'نقطة'
*/
function generateBulletRegex() {
return new RegExp("^(?<full>(?<marker>[•*°\\-]\\s?)(?<content>[\\s\\S]*))", "u");
}
/**
* Generates a regular expression for Markdown-style heading markers.
*
* Matches heading levels using hash symbols:
* - # Heading 1
* - ## Heading 2
* - ### Heading 3
* - etc.
*
* @returns A compiled RegExp with named groups: `full`, `marker`, `content`
*
* @example
* const regex = generateHeadingRegex();
* const match = regex.exec('## عنوان فرعي');
* // match.groups.marker -> '## '
* // match.groups.content -> 'عنوان فرعي'
*/
function generateHeadingRegex() {
return new RegExp("^(?<full>(?<marker>#+\\s?)(?<content>[\\s\\S]*))", "u");
}
//#endregion
//#region src/markers/generator.ts
/**
* Generates a regex pattern from a marker configuration.
* Always returns a regex with three named capture groups:
* - full: Complete match including marker
* - marker: Just the marker part (for metadata/indexing)
* - content: Clean content without marker (for LLM processing)
*
* This function applies all default values before delegating to type-specific generators.
*
* @param config - Marker configuration
* @returns Regular expression with named groups
*
* @example
* const regex = generateRegexFromMarker({ type: 'numbered' });
* const match = regex.exec('٥ - نص');
* match.groups.full // "٥ - نص"
* match.groups.marker // "٥ -"
* match.groups.content // "نص"
*/
function generateRegexFromMarker(config) {
const normalized = {
numbering: config.numbering ?? DEFAULT_NUMBERING,
separator: config.separator ?? DEFAULT_SEPARATOR,
...config
};
switch (normalized.type) {
case "pattern": return generatePatternRegex(normalized);
case "bab": return generateBabRegex();
case "hadith-chain": return generateHadithChainRegex(normalized);
case "basmala": return generateBasmalaRegex();
case "phrase": return generatePhraseRegex(normalized);
case "square-bracket": return generateSquareBracketRegex();
case "num-letter": return generateNumLetterRegex(normalized);
case "num-paren": return generateNumParenRegex(normalized);
case "num-slash": return generateNumSlashRegex(normalized);
case "numbered": return generateNumberedRegex(normalized);
case "bullet": return generateBulletRegex();
case "heading": return generateHeadingRegex();
default: {
const _exhaustive = normalized.type;
throw new Error(`Unknown marker type: ${_exhaustive}`);
}
}
}
//#endregion
export { DEFAULT_BASMALA_PATTERNS, DEFAULT_HADITH_PHRASES, DEFAULT_NUMBERING, DEFAULT_SEPARATOR, DEFAULT_SEPARATOR_PATTERN, NUMBERING_PATTERNS, SEPARATOR_PATTERNS, TOKENS, createTokenMap, expandTemplate, generateBabRegex, generateBasmalaRegex, generateBulletRegex, generateHadithChainRegex, generateHeadingRegex, generateNumLetterRegex, generateNumParenRegex, generateNumSlashRegex, generateNumberedRegex, generatePatternRegex, generatePhraseRegex, generateRegexFromMarker, generateSquareBracketRegex, validateTemplate };
//# sourceMappingURL=index.mjs.map