UNPKG

@ahmed5938/sorani-helper

Version:

TypeScript library for processing Central Kurdish (Sorani) text: Arabic-to-Kurdish conversion, keyboard layout mapping, validation, and input handling.

156 lines (155 loc) 6.18 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.KurdishProcessor = void 0; const constants_1 = require("./constants"); class KurdishProcessor { constructor(options = {}) { this.options = { allowDigits: false, allowPunctuation: false, allowSpaces: true, allowEmojis: false, autoConvertArabic: true, autoConvertEnglishLayout: false, blockOtherScripts: true, strict: true, ...options, }; } process(input) { let processed = input; if (this.options.autoConvertEnglishLayout) { processed = this.convertEnglishLayoutToKurdish(processed); } if (this.options.autoConvertArabic) { processed = this.convertArabicToKurdish(processed); } // Only filter if blockOtherScripts is enabled, otherwise just apply allowed character filtering if (this.options.blockOtherScripts) { processed = this.filterText(processed); } else { // When not blocking other scripts, still filter based on allowed options processed = this.filterByAllowedOptions(processed); } return processed; } filterByAllowedOptions(text) { let allowed = constants_1.KURDISH_SORANI_33_LETTERS; if (this.options.allowDigits) allowed += '0123456789٠١٢٣٤٥٦٧٨٩'; if (this.options.allowPunctuation) allowed += '.,;:!?()\\-"\'\'،؛؟«»[]{}'; if (this.options.allowSpaces) allowed += ' \t\n'; // When blockOtherScripts is false, also allow Latin and other common scripts if (!this.options.blockOtherScripts) { allowed += 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'; } const allowedSet = new Set(allowed); const emojiMatches = this.options.allowEmojis ? text.match(constants_1.EMOJI_REGEX) || [] : []; let filtered = text .replace(constants_1.EMOJI_REGEX, '※') .split('') .filter(char => char === '※' || allowedSet.has(char)) .join(''); if (this.options.allowEmojis) { let emojiIndex = 0; filtered = filtered.replace(/※/g, () => emojiMatches[emojiIndex++] || ''); } else { filtered = filtered.replace(/※/g, ''); } return filtered; } validate(input) { const errors = []; let converted = input; if (this.options.autoConvertArabic) { converted = this.convertArabicToKurdish(converted); } if (this.options.autoConvertEnglishLayout) { converted = this.convertEnglishLayoutToKurdish(converted); } if (this.options.strict && constants_1.ARABIC_VARIANTS_TO_REJECT.test(converted)) { errors.push('Contains forbidden Arabic variants'); } if (this.options.blockOtherScripts && constants_1.NON_KURDISH_SCRIPTS.test(converted)) { errors.push('Contains non-Kurdish script characters'); } // Build allowed character set const allowedChars = new Set(constants_1.KURDISH_SORANI_33_LETTERS); if (this.options.allowDigits) { '0123456789٠١٢٣٤٥٦٧٨٩'.split('').forEach(c => allowedChars.add(c)); } if (this.options.allowPunctuation) { '.,;:!?()-"\'،؛؟«»[]{}'.split('').forEach(c => allowedChars.add(c)); } if (this.options.allowSpaces) { ' \t\n'.split('').forEach(c => allowedChars.add(c)); } const emojiMatches = converted.match(constants_1.EMOJI_REGEX) || []; const textWithoutEmojis = converted.replace(constants_1.EMOJI_REGEX, ''); // Check if all non-emoji characters are allowed for (const char of textWithoutEmojis) { if (!allowedChars.has(char)) { errors.push('Contains invalid characters'); break; } } if (emojiMatches.length > 0 && !this.options.allowEmojis) { errors.push('Emojis are not allowed'); } return { isValid: errors.length === 0, errors, converted: errors.length === 0 ? converted : undefined, }; } convertArabicToKurdish(text) { let result = text; // Apply all Arabic to Kurdish replacements in order // Multi-character patterns are processed first, then single characters for (const [pattern, replacement] of constants_1.ARABIC_TO_KURDISH_REPLACEMENTS) { result = result.replace(pattern, replacement); } return result; } convertEnglishLayoutToKurdish(text) { return text .split('') .map(char => constants_1.ENGLISH_TO_KURDISH_LAYOUT[char] || char) .join(''); } filterText(text) { let allowed = constants_1.KURDISH_SORANI_33_LETTERS; if (this.options.allowDigits) allowed += '0123456789٠١٢٣٤٥٦٧٨٩'; if (this.options.allowPunctuation) allowed += '.,;:!?()\\-\"\'\'،؛؟«»[]{}'; if (this.options.allowSpaces) allowed += ' \t\n'; const allowedSet = new Set(allowed); const emojiMatches = this.options.allowEmojis ? text.match(constants_1.EMOJI_REGEX) || [] : []; let filtered = text .replace(constants_1.EMOJI_REGEX, '※') .split('') .filter(char => char === '※' || allowedSet.has(char)) .join(''); if (this.options.allowEmojis) { let emojiIndex = 0; filtered = filtered.replace(/※/g, () => emojiMatches[emojiIndex++] || ''); } else { filtered = filtered.replace(/※/g, ''); } return filtered; } escapeRegExp(string) { return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } updateOptions(newOptions) { this.options = { ...this.options, ...newOptions }; } } exports.KurdishProcessor = KurdishProcessor;