@ahmed5938/sorani-helper
Version:
TypeScript library for processing Central Kurdish (Sorani) text: Arabic-to-Kurdish conversion, keyboard layout mapping, validation, and input handling.
156 lines (155 loc) • 6.18 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.KurdishProcessor = void 0;
const constants_1 = require("./constants");
class KurdishProcessor {
constructor(options = {}) {
this.options = {
allowDigits: false,
allowPunctuation: false,
allowSpaces: true,
allowEmojis: false,
autoConvertArabic: true,
autoConvertEnglishLayout: false,
blockOtherScripts: true,
strict: true,
...options,
};
}
process(input) {
let processed = input;
if (this.options.autoConvertEnglishLayout) {
processed = this.convertEnglishLayoutToKurdish(processed);
}
if (this.options.autoConvertArabic) {
processed = this.convertArabicToKurdish(processed);
}
// Only filter if blockOtherScripts is enabled, otherwise just apply allowed character filtering
if (this.options.blockOtherScripts) {
processed = this.filterText(processed);
}
else {
// When not blocking other scripts, still filter based on allowed options
processed = this.filterByAllowedOptions(processed);
}
return processed;
}
filterByAllowedOptions(text) {
let allowed = constants_1.KURDISH_SORANI_33_LETTERS;
if (this.options.allowDigits)
allowed += '0123456789٠١٢٣٤٥٦٧٨٩';
if (this.options.allowPunctuation)
allowed += '.,;:!?()\\-"\'\'،؛؟«»[]{}';
if (this.options.allowSpaces)
allowed += ' \t\n';
// When blockOtherScripts is false, also allow Latin and other common scripts
if (!this.options.blockOtherScripts) {
allowed += 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ';
}
const allowedSet = new Set(allowed);
const emojiMatches = this.options.allowEmojis ? text.match(constants_1.EMOJI_REGEX) || [] : [];
let filtered = text
.replace(constants_1.EMOJI_REGEX, '※')
.split('')
.filter(char => char === '※' || allowedSet.has(char))
.join('');
if (this.options.allowEmojis) {
let emojiIndex = 0;
filtered = filtered.replace(/※/g, () => emojiMatches[emojiIndex++] || '');
}
else {
filtered = filtered.replace(/※/g, '');
}
return filtered;
}
validate(input) {
const errors = [];
let converted = input;
if (this.options.autoConvertArabic) {
converted = this.convertArabicToKurdish(converted);
}
if (this.options.autoConvertEnglishLayout) {
converted = this.convertEnglishLayoutToKurdish(converted);
}
if (this.options.strict && constants_1.ARABIC_VARIANTS_TO_REJECT.test(converted)) {
errors.push('Contains forbidden Arabic variants');
}
if (this.options.blockOtherScripts && constants_1.NON_KURDISH_SCRIPTS.test(converted)) {
errors.push('Contains non-Kurdish script characters');
}
// Build allowed character set
const allowedChars = new Set(constants_1.KURDISH_SORANI_33_LETTERS);
if (this.options.allowDigits) {
'0123456789٠١٢٣٤٥٦٧٨٩'.split('').forEach(c => allowedChars.add(c));
}
if (this.options.allowPunctuation) {
'.,;:!?()-"\'،؛؟«»[]{}'.split('').forEach(c => allowedChars.add(c));
}
if (this.options.allowSpaces) {
' \t\n'.split('').forEach(c => allowedChars.add(c));
}
const emojiMatches = converted.match(constants_1.EMOJI_REGEX) || [];
const textWithoutEmojis = converted.replace(constants_1.EMOJI_REGEX, '');
// Check if all non-emoji characters are allowed
for (const char of textWithoutEmojis) {
if (!allowedChars.has(char)) {
errors.push('Contains invalid characters');
break;
}
}
if (emojiMatches.length > 0 && !this.options.allowEmojis) {
errors.push('Emojis are not allowed');
}
return {
isValid: errors.length === 0,
errors,
converted: errors.length === 0 ? converted : undefined,
};
}
convertArabicToKurdish(text) {
let result = text;
// Apply all Arabic to Kurdish replacements in order
// Multi-character patterns are processed first, then single characters
for (const [pattern, replacement] of constants_1.ARABIC_TO_KURDISH_REPLACEMENTS) {
result = result.replace(pattern, replacement);
}
return result;
}
convertEnglishLayoutToKurdish(text) {
return text
.split('')
.map(char => constants_1.ENGLISH_TO_KURDISH_LAYOUT[char] || char)
.join('');
}
filterText(text) {
let allowed = constants_1.KURDISH_SORANI_33_LETTERS;
if (this.options.allowDigits)
allowed += '0123456789٠١٢٣٤٥٦٧٨٩';
if (this.options.allowPunctuation)
allowed += '.,;:!?()\\-\"\'\'،؛؟«»[]{}';
if (this.options.allowSpaces)
allowed += ' \t\n';
const allowedSet = new Set(allowed);
const emojiMatches = this.options.allowEmojis ? text.match(constants_1.EMOJI_REGEX) || [] : [];
let filtered = text
.replace(constants_1.EMOJI_REGEX, '※')
.split('')
.filter(char => char === '※' || allowedSet.has(char))
.join('');
if (this.options.allowEmojis) {
let emojiIndex = 0;
filtered = filtered.replace(/※/g, () => emojiMatches[emojiIndex++] || '');
}
else {
filtered = filtered.replace(/※/g, '');
}
return filtered;
}
escapeRegExp(string) {
return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
updateOptions(newOptions) {
this.options = { ...this.options, ...newOptions };
}
}
exports.KurdishProcessor = KurdishProcessor;