search-text-normalizer
Version:
Multilingual text normalization utilities for search accuracy
216 lines (179 loc) • 8.68 kB
JavaScript
const { normalizeText } = require('./index');
describe('Text Normalizer', () => {
describe('Generic Normalization', () => {
test('should convert text to lowercase', () => {
expect(normalizeText('HELLO WORLD')).toBe('hello world');
expect(normalizeText('Hello World')).toBe('hello world');
});
test('should normalize punctuation marks', () => {
expect(normalizeText('Hello... World!!!')).toBe('hello world');
expect(normalizeText('Hello«World»')).toBe('hello world');
expect(normalizeText('Hello(World)')).toBe('hello world');
expect(normalizeText('Hello[World]')).toBe('hello world');
expect(normalizeText('Hello-World')).toBe('hello world');
expect(normalizeText('Hello:World?')).toBe('hello world');
expect(normalizeText('¡Hello¿World')).toBe('hello world');
});
test('should normalize multiple spaces', () => {
expect(normalizeText('Hello World')).toBe('hello world');
expect(normalizeText(' Hello World ')).toBe('hello world');
});
test('should handle empty and null inputs', () => {
expect(normalizeText('')).toBe('');
expect(normalizeText(null)).toBe(null);
expect(normalizeText(undefined)).toBe(undefined);
expect(normalizeText(123)).toBe(123);
});
});
describe('Arabic Normalization', () => {
test('should remove Arabic diacritics (Tashkeel)', () => {
expect(normalizeText('مَرْحَباً', 'ar')).toBe('مرحبا');
expect(normalizeText('الْحَمْدُ لِلَّهِ', 'ar')).toBe('الحمد لله');
});
test('should normalize different forms of Alef', () => {
expect(normalizeText('آمن', 'ar')).toBe('امن');
expect(normalizeText('أحمد', 'ar')).toBe('احمد');
expect(normalizeText('إبراهيم', 'ar')).toBe('ابراهيم');
expect(normalizeText('اسم', 'ar')).toBe('اسم');
expect(normalizeText('ٱلله', 'ar')).toBe('الله');
});
test('should normalize different forms of Yeh', () => {
expect(normalizeText('يوم', 'ar')).toBe('يوم');
expect(normalizeText('على', 'ar')).toBe('علي');
expect(normalizeText('شيء', 'ar')).toBe('شيء');
});
test('should normalize Waw with Hamza', () => {
expect(normalizeText('مؤمن', 'ar')).toBe('مومن');
});
test('should remove Tatweel (elongation)', () => {
expect(normalizeText('الرحمـــــان', 'ar')).toBe('الرحمان');
});
test('should normalize Arabic punctuation', () => {
expect(normalizeText('السلام، عليكم؛ ورحمة الله؟', 'ar')).toBe('السلام عليكم ورحمة الله');
});
});
describe('Hebrew Normalization', () => {
test('should remove Hebrew vowel points (Nikud)', () => {
expect(normalizeText('שָׁלוֹם', 'he')).toBe('שלומ');
expect(normalizeText('עוֹלָם', 'he')).toBe('עולם');
});
test('should handle בן־אדם specifically', () => {
expect(normalizeText('בן־אדם', 'he')).toBe('בנאדמ');
});
test('should remove cantillation marks', () => {
expect(normalizeText('בְּרֵאשִׁ֖ית', 'he')).toBe('בראשית');
});
test('should normalize final letters', () => {
expect(normalizeText('מלך', 'he')).toBe('מלכ');
expect(normalizeText('שלום', 'he')).toBe('שלומ');
expect(normalizeText('רוח', 'he')).toBe('רוח');
expect(normalizeText('יוסף', 'he')).toBe('יוספ');
expect(normalizeText('ארץ', 'he')).toBe('ארצ');
});
test('should handle Maqqef (Hebrew hyphen)', () => {
expect(normalizeText('אל־מול', 'he')).toBe('אל מול');
});
test('should normalize Hebrew punctuation', () => {
expect(normalizeText('שלום׃ עליכם׀', 'he')).toBe('שלומ עליכם');
});
});
describe('Greek Normalization', () => {
test('should convert to lowercase', () => {
expect(normalizeText('ΚΟΣΜΟΣ', 'el')).toBe('κοσμοσ');
});
test('should remove accents and breathing marks', () => {
expect(normalizeText('Γεῖα σου κόσμε', 'el')).toBe('γεια σου κοσμε');
expect(normalizeText('ἀγάπη', 'el')).toBe('αγαπη');
expect(normalizeText('εἰρήνη', 'el')).toBe('ειρηνη');
});
test('should normalize final sigma', () => {
expect(normalizeText('κόσμος', 'el')).toBe('κοσμοσ');
expect(normalizeText('λόγος', 'el')).toBe('λογοσ');
});
test('should normalize Greek punctuation', () => {
expect(normalizeText('Γεια σου· κόσμε;', 'el')).toBe('γεια σου κοσμε');
});
});
describe('Latin Normalization', () => {
test('should normalize accented characters', () => {
expect(normalizeText('café', 'la')).toBe('cafe');
expect(normalizeText('résumé', 'la')).toBe('resume');
expect(normalizeText('naïve', 'la')).toBe('naive');
expect(normalizeText('piñata', 'la')).toBe('pinata');
});
test('should handle uppercase accented characters', () => {
expect(normalizeText('CAFÉ', 'la')).toBe('cafe');
expect(normalizeText('RÉSUMÉ', 'la')).toBe('resume');
});
test('should normalize various Latin diacritics', () => {
expect(normalizeText('àáâãäå', 'la')).toBe('aaaaaa');
expect(normalizeText('èéêë', 'la')).toBe('eeee');
expect(normalizeText('ìíîï', 'la')).toBe('iiii');
expect(normalizeText('òóôõöø', 'la')).toBe('oooooo');
expect(normalizeText('ùúûü', 'la')).toBe('uuuu');
});
test('should handle special characters', () => {
expect(normalizeText('ç', 'la')).toBe('c');
expect(normalizeText('Ç', 'la')).toBe('c');
expect(normalizeText('ñ', 'la')).toBe('n');
expect(normalizeText('Ñ', 'la')).toBe('n');
});
});
describe('Syriac Normalization', () => {
test('should remove Syriac vowel points', () => {
expect(normalizeText('ܫܠܵܡܵܐ', 'sy')).toBe('ܫܠܡܐ');
});
test('should remove Syriac punctuation and marks', () => {
expect(normalizeText('ܫܠܡܐ܀ ܥܠܡܐ܁', 'sy')).toBe('ܫܠܡܐ ܥܠܡܐ');
});
test('should normalize Syriac text completely', () => {
const syriacText = 'ܫܠܵܡܵܐ܀ ܥܵܠܡܵܐ܁';
expect(normalizeText(syriacText, 'sy')).toBe('ܫܠܡܐ ܥܠܡܐ');
});
});
describe('Language Detection and Fallback', () => {
test('should use generic normalization for unknown languages', () => {
expect(normalizeText('Hello World', 'unknown')).toBe('hello world');
expect(normalizeText('HELLO WORLD', 'fr')).toBe('hello world');
});
test('should apply generic normalization after language-specific normalization', () => {
expect(normalizeText('مَرْحَباً... بِالعَالَم!!!', 'ar')).toBe('مرحبا بالعالم');
expect(normalizeText('שָׁלוֹם... עוֹלָם!!!', 'he')).toBe('שלומ עולם');
});
test('should handle mixed scripts with generic normalization', () => {
const mixedText = 'Hello مرحبا שלום';
expect(normalizeText(mixedText)).toBe('hello مرحبا שלום');
});
});
describe('Edge Cases', () => {
test('should handle whitespace-only strings', () => {
expect(normalizeText(' ', 'ar')).toBe('');
expect(normalizeText('\t\n', 'he')).toBe('');
});
test('should handle strings with only punctuation', () => {
expect(normalizeText('...!!!', 'ar')).toBe('');
expect(normalizeText('،؛؟', 'ar')).toBe('');
});
test('should handle very long strings', () => {
const longText = 'Hello World '.repeat(1000);
const normalized = normalizeText(longText);
expect(normalized).toBe('hello world '.repeat(1000).trim());
});
test('should handle strings with only diacritics', () => {
expect(normalizeText('َُِّْ', 'ar')).toBe('');
expect(normalizeText('ֵֶַָֹֻ', 'he')).toBe('');
});
});
describe('Performance and Consistency', () => {
test('should be consistent across multiple calls', () => {
const text = 'مَرْحَباً بِالعَالَم';
const result1 = normalizeText(text, 'ar');
const result2 = normalizeText(text, 'ar');
expect(result1).toBe(result2);
});
test('should handle empty language parameter gracefully', () => {
expect(normalizeText('Hello World', '')).toBe('hello world');
expect(normalizeText('Hello World', null)).toBe('hello world');
});
});
});