deep-profanity-filter
Version:
A thorough profanity filter that considers most common circumventions. Works with your custom list of blocked and whitelisted words and phrases. Identifies and/or replaces bad words. Works with *wildcards* at *start and/or end* of words.
202 lines âĒ 14.1 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.reduceRepeatCharacters = exports.textToLatin = exports.removeTextAccents = exports.unEmoji = void 0;
/**
* Replaces all emojis in a text that feature a letter with
* normal latin characters.
* Example: "ðŽâ ð" turns into "go on" or
* "ð
ŋðšÂŪïļð°ðŠ" turns into "purse".
* Note: This does NOT replace random emojis used to represent
* or mask letters, such as 'ðļ' representing an 'O'.
*
* Suggested usage: `textToLatin(unEmoji(inputText))`
* @param {string} inputText - The text from which you would like to remove all text based emojis.
* @returns the input text, with all letter based emojis transformed to become text.
*/
function unEmoji(inputText) {
return inputText
.replace(/ð/g, 'id')
.replace(/ð/g, 'vs')
.replace(/ðĪ/g, 'abc')
.replace(/ð/g, 'ab')
.replace(/ð/g, 'cl')
.replace(/ð/g, 'sos')
.replace(/ðū/g, 'wc')
.replace(/ð/g, 'ng')
.replace(/ð/g, 'ok')
.replace(/ð/g, 'up')
.replace(/ð/g, 'cool')
.replace(/ð/g, 'new')
.replace(/ð/g, '10')
.replace(/ð/g, 'free')
.replace(/ð/g, 'end')
.replace(/ð/g, 'back')
.replace(/ð/g, 'on')
.replace(/ð/g, 'top')
.replace(/ð/g, 'soon')
.replace(/ðĶ|ð
°/g, 'a')
.replace(/ð§|ð
ą/g, 'b')
.replace(/ðĻ|ÂĐïļ/g, 'c')
.replace(/ðĐ/g, 'd')
.replace(/ðŠ/g, 'e')
.replace(/ðŦ/g, 'f')
.replace(/ðŽ/g, 'g')
.replace(/ð/g, 'h')
.replace(/ðŪ/g, 'i')
.replace(/ðŊ/g, 'j')
.replace(/ð°/g, 'k')
.replace(/ðą/g, 'l')
.replace(/ðē/g, 'm')
.replace(/ðģ/g, 'n')
.replace(/ðī|ð
ū|â/g, 'o')
.replace(/ðĩ|ð
ŋ/g, 'p')
.replace(/ðķ/g, 'q')
.replace(/ð·|ÂŪïļ/g, 'r')
.replace(/ðļ|ð°/g, 's')
.replace(/ðđ/g, 't')
.replace(/ðš/g, 'u')
.replace(/ðŧ/g, 'v')
.replace(/ðž/g, 'w')
.replace(/ð―|â|â/g, 'x')
.replace(/ðū/g, 'y')
.replace(/ðŋ/g, 'z');
}
exports.unEmoji = unEmoji;
/**
* Removes most common accents from characters.
* Example: The text "ZĖĩÍĖĖĄaĖķĖÍĖŽĖlĖķÍĖgĖĩÍÍĖĖēoĖķĖ
ĖĖ" becomes "Zalgo",
* the text "Ã -cÃītÃĐs" becomes "a-cotes",
* non-latin characters stay non-latin, e.g. "áâÖÎŋĖĪÍðŨ
Ü" becomes "áâÎŋð".
* @param {string} inputText - The text for which you wish to have all
* accents removed.
* @returns the input text, stripped of all accents.
*/
function removeTextAccents(inputText) {
return inputText
.normalize('NFD')
.replace(/[\u0300-\u036f]|[\u0591-\u05bd]|\u05bf|\u05c1|\u05c2|\u05c4|\u05c5|\u05c7|\u0711|[\u0730-\u074a]/g, '');
}
exports.removeTextAccents = removeTextAccents;
/**
* Converts a text of fancy unicode font to latin alphabet characters.
* This translation happens based on "visual appearance" of the letters,
* so if you do this to text that is written in a language of non-latin
* alphabet, you will get weird outputs.
*
* Disclaimer: This may at times mistranslate messages, and the list of
* characters that get converted is most likely not complete, although
* it is very thoroughly assembled. It will remove most common accents,
* and returns a latin string in lower case letters. Any characters that
* could not be mapped to latin characters will still appear in the string.
*
* Example:
* "áĩšâęŊą âŪáäđÕ" would turn into "this text" or "ZĖĩÍĖĖĄaĖķĖÍĖŽĖlĖķÍĖgĖĩÍÍĖĖēoĖķĖ
ĖĖ" turns into "zalgo", or
* "áâÖÎŋĖĪÍðŨ
Ü" turns into "cool".
*
* Suggested usage: `textToLatin(unEmoji(inputText))`
* @param {string} inputText - The text which you would like to convert to latin
* @returns the input text, with foreign or special alphabet letters translated
* to latin lower case characters
*/
function textToLatin(inputText) {
inputText = removeTextAccents(inputText);
inputText = unEmoji(inputText);
// replace upper case letters that look different in lower case
inputText = inputText
.replace(/Ã/g, 'd')
.replace(/ÎĢ/g, 'e')
.replace(/áš/g, 'g')
.replace(/Î/g, 'h')
.replace(/á /g, 'j')
.replace(/Î/g, 'm')
.replace(/áĪ/g, 'q')
.replace(/Õ/g, 't')
.replace(/Ôą/g, 'u')
.replace(/Õ/g, 'z');
// convert to lower case and replace the rest
inputText = inputText.toLowerCase();
return inputText
.replace(/áī|â|áīŽ|áĩ|Îą|âąĨ|É|áķ|áš|ð|ð|ð|ðī|ðĻ|ð|ðš|ð |â|ð|ð|ð|ðŽ|ðŠ|ð|ðķ|ð|ð|ðļ|ï―|ð°|É|āļ|ðŪ|ð|ðĒ|ð|ð|ðž|ð|ð°|Îŧ|âģ|å|ïū|áĐ/g, 'a')
.replace(/áī|ÃĶ|áī/g, 'ae')
.replace(/ęĩ/g, 'ao')
.replace(/ę·/g, 'au')
.replace(/ęđ/g, 'av')
.replace(/ęŧ|ęģ/g, 'aa')
.replace(/ę―/g, 'ay')
.replace(/Ę|áī|Ðē|áĩĶ|áīŪ|áĩ|É|Æ|Æ|áĩŽ|áķ|ð|ð|ð|ðĐ|ðŧ|ðĄ|â|ð|ð
|ð|ð|ðŦ|ð|ð·|ð|ðĩ|ð|ðđ|ï―|ðą|âŽ|Ï|āđ|âī|ÉŪ|áļ|áŠ|āđ|ðŊ|ð|ðĢ|ð|ð|ð―|ð|ðą|Ã|āļŋ|äđ|Ō|Îē|á·/g, 'b')
.replace(/áī|áķ|ÂĒ|Æ|Čž|ę|ðļ|É|áķ|É|â|áī|ð|ð|ð|ðķ|ð|ðŠ|ðž|ðĒ|â|ð |â|ð|ðŪ|ðŽ|ð|ð|ð|â|ï―|ðē|áĨī|ð
ē|Ï|á|ð°|ð|ðĪ|ð|ð|ðū|ð|ðē|á|âĩ|å|âŧ|á|áĒ|â/g, 'c')
.replace(/áī
|áī°|áĩ|â|ęš|É|Ä|Æ|ðđ|ČĄ|áķ|áĩ|áķ|É|ð|ð|ð
|ðŦ|ð―|ðĢ|ð|ð·|â|ðĄ|ð|ð|ðŊ|ð|ð|ð|ð|ðŧ|ï―|ðģ|áŠ|áĶ|ÕŠ|ð
ģ|āđ|Ô|ęŪ|āŧ|ðą|ð|ðĨ|ð|ð|ðŋ|ð|ðģ|ã|ę°|áē/g, 'd')
.replace(/Įģ|Į/g, 'dz')
.replace(/áī|â|áīą|áĩ|âąŧ|É|Ņ|É|Éĩ|ęŋ|Į|âąļ|áķ|áķ|É|ð|ð|ð|ðŽ|ðū|ðĪ|â|ðĒ|ð|ð|ð°|ðŪ|ð|ð|ðļ|ð|ðž|ï―
|ðī|ð
ī|Ō―|ęŪ|ðē|ð|ðĶ|ð|ð|ð|ð|ðī|äđ|Îū|áŋ|âŽ|áī/g, 'e')
.replace(/ę°|áķ |Æ|Ę|ę|ðŧ|ęž|áĩŪ|áķ|áš|áš|Ō|áĩģ|ð|ð
|ð|ð|ðŋ|ðĨ|ęŧ|â|ðĢ|ð|ð|ðą|ðŊ|ð|ð|ðđ|ð|ð―|ï―|ðĩ|âą|ð|á ŧ|âĻ|ð
ĩ|Ï|ðģ|ð|ð§|ð|ð|ð|ð|ðĩ|ÂĢ|âĢ|ęķ|å|ï―·|áī/g, 'f')
.replace(/ïŽ/g, 'fi')
.replace(/ïŽ/g, 'fl')
.replace(/ïŽ/g, 'ff')
.replace(/ïŽ/g, 'ffi')
.replace(/ïŽ/g, 'ffl')
.replace(/ÉĒ|Ę|áīģ|áĩ|É |ĮĨ|áķ|ÉĄ|áĩ·|ð°|ð |ð|ðš|ð|ðŪ|ð|ðĶ|â|ðĪ|ð|ð|ðē|ð|ð|ðĒ|ð|ðū|ï―|ðķ|Ų|á§|ð
ķ|ïŧŪ|ęŪ|āš|ðī|ð|ðĻ|ð|ð|ð|ð|ðķ|âē|ã |Ö|á/g, 'g')
.replace(/Ę|Ð―|â|áīī|ʰ|âąĻ|ħ|ÉĶ|ÉĨ|ĘŪ|ĘŊ|ðĄ|ð|ð|ðŊ|ð|ð§|â|ðĨ|â|ð|ðģ|ðą|ð|ð―|ðŧ|ð|â|ï―|ð·|â|ð
·|Ņ|Ô|ęŪ|ɧ|ðĩ|ð|ðĐ|ð|ð|ð|ð|ð·|å|ã|Õ°|áž/g, 'h')
.replace(/Æ/g, 'hu')
.replace(/ÉŠ|áĩĒ|áīĩ|áķĶ|âą|Îđ|Äą|ÉĻ|áķ|áī|ðĒ|ð|ð|ð°|ð|ðĻ|ðĪ|â|ðĶ|â|ð|ðī|ðē|ð|ðū|ð|ðž|ð|ð|ï―|ðļ|ę ļ|ð
ļ|āđ|ęĩ|ðķ|ð|ðŠ|ð|ð|ð|ð|ðļ|äļĻ|ïū|á°|áķĪ/g, 'i')
.replace(/Äģ/g, 'ij')
.replace(/áī|âąž|áīķ|Ęē|Ę|É|Č·|É|ðĢ|ð|ð|ð―|ð|ðą|ð§|ð|ðĐ|ðĨ|â|ð§|ð|ð|ðĩ|ðģ|ð|ðŋ|ðĨ|ð|ð|ï―|ðđ|â|Ņ|ę đ|Õĩ|ð
đ|āļ§|ð·|ð|ðŦ|ð|ð|ð
|ð|ðđ|Ũ |ïū|Ų|á|á|Úķ|á/g, 'j')
.replace(/áī|К|â|áī·|áĩ|⹊|ę|Æ|ę|ę
|áķ|Ę|ðĪ|ð|ð|ðū|ð|ðē|ð|ðŠ|â|ðĻ|ð|ð|ðķ|ðī|ð|ð|ðĶ|ð|ð|ï―|ðš|Ō|á|ð
š|Ó|ęŪķ|ðļ|ð|ðŽ|ð|ð |ð|ð|ðš|â|ãđ|Ō|á―áļ|ŌĄ/g, 'k')
.replace(/Ę|áī|â|áīļ|ËĄ|â|⹥|ę|Å|Å|ę|ÉŽ|Čī|áķ
|É|Åŋ|Éŋ|Ũ|ðĨ|ð|ð|ðģ|ð
|ðŦ|â
|â|ðĐ|ð|ð|ð·|ðĩ|ð|ð|ð|ðŋ|ð|ð|ï―|ðŧ|â|áĨĢ|ð
ŧ|Ę
|ęŪ|ÉĐ|ðđ|ð|ð|ð|ðĄ|ð|ð|ðŧ|á|ãĨ|ïū|Ó|áŠ/g, 'l')
.replace(/Į/g, 'lj')
.replace(/áī|О|â|áīđ|áĩ|Éą|áĩŊ|áķ|ɰ|ðĶ|ð|ð|ðī|ð|ðŽ|â|ðŠ|ð|ð|ðļ|ðķ|ð|ð|ð|ð|ð|ð|ï―|ðž|âģ|á°|ð
ž|āđ|ęŪ|ðš|ð |ðŪ|ð|ðĒ|ð|ð|ðž|âĨ|įŠ|ïūķ|áŧ|Ïŧ/g, 'm')
.replace(/Éī|â|áīš|âŋ|áī|Ðļ|ę|Éē|Æ|Čĩ|áĩ°|áķ|Éģ|áī|ð§|ð|ð|ð|ð|ðĩ|ð|ð|â|ðŦ|ð|ð|ðđ|ð·|ð|ð|ðĐ|ð|â|ï―|ð―|ð
―|āļ |Õž|ęŪ|Å|āš|ðŧ|ðĄ|ðŊ|ð|ðĢ|ð|ð|ð―|Ðŋ|η|âĶ|å |å|Õē|á |á|á|áķ°/g, 'n')
.replace(/Į/g, 'nj')
.replace(/áī|â|áīž|áĩ|Ï|ę|ę|Ãļ|ČĢ|âąš|áī|áī|ðĻ|ð|ð|ð|ðķ|ð|ðŪ|â|ðŽ|ð|ð|ðš|ðļ|ð|ð|ðŠ|ð |ð|ï―|ęī|ðū|â|āđ|Ö
|ę·|āŧ|ðž|ðĒ|ð°|ð|ðĪ|ð|ð|ðū|ÓĐ|ð|ã|ãŪ|âĒ|āķ§|á|âĄ|ðĩ|á|Îŋ|â|âŊ/g, 'o')
.replace(/ę/g, 'oo')
.replace(/Éķ|Å|áī/g, 'oe')
.replace(/ÆĢ/g, 'on')
.replace(/áī|â|áīū|áĩ|Ï|ę|ÆĨ|ę|áĩ―|ę|áĩą|áķ|ðĐ|ð|ð|ð|ęž|ð|ð·|ð|ðŊ|â|ð|ð|ð|ðŧ|ðđ|ð|ð
|ðŦ|ðĄ|â|ï―|ðŋ|â|Ũ§|Ö|ęū|ð―|ðĢ|ðą|ð|ðĨ|ð|ð|ðŋ|Ãū|âą|åĐ|ï―ą|Ï|á|áĩ/g, 'p')
.replace(/ĮŦ|áĩĐ|áĩ |ę|ę|Ę |É|ðŠ|ð|ð|ðļ|ð|ð°|â |ðŪ|ð|ð|ðž|ðš|ð |ð|ðŽ|ðĒ|â|ï―|ð
|ð|ð|ð|Ï|ÕĶ|āđ|ðū|ðĪ|ðē|ð|ðĶ|ð|ð|ð|ã|ŌĐ|áŦ|áī/g, 'q')
.replace(/Ę|áĩĢ|áīŋ|Ęģ|áī|áī|Ę|áī|Ņ|ę
|É|É―|Éū|Éž|áĩē|áķ|Éđ|Éŧ|Éš|âąđ|ęŪ§|ðŦ|ð|ð|ðđ|ð|ðą|âĄ|ðŊ|â|ð|ð―|ðŧ|ðĄ|ð|ð|ð
|ðĢ|â|ï―|ð
|â|ę|ð|Ðģ|ęŪĒ|ā―|ðŋ|ðĨ|ðģ|ð|ð§|ð|ð|ð|å°š|ā― |á/g, 'r')
.replace(/ęą|â|ËĒ|Ę|áĩī|áķ|Čŋ|ęļ|ðŽ|ð|ð |ð|ð|ðš|ð|ðē|âĒ|ę
|ð°|ð|ð|ðū|ðž|ðĒ|ð|ðŪ|ðĪ|ð|ï―|ð
|áĶ|ęŊą|ð|āļĢ|Ö|ęŪĨ|ð|ðĶ|ðī|ð|ðĻ|ð|ð|ð|ÆĻ|Ņ|§|âī|äļ|Ï|á/g, 's')
.replace(/ïŽ/g, 'st')
.replace(/áī|Ņ|â|áĩ|áĩ|Æ|ÉŦ|âąĶ|Æ|Ę|ŧ|Čķ|áĩĩ|ÆŦ|Ę|ð|ð|ð|ðŧ|ð|ðģ|âĢ|ðą|ð|ð|ðŋ|ð―|ðĢ|ð|ðŊ|ðĨ|ð|ï―|ð
|ð|ð|ęŪĶ|ð|ð§|ðĩ|ðĄ|ð|ðĐ|ð|ð|ð|â |âŪ|ã|ï―ē|Õ§|Íģ|áķ|äļ
/g, 't')
.replace(/áĩš/g, 'th')
.replace(/ęĐ/g, 'tz')
.replace(/áī|áĩĪ|áĩ|áĩ|Ę|áķ|ðŪ|ð|ðĒ|ð|ð|ðž|ð|ðī|âĪ|ðē|ð|ð|ð|ðū|ðĪ|ð|ð°|ðĶ|ð|ï―|ð
|Ï
|ð|āļĒ|Ę|ęŪž|ęŪ°|āļ|ð|ðĻ|ðķ|ð|ðŠ|ð|ð|ð|Ņ|Âĩ|Ξ|Ę|ãĐ|ãē|Õī|á|á/g, 'u')
.replace(/áĩŦ/g, 'ue')
.replace(/áī |áĩĨ|âą―|áĩ|Î―|Ę|ę|âąī|áķ|âąą|ðŊ|ð|ðĢ|ð|ð|ð―|ð|ðĩ|âĨ|ðģ|ð|ð|ð|ðŋ|ðĨ|ð|ðą|ð§|ð|ï―|ð
|ð
|ŨĐ|ęŪ|Û·|āļ|ð|ðĐ|ð·|ð|ðŦ|ð|ð|ð
|áŊ|â|Ņĩ|áŧ―|áš/g, 'v')
.replace(/ęĄ/g, 'vy')
.replace(/áīĄ|áĩ|Ę·|Ï|âąģ|Ę|ÉŊ|ð°|ð|ðĪ|ð|ð|ðū|ð|ðķ|âĶ|ðī|ð|ð|ð|ð|ðĶ|ð|ðē|ðĻ|ð|ï―|ð
|āļŽ|ð|ÕĄ|ęŪ|āš|ð|ðŠ|ðļ|ð|ðŽ|ð|ð |ð|Ņ|âĐ|åąą|ā°|áŊ|áš/g, 'w')
.replace(/Ï|â|ËĢ|áķ|ðą|ð|ðĨ|ð|ð|ðŋ|ð|ð·|â§|ðĩ|ð|ð|ð|ð|ð§|ð|ðģ|ðĐ|ð|ï―|ð
|ð|Ũ|Ó―|Ōģ|ð
|ðŦ|ðđ|ð|ð|ð|ðĄ|ð|Ã|Óŋ|äđ|ïū|áŊ|á|Ðķ|á/g, 'x')
.replace(/Ę|áĩ§|Ęļ|Ņ|Æī|áŧŋ|É|Ę|ðē|ð|ðĶ|ð|ð|ð|ð|ðļ|âĻ|ðķ|ð|ð|ð|ð|ðĻ|ð|ðī|ðŠ|ð|ï―|ð
|âī|ŌŊ|ð|ŨĨ|á§|ęđ|āļŊ|ð|ðŽ|ðš|ð |ðŪ|ð|ðĒ|ð|Ņ|ÂĨ|ã|ïū|Õū|Ó|áŧ/g, 'y')
.replace(/áīĒ|áķŧ|ęŦ|áĩđ|⹎|ČĨ|Æķ|Ę|áĩķ|áķ|Ę|É|ðģ|ð|ð§|ð|ð|ð|ð|ðđ|âĐ|ð·|âĻ|ð|ð
|ð|ðĐ|ð|ðĩ|ðŦ|âĪ|ï―|ð
|Æš|ð|Õđ|á|āš|ð|ð|ðŧ|ðĄ|ðŊ|ð|ðĢ|ð|äđ|á|áą|Õ·/g, 'z');
}
exports.textToLatin = textToLatin;
/**
* For any given input text, reduce any repeating characters to a given maximum amount of repetitions.
*
* As an example, the input string: `"heeellllooooo"` becomes: `"heelloo"` if that number is 2, or
* `"heeelllooo"` if that number is 3, or `helo` if that number is 1.
*
* For English, it is recommended to not use values lower than 2. If this preprocessing is used, make
* sure that the bad words and allowed terms also feature at most the same number of repeated characters.
* I.e. if using this with the number "2", there is no use of putting words like "helllo" on the lists.
* @param inputText - The text from which to remove repeat characters.
* @param maxAllowedCharsInSequence - The maximum number of characters in sequence (such as "aaa", "bbb",
* ...) that are allowed to remain in the input string. `Recommended: 2 or 3`, depending on the language
* of your input text.
* @returns The input text with all repeat characters that occur more than the max amount in sequence
* removed.
* @throws If `maxAllowedCharsInSequence` is not an integer (such as 1.5) or if it is 0 or less.
*/
function reduceRepeatCharacters(inputText, maxAllowedCharsInSequence) {
if (maxAllowedCharsInSequence === void 0) { maxAllowedCharsInSequence = 2; }
if (!Number.isInteger(maxAllowedCharsInSequence) || maxAllowedCharsInSequence < 1) {
throw new Error('reduceRepeatCharacters - maxAllowedCharsInSequence needs to be an integer that is larger than 0');
}
// match any word character `(\w)`, then refer to the last match `\1` and check if it appears
// n or more times `{n,}` (where n is maxAllowedCharsInSequence)
// replace this with our matched character '$1' repeated n times
// in normal regexp terms this would be `string.replace(/(\w)\1{2,}/g, '$1$1')` if `n = 2`
var regexp = new RegExp('(\\w)\\1{' + maxAllowedCharsInSequence + ',}', 'g');
var replacementStr = '$1'.repeat(maxAllowedCharsInSequence);
return inputText.replace(regexp, replacementStr);
}
exports.reduceRepeatCharacters = reduceRepeatCharacters;
//# sourceMappingURL=input_preprocessor.js.map