deep-profanity-filter
Version:
A thorough profanity filter that considers most common circumventions. Works with your custom list of blocked and whitelisted words and phrases. Identifies and/or replaces bad words. Works with *wildcards* at *start and/or end* of words.
202 lines âĸ 13.4 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.reduceRepeatCharacters = exports.textToLatin = exports.removeTextAccents = exports.unEmoji = void 0;
/**
* Replaces all emojis in a text that feature a letter with
* normal latin characters.
* Example: "đŦâ đ" turns into "go on" or
* "đ
ŋđēÂŽī¸đ°đĒ" turns into "purse".
* Note: This does NOT replace random emojis used to represent
* or mask letters, such as 'đ¸' representing an 'O'.
*
* Suggested usage: `textToLatin(unEmoji(inputText))`
* @param {string} inputText - The text from which you would like to remove all text based emojis.
* @returns the input text, with all letter based emojis transformed to become text.
*/
function unEmoji(inputText) {
return inputText
.replace(/đ/g, 'id')
.replace(/đ/g, 'vs')
.replace(/đ¤/g, 'abc')
.replace(/đ/g, 'ab')
.replace(/đ/g, 'cl')
.replace(/đ/g, 'sos')
.replace(/đž/g, 'wc')
.replace(/đ/g, 'ng')
.replace(/đ/g, 'ok')
.replace(/đ/g, 'up')
.replace(/đ/g, 'cool')
.replace(/đ/g, 'new')
.replace(/đ/g, '10')
.replace(/đ/g, 'free')
.replace(/đ/g, 'end')
.replace(/đ/g, 'back')
.replace(/đ/g, 'on')
.replace(/đ/g, 'top')
.replace(/đ/g, 'soon')
.replace(/đĻ|đ
°/g, 'a')
.replace(/đ§|đ
ą/g, 'b')
.replace(/đ¨|Šī¸/g, 'c')
.replace(/đŠ/g, 'd')
.replace(/đĒ/g, 'e')
.replace(/đĢ/g, 'f')
.replace(/đŦ/g, 'g')
.replace(/đ/g, 'h')
.replace(/đŽ/g, 'i')
.replace(/đ¯/g, 'j')
.replace(/đ°/g, 'k')
.replace(/đą/g, 'l')
.replace(/đ˛/g, 'm')
.replace(/đŗ/g, 'n')
.replace(/đ´|đ
ž|â/g, 'o')
.replace(/đĩ|đ
ŋ/g, 'p')
.replace(/đļ/g, 'q')
.replace(/đˇ|ÂŽī¸/g, 'r')
.replace(/đ¸|đ°/g, 's')
.replace(/đš/g, 't')
.replace(/đē/g, 'u')
.replace(/đģ/g, 'v')
.replace(/đŧ/g, 'w')
.replace(/đŊ|â|â/g, 'x')
.replace(/đž/g, 'y')
.replace(/đŋ/g, 'z');
}
exports.unEmoji = unEmoji;
/**
* Removes most common accents from characters.
* Example: The text "ZĖĩÍĖĖĄaĖļĖÍĖŦĖlĖļÍĖgĖĩÍÍĖ˞oĖļĖ
ĖĖ" becomes "Zalgo",
* the text "à -côtÊs" becomes "a-cotes",
* non-latin characters stay non-latin, e.g. "áâÖÎŋˤÍđ×
Ü" becomes "áâÎŋđ".
* @param {string} inputText - The text for which you wish to have all
* accents removed.
* @returns the input text, stripped of all accents.
*/
function removeTextAccents(inputText) {
return inputText
.normalize('NFD')
.replace(/[\u0300-\u036f]|[\u0591-\u05bd]|\u05bf|\u05c1|\u05c2|\u05c4|\u05c5|\u05c7|\u0711|[\u0730-\u074a]/g, '');
}
exports.removeTextAccents = removeTextAccents;
/**
* Converts a text of fancy unicode font to latin alphabet characters.
* This translation happens based on "visual appearance" of the letters,
* so if you do this to text that is written in a language of non-latin
* alphabet, you will get weird outputs.
*
* Disclaimer: This may at times mistranslate messages, and the list of
* characters that get converted is most likely not complete, although
* it is very thoroughly assembled. It will remove most common accents,
* and returns a latin string in lower case letters. Any characters that
* could not be mapped to latin characters will still appear in the string.
*
* Example:
* "áĩēâę¯ą âŽáäšÕ" would turn into "this text" or "ZĖĩÍĖĖĄaĖļĖÍĖŦĖlĖļÍĖgĖĩÍÍĖ˞oĖļĖ
ĖĖ" turns into "zalgo", or
* "áâÖÎŋˤÍđ×
Ü" turns into "cool".
*
* Suggested usage: `textToLatin(unEmoji(inputText))`
* @param {string} inputText - The text which you would like to convert to latin
* @returns the input text, with foreign or special alphabet letters translated
* to latin lower case characters
*/
function textToLatin(inputText) {
inputText = removeTextAccents(inputText);
inputText = unEmoji(inputText);
// replace upper case letters that look different in lower case
inputText = inputText
.replace(/Ã/g, 'd')
.replace(/ÎŖ/g, 'e')
.replace(/áē/g, 'g')
.replace(/Î/g, 'h')
.replace(/á /g, 'j')
.replace(/Î/g, 'm')
.replace(/á¤/g, 'q')
.replace(/Õ/g, 't')
.replace(/Ôą/g, 'u')
.replace(/Õ/g, 'z');
// convert to lower case and replace the rest
inputText = inputText.toLowerCase();
return inputText
.replace(/á´|â|á´Ŧ|áĩ|Îą|âąĨ|É|áļ|áē|đ|đ|â|đ|đ|đ|đŦ|đĒ|đ|đļ|đ|đ|đ¸|īŊ|đ°|É|ā¸|đŽ|đ|đĸ|đ|đ|đŧ|đ|đ°|Îģ|âŗ|å|īž|áŠ/g, 'a')
.replace(/á´|ÃĻ|á´/g, 'ae')
.replace(/ęĩ/g, 'ao')
.replace(/ęˇ/g, 'au')
.replace(/ęš/g, 'av')
.replace(/ęģ|ęŗ/g, 'aa')
.replace(/ęŊ/g, 'ay')
.replace(/Ę|á´|в|áĩĻ|á´Ž|áĩ|É|Æ|Æ|áĩŦ|áļ|đ|đ|â|đ|đ
|đ|đ|đĢ|đ|đˇ|đĩ|đ|đš|īŊ|đą|âŦ|Ī|āš|â´|ÉŽ|á¸|áĒ|āš|đ¯|đ|đŖ|đ|đ|đŊ|đ|đą|Ã|ā¸ŋ|äš|Ō|β|áˇ/g, 'b')
.replace(/á´|áļ|Âĸ|Æ|Čŧ|ę|đ¸|É|áļ|É|â|á´|đ|đ|â|đ |â|đ|đŽ|đŦ|đ|đ|đ|â|īŊ|đ˛|áĨ´|đ
˛|Ī|á|đ°|đ|đ¤|đ|đ|đž|đ|đ˛|á|âĩ|å|âģ|á|áĸ|â/g, 'c')
.replace(/á´
|á´°|áĩ|â|ęē|É|Ä|Æ|đš|ČĄ|áļ|áĩ|áļ|É|đ|đ|â|đĄ|đ|đ|đ¯|đ|đ|đ|đ|đģ|īŊ|đŗ|áĒ|áĻ|ÕĒ|đ
ŗ|āš|Ô|ęŽ|āģ|đą|đ|đĨ|đ|đ|đŋ|đ|đŗ|ã|ę°|á˛/g, 'd')
.replace(/Įŗ|Į/g, 'dz')
.replace(/á´|â|á´ą|áĩ|âąģ|É|Ņ|É|Éĩ|ęŋ|Į|⹸|áļ|áļ|É|đ|đ|â|đĸ|đ|đ|đ°|đŽ|đ|đ|đ¸|đ|đŧ|īŊ
|đ´|đ
´|ŌŊ|ęŽ|đ˛|đ|đĻ|đ|đ|đ|đ|đ´|äš|Ξ|áŋ|âŦ|đ|á´/g, 'e')
.replace(/ę°|áļ |Æ|Ę|ę|đģ|ęŧ|áĩŽ|áļ|áē|áē|Ō|áĩŗ|đ|đ
|ęģ|â|đŖ|đ|đ|đą|đ¯|đ|đš|đ|đŊ|īŊ|đĩ|âą|đ|á ģ|â¨|đ
ĩ|Ī|đŗ|đ|đ§|đ|đ|đ|đ|đĩ|ÂŖ|âŖ|ęļ|å|īŊˇ|á´/g, 'f')
.replace(/īŦ/g, 'fi')
.replace(/īŦ/g, 'fl')
.replace(/īŦ/g, 'ff')
.replace(/īŦ/g, 'ffi')
.replace(/īŦ/g, 'ffl')
.replace(/Éĸ|Ę|á´ŗ|áĩ|É |ĮĨ|áļ|ÉĄ|áĩˇ|đ°|đ |đ|â|đ¤|đ|đ|đ˛|đ|đ|đĸ|đ|đž|īŊ|đļ|Ų|á§|đ
ļ|īģŽ|ęŽ|āē|đ´|đ|đ¨|đ|đ|đ|đ|đļ|â˛|ã |Ö|á/g, 'g')
.replace(/Ę|ĐŊ|â|á´´|ʰ|⹨|ħ|ÉĻ|ÉĨ|ĘŽ|Ɲ|đĄ|đ|â|đĨ|â|đ|đŗ|đą|đ|đŊ|đģ|đ|â|īŊ|đˇ|â|đ
ˇ|Ņ|Ô|ęŽ|ɧ|đĩ|đ|đŠ|đ|đ|đ|đ|đˇ|å|ã|Õ°|áŧ/g, 'h')
.replace(/Æ/g, 'hu')
.replace(/ÉĒ|áĩĸ|á´ĩ|áļĻ|âą|Κ|Äą|ɨ|áļ|á´|đĸ|đ|â|đĻ|â|đ|đ´|đ˛|đ|đž|đŧ|đ|đ|īŊ|đ¸|ę ¸|đ
¸|āš|ęĩ|đļ|đ|đĒ|đ|đ|đ|đ|đ¸|丨|īž|á°|áļ¤/g, 'i')
.replace(/Äŗ/g, 'ij')
.replace(/á´|âąŧ|á´ļ|ʲ|Ę|É|ȡ|É|đŖ|đ|â|đ§|đ|đ|đĩ|đŗ|đ|đŋ|đĨ|đ|đ|īŊ|đš|â|Ņ|ę š|Õĩ|đ
š|⏧|đˇ|đ|đĢ|đ|đ|đ
|đ|đš|× |īž|Ų|á|á|Úļ|á/g, 'j')
.replace(/á´|Đē|â|á´ˇ|áĩ|âąĒ|ę|Æ|ę|ę
|áļ|Ę|đ¤|đ|â|đ¨|đ|đ|đļ|đ´|đ|đ|đĻ|đ|đ|īŊ|đē|Ō|á|đ
ē|Ķ|ęŽļ|đ¸|đ|đŦ|đ|đ |đ|đ|đē|â|ãš|Ō|áŊá¸|ŌĄ/g, 'k')
.replace(/Ę|á´|â|á´¸|ËĄ|â|⹥|ę|Å|Å|ę|ÉŦ|Č´|áļ
|É|Åŋ|Éŋ|×|đĨ|đ|â
|â|đŠ|đ|đ|đˇ|đĩ|đ|đ|đŋ|đ|đ|īŊ|đģ|â|áĨŖ|đ
ģ|Ę
|ęŽ|ÉŠ|đš|đ|đ|đ|đĄ|đ|đ|đģ|á|ãĨ|īž|Ķ|áĒ/g, 'l')
.replace(/Į/g, 'lj')
.replace(/á´|Đŧ|â|á´š|áĩ|Éą|áĩ¯|áļ|ɰ|đĻ|đ|â|đĒ|đ|đ|đ¸|đļ|đ|đ|đ|đ|đ|īŊ|đŧ|âŗ|á°|đ
ŧ|āš|ęŽ|đē|đ |đŽ|đ|đĸ|đ|đ|đŧ|âĨ|įĒ|īžļ|áģ|Īģ/g, 'm')
.replace(/É´|â|á´ē|âŋ|á´|и|ę|ɲ|Æ|Čĩ|áĩ°|áļ|Éŗ|á´|đ§|đ|â|đĢ|đ|đ|đš|đˇ|đ|đ|đŠ|đ|â|īŊ|đŊ|đ
Ŋ|⏠|Õŧ|ęŽ|Å|āē|đģ|đĄ|đ¯|đ|đŖ|đ|đ|đŊ|Đŋ|Ρ|âĻ|å |å|Õ˛|á |á|á|áļ°/g, 'n')
.replace(/Į/g, 'nj')
.replace(/á´|â|á´ŧ|áĩ|Ī|ę|ę|ø|ČŖ|âąē|á´|á´|đ¨|đ|â|đŦ|đ|đ|đē|đ¸|đ|đ|đĒ|đ |đ|īŊ|ę´|đž|â|āš|Ö
|ęˇ|āģ|đŧ|đĸ|đ°|đ|đ¤|đ|đ|đž|ĶŠ|ð|ã|ãŽ|âĸ|āļ§|á|âĄ|đĩ|á|Îŋ|â|â¯/g, 'o')
.replace(/ę/g, 'oo')
.replace(/Éļ|Å|á´/g, 'oe')
.replace(/ÆŖ/g, 'on')
.replace(/á´|â|á´ž|áĩ|Ī|ę|ÆĨ|ę|áĩŊ|ę|áĩą|áļ|đŠ|đ|ęŧ|â|đ|đ|đ|đģ|đš|đ|đ
|đĢ|đĄ|â|īŊ|đŋ|â|×§|Ö|ęž|đŊ|đŖ|đą|đ|đĨ|đ|đ|đŋ|Þ|âą|åŠ|īŊą|Ī|á|áĩ/g, 'p')
.replace(/ĮĢ|áĩŠ|áĩ |ę|ę|Ę |É|đĒ|đ|â |đŽ|đ|đ|đŧ|đē|đ |đ|đŦ|đĸ|â|īŊ|đ
|đ|đ|Ī|ÕĻ|āš|đž|đ¤|đ˛|đ|đĻ|đ|đ|đ|ã|ŌŠ|áĢ|á´/g, 'q')
.replace(/Ę|áĩŖ|á´ŋ|Ęŗ|á´|á´|Ę|á´|Ņ|ę
|É|ÉŊ|Éž|Éŧ|áĩ˛|áļ|Éš|Éģ|Éē|âąš|ꎧ|đĢ|đ|âĄ|đ¯|â|đ|đŊ|đģ|đĄ|đ|đ
|đŖ|â|īŊ|đ
|â|ę|đ|Đŗ|ęŽĸ|āŊ|đŋ|đĨ|đŗ|đ|đ§|đ|đ|đ|å°ē|āŊ |á/g, 'r')
.replace(/ęą|â|Ëĸ|Ę|áĩ´|áļ|Čŋ|ę¸|đŦ|đ|âĸ|ę
|đ°|đ|đ|đž|đŧ|đĸ|đ|đŽ|đ¤|đ|īŊ|đ
|áĻ|ę¯ą|đ|⏪|Ö|ęŽĨ|đ|đĻ|đ´|đ|đ¨|đ|đ|đ|ƨ|Ņ|§|â´|ä¸|Ī|á/g, 's')
.replace(/īŦ/g, 'st')
.replace(/á´|Ņ|â|áĩ|áĩ|Æ|ÉĢ|âąĻ|Æ|Ę|ŧ|Čļ|áĩĩ|ÆĢ|Ę|đ|đ|âŖ|đą|đ|đ|đŋ|đŊ|đŖ|đ|đ¯|đĨ|đ|īŊ|đ
|đ|đ|ęŽĻ|đ|đ§|đĩ|đ|đŠ|đ|đ|đ|â |âŽ|ã|īŊ˛|Õ§|Íŗ|áļ|ä¸
/g, 't')
.replace(/áĩē/g, 'th')
.replace(/ęŠ/g, 'tz')
.replace(/á´|áĩ¤|áĩ|áĩ|Ę|áļ|đŽ|đ|â¤|đ˛|đ|đ|đ|đž|đ¤|đ|đ°|đĻ|đ|īŊ|đ
|Ī
|đ|ā¸ĸ|Ę|ęŽŧ|ꎰ|ā¸|đ|đ¨|đļ|đ|đĒ|đ|đ|đ|Ņ|Âĩ|Îŧ|Ę|ãŠ|ã˛|Õ´|á|á/g, 'u')
.replace(/áĩĢ/g, 'ue')
.replace(/á´ |áĩĨ|âąŊ|áĩ|ÎŊ|Ę|ę|âą´|áļ|âąą|đ¯|đ|âĨ|đŗ|đ|đ|đ|đŋ|đĨ|đ|đą|đ§|đ|īŊ|đ
|đ
|׊|ęŽ|Ûˇ|ā¸|đ|đŠ|đˇ|đ|đĢ|đ|đ|đ
|á¯|â|Ņĩ|áģŊ|áē/g, 'v')
.replace(/ęĄ/g, 'vy')
.replace(/á´Ą|áĩ|ʡ|Ī|âąŗ|Ę|ɯ|đ°|đ|âĻ|đ´|đ|đ|đ|đ|đĻ|đ|đ˛|đ¨|đ|īŊ|đ
|ā¸Ŧ|đ|ÕĄ|ęŽ|āē|đ|đĒ|đ¸|đ|đŦ|đ|đ |đ|Ņ|âŠ|åąą|ā°|á¯|áē/g, 'w')
.replace(/Ī|â|ËŖ|áļ|đą|đ|â§|đĩ|đ|đ|đ|đ|đ§|đ|đŗ|đŠ|đ|īŊ|đ
|đ|×|ĶŊ|Ōŗ|đ
|đĢ|đš|đ|đ|đ|đĄ|đ|Ã|Ķŋ|äš|īž|á¯|á|Đļ|á/g, 'x')
.replace(/Ę|áĩ§|ʸ|Ņ|Æ´|áģŋ|É|Ę|đ˛|đ|â¨|đļ|đ|đ|đ|đ|đ¨|đ|đ´|đĒ|đ|īŊ|đ
|â´|Ō¯|đ|×Ĩ|á§|ęš|⏝|đ|đŦ|đē|đ |đŽ|đ|đĸ|đ|Ņ|ÂĨ|ã|īž|Õž|Ķ|áģ/g, 'y')
.replace(/á´ĸ|áļģ|ęĢ|áĩš|âąŦ|ČĨ|Æļ|Ę|áĩļ|áļ|Ę|É|đŗ|đ|âŠ|đˇ|â¨|đ|đ
|đ|đŠ|đ|đĩ|đĢ|â¤|īŊ|đ
|Æē|đ|Õš|á|āē|đ|đ|đģ|đĄ|đ¯|đ|đŖ|đ|äš|á|áą|Õˇ/g, 'z');
}
exports.textToLatin = textToLatin;
/**
* For any given input text, reduce any repeating characters to a given maximum amount of repetitions.
*
* As an example, the input string: `"heeellllooooo"` becomes: `"heelloo"` if that number is 2, or
* `"heeelllooo"` if that number is 3, or `helo` if that number is 1.
*
* For English, it is recommended to not use values lower than 2. If this preprocessing is used, make
* sure that the bad words and allowed terms also feature at most the same number of repeated characters.
* I.e. if using this with the number "2", there is no use of putting words like "helllo" on the lists.
* @param inputText - The text from which to remove repeat characters.
* @param maxAllowedCharsInSequence - The maximum number of characters in sequence (such as "aaa", "bbb",
* ...) that are allowed to remain in the input string. `Recommended: 2 or 3`, depending on the language
* of your input text.
* @returns The input text with all repeat characters that occur more than the max amount in sequence
* removed.
* @throws If `maxAllowedCharsInSequence` is not an integer (such as 1.5) or if it is 0 or less.
*/
function reduceRepeatCharacters(inputText, maxAllowedCharsInSequence) {
if (maxAllowedCharsInSequence === void 0) { maxAllowedCharsInSequence = 2; }
if (!Number.isInteger(maxAllowedCharsInSequence) || maxAllowedCharsInSequence < 1) {
throw new Error('reduceRepeatCharacters - maxAllowedCharsInSequence needs to be an integer that is larger than 0');
}
// match any word character `(\w)`, then refer to the last match `\1` and check if it appears
// n or more times `{n,}` (where n is maxAllowedCharsInSequence)
// replace this with our matched character '$1' repeated n times
// in normal regexp terms this would be `string.replace(/(\w)\1{2,}/g, '$1$1')` if `n = 2`
var regexp = new RegExp('(\\w)\\1{' + maxAllowedCharsInSequence + ',}', 'g');
var replacementStr = '$1'.repeat(maxAllowedCharsInSequence);
return inputText.replace(regexp, replacementStr);
}
exports.reduceRepeatCharacters = reduceRepeatCharacters;
//# sourceMappingURL=input_preprocessor.js.map