deep-profanity-filter
Version:
A thorough profanity filter that considers most common circumventions. Works with your custom list of blocked and whitelisted words and phrases. Identifies and/or replaces bad words. Works with *wildcards* at *start and/or end* of words.
176 lines • 9.08 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.getCircumventionWhitelistRegExp = exports.getCircumventionRegExp = exports.getNormalRegExp = exports.getRegExpComponents = exports.escapeStringForRegex = void 0;
// strings used to build regular expressions
var oneOrMoreNonWordCharacters = '[\\W_]+';
var aSingleWordCharacter = '[^\\W_]';
var nonWordCharactersExceptApostrophe = "[^a-zA-Z0-9'‘’]+";
var wordBoundary = '\\b';
// regular expressions to clean up strings used to build regular expressions
var nonWordCharsAtEndsOfString = /^\[\\W\_\]\+|\[\\W\_\]\+$/g; // identify the "non-word-characters" at the start or end of a string
var nonWordCharactersWithSpaceBetween = /\[\\W\_\]\+\s+\[\\W\_\]\+/g; // identify when two "non-word-characters" surround somes whitespace
var nonWordCharsAfterEscapeSlash = /\\\[\\W\_\]\+{1}/g; // identify the "non-word-characters interjected between escaped characters"
/**
* For a character that is either a wildcard `*`, a letter `[a-zA-Z]` or anything else,
* return a regular expression that matches either anything (for a wildcard), any word
* boundary (for a letter), or any whitespace or start/end of string for all remaining cases.
*
* @param inputCharacter A string with a single character.
* @param atStart Whether the character given was at the start or end of the string.
* @returns a string used to build a regular expression
*/
var getRegExpComponentCharacter = function (inputCharacter, atStart) {
if (inputCharacter === '*') {
// match anything after wildcard characters
return '';
}
if (inputCharacter.match(/[\W_]/g)) {
// non-word character at start of search term: positive lookbehind for any whitespace or start of string
// non-word character at end of search term: positive lookahead for any whitespace or end of string
return atStart ? '(?<=^|\\s)' : '(?=\\s|$)';
}
// match a word boundery after word characters
return wordBoundary;
};
/**
* Escape a string so that we can build a `new RegExp(...)` with it and preserve
* any special characters within the string, such as `. * + ? ^ $ { } ( ) [ ] \\ /`
* and still match them properly. If you wish to match a singular backslash `\`
* literally, make sure that in your badwordlist or whitelist, as well as in your
* string that you are testing against, the backslash is escaped by replacing it
* with `\\`.
* @param inputString - The string you wish to escape for creating a regular expression.
* @returns The escaped string that can be used in `new RegExp(...)`
*/
var escapeStringForRegex = function (inputString) {
return inputString.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
};
exports.escapeStringForRegex = escapeStringForRegex;
/**
* Splits up a word that has optional wildcards '*' at its start or end.
* Removes the wildcards, and returns an empty string for start and end
* if there was a wildcard, or a word-boundary string if there was none.
* These components are then used to build regular expressions with.
*
* @param {string} badword - The bad word to split into its components.
* @returns An object with the components accessible as obj.start, obj.word and obj.end
*/
var getRegExpComponents = function (badword) {
if (badword === '') {
return { start: '', word: '', end: '' };
}
var sliceStart = badword.startsWith('*') ? 1 : 0; // if there's a * at the start, remove it
var sliceEnd = badword.endsWith('*') ? -1 : undefined; // if there's a * at the end, remove it
return {
start: getRegExpComponentCharacter(badword[0], true),
word: (0, exports.escapeStringForRegex)(badword.slice(sliceStart, sliceEnd)),
end: getRegExpComponentCharacter(badword[badword.length - 1], false),
};
};
exports.getRegExpComponents = getRegExpComponents;
/**
* Turn a bad word into a regular expression that checks if it is present
* in the string with word boundaries \b on each side that does not have a wildcard.
*
* The word "kitty" would result in the regular expression:
* /\bkitty\b/g
* the word "hell*" would result in the regular expression:
* /\bhell/g
* If the word is a phrase with whitespace, replace that whitespace with a regular
* expression that represents one or more non word characters.
* The phrase "ban ananas" turns into:
* /\bban[\W_]+ananas\b/g
*
* @param {WordRegexComponents} badWordComponents - The bad word, split into components by getRegExpComponents(...)
* @returns The regular expression that can be used to find this word in a string.
*/
var getNormalRegExp = function (badwordComponents) {
return new RegExp(badwordComponents.start +
badwordComponents.word.replace(/\s+/g, oneOrMoreNonWordCharacters) +
badwordComponents.end, 'g');
};
exports.getNormalRegExp = getNormalRegExp;
// TODO: can we not just replace all whitespace with nothing first, then all nothing with non-word characters, then remove at start and end of string?
/**
* Turn a bad word into a regular expression that checks for non-word characters
* interjected between all of the characters, but containing a word boundary
* on each side that does not have a wildcard.
*
* The word "kitty" would result in: /\bk[\W_]+i[\W_]+t[\W_]+t[\W_]+y\b/g
*
* It checks for variations such as:
* "k i t t y", "k-i-t-t-y", "k.i,t;t~y" (with a word boundary at each side)
*
* The word "hell*" would result in: /\bh[\W_]+e[\W_]+l[\W_]+l/g
*
* Phrases with whitespace
* If the word is a phrase with whitespace, turn the whitespace into the same
* regular expression that allows any non-word character, but make sure there is only
* one of these non-word-character-regexpressions at a space, as they allow 1 or more
* characters already (specified with the + at the end)
* So, "ban ananas" turns into:
* /\bb[\W_]+a[\W_]+n[\W_]+a[\W_]+n[\W_]+a[\W_]+n[\W_]+a[\W_]+s\b/g
*
* @param {WordRegexComponents} badWordComponents - The bad word, split into components by getRegExpComponents(...)
* @returns The regular expression that can be used to find this word in a string.
*/
var getCircumventionRegExp = function (badwordComponents) {
return new RegExp(badwordComponents.start +
badwordComponents.word
.split('')
.join(oneOrMoreNonWordCharacters)
.replace(nonWordCharsAtEndsOfString, '')
.replace(nonWordCharsAfterEscapeSlash, '\\')
.replace(nonWordCharactersWithSpaceBetween, oneOrMoreNonWordCharacters) +
badwordComponents.end, 'g');
};
exports.getCircumventionRegExp = getCircumventionRegExp;
/**
* Create a regular expression used for whitelisting, that treats singularly spaced out characters
* in front or after a bad word as "breaking the pattern" of the circumvention regular expression,
* so that words such as:
* "h e l l"
* can still get blocked, but words such as
* "s h e l l"
* will make sure that the input doesn't trigger on the phrase "hell".
*
* For an explanation on matchApostrophes, check the description of `preprocessWordLists(...)`,
* which covers the case of apostrophes matched vs. not matched at both the start and end of the word.
*
* @param {WordRegexComponents} badWordComponents - The bad word, split into components by getRegExpComponents(...)
* @param {boolean} atWordStart - Whether this regex whitelists the word with an additional letter at the start
* or whether it covers the case of an additional letter at the end.
* @param {boolean} matchApostrophes - Whether the regular expression treats apostrophes before and after the word differently.
* @returns The regular expression that can be used to find this word in a string, or undefined if the regular expression
* is irrelevant and should not be used.
*/
var getCircumventionWhitelistRegExp = function (badwordComponents, atWordStart, matchApostrophes) {
var newWordComponents = {
start: badwordComponents.start,
end: badwordComponents.end,
word: badwordComponents.word,
};
if (atWordStart) {
if (badwordComponents.start) {
// match a word boundary, followed by a singular character, then an amount of empty characters
newWordComponents.start = matchApostrophes ? '(?<!' + aSingleWordCharacter + "['‘’])" : '';
newWordComponents.start += wordBoundary + aSingleWordCharacter + oneOrMoreNonWordCharacters;
}
else {
return undefined;
}
}
else {
if (badwordComponents.end) {
// match an amount of empty characters, followed by a singular character, then a word boundary
newWordComponents.end = matchApostrophes ? nonWordCharactersExceptApostrophe : oneOrMoreNonWordCharacters;
newWordComponents.end += aSingleWordCharacter + wordBoundary;
}
else {
return undefined;
}
}
return (0, exports.getCircumventionRegExp)(newWordComponents);
};
exports.getCircumventionWhitelistRegExp = getCircumventionWhitelistRegExp;
//# sourceMappingURL=regex_handler.js.map