deep-profanity-filter
Version:
A thorough profanity filter that considers most common circumventions. Works with your custom list of blocked and whitelisted words and phrases. Identifies and/or replaces bad words. Works with *wildcards* at *start and/or end* of words.
237 lines (236 loc) • 16 kB
TypeScript
import { BadWordData, ProcessedWordLists, WordListOverrideData, WhitelistWordType } from './wordlist_preprocessor';
export declare enum InputPreprocessMethod {
Thorough = 0,// textToLatin (removing accents, translating foreign characters and emojis to latin letters)
CaseInsensitive = 1,// converting input to lower case before searching for bad words
ExactMatch = 2
}
export declare enum WordReplacementMethod {
ReplaceAll = 0,
KeepFirstCharacter = 1,
KeepFirstAndLastCharacter = 2
}
export declare enum WordReplacementType {
RepeatCharacter = 0,
Grawlix = 1
}
/**
* Options used in `findBadWordLocations(...)` to determine how to search for bad words.
* @param firstMatchOnly - (Default: `false`) If true, returns only the first match.
* If false, returns all matched bad words.
* @param overrideData - (Default: `undefined`) Data used to modify a list by removing words or whitelisted
* terms or by adding new whitelisted terms. Created with `preprocessWordListOverrideData`
*/
export interface WordSearchOptions {
firstMatchOnly?: boolean;
overrideData?: WordListOverrideData;
}
/**
* Options used in `replaceBadWords(...)` to determine how to replace bad words in a given input string.
* @param replacementMethod - (Default: `WordReplacementMethod.ReplaceAll`) Used to select whether to replace the
* whole word, or keep the first (and last) characters from the bad word intact.
* @param replacementType - (Default: `WordReplacementType.Grawlix`) Used to select whether to replace the
* word with a jumbled mess of Grawlix (`$!#@&%`) characters, or with a selected repeatable character defined
* in the next parameter.
* @param replacementRepeatCharacter - (Default: `-`) The character to repeat in order to replace the bad word.
* (If several characters are entered, only the first one will be used.)
*/
export interface WordReplacementOptions {
replacementMethod?: WordReplacementMethod;
replacementType?: WordReplacementType;
replacementRepeatCharacter?: string;
}
/**
* Options used in `censorText(...)` to determine how to filter and replace bad words in a given input string.
* @param inputPreprocessMethod - (Default: `InputPreprocessMethod.CaseInsensitive`) Used to preprocess the input
* string before identifying bad words. `CaseInsensitive`: transforms the input to lower case and then matches it against
* the bad word list. `Thorough` uses the `textToLatin()` function to remove text accents, translate letter emojis and
* any other fancy unicode fonts to latin before testing for bad words.
* `ExactMatch` matches the input string against the bad word list exactly.
* @param reduceRepeatCharactersTo - (Default: `undefined` meaning repeat characters are not modified.)
* Otherwise required to be a number >= 1. `Will throw an error if this number is <= 0.`
* The amount of characters a repeating sequence of characters (e.g. "aaaabcc") is reduced to in _*every*_ input string.
* (e.g. "abc" if the number is 1, "aabcc" if the number is 2, "aaabcc" if the number is 3, etc.)
* This reduction will be applied after any other input preprocess method. (Uses function `reduceRepeatCharacters(...)`)
*
* `Important:` keep in mind that reducing to 1
* repeat character will likely result in mismatches/false positives ("loot" -> "lot"). Depending on the language of the
* input, this number should be around 2 or 3.
*
* `Note:` when setting this number to 1 or larger, keep in mind that all words in your filter need to adhere to this,
* so if you set the number to 2, putting "princessship" on the bad word list won't take effect, as any such input would
* get reduced to "princesship".
* @param replacementMethod - (Default: `WordReplacementMethod.ReplaceAll`) Used to select whether to replace the
* whole word, or keep the first (and last) characters from the bad word intact.
* @param replacementType - (Default: `WordReplacementType.Grawlix`) Used to select whether to replace the
* word with a jumbled mess of Grawlix (`$!#@&%`) characters, or with a selected repeatable character defined
* in the next parameter.
* @param replacementRepeatCharacter - (Default: `-`) The character to repeat in order to replace the bad word.
* (If several characters are entered, only the first one will be used.)
*/
export interface WordCensorOptions {
inputPreprocessMethod?: InputPreprocessMethod;
reduceRepeatCharactersTo?: number;
replacementMethod?: WordReplacementMethod;
replacementType?: WordReplacementType;
replacementRepeatCharacter?: string;
}
/**
* Information on a bad word that has been found, containing the index at which it starts
* in the input string, as well as its length.
*/
export type BadWordMatchData = {
word: string;
startIndex: number;
length: number;
};
/**
* Figure out if the good word matches the bad word in its normal form
* or if it represents a variant that reduces to the bad word (by removing special characters)
* or if it represents a circumvention that spaces out the bad word.
*
* @param goodword - the whitelist term
* @param badwordData - the bad word's regular expressions, created with `getBadWordData(...)`
* @returns `WhitelistWordType.None` if the whitelist term does not match this bad word,
* `WhitelistWordType.Normal` if the term matches the bad word in its normal form,
* `WhitelistWordType.Reduced` if the term without special characters (reduced string) matches the word,
* `WhitelistWordType.Circumvention` if the whitelisted word matches the bad word's circumvention regexp,
* `WhitelistWordType.ReducedAndCircumvention` if it matches both the reduced string and the circumvention.
*/
export declare const getWhitelistType: (goodword: string, badwordData: BadWordData) => WhitelistWordType;
/**
* Given your preprocessed bad word list and whitelist, checks if a given text contains any bad word
* that hasn't been allowed by the whitelist. Checks for the most common circumventions as well.
*
* IMPORTANT: Make sure that any backslash in the inputString is escaped correctly.
* If you are trying to see whether the string `¯\_(ツ)_/¯` is a bad word, you have to
* enter it as `¯\\_(ツ)_/¯` to match it correctly.
*
* @param inputString - The text you wish to check for bad words.
* @param processedWordLists - The preprocessed bad word list and whitelist, generated by `preprocessWordLists(...)`
* @param overrideData - (Default: `undefined`) Data used to modify a list by removing words or whitelisted
* terms or by adding new whitelisted terms. Created with `preprocessWordListOverrideData`
* @returns True if any bad word was found, false if no bad word was found or all bad words were whitelisted.
*/
export declare const doesContainBadWords: (inputString: string, processedWordLists: ProcessedWordLists, overrideData?: WordListOverrideData) => boolean;
/**
* Finds all bad words contained in a string, as well as their locations, indicated by start index and length.
*
* IMPORTANT: Make sure that any backslash in the inputString is escaped correctly.
* If you are trying to see whether the string `¯\_(ツ)_/¯` is a bad word, you have to
* enter it as `¯\\_(ツ)_/¯` to match it correctly.
*
* @param inputString - The text you wish to check for bad words.
* @param processedWordLists - The preprocessed bad word list and whitelist, generated by `preprocessWordLists(...)`
* @param firstMatchOnly - (Default: `false`) If true, returns only the first match. If false, returns all matched
* bad words.
* @param overrideData - (Default: `undefined`) Data used to modify a list by removing words or whitelisted
* terms or by adding new whitelisted terms. Created with `preprocessWordListOverrideData`
* --------
* @returns an array of information about all found bad words and where they are located in the input string.
*/
export declare const findBadWordLocations: (inputString: string, processedWordLists: ProcessedWordLists, { firstMatchOnly, overrideData }?: WordSearchOptions) => BadWordMatchData[];
/**
* Given your preprocessed bad word list and whitelist, checks if a given text contains any bad word
* that hasn't been allowed by the whitelist. Checks for the most common circumventions as well.
* If any bad word was found, the first word that was found will be returned.
*
* IMPORTANT: Make sure that any backslash in the inputString is escaped correctly.
* If you are trying to see whether the string `¯\_(ツ)_/¯` is a bad word, you have to
* enter it as `¯\\_(ツ)_/¯` to match it correctly.
*
* @param inputString - The text you wish to check for bad words.
* @param processedWordLists - The preprocessed bad word list and whitelist, generated by `preprocessWordLists(...)`
* @param overrideData - (Default: `undefined`) Data used to modify a list by removing words or whitelisted
* terms or by adding new whitelisted terms. Created with `preprocessWordListOverrideData`
* @returns The first bad word that was found in the input, or undefined if no bad word was found.
*/
export declare const findAnyBadWord: (inputString: string, processedWordLists: ProcessedWordLists, overrideData?: WordListOverrideData) => string | undefined;
/**
* Given the bad word locations found by `findBadWordLocations(...)`,
* extract all the bad words from that data.
* This function is useful if you need both the bad words as well as the input string
* with all bad words replaced (check out `replaceBadWords(...)` for the latter.)
*
* If the bad words are all you need, consider using `findAllBadWords(...)` instead.
* If you only need one bad word, consider using `findAnyBadWord(...)`, and if you
* only need to know whether there is a bad word, consider using `doesContainBadWord(...)`.
*
* @param badWordLocations - The locations of bad words in your input string checked
* with `findBadWordLocations(...)`
* @returns An array of strings of all bad words found in the text. Only contains each
* bad word once, even if they repeat.
*/
export declare const getBadWords: (badWordLocations: BadWordMatchData[]) => string[];
/**
* Given your preprocessed bad word list and whitelist, checks for all bad words in a given input text
* that haven't been allowed by the whitelist. Checks for the most common circumventions as well.
* Returns an array of strings of all bad words.
*
* IMPORTANT: Make sure that any backslash in the inputString is escaped correctly.
* If you are trying to see whether the string `¯\_(ツ)_/¯` is a bad word, you have to
* enter it as `¯\\_(ツ)_/¯` to match it correctly.
*
* @param inputString - The text you wish to check for bad words.
* @param processedWordLists - The preprocessed bad word list and whitelist, generated by `preprocessWordLists(...)`
* @param overrideData - (Default: `undefined`) Data used to modify a list by removing words or whitelisted
* terms or by adding new whitelisted terms. Created with `preprocessWordListOverrideData`
* @returns The first bad word that was found in the input, or undefined if no bad word was found.
*/
export declare const findAllBadWords: (inputString: string, processedWordLists: ProcessedWordLists, overrideData?: WordListOverrideData) => string[];
/**
* Sanitise any text by replacing bad words in it with Grawlix (`$!#@&%`) or a single repeated character.
* This function is useful if you need both the bad words as well as the input string
* with all bad words replaced (check out `getBadWords(...)` for the latter.)
*
* If you only need a "censored" input string but are not interested in identifying which bad words were
* censored, consider using `censorText(...)` instead.
*
* @param inputString - The text that got checked for bad words in `findBadWordLocations(...)`
* @param badWordLocations - The information on all bad word matches found with `findBadWordLocations(...)`
* @param replacementMethod - (Default: `WordReplacementMethod.ReplaceAll`) Used to select whether to replace the
* whole word, or keep the first (and last) characters from the bad word intact.
* @param replacementType - (Default: `WordReplacementType.Grawlix`) Used to select whether to replace the
* word with a jumbled mess of Grawlix (`$!#@&%`) characters, or with a selected repeatable character defined
* in the next parameter.
* @param replacementRepeatCharacter - (Default: `-`) The character to repeat in order to replace the bad word.
* (If several characters are entered, only the first one will be used.)
* @returns the input string, with all bad words replaced by either Grawlix or a repeated character.
*/
export declare const replaceBadWords: (inputString: string, badWordLocations: BadWordMatchData[], { replacementMethod, replacementType, replacementRepeatCharacter, }?: WordReplacementOptions) => string;
/**
* Sanitise any text by replacing bad words in it with Grawlix (`$!#@&%`) or a single repeated character.
*
* @param inputString - The text that got checked for bad words in `findBadWordLocations(...)`
* @param processedWordLists - The preprocessed bad word list and whitelist, generated by `preprocessWordLists(...)`
* @param inputPreprocessMethod - (Default: `InputPreprocessMethod.CaseInsensitive`) Used to preprocess the input
* string before identifying bad words. `CaseInsensitive`: transforms the input to lower case and then matches it against
* the bad word list.
* `Thorough` uses the `textToLatin()` function to remove text accents, translate letter emojis and
* any other fancy unicode fonts to latin before testing for bad words. Note: If non-latin characters are found,
* the censored text will be returned all in lower case and in latin letters.
* `ExactMatch` matches the input string against the bad word list exactly.
* @param reduceRepeatCharactersTo - (Default: `undefined` meaning repeat characters are not modified.)
* Otherwise required to be a number >= 1. `Will throw an error if this number is <= 0.`
* The amount of characters a repeating sequence of characters (e.g. "aaaabcc") is reduced to in _*every*_ input string.
* (e.g. "abc" if the number is 1, "aabcc" if the number is 2, "aaabcc" if the number is 3, etc.)
* This reduction will be applied after any other input preprocess method. (Uses function `reduceRepeatCharacters(...)`)
*
* `Important:` keep in mind that reducing to 1
* repeat character will likely result in mismatches/false positives ("loot" -> "lot"). Depending on the language of the
* input, this number should be around 2 or 3.
*
* `Note:` when setting this number to 1 or larger, keep in mind that all words in your filter need to adhere to this,
* so if you set the number to 2, putting "princessship" on the bad word list won't take effect, as any such input would
* get reduced to "princesship".
* @param replacementMethod - (Default: `WordReplacementMethod.ReplaceAll`) Used to select whether to replace the
* whole word, or keep the first (and last) characters from the bad word intact.
* @param replacementType - (Default: `WordReplacementType.Grawlix`) Used to select whether to replace the
* word with a jumbled mess of Grawlix (`$!#@&%`) characters, or with a selected repeatable character defined
* in the next parameter.
* @param replacementRepeatCharacter - (Default: `-`) The character to repeat in order to replace the bad word.
* (If several characters are entered, only the first one will be used.)
* @param overrideData - (Default: `undefined`) Data used to modify a list by removing words or whitelisted
* terms or by adding new whitelisted terms. Created with `preprocessWordListOverrideData`
* @returns the input string, with all bad words replaced by either Grawlix or a repeated character.
*/
export declare const censorText: (inputString: string, processedWordLists: ProcessedWordLists, { inputPreprocessMethod, reduceRepeatCharactersTo, replacementMethod, replacementType, replacementRepeatCharacter, }?: WordCensorOptions, overrideData?: WordListOverrideData) => string;