allprofanity
Version:
A TypeScript package to filter Hindi and Hinglish bad words from text
806 lines • 29.4 kB
JavaScript
// Language dictionaries imports
import englishBadWords from "./languages/english-words.js";
import hindiBadWords from "./languages/hindi-words.js";
import frenchBadWords from "./languages/french-words.js";
import germanBadWords from "./languages/german-words.js";
import spanishBadWords from "./languages/spanish-words.js";
import bengaliBadWords from "./languages/bengali-words.js";
import tamilBadWords from "./languages/tamil-words.js";
import teluguBadWords from "./languages/telugu-words.js";
// Export language dictionaries for direct access
export { default as englishBadWords } from "./languages/english-words.js";
export { default as hindiBadWords } from "./languages/hindi-words.js";
export { default as frenchBadWords } from "./languages/french-words.js";
export { default as germanBadWords } from "./languages/german-words.js";
export { default as spanishBadWords } from "./languages/spanish-words.js";
export { default as bengaliBadWords } from "./languages/bengali-words.js";
export { default as tamilBadWords } from "./languages/tamil-words.js";
export { default as teluguBadWords } from "./languages/telugu-words.js";
/**
* Default console logger implementation.
*/
class ConsoleLogger {
info(message) {
console.log(`[AllProfanity] ${message}`);
}
warn(message) {
console.warn(`[AllProfanity] ${message}`);
}
error(message) {
console.error(`[AllProfanity] ${message}`);
}
}
/**
* Severity levels for profanity detection.
*/
export var ProfanitySeverity;
(function (ProfanitySeverity) {
ProfanitySeverity[ProfanitySeverity["MILD"] = 1] = "MILD";
ProfanitySeverity[ProfanitySeverity["MODERATE"] = 2] = "MODERATE";
ProfanitySeverity[ProfanitySeverity["SEVERE"] = 3] = "SEVERE";
ProfanitySeverity[ProfanitySeverity["EXTREME"] = 4] = "EXTREME";
})(ProfanitySeverity = ProfanitySeverity || (ProfanitySeverity = {}));
/**
* Validate a string parameter.
* @param input - The input to validate.
* @param paramName - The name of the parameter.
* @returns The validated string.
* @throws {TypeError} If input is not a string.
*/
function validateString(input, paramName) {
if (typeof input !== "string") {
throw new TypeError(`${paramName} must be a string, got ${typeof input}`);
}
return input;
}
/**
* Validate a string array parameter.
* @param input - The input to validate.
* @param paramName - The name of the parameter.
* @returns The validated string array.
* @throws {TypeError} If input is not an array.
*/
function validateStringArray(input, paramName) {
if (!Array.isArray(input)) {
throw new TypeError(`${paramName} must be an array`);
}
return input.filter((item) => {
if (typeof item !== "string") {
console.warn(`Skipping non-string item in ${paramName}: ${item}`);
return false;
}
return item.trim().length > 0;
});
}
/**
* Trie node for efficient string matching.
*/
class TrieNode {
constructor() {
this.children = new Map();
this.isEndOfWord = false;
this.word = "";
}
/**
* Add a word to the trie.
* @param word - The word to add.
*/
addWord(word) {
let current = this;
for (const char of word) {
if (!current.children.has(char)) {
current.children.set(char, new TrieNode());
}
const nextNode = current.children.get(char);
if (nextNode) {
current = nextNode;
}
}
current.isEndOfWord = true;
current.word = word;
}
/**
* Remove a word from the trie.
* @param word - The word to remove.
* @returns True if the word was removed, false otherwise.
*/
removeWord(word) {
return this.removeHelper(word, 0);
}
removeHelper(word, index) {
if (index === word.length) {
if (!this.isEndOfWord)
return false;
this.isEndOfWord = false;
return this.children.size === 0;
}
const char = word[index];
const node = this.children.get(char);
if (!node)
return false;
const shouldDeleteChild = node.removeHelper(word, index + 1);
if (shouldDeleteChild) {
this.children.delete(char);
return this.children.size === 0 && !this.isEndOfWord;
}
return false;
}
/**
* Find all matches starting at a given position.
* @param text - The text to search.
* @param startPos - The start position.
* @param allowPartial - Whether to allow partial word matches.
* @returns Array of matches.
*/
findMatches(text, startPos, allowPartial) {
const matches = [];
let current = this;
let pos = startPos;
while (pos < text.length) {
const nextNode = current.children.get(text[pos]);
if (!nextNode)
break;
current = nextNode;
pos++;
if (current.isEndOfWord) {
if (!allowPartial) {
const wordStart = startPos;
const wordEnd = pos;
matches.push({
word: current.word,
start: wordStart - startPos,
end: wordEnd - startPos,
});
}
else {
matches.push({
word: current.word,
start: 0,
end: pos - startPos,
});
}
}
}
return matches;
}
/**
* Clear all words from the trie.
*/
clear() {
this.children.clear();
this.isEndOfWord = false;
this.word = "";
}
}
/**
* Main class for profanity detection and filtering.
*/
export class AllProfanity {
/**
* Create an AllProfanity instance.
* @param options - Profanity filter configuration options.
*/
constructor(options) {
var _a, _b, _c, _d, _e;
this.profanityTrie = new TrieNode();
this.whitelistSet = new Set();
this.loadedLanguages = new Set();
this.defaultPlaceholder = "*";
this.enableLeetSpeak = true;
this.caseSensitive = false;
this.strictMode = false;
this.detectPartialWords = false;
this.availableLanguages = {
english: englishBadWords || [],
hindi: hindiBadWords || [],
french: frenchBadWords || [],
german: germanBadWords || [],
spanish: spanishBadWords || [],
bengali: bengaliBadWords || [],
tamil: tamilBadWords || [],
telugu: teluguBadWords || [],
};
this.leetMappings = new Map([
["@", "a"],
["^", "a"],
["4", "a"],
["8", "b"],
["6", "b"],
["|3", "b"],
["(", "c"],
["<", "c"],
["©", "c"],
["|)", "d"],
["0", "o"],
["3", "e"],
["€", "e"],
["|=", "f"],
["ph", "f"],
["9", "g"],
["#", "h"],
["|-|", "h"],
["1", "i"],
["!", "i"],
["|", "i"],
["_|", "j"],
["¿", "j"],
["|<", "k"],
["1<", "k"],
["7", "l"],
["|\\/|", "m"],
["/\\/\\", "m"],
["|\\|", "n"],
["//", "n"],
["()", "o"],
["|*", "p"],
["|o", "p"],
["(_,)", "q"],
["()_", "q"],
["|2", "r"],
["12", "r"],
["5", "s"],
["$", "s"],
["z", "s"],
["7", "t"],
["+", "t"],
["†", "t"],
["|_|", "u"],
["(_)", "u"],
["v", "u"],
["\\/", "v"],
["|/", "v"],
["\\/\\/", "w"],
["vv", "w"],
["><", "x"],
["}{", "x"],
["`/", "y"],
["j", "y"],
["2", "z"],
["7_", "z"],
]);
this.dynamicWords = new Set();
this.logger = (options === null || options === void 0 ? void 0 : options.logger) || new ConsoleLogger();
if ((options === null || options === void 0 ? void 0 : options.defaultPlaceholder) !== undefined) {
this.setPlaceholder(options.defaultPlaceholder);
}
this.enableLeetSpeak = (_a = options === null || options === void 0 ? void 0 : options.enableLeetSpeak) !== null && _a !== void 0 ? _a : true;
this.caseSensitive = (_b = options === null || options === void 0 ? void 0 : options.caseSensitive) !== null && _b !== void 0 ? _b : false;
this.strictMode = (_c = options === null || options === void 0 ? void 0 : options.strictMode) !== null && _c !== void 0 ? _c : false;
this.detectPartialWords = (_d = options === null || options === void 0 ? void 0 : options.detectPartialWords) !== null && _d !== void 0 ? _d : false;
if (options === null || options === void 0 ? void 0 : options.whitelistWords) {
this.addToWhitelist(options.whitelistWords);
}
this.loadLanguage("english");
this.loadLanguage("hindi");
if ((_e = options === null || options === void 0 ? void 0 : options.languages) === null || _e === void 0 ? void 0 : _e.length) {
options.languages.forEach((lang) => this.loadLanguage(lang));
}
if (options === null || options === void 0 ? void 0 : options.customDictionaries) {
Object.entries(options.customDictionaries).forEach(([name, words]) => {
this.loadCustomDictionary(name, words);
});
}
}
/**
* Normalize leet speak to regular characters.
* @param text - The input text.
* @returns Normalized text.
*/
normalizeLeetSpeak(text) {
if (!this.enableLeetSpeak)
return text;
let normalized = text.toLowerCase();
const sortedMappings = Array.from(this.leetMappings.entries()).sort(([leetA], [leetB]) => leetB.length - leetA.length);
for (const [leet, normal] of sortedMappings) {
const regex = new RegExp(this.escapeRegex(leet), "g");
normalized = normalized.replace(regex, normal);
}
return normalized;
}
/**
* Escape regex special characters in a string.
* @param str - The string to escape.
* @returns The escaped string.
*/
escapeRegex(str) {
return str.replace(/[\\^$.*+?()[\]{}|]/g, "\\$&");
}
/**
* Check if a match is bounded by word boundaries (strict mode).
* @param text - The text.
* @param start - Start index.
* @param end - End index.
* @returns True if match is at word boundaries, false otherwise.
*/
hasWordBoundaries(text, start, end) {
if (!this.strictMode)
return true;
const beforeChar = start > 0 ? text[start - 1] : " ";
const afterChar = end < text.length ? text[end] : " ";
const wordBoundaryRegex = /[\s\p{P}\p{S}]/u;
return (wordBoundaryRegex.test(beforeChar) && wordBoundaryRegex.test(afterChar));
}
/**
* Determine if a match is a whole word.
* @param text - The text.
* @param start - Start index.
* @param end - End index.
* @returns True if whole word, false otherwise.
*/
isWholeWord(text, start, end) {
if (start !== 0 && /\w/.test(text[start - 1]))
return false;
if (end !== text.length && /\w/.test(text[end]))
return false;
return true;
}
/**
* Check if a match is whitelisted.
* @param word - Word from dictionary.
* @param matchedText - Actual matched text.
* @returns True if whitelisted, false otherwise.
*/
isWhitelistedMatch(word, matchedText) {
if (this.caseSensitive) {
return this.whitelistSet.has(word) || this.whitelistSet.has(matchedText);
}
else {
return (this.whitelistSet.has(word.toLowerCase()) ||
this.whitelistSet.has(matchedText.toLowerCase()));
}
}
/**
* Remove overlapping matches, keeping only the longest at each start position.
* @param matches - Array of match results.
* @returns Deduplicated matches.
*/
deduplicateMatches(matches) {
const sorted = [...matches].sort((a, b) => {
if (a.start !== b.start)
return a.start - b.start;
return b.end - a.end;
});
const result = [];
let lastEnd = -1;
for (const match of sorted) {
if (match.start >= lastEnd) {
result.push(match);
lastEnd = match.end;
}
}
return result;
}
/**
* Detect profanity in a given text.
* @param text - The text to check.
* @returns Profanity detection result.
*/
detect(text) {
const validatedText = validateString(text, "text");
if (validatedText.length === 0) {
return {
hasProfanity: false,
detectedWords: [],
cleanedText: validatedText,
severity: ProfanitySeverity.MILD,
positions: [],
};
}
const matches = [];
const normalizedText = this.caseSensitive
? validatedText
: validatedText.toLowerCase();
this.findMatches(normalizedText, validatedText, matches);
if (this.enableLeetSpeak) {
const leetNormalized = this.normalizeLeetSpeak(normalizedText);
if (leetNormalized !== normalizedText) {
this.findMatches(leetNormalized, validatedText, matches);
}
}
const uniqueMatches = this.deduplicateMatches(matches);
const detectedWords = uniqueMatches.map((m) => m.originalWord);
const severity = this.calculateSeverity(uniqueMatches);
const cleanedText = this.generateCleanedText(validatedText, uniqueMatches);
return {
hasProfanity: uniqueMatches.length > 0,
detectedWords,
cleanedText,
severity,
positions: uniqueMatches.map((m) => ({
word: m.originalWord,
start: m.start,
end: m.end,
})),
};
}
/**
* Main matching function, with whole-word logic.
* @param searchText - The normalized text to search.
* @param originalText - The original text.
* @param matches - Array to collect matches.
*/
findMatches(searchText, originalText, matches) {
for (let i = 0; i < searchText.length; i++) {
const matchResults = this.profanityTrie.findMatches(searchText, i, this.detectPartialWords);
for (const match of matchResults) {
const start = i + match.start;
const end = i + match.end;
if (!this.detectPartialWords &&
!this.isWholeWord(originalText, start, end)) {
continue;
}
const matchedText = originalText.substring(start, end);
if (this.isWhitelistedMatch(match.word, matchedText)) {
continue;
}
if (this.hasWordBoundaries(originalText, start, end)) {
matches.push({
word: match.word,
start,
end,
originalWord: matchedText,
});
}
}
}
}
/**
* Generate cleaned text by replacing profane words.
* @param originalText - The original text.
* @param matches - Array of matches.
* @returns Cleaned text.
*/
generateCleanedText(originalText, matches) {
if (matches.length === 0)
return originalText;
let result = originalText;
const sortedMatches = [...this.deduplicateMatches(matches)].sort((a, b) => b.start - a.start);
for (const match of sortedMatches) {
const replacement = this.defaultPlaceholder.repeat(match.originalWord.length);
result =
result.substring(0, match.start) +
replacement +
result.substring(match.end);
}
return result;
}
/**
* Check if a string contains profanity.
* @param text - The text to check.
* @returns True if profanity is found, false otherwise.
*/
check(text) {
return this.detect(text).hasProfanity;
}
/**
* Clean text with a custom placeholder.
* @param text - The text to clean.
* @param placeholder - The placeholder to use.
* @returns Cleaned text.
*/
clean(text, placeholder) {
const detection = this.detect(text);
if (!placeholder || placeholder === this.defaultPlaceholder) {
return detection.cleanedText;
}
let result = text;
const sortedPositions = [
...this.deduplicateMatches(detection.positions.map((p) => ({
word: p.word,
start: p.start,
end: p.end,
originalWord: text.substring(p.start, p.end),
}))),
].sort((a, b) => b.start - a.start);
for (const pos of sortedPositions) {
const originalWord = text.substring(pos.start, pos.end);
const replacement = placeholder.repeat(originalWord.length);
result =
result.substring(0, pos.start) +
replacement +
result.substring(pos.end);
}
return result;
}
/**
* Clean text by replacing each profane word with a single placeholder (word-level).
* @param text - The text to clean.
* @param placeholder - The placeholder to use.
* @returns Word-level cleaned text.
*/
cleanWithPlaceholder(text, placeholder = "***") {
const detection = this.detect(text);
if (detection.positions.length === 0)
return text;
let result = text;
const sortedPositions = [
...this.deduplicateMatches(detection.positions.map((p) => ({
word: p.word,
start: p.start,
end: p.end,
originalWord: text.substring(p.start, p.end),
}))),
].sort((a, b) => b.start - a.start);
for (const pos of sortedPositions) {
if (!this.isWholeWord(result, pos.start, pos.end))
continue;
result =
result.substring(0, pos.start) +
placeholder +
result.substring(pos.end);
}
return result;
}
/**
* Add word(s) to the profanity filter.
* @param word - Word or array of words to add.
*/
add(word) {
const words = Array.isArray(word) ? word : [word];
const validatedWords = validateStringArray(words, "words to add");
for (const w of validatedWords) {
this.dynamicWords.add(w);
this.addWordToTrie(w);
}
}
/**
* Remove word(s) from the profanity filter.
* @param word - Word or array of words to remove.
*/
remove(word) {
const words = Array.isArray(word) ? word : [word];
const validatedWords = validateStringArray(words, "words to remove");
for (const w of validatedWords) {
const normalizedWord = this.caseSensitive ? w : w.toLowerCase();
this.profanityTrie.removeWord(normalizedWord);
this.dynamicWords.delete(w);
}
}
/**
* Add words to the whitelist.
* @param words - Words to whitelist.
*/
addToWhitelist(words) {
const validatedWords = validateStringArray(words, "whitelist words");
for (const word of validatedWords) {
const normalizedWord = this.caseSensitive ? word : word.toLowerCase();
this.whitelistSet.add(normalizedWord);
}
}
/**
* Remove words from the whitelist.
* @param words - Words to remove from whitelist.
*/
removeFromWhitelist(words) {
const validatedWords = validateStringArray(words, "whitelist words");
for (const word of validatedWords) {
const normalizedWord = this.caseSensitive ? word : word.toLowerCase();
this.whitelistSet.delete(normalizedWord);
}
}
/**
* Check if a word is whitelisted.
* @param word - The word to check.
* @returns True if whitelisted, false otherwise.
*/
isWhitelisted(word) {
const normalizedWord = this.caseSensitive ? word : word.toLowerCase();
return this.whitelistSet.has(normalizedWord);
}
/**
* Load a built-in language dictionary.
* @param language - The language key.
* @returns True if loaded, false otherwise.
*/
loadLanguage(language) {
if (!language || typeof language !== "string") {
this.logger.warn(`Invalid language parameter: ${language}`);
return false;
}
const langKey = language.toLowerCase().trim();
if (this.loadedLanguages.has(langKey)) {
return true;
}
const words = this.availableLanguages[langKey];
if (!words || words.length === 0) {
this.logger.warn(`Language '${language}' not found or empty`);
return false;
}
try {
let addedCount = 0;
for (const word of words) {
if (this.addWordToTrie(word)) {
addedCount++;
}
}
this.loadedLanguages.add(langKey);
this.logger.info(`Loaded ${addedCount} words from ${language} dictionary`);
return true;
}
catch (error) {
this.logger.error(`Failed to load language ${language}: ${error}`);
return false;
}
}
/**
* Load multiple language dictionaries.
* @param languages - Array of languages to load.
* @returns Number of successfully loaded languages.
*/
loadLanguages(languages) {
const validatedLanguages = validateStringArray(languages, "languages");
return validatedLanguages.reduce((count, lang) => {
return this.loadLanguage(lang) ? count + 1 : count;
}, 0);
}
/**
* Load all supported Indian languages.
* @returns Number of loaded Indian languages.
*/
loadIndianLanguages() {
const indianLanguages = ["hindi", "bengali", "tamil", "telugu"];
return this.loadLanguages(indianLanguages);
}
/**
* Load a custom dictionary.
* @param name - Name of the dictionary.
* @param words - Words to add.
*/
loadCustomDictionary(name, words) {
validateString(name, "dictionary name");
const validatedWords = validateStringArray(words, "custom dictionary words");
if (validatedWords.length === 0) {
this.logger.warn(`Custom dictionary '${name}' contains no valid words`);
return;
}
try {
let addedCount = 0;
for (const word of validatedWords) {
if (this.addWordToTrie(word)) {
addedCount++;
}
}
this.availableLanguages[name.toLowerCase()] = validatedWords;
this.loadedLanguages.add(name.toLowerCase());
this.logger.info(`Loaded ${addedCount} words from custom dictionary '${name}'`);
}
catch (error) {
this.logger.error(`Failed to load custom dictionary ${name}: ${error}`);
}
}
/**
* Add a single word to the trie.
* @param word - The word to add.
* @returns True if added, false otherwise.
*/
addWordToTrie(word) {
if (!word || typeof word !== "string" || word.trim().length === 0) {
return false;
}
const normalizedWord = this.caseSensitive
? word.trim()
: word.trim().toLowerCase();
if (this.isWhitelisted(normalizedWord)) {
return false;
}
this.profanityTrie.addWord(normalizedWord);
return true;
}
/**
* Calculate severity from matches.
* @param matches - Array of matches.
* @returns Severity level.
*/
calculateSeverity(matches) {
if (matches.length === 0)
return ProfanitySeverity.MILD;
const uniqueWords = new Set(matches.map((m) => m.word)).size;
const totalMatches = matches.length;
if (totalMatches >= 5 || uniqueWords >= 4)
return ProfanitySeverity.EXTREME;
if (totalMatches >= 3 || uniqueWords >= 3)
return ProfanitySeverity.SEVERE;
if (totalMatches >= 2 || uniqueWords >= 2)
return ProfanitySeverity.MODERATE;
return ProfanitySeverity.MILD;
}
/**
* Clear all loaded dictionaries and dynamic words.
*/
clearList() {
this.profanityTrie.clear();
this.loadedLanguages.clear();
this.dynamicWords.clear();
}
/**
* Set the placeholder character for filtered words.
* @param placeholder - The placeholder character.
*/
setPlaceholder(placeholder) {
validateString(placeholder, "placeholder");
if (placeholder.length === 0) {
throw new Error("Placeholder cannot be empty");
}
this.defaultPlaceholder = placeholder.charAt(0);
}
/**
* Get the list of loaded languages.
* @returns Array of loaded language keys.
*/
getLoadedLanguages() {
return Array.from(this.loadedLanguages);
}
/**
* Get the list of available built-in languages.
* @returns Array of available language keys.
*/
getAvailableLanguages() {
return Object.keys(this.availableLanguages);
}
/**
* Get the current configuration of the profanity filter.
* @returns Partial configuration object.
*/
getConfig() {
return {
defaultPlaceholder: this.defaultPlaceholder,
enableLeetSpeak: this.enableLeetSpeak,
caseSensitive: this.caseSensitive,
strictMode: this.strictMode,
detectPartialWords: this.detectPartialWords,
languages: this.getLoadedLanguages(),
whitelistWords: Array.from(this.whitelistSet),
};
}
/**
* Rebuild the profanity trie from loaded dictionaries and dynamic words.
*/
rebuildTrie() {
this.profanityTrie.clear();
for (const lang of this.loadedLanguages) {
const words = this.availableLanguages[lang] || [];
for (const word of words) {
this.addWordToTrie(word);
}
}
for (const word of this.dynamicWords) {
this.addWordToTrie(word);
}
}
/**
* Update configuration options for the profanity filter.
* @param options - Partial configuration object.
*/
updateConfig(options) {
let rebuildNeeded = false;
if (options.defaultPlaceholder !== undefined) {
this.setPlaceholder(options.defaultPlaceholder);
}
if (options.enableLeetSpeak !== undefined) {
this.enableLeetSpeak = options.enableLeetSpeak;
}
if (options.caseSensitive !== undefined &&
options.caseSensitive !== this.caseSensitive) {
this.caseSensitive = options.caseSensitive;
rebuildNeeded = true;
}
if (options.strictMode !== undefined) {
this.strictMode = options.strictMode;
}
if (options.detectPartialWords !== undefined) {
this.detectPartialWords = options.detectPartialWords;
}
if (options.whitelistWords) {
this.addToWhitelist(options.whitelistWords);
}
if (rebuildNeeded) {
this.rebuildTrie();
}
}
}
/**
* Singleton instance of AllProfanity with default configuration.
*/
const allProfanity = new AllProfanity();
export default allProfanity;
//# sourceMappingURL=index.js.map