swearing-filter
Version:
Multilingual profanity filter supporting English, Russian, Finnish, and more
200 lines (199 loc) • 7.24 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
const patterns_1 = require("./patterns");
const replaces_1 = require("./replaces");
/**
* ProfanityFilter class provides methods to detect and replace abusive words in a given text.
* It supports multiple languages and allows customization of the placeholder used for replacement.
* The class also offers a debug mode to trace which patterns triggered the detection.
*
* @param opts
* @param {string} opts.placeholder Character used to replace profane words.
* @param {string[]} opts.languages Array of supported languages.
* @param {boolean} opts.debug In debug mode you can see original word and find out which pattern caused the trigger.
*/
class ProfanityFilter {
/**
* Character used to replace profane words.
* @type {string}
*/
placeholder = '***';
/**
* Array of supported languages.
* @type {string[]}
*/
languages = ['ru', 'en'];
/**
* In debug mode you can see original word and find out which pattern caused the trigger.
* @type {boolean}
*/
debug = false;
constructor(opts = {}) {
Object.assign(this, {
placeholder: opts.placeholder || '***',
languages: opts.languages || ['ru', 'en'],
debug: opts.debug || false,
});
}
/**
* Searches if there are any abusive words in the text.
* @param {string} string Original text.
* @return {boolean} Returns true if there are any abusive words in the string, otherwise false.
*/
isBad(string) {
const words = string.split(" ");
for (let i = 0; i < words.length; i++) {
const wordParts = this.getCleanedWords(words[i]);
for (const w of wordParts) {
// if (w.length < 3) continue;
const match = this.search(w);
if (match) {
return true;
}
}
}
return false;
}
/**
* Replaces abusive words in the string.
* @param {string} string Original text.
* @return {string} Cleaned text with abusive words replaced.
*/
replace(string) {
const words = string.split(" ");
for (let i = 0; i < words.length; i++) {
const wordParts = this.getCleanedWords(words[i]);
for (const w of wordParts) {
// if (w.length < 3) continue;
const match = this.search(w);
if (match) {
//TODO: replace only exact search
words[i] = words[i].replaceAll(match.input, this.placeholder);
}
}
}
return words.join(' ');
}
/**
* Cleans the input string by removing non-alphabetic characters and splitting into words.
* @param {string} string Original text.
* @return {string[]} Array of cleaned words.
*/
getCleanedWords(string) {
let result = string;
if (/\p{Script=Latin}/ui.test(string)) {
result = string.replace(/[^\p{Script=Latin}]/ug, ' ').trim();
}
else if (/\p{Script=Cyrillic}/ui.test(string)) {
result = string.replace(/[^\p{Script=Cyrillic}]/ug, ' ').trim();
}
else if (/\p{Script=Han}/ui.test(string)) {
result = string.replace(/[^\p{Script=Han}]/ug, ' ').trim();
}
return result.trim().split(/\s+/g);
}
/**
* Fixes abusive words inside the string.
* @param {string} string Original text.
* @return {string} Fixed text with abusive words corrected.
*/
fix(string) {
let result = '';
const patternKeys = Object.keys(replaces_1.replaceRu).reverse();
for (const p of patternKeys) {
const pattern = this.prepare(p);
const replace = replaces_1.replaceRu[p];
if (pattern.test(string)) {
result = string.replace(pattern, replace);
if (this.checkFirstChar(string)) {
result = this.upFirstChar(result);
}
}
}
return result;
}
/**
* Changes the filter options.
* @param {FilterOptions} opts Options to configure the filter.
*/
setOptions(opts) {
Object.assign(this, opts);
}
/**
* Searches for an abusive word in the given word.
* @param {string} word Original word.
* @return {RegExpExecArray | null} Returns the match if found, otherwise null.
*/
search(word) {
const patterns = this.getPatterns(word);
const firstLetter = word.charAt(0).toLowerCase();
const filteredPatterns = patterns.filter(p => {
return !p.startsWith('^') || p.replace('^', '').charAt(0).toLowerCase() === firstLetter;
});
for (const p of filteredPatterns) {
const regexp = this.prepare(p);
const match = regexp.exec(word);
if (match) {
if (match && this.debug)
console.debug(`DEBUG: ${word} ${p}`);
return match;
}
}
return null;
}
/**
* Prepares a regular expression pattern.
* @param {string} pattern Pattern to prepare.
* @return {RegExp} Prepared regular expression.
*/
prepare(pattern) {
return new RegExp(pattern, 'ui');
}
/**
* Gets the patterns for the given string based on the supported languages.
* @param {string} string Original text.
* @return {string[]} Array of patterns.
*/
getPatterns(string) {
let patterns = [];
// https://en.wikipedia.org/wiki/ISO_15924
if (this.languages.includes('ru') && /\p{Script=Cyrillic}/ui.test(string)) {
patterns = patterns_1.ru;
}
else if (/\p{Script=Latin}/ui.test(string)) {
if (this.languages.includes('en'))
patterns.push(...patterns_1.en);
if (this.languages.includes('fi'))
patterns.push(...patterns_1.fi);
if (this.languages.includes('sv'))
patterns.push(...patterns_1.sv);
}
else if (/\p{Script=Han}/ui.test(string)) {
if (this.languages.includes('zh'))
patterns.push(...patterns_1.zh);
}
//TODO: sorted patterns with few langs (en and fi)
// or maybe sort it in construct new ProfnityFilter()
return patterns;
}
/**
* Checks if the first character of the string is uppercase.
* @param {string} string Original text.
* @return {boolean} Returns true if the first character is uppercase, otherwise false.
*/
checkFirstChar(string) {
const first = string.substring(0, 1);
return (first.toLowerCase() !== first);
}
/**
* Capitalizes the first character of the string.
* @param {string} string Original text.
* @return {string} Text with the first character capitalized.
*/
upFirstChar(string) {
const words = string.split(' ');
words[0] = words[0].slice(0, 1).toUpperCase() + words[0].slice(1);
return words.join(' ');
}
}
exports.default = ProfanityFilter;