UNPKG

swearing-filter

Version:

Multilingual profanity filter supporting English, Russian, Finnish, and more

200 lines (199 loc) 7.24 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); const patterns_1 = require("./patterns"); const replaces_1 = require("./replaces"); /** * ProfanityFilter class provides methods to detect and replace abusive words in a given text. * It supports multiple languages and allows customization of the placeholder used for replacement. * The class also offers a debug mode to trace which patterns triggered the detection. * * @param opts * @param {string} opts.placeholder Character used to replace profane words. * @param {string[]} opts.languages Array of supported languages. * @param {boolean} opts.debug In debug mode you can see original word and find out which pattern caused the trigger. */ class ProfanityFilter { /** * Character used to replace profane words. * @type {string} */ placeholder = '***'; /** * Array of supported languages. * @type {string[]} */ languages = ['ru', 'en']; /** * In debug mode you can see original word and find out which pattern caused the trigger. * @type {boolean} */ debug = false; constructor(opts = {}) { Object.assign(this, { placeholder: opts.placeholder || '***', languages: opts.languages || ['ru', 'en'], debug: opts.debug || false, }); } /** * Searches if there are any abusive words in the text. * @param {string} string Original text. * @return {boolean} Returns true if there are any abusive words in the string, otherwise false. */ isBad(string) { const words = string.split(" "); for (let i = 0; i < words.length; i++) { const wordParts = this.getCleanedWords(words[i]); for (const w of wordParts) { // if (w.length < 3) continue; const match = this.search(w); if (match) { return true; } } } return false; } /** * Replaces abusive words in the string. * @param {string} string Original text. * @return {string} Cleaned text with abusive words replaced. */ replace(string) { const words = string.split(" "); for (let i = 0; i < words.length; i++) { const wordParts = this.getCleanedWords(words[i]); for (const w of wordParts) { // if (w.length < 3) continue; const match = this.search(w); if (match) { //TODO: replace only exact search words[i] = words[i].replaceAll(match.input, this.placeholder); } } } return words.join(' '); } /** * Cleans the input string by removing non-alphabetic characters and splitting into words. * @param {string} string Original text. * @return {string[]} Array of cleaned words. */ getCleanedWords(string) { let result = string; if (/\p{Script=Latin}/ui.test(string)) { result = string.replace(/[^\p{Script=Latin}]/ug, ' ').trim(); } else if (/\p{Script=Cyrillic}/ui.test(string)) { result = string.replace(/[^\p{Script=Cyrillic}]/ug, ' ').trim(); } else if (/\p{Script=Han}/ui.test(string)) { result = string.replace(/[^\p{Script=Han}]/ug, ' ').trim(); } return result.trim().split(/\s+/g); } /** * Fixes abusive words inside the string. * @param {string} string Original text. * @return {string} Fixed text with abusive words corrected. */ fix(string) { let result = ''; const patternKeys = Object.keys(replaces_1.replaceRu).reverse(); for (const p of patternKeys) { const pattern = this.prepare(p); const replace = replaces_1.replaceRu[p]; if (pattern.test(string)) { result = string.replace(pattern, replace); if (this.checkFirstChar(string)) { result = this.upFirstChar(result); } } } return result; } /** * Changes the filter options. * @param {FilterOptions} opts Options to configure the filter. */ setOptions(opts) { Object.assign(this, opts); } /** * Searches for an abusive word in the given word. * @param {string} word Original word. * @return {RegExpExecArray | null} Returns the match if found, otherwise null. */ search(word) { const patterns = this.getPatterns(word); const firstLetter = word.charAt(0).toLowerCase(); const filteredPatterns = patterns.filter(p => { return !p.startsWith('^') || p.replace('^', '').charAt(0).toLowerCase() === firstLetter; }); for (const p of filteredPatterns) { const regexp = this.prepare(p); const match = regexp.exec(word); if (match) { if (match && this.debug) console.debug(`DEBUG: ${word} ${p}`); return match; } } return null; } /** * Prepares a regular expression pattern. * @param {string} pattern Pattern to prepare. * @return {RegExp} Prepared regular expression. */ prepare(pattern) { return new RegExp(pattern, 'ui'); } /** * Gets the patterns for the given string based on the supported languages. * @param {string} string Original text. * @return {string[]} Array of patterns. */ getPatterns(string) { let patterns = []; // https://en.wikipedia.org/wiki/ISO_15924 if (this.languages.includes('ru') && /\p{Script=Cyrillic}/ui.test(string)) { patterns = patterns_1.ru; } else if (/\p{Script=Latin}/ui.test(string)) { if (this.languages.includes('en')) patterns.push(...patterns_1.en); if (this.languages.includes('fi')) patterns.push(...patterns_1.fi); if (this.languages.includes('sv')) patterns.push(...patterns_1.sv); } else if (/\p{Script=Han}/ui.test(string)) { if (this.languages.includes('zh')) patterns.push(...patterns_1.zh); } //TODO: sorted patterns with few langs (en and fi) // or maybe sort it in construct new ProfnityFilter() return patterns; } /** * Checks if the first character of the string is uppercase. * @param {string} string Original text. * @return {boolean} Returns true if the first character is uppercase, otherwise false. */ checkFirstChar(string) { const first = string.substring(0, 1); return (first.toLowerCase() !== first); } /** * Capitalizes the first character of the string. * @param {string} string Original text. * @return {string} Text with the first character capitalized. */ upFirstChar(string) { const words = string.split(' '); words[0] = words[0].slice(0, 1).toUpperCase() + words[0].slice(1); return words.join(' '); } } exports.default = ProfanityFilter;