turkish-profanity-filter
Version:
A configurable Turkish profanity filter for text content
138 lines (115 loc) • 4.11 kB
JavaScript
// src/index.js
// Main library functionality
const { defaultWordList } = require('./wordList');
const { createRegExp } = require('./utils');
class TurkishProfanityFilter {
constructor(options = {}) {
this.options = {
wordList: options.wordList || defaultWordList,
wholeWords: options.wholeWords !== undefined ? options.wholeWords : true,
caseSensitive: options.caseSensitive || false,
replacement: options.replacement || '***',
...options
};
this._buildRegExp();
}
/**
* Build the regular expression used for matching
* @private
*/
_buildRegExp() {
this.regexp = createRegExp(
this.options.wordList,
{
wholeWords: this.options.wholeWords,
caseSensitive: this.options.caseSensitive
}
);
}
/**
* Check if text contains profanity
* @param {string} text - The text to check
* @returns {boolean} - True if profanity found, false otherwise
*/
check(text) {
if (typeof text !== 'string') {
return false;
}
// Reset lastIndex to ensure consistent results
this.regexp.lastIndex = 0;
const result = this.regexp.test(text);
// Debug
// console.log(`Checking: "${text}"`);
// console.log(`Regex: ${this.regexp}`);
// console.log(`Result: ${result}`);
return result;
}
/**
* Get all profanity words found in text
* @param {string} text - The text to analyze
* @returns {string[]} - Array of found profanity words
*/
getWords(text) {
if (typeof text !== 'string') {
return [];
}
// Create a new RegExp object for searching to avoid lastIndex issues
const searchRegexp = new RegExp(this.regexp.source, this.regexp.flags);
const matches = [];
let match;
while ((match = searchRegexp.exec(text)) !== null) {
// If using our custom whole-word matching for non-ASCII characters,
// the actual word is in the 2nd capture group
const word = this.options.wholeWords ? match[2] : match[0];
if (word) {
matches.push(word);
}
// Prevent infinite loops on zero-length matches
if (match.index === searchRegexp.lastIndex) {
searchRegexp.lastIndex++;
}
}
return matches;
}
/**
* Censor profanity in text
* @param {string} text - The text to censor
* @returns {string} - Censored text
*/
censor(text) {
if (typeof text !== 'string') {
return text;
}
// Create a new RegExp to avoid lastIndex issues
const replaceRegexp = new RegExp(this.regexp.source, this.regexp.flags);
// For whole word matching with our custom approach,
// preserve the surrounding characters
if (this.options.wholeWords) {
return text.replace(replaceRegexp, (match, before, word, after) => {
return before + this.options.replacement + after;
});
} else {
return text.replace(replaceRegexp, this.options.replacement);
}
}
/**
* Add words to the filter
* @param {string|string[]} words - Word(s) to add
*/
addWords(words) {
const wordsArray = Array.isArray(words) ? words : [words];
const validWords = wordsArray.filter(word => word && typeof word === 'string');
this.options.wordList = [...new Set([...this.options.wordList, ...validWords])];
this._buildRegExp();
}
/**
* Remove words from the filter
* @param {string|string[]} words - Word(s) to remove
*/
removeWords(words) {
const wordsArray = Array.isArray(words) ? words : [words];
this.options.wordList = this.options.wordList.filter(word => !wordsArray.includes(word));
this._buildRegExp();
}
}
module.exports = TurkishProfanityFilter;