paknevis
Version:
Paknevis.js is a persian text formatter tool.
79 lines (78 loc) • 3.11 kB
JavaScript
import { defaultBadWords } from './data/badWords.js';
/**
* Base class of Paknevis.js
* @class
*/
export class Paknevis {
/**
* This function normalise strings to improve detection of banned words.
* @param {string} text your text
* @returns {string}
*/
static normalise(text) {
return text
.replace(/[\u200c\u200d\u200e\u200f\u202a-\u202e\s ]/gm, '')
.replace(/[,،؛:!؟?"'«»‹›()\[\]{}\-_/\*~^@#$%=\|`']/gm, '')
.replace(/ك/gm, 'ک')
.replace(/[۰-۹]/gm, d => String.fromCharCode(d.charCodeAt(0) - 1728))
.replace(/[٠-٩]/gm, d => String.fromCharCode(d.charCodeAt(0) - 1632))
.replace(/[\u064b-\u0652\u0670]/gm, '')
.replace(/ـ/gm, '')
.replace(/[يى]/gm, 'ی')
.replace(/[ثص]/gm, 'س')
.replace(/[زذضظ]/gm, 'ز')
.replace(/ح/gm, 'ه')
.replace(/[تط]/gm, 'ت')
.replace(/[عءئ]/gm, 'ا')
.replace(/[ؤو]/gm, 'و')
.replace(/[^آابپتثجچحخدذرزسشصضطظعغفقکگلمنوهی0-9]/gm, '').toLocaleLowerCase('fa');
}
;
/**
* This function checks if text contains any filtered word.good for just checking.
* @param {string} text input text
* @param {string[]} [extraWords] use this to add any extra words in array.
* @returns {boolean}
*/
static hasBadWords(text, extraWords = []) {
let arrayOfWords = Array.from(new Set([...defaultBadWords, ...extraWords]));
return arrayOfWords.some(word => this.normalise(text).includes(word));
}
/**
* This function censores banned words.note that it could not find words in some scenarios.
* @param {string} text input text
* @param {string} [censorChar] good for masking banned word. defualt is empty.
* @param {string[]} [extraWords] use this to add any extra words in array.
* @returns {string}
*/
static censor(text, censorChar = "", extraWords = []) {
const allWords = Array.from(new Set([...defaultBadWords, ...extraWords]));
const origChars = [...text];
const len = origChars.length;
const marks = Array(len).fill(false);
allWords.forEach(word => {
const nword = this.normalise(word);
for (let i = 0; i < len; i++) {
for (let l = 1; l <= Math.min(20, len - i); l++) {
const slice = origChars.slice(i, i + l).join('');
if (this.normalise(slice) === nword) {
for (let k = i; k < i + l; k++)
marks[k] = true;
}
}
}
});
let result = '';
for (let i = 0; i < len; i++) {
if (marks[i]) {
if (censorChar && (i === 0 || !marks[i - 1])) {
result += censorChar.repeat(1);
}
}
else {
result += origChars[i];
}
}
return result;
}
}