@sit-sandbox/thai-bad-words
Version:
A package to detect bad words in Thai language.
237 lines (232 loc) • 6.94 kB
JavaScript
var __async = (__this, __arguments, generator) => {
return new Promise((resolve, reject) => {
var fulfilled = (value) => {
try {
step(generator.next(value));
} catch (e) {
reject(e);
}
};
var rejected = (value) => {
try {
step(generator.throw(value));
} catch (e) {
reject(e);
}
};
var step = (x) => x.done ? resolve(x.value) : Promise.resolve(x.value).then(fulfilled, rejected);
step((generator = generator.apply(__this, __arguments)).next());
});
};
// src/trie/TrieNode.ts
var TrieNode = class {
constructor() {
this.children = /* @__PURE__ */ new Map();
this.isEndOfWord = false;
}
};
// src/trie/Trie.ts
var Trie = class {
constructor() {
this.root = new TrieNode();
}
insert(word) {
let current = this.root;
for (const char of word) {
if (!current.children.has(char)) {
current.children.set(char, new TrieNode());
}
current = current.children.get(char);
}
current.isEndOfWord = true;
}
search(text) {
const n = text.length;
for (let i = 0; i < n; i++) {
let current = this.root;
let matchedWord = "";
for (let j = i; j < n; j++) {
if (!current.children.has(text[j])) {
break;
}
current = current.children.get(text[j]);
matchedWord += text[j];
if (current.isEndOfWord) {
return matchedWord;
}
}
}
return false;
}
};
// src/words/wordLists.ts
var prefixes = [
"\u0E01\u0E39",
"\u0E21\u0E36\u0E07",
"\u0E44\u0E2D\u0E49",
"\u0E2D\u0E35",
"\u0E44\u0E2D",
"\u0E1C\u0E21",
"\u0E04\u0E38\u0E13",
"\u0E01\u0E23\u0E30\u0E1C\u0E21",
"\u0E40\u0E18\u0E2D",
"\u0E1E\u0E48\u0E2D",
"\u0E41\u0E21\u0E48",
"\u0E19\u0E32\u0E22"
];
var ignoreList = [
"\u0E2B\u0E35\u0E1A",
"\u0E2A\u0E31\u0E2A\u0E14\u0E35",
"\u0E2B\u0E19\u0E49\u0E32\u0E2B\u0E35\u0E1A",
"\u0E15\u0E14",
"\u0E01\u0E30\u0E2B\u0E23\u0E35\u0E48\u0E1B\u0E31\u0E4A\u0E1A",
"\u0E1A\u0E49\u0E32\u0E19"
];
var rootWords = [
"\u0E04\u0E27\u0E22",
"\u0E40\u0E2B\u0E35\u0E49\u0E22",
"\u0E2B\u0E35",
"\u0E2A\u0E31\u0E2A",
"\u0E40\u0E0A\u0E35\u0E48\u0E22",
"\u0E41\u0E23\u0E14",
"\u0E01\u0E23\u0E30\u0E2B\u0E23\u0E35\u0E48",
"\u0E0A\u0E34\u0E1A\u0E2B\u0E32\u0E22",
"\u0E15\u0E2D\u0E41\u0E2B\u0E25",
"\u0E1F\u0E32\u0E22",
"\u0E41\u0E21\u0E48\u0E07",
"\u0E41\u0E2A\u0E14",
"\u0E16\u0E38\u0E22",
"\u0E40\u0E14\u0E23\u0E31\u0E08\u0E09\u0E32\u0E19",
"\u0E0A\u0E32\u0E15\u0E34\u0E0A\u0E31\u0E48\u0E27",
"\u0E19\u0E23\u0E01",
"\u0E44\u0E2D\u0E14\u0E2D\u0E01",
"\u0E2B\u0E21\u0E2D\u0E22",
"\u0E40\u0E2D\u0E4B\u0E2D",
"\u0E2A\u0E31\u0E15\u0E27\u0E4C",
"\u0E08\u0E31\u0E0D\u0E44\u0E23",
"\u0E40\u0E25\u0E27",
"\u0E17\u0E23\u0E32\u0E21",
"\u0E2A\u0E16\u0E38\u0E19",
"\u0E23\u0E30\u0E22\u0E33",
"\u0E2D\u0E31\u0E1B\u0E23\u0E35\u0E22\u0E4C",
"\u0E15\u0E48\u0E33\u0E15\u0E21",
"\u0E01\u0E32\u0E01",
"\u0E2A\u0E49\u0E19\u0E15\u0E35\u0E19",
"\u0E2B\u0E34\u0E27\u0E15\u0E35\u0E19",
"\u0E2A\u0E49\u0E19\u0E15\u0E35\u0E19",
"\u0E02\u0E22\u0E30",
"\u0E02\u0E35\u0E49\u0E41\u0E1E\u0E49",
"\u0E1A\u0E31\u0E14\u0E0B\u0E1A",
"\u0E08\u0E31\u0E07\u0E44\u0E23",
"\u0E42\u0E2A\u0E42\u0E04\u0E23\u0E01",
"\u0E40\u0E2E\u0E07\u0E0B\u0E27\u0E22",
"\u0E15\u0E25\u0E32\u0E14\u0E25\u0E48\u0E32\u0E07",
"\u0E04\u0E27\u0E32\u0E22",
"\u0E21\u0E36\u0E07\u0E15\u0E32\u0E22",
"\u0E1B\u0E31\u0E0D\u0E0D\u0E32\u0E2D\u0E48\u0E2D\u0E19",
"\u0E40\u0E2A\u0E47\u0E07\u0E40\u0E04\u0E23\u0E47\u0E07",
"\u0E42\u0E07\u0E48",
"\u0E42\u0E07\u0E48\u0E40\u0E07\u0E48\u0E32",
"\u0E01\u0E30\u0E2B\u0E23\u0E35\u0E48",
"\u0E14\u0E2D\u0E01\u0E17\u0E2D\u0E07",
"\u0E14\u0E2D\u0E01\u0E01\u0E23\u0E30\u0E2B\u0E23\u0E35\u0E48",
"\u0E1A\u0E49\u0E32",
"\u0E04\u0E27\u0E45\u0E22",
"\u0E21\u0E36\u0E07",
"\u0E2D\u0E35\u0E14\u0E2D\u0E01",
"\u0E2B\u0E19\u0E49\u0E32\u0E1B\u0E25\u0E27\u0E01",
"\u0E1E\u0E48\u0E2D\u0E21\u0E36\u0E07",
"\u0E41\u0E21\u0E48\u0E21\u0E36\u0E07",
"\u0E40\u0E22\u0E47\u0E14",
"\u0E40\u0E07\u0E35\u0E48\u0E22\u0E19",
"\u0E2B\u0E19\u0E49\u0E32\u0E14\u0E49\u0E32\u0E19"
];
var badWordsList = [];
function generateBadWords() {
badWordsList = [];
for (let prefix of prefixes) {
for (let rootWord of rootWords) {
badWordsList.push(`${prefix}${rootWord}`);
}
}
badWordsList = [...badWordsList, ...rootWords];
}
// src/index.ts
var trie = new Trie();
function checkBadWords(input) {
const cleanedInput = input.replace(/[^a-zA-Z0-9\u0E00-\u0E7F]/g, "");
for (let ignore of ignoreList) {
if (cleanedInput.includes(ignore)) {
return;
}
}
if (trie.search(cleanedInput)) {
throw new Error(`Bad words detected! Found: ${trie.search(cleanedInput)}`);
}
}
function scanBadWords(input) {
return __async(this, null, function* () {
if (typeof input === "string") {
checkBadWords(input.toLowerCase());
} else {
for (const key in input) {
if (input.hasOwnProperty(key)) {
const value = input[key];
if (typeof value === "object" && value !== null) {
if (Array.isArray(value)) {
for (const item of value) {
if (typeof item === "string") {
checkBadWords(item.toLowerCase());
} else if (typeof item === "object" && item !== null) {
yield scanBadWords(item);
}
}
} else {
yield scanBadWords(value);
}
} else if (typeof value === "string") {
checkBadWords(value.toLowerCase());
}
}
}
}
});
}
function addBadWords(newBadWords) {
const lowerCaseWords = newBadWords.map((word) => word.toLowerCase());
rootWords.push(...lowerCaseWords);
generateBadWords();
lowerCaseWords.forEach((word) => trie.insert(word));
}
function removeBadWords(wordsToRemove) {
const lowerCaseWordsToRemove = wordsToRemove.map((word) => word.toLowerCase());
const updatedRootWords = rootWords.filter((word) => !lowerCaseWordsToRemove.includes(word.toLowerCase()));
rootWords.length = 0;
rootWords.push(...updatedRootWords);
generateBadWords();
const newTrie = new Trie();
badWordsList.forEach((word) => newTrie.insert(word.toLowerCase()));
trie.root = newTrie.root;
}
function addPrefixes(newPrefixes) {
const lowerCasePrefixes = newPrefixes.map((prefix) => prefix.toLowerCase());
prefixes.push(...lowerCasePrefixes);
generateBadWords();
}
function addIgnoreList(newIgnoreWords) {
const lowerCaseIgnoreWords = newIgnoreWords.map((word) => word.toLowerCase());
ignoreList.push(...lowerCaseIgnoreWords);
}
function getBadWords() {
return badWordsList;
}
generateBadWords();
badWordsList.forEach((word) => trie.insert(word));
export {
addBadWords,
addIgnoreList,
addPrefixes,
getBadWords,
removeBadWords,
scanBadWords
};