UNPKG

@sit-sandbox/thai-bad-words

Version:

A package to detect bad words in Thai language.

268 lines (262 loc) 8.15 kB
"use strict"; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __hasOwnProp = Object.prototype.hasOwnProperty; var __export = (target, all) => { for (var name in all) __defProp(target, name, { get: all[name], enumerable: true }); }; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod); var __async = (__this, __arguments, generator) => { return new Promise((resolve, reject) => { var fulfilled = (value) => { try { step(generator.next(value)); } catch (e) { reject(e); } }; var rejected = (value) => { try { step(generator.throw(value)); } catch (e) { reject(e); } }; var step = (x) => x.done ? resolve(x.value) : Promise.resolve(x.value).then(fulfilled, rejected); step((generator = generator.apply(__this, __arguments)).next()); }); }; // src/index.ts var index_exports = {}; __export(index_exports, { addBadWords: () => addBadWords, addIgnoreList: () => addIgnoreList, addPrefixes: () => addPrefixes, getBadWords: () => getBadWords, removeBadWords: () => removeBadWords, scanBadWords: () => scanBadWords }); module.exports = __toCommonJS(index_exports); // src/trie/TrieNode.ts var TrieNode = class { constructor() { this.children = /* @__PURE__ */ new Map(); this.isEndOfWord = false; } }; // src/trie/Trie.ts var Trie = class { constructor() { this.root = new TrieNode(); } insert(word) { let current = this.root; for (const char of word) { if (!current.children.has(char)) { current.children.set(char, new TrieNode()); } current = current.children.get(char); } current.isEndOfWord = true; } search(text) { const n = text.length; for (let i = 0; i < n; i++) { let current = this.root; let matchedWord = ""; for (let j = i; j < n; j++) { if (!current.children.has(text[j])) { break; } current = current.children.get(text[j]); matchedWord += text[j]; if (current.isEndOfWord) { return matchedWord; } } } return false; } }; // src/words/wordLists.ts var prefixes = [ "\u0E01\u0E39", "\u0E21\u0E36\u0E07", "\u0E44\u0E2D\u0E49", "\u0E2D\u0E35", "\u0E44\u0E2D", "\u0E1C\u0E21", "\u0E04\u0E38\u0E13", "\u0E01\u0E23\u0E30\u0E1C\u0E21", "\u0E40\u0E18\u0E2D", "\u0E1E\u0E48\u0E2D", "\u0E41\u0E21\u0E48", "\u0E19\u0E32\u0E22" ]; var ignoreList = [ "\u0E2B\u0E35\u0E1A", "\u0E2A\u0E31\u0E2A\u0E14\u0E35", "\u0E2B\u0E19\u0E49\u0E32\u0E2B\u0E35\u0E1A", "\u0E15\u0E14", "\u0E01\u0E30\u0E2B\u0E23\u0E35\u0E48\u0E1B\u0E31\u0E4A\u0E1A", "\u0E1A\u0E49\u0E32\u0E19" ]; var rootWords = [ "\u0E04\u0E27\u0E22", "\u0E40\u0E2B\u0E35\u0E49\u0E22", "\u0E2B\u0E35", "\u0E2A\u0E31\u0E2A", "\u0E40\u0E0A\u0E35\u0E48\u0E22", "\u0E41\u0E23\u0E14", "\u0E01\u0E23\u0E30\u0E2B\u0E23\u0E35\u0E48", "\u0E0A\u0E34\u0E1A\u0E2B\u0E32\u0E22", "\u0E15\u0E2D\u0E41\u0E2B\u0E25", "\u0E1F\u0E32\u0E22", "\u0E41\u0E21\u0E48\u0E07", "\u0E41\u0E2A\u0E14", "\u0E16\u0E38\u0E22", "\u0E40\u0E14\u0E23\u0E31\u0E08\u0E09\u0E32\u0E19", "\u0E0A\u0E32\u0E15\u0E34\u0E0A\u0E31\u0E48\u0E27", "\u0E19\u0E23\u0E01", "\u0E44\u0E2D\u0E14\u0E2D\u0E01", "\u0E2B\u0E21\u0E2D\u0E22", "\u0E40\u0E2D\u0E4B\u0E2D", "\u0E2A\u0E31\u0E15\u0E27\u0E4C", "\u0E08\u0E31\u0E0D\u0E44\u0E23", "\u0E40\u0E25\u0E27", "\u0E17\u0E23\u0E32\u0E21", "\u0E2A\u0E16\u0E38\u0E19", "\u0E23\u0E30\u0E22\u0E33", "\u0E2D\u0E31\u0E1B\u0E23\u0E35\u0E22\u0E4C", "\u0E15\u0E48\u0E33\u0E15\u0E21", "\u0E01\u0E32\u0E01", "\u0E2A\u0E49\u0E19\u0E15\u0E35\u0E19", "\u0E2B\u0E34\u0E27\u0E15\u0E35\u0E19", "\u0E2A\u0E49\u0E19\u0E15\u0E35\u0E19", "\u0E02\u0E22\u0E30", "\u0E02\u0E35\u0E49\u0E41\u0E1E\u0E49", "\u0E1A\u0E31\u0E14\u0E0B\u0E1A", "\u0E08\u0E31\u0E07\u0E44\u0E23", "\u0E42\u0E2A\u0E42\u0E04\u0E23\u0E01", "\u0E40\u0E2E\u0E07\u0E0B\u0E27\u0E22", "\u0E15\u0E25\u0E32\u0E14\u0E25\u0E48\u0E32\u0E07", "\u0E04\u0E27\u0E32\u0E22", "\u0E21\u0E36\u0E07\u0E15\u0E32\u0E22", "\u0E1B\u0E31\u0E0D\u0E0D\u0E32\u0E2D\u0E48\u0E2D\u0E19", "\u0E40\u0E2A\u0E47\u0E07\u0E40\u0E04\u0E23\u0E47\u0E07", "\u0E42\u0E07\u0E48", "\u0E42\u0E07\u0E48\u0E40\u0E07\u0E48\u0E32", "\u0E01\u0E30\u0E2B\u0E23\u0E35\u0E48", "\u0E14\u0E2D\u0E01\u0E17\u0E2D\u0E07", "\u0E14\u0E2D\u0E01\u0E01\u0E23\u0E30\u0E2B\u0E23\u0E35\u0E48", "\u0E1A\u0E49\u0E32", "\u0E04\u0E27\u0E45\u0E22", "\u0E21\u0E36\u0E07", "\u0E2D\u0E35\u0E14\u0E2D\u0E01", "\u0E2B\u0E19\u0E49\u0E32\u0E1B\u0E25\u0E27\u0E01", "\u0E1E\u0E48\u0E2D\u0E21\u0E36\u0E07", "\u0E41\u0E21\u0E48\u0E21\u0E36\u0E07", "\u0E40\u0E22\u0E47\u0E14", "\u0E40\u0E07\u0E35\u0E48\u0E22\u0E19", "\u0E2B\u0E19\u0E49\u0E32\u0E14\u0E49\u0E32\u0E19" ]; var badWordsList = []; function generateBadWords() { badWordsList = []; for (let prefix of prefixes) { for (let rootWord of rootWords) { badWordsList.push(`${prefix}${rootWord}`); } } badWordsList = [...badWordsList, ...rootWords]; } // src/index.ts var trie = new Trie(); function checkBadWords(input) { const cleanedInput = input.replace(/[^a-zA-Z0-9\u0E00-\u0E7F]/g, ""); for (let ignore of ignoreList) { if (cleanedInput.includes(ignore)) { return; } } if (trie.search(cleanedInput)) { throw new Error(`Bad words detected! Found: ${trie.search(cleanedInput)}`); } } function scanBadWords(input) { return __async(this, null, function* () { if (typeof input === "string") { checkBadWords(input.toLowerCase()); } else { for (const key in input) { if (input.hasOwnProperty(key)) { const value = input[key]; if (typeof value === "object" && value !== null) { if (Array.isArray(value)) { for (const item of value) { if (typeof item === "string") { checkBadWords(item.toLowerCase()); } else if (typeof item === "object" && item !== null) { yield scanBadWords(item); } } } else { yield scanBadWords(value); } } else if (typeof value === "string") { checkBadWords(value.toLowerCase()); } } } } }); } function addBadWords(newBadWords) { const lowerCaseWords = newBadWords.map((word) => word.toLowerCase()); rootWords.push(...lowerCaseWords); generateBadWords(); lowerCaseWords.forEach((word) => trie.insert(word)); } function removeBadWords(wordsToRemove) { const lowerCaseWordsToRemove = wordsToRemove.map((word) => word.toLowerCase()); const updatedRootWords = rootWords.filter((word) => !lowerCaseWordsToRemove.includes(word.toLowerCase())); rootWords.length = 0; rootWords.push(...updatedRootWords); generateBadWords(); const newTrie = new Trie(); badWordsList.forEach((word) => newTrie.insert(word.toLowerCase())); trie.root = newTrie.root; } function addPrefixes(newPrefixes) { const lowerCasePrefixes = newPrefixes.map((prefix) => prefix.toLowerCase()); prefixes.push(...lowerCasePrefixes); generateBadWords(); } function addIgnoreList(newIgnoreWords) { const lowerCaseIgnoreWords = newIgnoreWords.map((word) => word.toLowerCase()); ignoreList.push(...lowerCaseIgnoreWords); } function getBadWords() { return badWordsList; } generateBadWords(); badWordsList.forEach((word) => trie.insert(word)); // Annotate the CommonJS export names for ESM import in node: 0 && (module.exports = { addBadWords, addIgnoreList, addPrefixes, getBadWords, removeBadWords, scanBadWords });