allprofanity
Version:
A blazing-fast, multi-language profanity filter with advanced algorithms (Aho-Corasick, Bloom Filters) delivering 664% faster performance on large texts, intelligent leet-speak detection, and pattern-based context analysis
238 lines • 7.4 kB
JavaScript
/**
* Aho-Corasick algorithm implementation for efficient multi-pattern string matching
*/
export class AhoCorasick {
constructor(patterns = []) {
this.compiled = false;
this.patterns = [...patterns];
this.root = this.createNode();
if (patterns.length > 0) {
this.buildAutomaton();
}
}
/**
* Create a new trie node
*/
createNode() {
return {
children: new Map(),
output: [],
outputIndices: [],
failure: null,
isEndOfPattern: false,
};
}
/**
* Add patterns to the automaton
*/
addPatterns(patterns) {
this.patterns.push(...patterns);
this.compiled = false;
}
/**
* Add a single pattern to the automaton
*/
addPattern(pattern) {
if (pattern && pattern.length > 0) {
this.patterns.push(pattern);
this.compiled = false;
}
}
/**
* Build the Aho-Corasick automaton
*/
buildAutomaton() {
this.buildTrie();
this.buildFailureLinks();
this.buildOutputLinks();
this.compiled = true;
}
/**
* Build the trie structure
*/
buildTrie() {
this.root = this.createNode();
for (let i = 0; i < this.patterns.length; i++) {
const pattern = this.patterns[i];
let current = this.root;
for (const char of pattern) {
if (!current.children.has(char)) {
current.children.set(char, this.createNode());
}
current = current.children.get(char);
}
current.isEndOfPattern = true;
current.output.push(pattern);
current.outputIndices.push(i);
}
}
/**
* Build failure links using BFS
*/
buildFailureLinks() {
const queue = [];
// Initialize failure links for depth 1 nodes
for (const child of this.root.children.values()) {
child.failure = this.root;
queue.push(child);
}
// Build failure links for deeper nodes
while (queue.length > 0) {
const current = queue.shift();
for (const [char, child] of current.children) {
queue.push(child);
let failure = current.failure;
while (failure !== null && !failure.children.has(char)) {
failure = failure.failure;
}
child.failure = failure ? failure.children.get(char) : this.root;
}
}
}
/**
* Build output links for failure transitions
*/
buildOutputLinks() {
const queue = [];
for (const child of this.root.children.values()) {
queue.push(child);
}
while (queue.length > 0) {
const current = queue.shift();
// Add failure node outputs to current node
if (current.failure && current.failure.output.length > 0) {
current.output.push(...current.failure.output);
current.outputIndices.push(...current.failure.outputIndices);
}
for (const child of current.children.values()) {
queue.push(child);
}
}
}
/**
* Find all pattern matches in the given text
*/
findAll(text) {
if (!this.compiled) {
this.buildAutomaton();
}
const matches = [];
let current = this.root;
for (let i = 0; i < text.length; i++) {
const char = text[i];
// Follow failure links until we find a transition or reach root
while (current !== this.root && !current.children.has(char)) {
current = current.failure;
}
// Transition to next state if possible
if (current.children.has(char)) {
current = current.children.get(char);
}
// Report all patterns that end at this position
for (let j = 0; j < current.output.length; j++) {
const pattern = current.output[j];
const patternIndex = current.outputIndices[j];
const start = i - pattern.length + 1;
matches.push({
pattern,
start,
end: i + 1,
patternIndex,
});
}
}
return matches;
}
/**
* Check if text contains any patterns
*/
hasMatch(text) {
if (!this.compiled) {
this.buildAutomaton();
}
let current = this.root;
for (let i = 0; i < text.length; i++) {
const char = text[i];
while (current !== this.root && !current.children.has(char)) {
current = current.failure;
}
if (current.children.has(char)) {
current = current.children.get(char);
}
if (current.output.length > 0) {
return true;
}
}
return false;
}
/**
* Find first match in text
*/
findFirst(text) {
if (!this.compiled) {
this.buildAutomaton();
}
let current = this.root;
for (let i = 0; i < text.length; i++) {
const char = text[i];
while (current !== this.root && !current.children.has(char)) {
current = current.failure;
}
if (current.children.has(char)) {
current = current.children.get(char);
}
if (current.output.length > 0) {
const pattern = current.output[0];
const patternIndex = current.outputIndices[0];
const start = i - pattern.length + 1;
return {
pattern,
start,
end: i + 1,
patternIndex,
};
}
}
return null;
}
/**
* Get the patterns stored in this automaton
*/
getPatterns() {
return [...this.patterns];
}
/**
* Clear all patterns and reset the automaton
*/
clear() {
this.patterns = [];
this.root = this.createNode();
this.compiled = false;
}
/**
* Get statistics about the automaton
*/
getStats() {
const nodeCount = this.countNodes(this.root);
const averagePatternLength = this.patterns.length > 0
? this.patterns.reduce((sum, p) => sum + p.length, 0) /
this.patterns.length
: 0;
return {
patternCount: this.patterns.length,
nodeCount,
averagePatternLength,
};
}
/**
* Count total nodes in the trie
*/
countNodes(node) {
let count = 1;
for (const child of node.children.values()) {
count += this.countNodes(child);
}
return count;
}
}
//# sourceMappingURL=aho-corasick.js.map