obscenity
Version:
Robust, extensible profanity filter.
183 lines (182 loc) • 8.57 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", { value: true });
exports.RegExpMatcher = void 0;
const Util_1 = require("../../pattern/Util");
const TransformerSet_1 = require("../../transformer/TransformerSet");
const Char_1 = require("../../util/Char");
const CharacterIterator_1 = require("../../util/CharacterIterator");
const IntervalCollection_1 = require("../IntervalCollection");
const MatchPayload_1 = require("../MatchPayload");
/**
* An implementation of the [[Matcher]] interface using regular expressions and
* string searching methods.
*/
class RegExpMatcher {
blacklistedTerms;
whitelistedTerms;
blacklistMatcherTransformers;
whitelistMatcherTransformers;
/**
* Creates a new [[RegExpMatcher]] with the options given.
*
* @example
* ```typescript
* // Use the options provided by the English preset.
* const matcher = new RegExpMatcher({
* ...englishDataset.build(),
* ...englishRecommendedTransformers,
* });
* ```
* @example
* ```typescript
* // Simple matcher that only has blacklisted patterns.
* const matcher = new RegExpMatcher({
* blacklistedTerms: assignIncrementingIds([
* pattern`fuck`,
* pattern`f?uck`, // wildcards (?)
* pattern`bitch`,
* pattern`b[i]tch` // optionals ([i] matches either "i" or "")
* ]),
* });
*
* // Check whether some string matches any of the patterns.
* const doesMatch = matcher.hasMatch('fuck you bitch');
* ```
* @example
* ```typescript
* // A more advanced example, with transformers and whitelisted terms.
* const matcher = new RegExpMatcher({
* blacklistedTerms: [
* { id: 1, pattern: pattern`penis` },
* { id: 2, pattern: pattern`fuck` },
* ],
* whitelistedTerms: ['pen is'],
* blacklistMatcherTransformers: [
* resolveConfusablesTransformer(), // '🅰' => 'a'
* resolveLeetSpeakTransformer(), // '$' => 's'
* foldAsciiCharCaseTransformer(), // case insensitive matching
* skipNonAlphabeticTransformer(), // 'f.u...c.k' => 'fuck'
* collapseDuplicatesTransformer(), // 'aaaa' => 'a'
* ],
* });
*
* // Output all matches.
* console.log(matcher.getAllMatches('fu.....uuuuCK the pen is mightier than the sword!'));
* ```
* @param options - Options to use.
*/
constructor({ blacklistedTerms, whitelistedTerms = [], blacklistMatcherTransformers = [], whitelistMatcherTransformers = [], }) {
this.blacklistedTerms = this.compileTerms(blacklistedTerms);
this.validateWhitelistedTerms(whitelistedTerms);
this.whitelistedTerms = whitelistedTerms;
this.blacklistMatcherTransformers = new TransformerSet_1.TransformerSet(blacklistMatcherTransformers);
this.whitelistMatcherTransformers = new TransformerSet_1.TransformerSet(whitelistMatcherTransformers);
}
getAllMatches(input, sorted = false) {
const whitelistedIntervals = this.getWhitelistedIntervals(input);
const [transformedToOrigIndex, transformed] = this.applyTransformers(input, this.blacklistMatcherTransformers);
const matches = [];
for (const blacklistedTerm of this.blacklistedTerms) {
for (const match of transformed.matchAll(blacklistedTerm.regExp)) {
const origStartIndex = transformedToOrigIndex[match.index];
let origEndIndex = transformedToOrigIndex[match.index + match[0].length - 1];
// End index is (unfortunately) inclusive, so adjust as necessary.
if (origEndIndex < input.length - 1 && // not the last character
(0, Char_1.isHighSurrogate)(input.charCodeAt(origEndIndex)) && // character is a high surrogate
(0, Char_1.isLowSurrogate)(input.charCodeAt(origEndIndex + 1)) // next character is a low surrogate
) {
origEndIndex++;
}
if (!whitelistedIntervals.query(origStartIndex, origEndIndex)) {
matches.push({
termId: blacklistedTerm.id,
startIndex: origStartIndex,
endIndex: origEndIndex,
matchLength: [...match[0]].length,
});
}
}
}
if (sorted)
matches.sort(MatchPayload_1.compareMatchByPositionAndId);
return matches;
}
hasMatch(input) {
const whitelistedIntervals = this.getWhitelistedIntervals(input);
const [transformedToOrigIndex, transformed] = this.applyTransformers(input, this.blacklistMatcherTransformers);
for (const blacklistedTerm of this.blacklistedTerms) {
for (const match of transformed.matchAll(blacklistedTerm.regExp)) {
const origStartIndex = transformedToOrigIndex[match.index];
let origEndIndex = transformedToOrigIndex[match.index + match[0].length - 1];
// End index is (unfortunately) inclusive, so adjust as necessary.
if (origEndIndex < input.length - 1 && // not the last character
(0, Char_1.isHighSurrogate)(input.charCodeAt(origEndIndex)) && // character is a high surrogate
(0, Char_1.isLowSurrogate)(input.charCodeAt(origEndIndex + 1)) // next character is a low surrogate
) {
origEndIndex++;
}
if (!whitelistedIntervals.query(origStartIndex, origEndIndex))
return true;
}
}
return false;
}
getWhitelistedIntervals(input) {
const matches = new IntervalCollection_1.IntervalCollection();
const [transformedToOrigIndex, transformed] = this.applyTransformers(input, this.whitelistMatcherTransformers);
for (const whitelistedTerm of this.whitelistedTerms) {
let lastEnd = 0;
for (let startIndex = transformed.indexOf(whitelistedTerm, lastEnd); startIndex !== -1; startIndex = transformed.indexOf(whitelistedTerm, lastEnd)) {
let origEndIndex = transformedToOrigIndex[startIndex + whitelistedTerm.length - 1];
// End index is (unfortunately) inclusive, so adjust as necessary.
if (origEndIndex < input.length - 1 && // not the last character
(0, Char_1.isHighSurrogate)(input.charCodeAt(origEndIndex)) && // character is a high surrogate
(0, Char_1.isLowSurrogate)(input.charCodeAt(origEndIndex + 1)) // next character is a low surrogate
) {
origEndIndex++;
}
matches.insert(transformedToOrigIndex[startIndex], origEndIndex);
lastEnd = startIndex + whitelistedTerm.length;
}
}
return matches;
}
applyTransformers(input, transformers) {
const transformedToOrigIndex = [];
let transformed = '';
const iter = new CharacterIterator_1.CharacterIterator(input);
for (const char of iter) {
const transformedChar = transformers.applyTo(char);
if (transformedChar !== undefined) {
transformed += String.fromCodePoint(transformedChar);
while (transformedToOrigIndex.length < transformed.length)
transformedToOrigIndex.push(iter.position);
}
}
transformers.resetAll();
return [transformedToOrigIndex, transformed];
}
compileTerms(terms) {
const compiled = [];
const seenIds = new Set();
for (const term of terms) {
if (seenIds.has(term.id))
throw new Error(`Duplicate blacklisted term ID ${term.id}.`);
if ((0, Util_1.potentiallyMatchesEmptyString)(term.pattern)) {
throw new Error(`Pattern with ID ${term.id} potentially matches empty string; this is unsupported.`);
}
compiled.push({
id: term.id,
regExp: (0, Util_1.compilePatternToRegExp)(term.pattern),
});
seenIds.add(term.id);
}
return compiled;
}
validateWhitelistedTerms(whitelist) {
if (whitelist.some((term) => term.length === 0)) {
throw new Error('Whitelisted term set contains empty string; this is unsupported.');
}
}
}
exports.RegExpMatcher = RegExpMatcher;