UNPKG

obscenity

Version:

Robust, extensible profanity filter.

183 lines (182 loc) 8.57 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.RegExpMatcher = void 0; const Util_1 = require("../../pattern/Util"); const TransformerSet_1 = require("../../transformer/TransformerSet"); const Char_1 = require("../../util/Char"); const CharacterIterator_1 = require("../../util/CharacterIterator"); const IntervalCollection_1 = require("../IntervalCollection"); const MatchPayload_1 = require("../MatchPayload"); /** * An implementation of the [[Matcher]] interface using regular expressions and * string searching methods. */ class RegExpMatcher { blacklistedTerms; whitelistedTerms; blacklistMatcherTransformers; whitelistMatcherTransformers; /** * Creates a new [[RegExpMatcher]] with the options given. * * @example * ```typescript * // Use the options provided by the English preset. * const matcher = new RegExpMatcher({ * ...englishDataset.build(), * ...englishRecommendedTransformers, * }); * ``` * @example * ```typescript * // Simple matcher that only has blacklisted patterns. * const matcher = new RegExpMatcher({ * blacklistedTerms: assignIncrementingIds([ * pattern`fuck`, * pattern`f?uck`, // wildcards (?) * pattern`bitch`, * pattern`b[i]tch` // optionals ([i] matches either "i" or "") * ]), * }); * * // Check whether some string matches any of the patterns. * const doesMatch = matcher.hasMatch('fuck you bitch'); * ``` * @example * ```typescript * // A more advanced example, with transformers and whitelisted terms. * const matcher = new RegExpMatcher({ * blacklistedTerms: [ * { id: 1, pattern: pattern`penis` }, * { id: 2, pattern: pattern`fuck` }, * ], * whitelistedTerms: ['pen is'], * blacklistMatcherTransformers: [ * resolveConfusablesTransformer(), // '🅰' => 'a' * resolveLeetSpeakTransformer(), // '$' => 's' * foldAsciiCharCaseTransformer(), // case insensitive matching * skipNonAlphabeticTransformer(), // 'f.u...c.k' => 'fuck' * collapseDuplicatesTransformer(), // 'aaaa' => 'a' * ], * }); * * // Output all matches. * console.log(matcher.getAllMatches('fu.....uuuuCK the pen is mightier than the sword!')); * ``` * @param options - Options to use. */ constructor({ blacklistedTerms, whitelistedTerms = [], blacklistMatcherTransformers = [], whitelistMatcherTransformers = [], }) { this.blacklistedTerms = this.compileTerms(blacklistedTerms); this.validateWhitelistedTerms(whitelistedTerms); this.whitelistedTerms = whitelistedTerms; this.blacklistMatcherTransformers = new TransformerSet_1.TransformerSet(blacklistMatcherTransformers); this.whitelistMatcherTransformers = new TransformerSet_1.TransformerSet(whitelistMatcherTransformers); } getAllMatches(input, sorted = false) { const whitelistedIntervals = this.getWhitelistedIntervals(input); const [transformedToOrigIndex, transformed] = this.applyTransformers(input, this.blacklistMatcherTransformers); const matches = []; for (const blacklistedTerm of this.blacklistedTerms) { for (const match of transformed.matchAll(blacklistedTerm.regExp)) { const origStartIndex = transformedToOrigIndex[match.index]; let origEndIndex = transformedToOrigIndex[match.index + match[0].length - 1]; // End index is (unfortunately) inclusive, so adjust as necessary. if (origEndIndex < input.length - 1 && // not the last character (0, Char_1.isHighSurrogate)(input.charCodeAt(origEndIndex)) && // character is a high surrogate (0, Char_1.isLowSurrogate)(input.charCodeAt(origEndIndex + 1)) // next character is a low surrogate ) { origEndIndex++; } if (!whitelistedIntervals.query(origStartIndex, origEndIndex)) { matches.push({ termId: blacklistedTerm.id, startIndex: origStartIndex, endIndex: origEndIndex, matchLength: [...match[0]].length, }); } } } if (sorted) matches.sort(MatchPayload_1.compareMatchByPositionAndId); return matches; } hasMatch(input) { const whitelistedIntervals = this.getWhitelistedIntervals(input); const [transformedToOrigIndex, transformed] = this.applyTransformers(input, this.blacklistMatcherTransformers); for (const blacklistedTerm of this.blacklistedTerms) { for (const match of transformed.matchAll(blacklistedTerm.regExp)) { const origStartIndex = transformedToOrigIndex[match.index]; let origEndIndex = transformedToOrigIndex[match.index + match[0].length - 1]; // End index is (unfortunately) inclusive, so adjust as necessary. if (origEndIndex < input.length - 1 && // not the last character (0, Char_1.isHighSurrogate)(input.charCodeAt(origEndIndex)) && // character is a high surrogate (0, Char_1.isLowSurrogate)(input.charCodeAt(origEndIndex + 1)) // next character is a low surrogate ) { origEndIndex++; } if (!whitelistedIntervals.query(origStartIndex, origEndIndex)) return true; } } return false; } getWhitelistedIntervals(input) { const matches = new IntervalCollection_1.IntervalCollection(); const [transformedToOrigIndex, transformed] = this.applyTransformers(input, this.whitelistMatcherTransformers); for (const whitelistedTerm of this.whitelistedTerms) { let lastEnd = 0; for (let startIndex = transformed.indexOf(whitelistedTerm, lastEnd); startIndex !== -1; startIndex = transformed.indexOf(whitelistedTerm, lastEnd)) { let origEndIndex = transformedToOrigIndex[startIndex + whitelistedTerm.length - 1]; // End index is (unfortunately) inclusive, so adjust as necessary. if (origEndIndex < input.length - 1 && // not the last character (0, Char_1.isHighSurrogate)(input.charCodeAt(origEndIndex)) && // character is a high surrogate (0, Char_1.isLowSurrogate)(input.charCodeAt(origEndIndex + 1)) // next character is a low surrogate ) { origEndIndex++; } matches.insert(transformedToOrigIndex[startIndex], origEndIndex); lastEnd = startIndex + whitelistedTerm.length; } } return matches; } applyTransformers(input, transformers) { const transformedToOrigIndex = []; let transformed = ''; const iter = new CharacterIterator_1.CharacterIterator(input); for (const char of iter) { const transformedChar = transformers.applyTo(char); if (transformedChar !== undefined) { transformed += String.fromCodePoint(transformedChar); while (transformedToOrigIndex.length < transformed.length) transformedToOrigIndex.push(iter.position); } } transformers.resetAll(); return [transformedToOrigIndex, transformed]; } compileTerms(terms) { const compiled = []; const seenIds = new Set(); for (const term of terms) { if (seenIds.has(term.id)) throw new Error(`Duplicate blacklisted term ID ${term.id}.`); if ((0, Util_1.potentiallyMatchesEmptyString)(term.pattern)) { throw new Error(`Pattern with ID ${term.id} potentially matches empty string; this is unsupported.`); } compiled.push({ id: term.id, regExp: (0, Util_1.compilePatternToRegExp)(term.pattern), }); seenIds.add(term.id); } return compiled; } validateWhitelistedTerms(whitelist) { if (whitelist.some((term) => term.length === 0)) { throw new Error('Whitelisted term set contains empty string; this is unsupported.'); } } } exports.RegExpMatcher = RegExpMatcher;