obscenity
Version:
Robust, extensible profanity filter.
191 lines (190 loc) • 7.11 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Parser = void 0;
const Char_1 = require("../util/Char");
const CharacterIterator_1 = require("../util/CharacterIterator");
const Nodes_1 = require("./Nodes");
const ParserError_1 = require("./ParserError");
const supportsEscaping = [
92 /* CharacterCode.Backslash */,
91 /* CharacterCode.LeftSquareBracket */,
93 /* CharacterCode.RightSquareBracket */,
63 /* CharacterCode.QuestionMark */,
124 /* CharacterCode.VerticalBar */,
];
const supportsEscapingList = supportsEscaping.map((char) => `'${String.fromCodePoint(char)}'`).join(', ');
const eof = -1;
class Parser {
input = '';
line = 1;
column = 1;
position = 0;
lastColumn = 1;
lastWidth = 0;
parse(input) {
this.setInput(input);
const nodes = [];
const firstNode = this.nextNode();
const requireWordBoundaryAtStart = firstNode?.kind === Nodes_1.SyntaxKind.BoundaryAssertion;
if (firstNode && !requireWordBoundaryAtStart)
nodes.push(firstNode);
let requireWordBoundaryAtEnd = false;
while (!this.done) {
const pos = this.mark();
const node = this.nextNode();
if (node.kind !== Nodes_1.SyntaxKind.BoundaryAssertion) {
nodes.push(node);
continue;
}
if (!this.done) {
this.reportError('Boundary assertions are not supported in this position; they are only allowed at the start / end of the pattern.', pos);
}
requireWordBoundaryAtEnd = true;
}
return { requireWordBoundaryAtStart, requireWordBoundaryAtEnd, nodes };
}
setInput(input) {
this.input = input;
this.line = 1;
this.column = 1;
this.position = 0;
this.lastColumn = 1;
this.lastWidth = 0;
return this;
}
nextNode() {
switch (this.peek()) {
case eof:
return undefined;
case 91 /* CharacterCode.LeftSquareBracket */:
return this.parseOptional();
case 93 /* CharacterCode.RightSquareBracket */:
this.reportError(`Unexpected ']' with no corresponding '['.`);
case 63 /* CharacterCode.QuestionMark */:
return this.parseWildcard();
case 124 /* CharacterCode.VerticalBar */:
return this.parseBoundaryAssertion();
default:
return this.parseLiteral();
}
}
get done() {
return this.position >= this.input.length;
}
// Optional ::= '[' Wildcard | Text ']'
parseOptional() {
const preOpenBracketPos = this.mark();
this.next(); // '['
const postOpenBracketPos = this.mark();
if (this.done)
this.reportError("Unexpected unclosed '['.", preOpenBracketPos);
if (this.accept('['))
this.reportError('Unexpected nested optional node.', postOpenBracketPos);
const childNode = this.nextNode();
if (childNode.kind === Nodes_1.SyntaxKind.BoundaryAssertion) {
this.reportError('Boundary assertions are not supported in this position; they are only allowed at the start / end of the pattern.', postOpenBracketPos);
}
if (!this.accept(']'))
this.reportError("Unexpected unclosed '['.");
return { kind: Nodes_1.SyntaxKind.Optional, childNode: childNode };
}
// Wildcard ::= '?'
parseWildcard() {
this.next(); // '?'
return { kind: Nodes_1.SyntaxKind.Wildcard };
}
// BoundaryAssertion ::= '|'
parseBoundaryAssertion() {
this.next(); // '|'
return { kind: Nodes_1.SyntaxKind.BoundaryAssertion };
}
// Literal ::= (NON_SPECIAL | '\' SUPPORTS_ESCAPING)+
// NON_SPECIAL ::= _any character other than '\', '?', '[', ']', or '|'_
// SUPPORTS_ESCAPING ::= '\' | '[' | ']' | '?' | '|'
parseLiteral() {
const chars = [];
while (!this.done) {
if (this.accept('[]?|')) {
this.backup();
break;
}
const next = this.next();
if (next === 92 /* CharacterCode.Backslash */) {
if (this.done) {
this.backup();
this.reportError('Unexpected trailing backslash.');
}
// Can we escape the next character?
const escaped = this.next();
if (!supportsEscaping.includes(escaped)) {
const repr = String.fromCodePoint(escaped);
this.backup();
this.reportError(`Cannot escape character '${repr}'; the only characters that can be escaped are the following: ${supportsEscapingList}.`);
}
chars.push(escaped);
}
else {
chars.push(next);
}
}
return { kind: Nodes_1.SyntaxKind.Literal, chars };
}
reportError(message, { line = this.line, column = this.column } = {}) {
throw new ParserError_1.ParserError(message, line, column);
}
// Marks the current position.
mark() {
return { line: this.line, column: this.column };
}
// Accepts any code point in the charset provided. Iff accepted, the character is consumed.
accept(charset) {
const next = this.next();
const iter = new CharacterIterator_1.CharacterIterator(charset);
for (const char of iter) {
if (char === next)
return true;
}
this.backup();
return false;
}
// Reads one code point from the input, without consuming it.
peek() {
const next = this.next();
this.backup();
return next;
}
// Consumes one code point from the input.
next() {
if (this.done)
return eof;
const char = this.input.charCodeAt(this.position++);
this.lastWidth = 1;
if (char === 10 /* CharacterCode.Newline */) {
this.lastColumn = this.column;
this.column = 1;
this.line++;
return char;
}
this.lastColumn = this.column++;
if (!(0, Char_1.isHighSurrogate)(char) || this.done)
return char;
// Do we have a surrogate pair?
const next = this.input.charCodeAt(this.position);
if ((0, Char_1.isLowSurrogate)(next)) {
this.position++;
this.lastWidth++;
return (0, Char_1.convertSurrogatePairToCodePoint)(char, next);
}
return char;
}
// Steps back one character; can only be called once per call to next().
backup() {
this.position -= this.lastWidth;
this.column = this.lastColumn;
// Adjust line count if needed.
if (this.lastWidth === 1 && this.input.charCodeAt(this.position) === 10 /* CharacterCode.Newline */) {
this.line--;
}
}
}
exports.Parser = Parser;