UNPKG

obscenity

Version:

Robust, extensible profanity filter.

191 lines (190 loc) 7.11 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.Parser = void 0; const Char_1 = require("../util/Char"); const CharacterIterator_1 = require("../util/CharacterIterator"); const Nodes_1 = require("./Nodes"); const ParserError_1 = require("./ParserError"); const supportsEscaping = [ 92 /* CharacterCode.Backslash */, 91 /* CharacterCode.LeftSquareBracket */, 93 /* CharacterCode.RightSquareBracket */, 63 /* CharacterCode.QuestionMark */, 124 /* CharacterCode.VerticalBar */, ]; const supportsEscapingList = supportsEscaping.map((char) => `'${String.fromCodePoint(char)}'`).join(', '); const eof = -1; class Parser { input = ''; line = 1; column = 1; position = 0; lastColumn = 1; lastWidth = 0; parse(input) { this.setInput(input); const nodes = []; const firstNode = this.nextNode(); const requireWordBoundaryAtStart = firstNode?.kind === Nodes_1.SyntaxKind.BoundaryAssertion; if (firstNode && !requireWordBoundaryAtStart) nodes.push(firstNode); let requireWordBoundaryAtEnd = false; while (!this.done) { const pos = this.mark(); const node = this.nextNode(); if (node.kind !== Nodes_1.SyntaxKind.BoundaryAssertion) { nodes.push(node); continue; } if (!this.done) { this.reportError('Boundary assertions are not supported in this position; they are only allowed at the start / end of the pattern.', pos); } requireWordBoundaryAtEnd = true; } return { requireWordBoundaryAtStart, requireWordBoundaryAtEnd, nodes }; } setInput(input) { this.input = input; this.line = 1; this.column = 1; this.position = 0; this.lastColumn = 1; this.lastWidth = 0; return this; } nextNode() { switch (this.peek()) { case eof: return undefined; case 91 /* CharacterCode.LeftSquareBracket */: return this.parseOptional(); case 93 /* CharacterCode.RightSquareBracket */: this.reportError(`Unexpected ']' with no corresponding '['.`); case 63 /* CharacterCode.QuestionMark */: return this.parseWildcard(); case 124 /* CharacterCode.VerticalBar */: return this.parseBoundaryAssertion(); default: return this.parseLiteral(); } } get done() { return this.position >= this.input.length; } // Optional ::= '[' Wildcard | Text ']' parseOptional() { const preOpenBracketPos = this.mark(); this.next(); // '[' const postOpenBracketPos = this.mark(); if (this.done) this.reportError("Unexpected unclosed '['.", preOpenBracketPos); if (this.accept('[')) this.reportError('Unexpected nested optional node.', postOpenBracketPos); const childNode = this.nextNode(); if (childNode.kind === Nodes_1.SyntaxKind.BoundaryAssertion) { this.reportError('Boundary assertions are not supported in this position; they are only allowed at the start / end of the pattern.', postOpenBracketPos); } if (!this.accept(']')) this.reportError("Unexpected unclosed '['."); return { kind: Nodes_1.SyntaxKind.Optional, childNode: childNode }; } // Wildcard ::= '?' parseWildcard() { this.next(); // '?' return { kind: Nodes_1.SyntaxKind.Wildcard }; } // BoundaryAssertion ::= '|' parseBoundaryAssertion() { this.next(); // '|' return { kind: Nodes_1.SyntaxKind.BoundaryAssertion }; } // Literal ::= (NON_SPECIAL | '\' SUPPORTS_ESCAPING)+ // NON_SPECIAL ::= _any character other than '\', '?', '[', ']', or '|'_ // SUPPORTS_ESCAPING ::= '\' | '[' | ']' | '?' | '|' parseLiteral() { const chars = []; while (!this.done) { if (this.accept('[]?|')) { this.backup(); break; } const next = this.next(); if (next === 92 /* CharacterCode.Backslash */) { if (this.done) { this.backup(); this.reportError('Unexpected trailing backslash.'); } // Can we escape the next character? const escaped = this.next(); if (!supportsEscaping.includes(escaped)) { const repr = String.fromCodePoint(escaped); this.backup(); this.reportError(`Cannot escape character '${repr}'; the only characters that can be escaped are the following: ${supportsEscapingList}.`); } chars.push(escaped); } else { chars.push(next); } } return { kind: Nodes_1.SyntaxKind.Literal, chars }; } reportError(message, { line = this.line, column = this.column } = {}) { throw new ParserError_1.ParserError(message, line, column); } // Marks the current position. mark() { return { line: this.line, column: this.column }; } // Accepts any code point in the charset provided. Iff accepted, the character is consumed. accept(charset) { const next = this.next(); const iter = new CharacterIterator_1.CharacterIterator(charset); for (const char of iter) { if (char === next) return true; } this.backup(); return false; } // Reads one code point from the input, without consuming it. peek() { const next = this.next(); this.backup(); return next; } // Consumes one code point from the input. next() { if (this.done) return eof; const char = this.input.charCodeAt(this.position++); this.lastWidth = 1; if (char === 10 /* CharacterCode.Newline */) { this.lastColumn = this.column; this.column = 1; this.line++; return char; } this.lastColumn = this.column++; if (!(0, Char_1.isHighSurrogate)(char) || this.done) return char; // Do we have a surrogate pair? const next = this.input.charCodeAt(this.position); if ((0, Char_1.isLowSurrogate)(next)) { this.position++; this.lastWidth++; return (0, Char_1.convertSurrogatePairToCodePoint)(char, next); } return char; } // Steps back one character; can only be called once per call to next(). backup() { this.position -= this.lastWidth; this.column = this.lastColumn; // Adjust line count if needed. if (this.lastWidth === 1 && this.input.charCodeAt(this.position) === 10 /* CharacterCode.Newline */) { this.line--; } } } exports.Parser = Parser;