UNPKG

@adguard/agtree

Version:
280 lines (277 loc) 10.6 kB
/* * AGTree v3.2.2 (build date: Tue, 08 Jul 2025 13:39:47 GMT) * (c) 2025 Adguard Software Ltd. * Released under the MIT license * https://github.com/AdguardTeam/tsurlfilter/tree/master/packages/agtree#readme */ import { StringUtils } from '../../utils/string.js'; import { OperatorValue } from '../../nodes/index.js'; import { UNDERSCORE, OPEN_PARENTHESIS, CLOSE_PARENTHESIS, AMPERSAND, PIPE, EXCLAMATION_MARK } from '../../utils/constants.js'; import { AdblockSyntaxError } from '../../errors/adblock-syntax-error.js'; import { defaultParserOptions } from '../options.js'; import { BaseParser } from '../base-parser.js'; /** * Possible token types in the logical expression. */ const TokenType = { Variable: 0, Operator: 1, Parenthesis: 2, }; /** * Possible node types in the logical expression. */ const NodeType = { Variable: 'Variable', Operator: 'Operator', Parenthesis: 'Parenthesis', }; /** * Precedence of the operators, larger number means higher precedence. */ const OPERATOR_PRECEDENCE = { [OperatorValue.Not]: 3, [OperatorValue.And]: 2, [OperatorValue.Or]: 1, }; /** * `LogicalExpressionParser` is responsible for parsing logical expressions. * * @example * From the following rule: * ```adblock * !#if (adguard_ext_android_cb || adguard_ext_safari) * ``` * this parser will parse the expression `(adguard_ext_android_cb || adguard_ext_safari)`. */ // TODO: Refactor this class class LogicalExpressionParser extends BaseParser { /** * Split the expression into tokens. * * @param raw Source code of the expression * @param baseOffset Starting offset of the input. Node locations are calculated relative to this offset. * @returns Token list * @throws {AdblockSyntaxError} If the expression is invalid */ static tokenize(raw, baseOffset = 0) { const tokens = []; let offset = 0; while (offset < raw.length) { const char = raw[offset]; if (StringUtils.isWhitespace(char)) { // Ignore whitespace offset += 1; } else if (StringUtils.isLetter(char)) { // Save the start offset of the variable name const nameStart = offset; // Variable name shouldn't start with a number or underscore, // but can contain them while (offset + 1 < raw.length && (StringUtils.isAlphaNumeric(raw[offset + 1]) || raw[offset + 1] === UNDERSCORE)) { offset += 1; } tokens.push({ type: TokenType.Variable, start: nameStart, end: offset + 1, }); offset += 1; } else if (char === OPEN_PARENTHESIS || char === CLOSE_PARENTHESIS) { // Parenthesis tokens.push({ type: TokenType.Parenthesis, start: offset, end: offset + 1, }); offset += 1; } else if (char === AMPERSAND || char === PIPE) { // Parse operator if (offset + 1 < raw.length && raw[offset + 1] === char) { tokens.push({ type: TokenType.Operator, start: offset, end: offset + 2, }); offset += 2; } else { throw new AdblockSyntaxError(`Unexpected character "${char}"`, baseOffset + offset, baseOffset + offset + 1); } } else if (char === EXCLAMATION_MARK) { tokens.push({ type: TokenType.Operator, start: offset, end: offset + 1, }); offset += 1; } else { throw new AdblockSyntaxError(`Unexpected character "${char}"`, baseOffset + offset, baseOffset + offset + 1); } } return tokens; } /** * Parses a logical expression. * * @param raw Raw input to parse. * @param options Global parser options. * @param baseOffset Starting offset of the input. Node locations are calculated relative to this offset. * @returns Parsed expression * @throws {AdblockSyntaxError} If the expression is invalid */ // TODO: Create a separate TokenStream class static parse(raw, options = defaultParserOptions, baseOffset = 0) { // Tokenize the source (produces an array of tokens) const tokens = LogicalExpressionParser.tokenize(raw, baseOffset); // Current token index let tokenIndex = 0; /** * Consumes a token of the expected type. * * @param type Expected token type * @returns The consumed token */ function consume(type) { const token = tokens[tokenIndex]; if (!token) { throw new AdblockSyntaxError(`Expected token of type "${type}", but reached end of input`, baseOffset, baseOffset + raw.length); } // We only use this function internally, so we can safely ignore this // from the coverage report // istanbul ignore next if (token.type !== type) { throw new AdblockSyntaxError(`Expected token of type "${type}", but got "${token.type}"`, baseOffset + token.start, baseOffset + token.end); } tokenIndex += 1; return token; } /** * Parses a variable. * * @returns Variable node */ function parseVariable() { const token = consume(TokenType.Variable); const result = { type: NodeType.Variable, name: raw.slice(token.start, token.end), }; if (options.isLocIncluded) { result.start = baseOffset + token.start; result.end = baseOffset + token.end; } return result; } /** * Parses a binary expression. * * @param left Left-hand side of the expression * @param minPrecedence Minimum precedence of the operator * @returns Binary expression node */ function parseBinaryExpression(left, minPrecedence = 0) { let node = left; let operatorToken; while (tokens[tokenIndex]) { operatorToken = tokens[tokenIndex]; if (!operatorToken || operatorToken.type !== TokenType.Operator) { break; } // It is safe to cast here, because we already checked the type const operator = raw.slice(operatorToken.start, operatorToken.end); const precedence = OPERATOR_PRECEDENCE[operator]; if (precedence < minPrecedence) { break; } tokenIndex += 1; // eslint-disable-next-line @typescript-eslint/no-use-before-define const right = parseExpression(precedence + 1); const newNode = { type: NodeType.Operator, operator, left: node, right, }; if (options.isLocIncluded) { newNode.start = node.start ?? baseOffset + operatorToken.start; newNode.end = right.end ?? baseOffset + operatorToken.end; } node = newNode; } return node; } /** * Parses a parenthesized expression. * * @returns Parenthesized expression node */ function parseParenthesizedExpression() { consume(TokenType.Parenthesis); // eslint-disable-next-line @typescript-eslint/no-use-before-define const expression = parseExpression(); consume(TokenType.Parenthesis); const result = { type: NodeType.Parenthesis, expression, }; if (options.isLocIncluded) { result.start = expression.start; result.end = expression.end; } return result; } /** * Parses an expression. * * @param minPrecedence Minimum precedence of the operator * @returns Expression node */ function parseExpression(minPrecedence = 0) { let node; const token = tokens[tokenIndex]; const value = raw.slice(token.start, token.end); if (token.type === TokenType.Variable) { node = parseVariable(); } else if (token.type === TokenType.Operator && value === OperatorValue.Not) { tokenIndex += 1; const expression = parseExpression(OPERATOR_PRECEDENCE[OperatorValue.Not]); node = { type: NodeType.Operator, operator: OperatorValue.Not, left: expression, }; if (options.isLocIncluded) { if (expression.end) { node.start = baseOffset + token.start; // no need to shift the node location, because it's already shifted node.end = expression.end; } else { node.start = baseOffset + token.start; node.end = baseOffset + token.end; } } } else if (token.type === TokenType.Parenthesis && value === OPEN_PARENTHESIS) { node = parseParenthesizedExpression(); } else { throw new AdblockSyntaxError(`Unexpected token "${value}"`, baseOffset + token.start, baseOffset + token.end); } return parseBinaryExpression(node, minPrecedence); } const expression = parseExpression(); if (tokenIndex !== tokens.length) { throw new AdblockSyntaxError(`Unexpected token "${tokens[tokenIndex].type}"`, baseOffset + tokens[tokenIndex].start, baseOffset + tokens[tokenIndex].end); } return expression; } } export { LogicalExpressionParser, NodeType };