UNPKG

@adguard/agtree

Version:
370 lines (367 loc) 18.6 kB
/* * AGTree v3.2.2 (build date: Tue, 08 Jul 2025 13:39:47 GMT) * (c) 2025 Adguard Software Ltd. * Released under the MIT license * https://github.com/AdguardTeam/tsurlfilter/tree/master/packages/agtree#readme */ import { TokenType, getFormattedTokenName } from '@adguard/css-tokenizer'; import { sprintf } from 'sprintf-js'; import { AdblockSyntaxError } from '../../errors/adblock-syntax-error.js'; import { EMPTY, COLON, OPEN_PARENTHESIS, CSS_NOT_PSEUDO, CLOSE_PARENTHESIS } from '../../utils/constants.js'; import { tokenizeFnBalanced } from './balancing.js'; import { defaultParserOptions } from '../options.js'; import { BaseParser } from '../base-parser.js'; import { UboPseudoName } from '../../common/ubo-selector-common.js'; /** * @file Parser for special uBO selectors. */ /** * Possible error messages for uBO selectors. Formatted with {@link sprintf}. */ const ERROR_MESSAGES = { DUPLICATED_UBO_MODIFIER: "uBO modifier '%s' cannot be used more than once", EXPECTED_BUT_GOT_BEFORE: "Expected '%s' but got '%s' before '%s'", // eslint-disable-next-line max-len NEGATED_UBO_MODIFIER_CANNOT_BE_FOLLOWED_BY: "Negated uBO modifier '%s' cannot be followed by anything else than a closing parenthesis or a whitespace", NEGATED_UBO_MODIFIER_CANNOT_BE_PRECEDED_BY: "Negated uBO modifier '%s' cannot be preceded by '%s'", PSEUDO_CANNOT_BE_NESTED: "uBO modifier '%s' cannot be nested inside '%s', only '%s' is allowed as a wrapper", UBO_MODIFIER_CANNOT_BE_NESTED: "uBO modifier '%s' cannot be nested", UBO_STYLE_CANNOT_BE_FOLLOWED: 'uBO style injection cannot be followed by anything else than a whitespace', }; /** * Dummy parameter for uBO modifiers in error messages. */ const DUMMY_PARAM = '...'; /** * Set of known uBO modifiers. * * @note We use `string` instead of `UboPseudoName` because we use this set for checking if a modifier is a known uBO, * and an unknown sequence is just a string. */ const KNOWN_UBO_MODIFIERS = new Set([ UboPseudoName.MatchesMedia, UboPseudoName.MatchesPath, UboPseudoName.Remove, UboPseudoName.Style, ]); /** * Helper function to check if the given selector has any uBO modifier. This function should be fast, because it's used * in the hot path of the parser. * * @param raw Raw selector string. * @returns `true` if the selector has any uBO modifier, `false` otherwise. */ const hasAnyUboModifier = (raw) => { // Find the first colon let colonIndex = raw.indexOf(COLON); while (colonIndex !== -1) { // Find next opening parenthesis const openingParenthesisIndex = raw.indexOf(OPEN_PARENTHESIS, colonIndex + 1); // If there is no opening parenthesis, then the selector doesn't contain any uBO modifier if (openingParenthesisIndex === -1) { return false; } // Check if the modifier is a known uBO modifier if (KNOWN_UBO_MODIFIERS.has(raw.slice(colonIndex + 1, openingParenthesisIndex))) { return true; } // Find next colon colonIndex = raw.indexOf(COLON, colonIndex + 1); } return false; }; /** * A simple helper function to format a pseudo name for error messages. * * @param name Pseudo name. * @param wrapper Wrapper pseudo name (eg. `not`) (optional, defaults to `undefined`). * @returns Formatted pseudo name. * @example * ```ts * formatPseudoName('matches-path', 'not'); // => ':not(:matches-path(...))' * formatPseudoName('matches-media'); // => ':matches-media(...)' * ``` */ const formatPseudoName = (name, wrapper) => { const result = []; if (wrapper) { result.push(COLON, wrapper, OPEN_PARENTHESIS); } result.push(COLON, name, OPEN_PARENTHESIS, DUMMY_PARAM, CLOSE_PARENTHESIS); if (wrapper) { result.push(CLOSE_PARENTHESIS); } return result.join(EMPTY); }; /** * Parser for uBO selectors. */ class UboSelectorParser extends BaseParser { /** * Parses a uBO selector list, eg. `div:matches-path(/path)`. * * @param raw Raw input to parse. * @param options Global parser options. * @param baseOffset Starting offset of the input. Node locations are calculated relative to this offset. * * @returns Parsed uBO selector {@link UboSelectorParser}. * @throws An {@link AdblockSyntaxError} if the selector list is syntactically invalid. */ static parse(raw, options = defaultParserOptions, baseOffset = 0) { // Prepare helper variables const modifiers = { type: 'ModifierList', children: [], }; if (options.isLocIncluded) { modifiers.start = baseOffset; modifiers.end = baseOffset + raw.length; } // Do not perform any parsing if the selector doesn't contain any uBO modifier // Parsing is a relatively expensive operation, but this check is cheap, so we can avoid unnecessary work // TODO: Move this check to the cosmetic parser (adjustable syntaxes - if uBO syntax is disabled, then we don't // need to check for uBO modifiers) if (!hasAnyUboModifier(raw)) { const selector = { type: 'Value', value: raw, }; if (options.isLocIncluded) { selector.start = baseOffset; selector.end = baseOffset + raw.length; } const result = { type: 'UboSelector', selector, modifiers, }; if (options.isLocIncluded) { result.start = baseOffset; result.end = baseOffset + raw.length; } return result; } // Simple way to check if a modifier is already processed to avoid duplicate modifiers const processedModifiers = new Set(); // We need to keep track of the tokens for handling negations properly const tokens = []; // This array is used to mark the character slots in the selector string that are occupied by uBO modifiers const uboIndexes = new Array(raw.length); const uboModifierStack = []; let i = 0; // Helper function to stack a uBO modifier const stackModifier = (modifier) => { if (processedModifiers.has(modifier.name)) { throw new AdblockSyntaxError(sprintf(ERROR_MESSAGES.DUPLICATED_UBO_MODIFIER, formatPseudoName(modifier.name)), baseOffset + modifier.modifierStart, baseOffset + raw.length); } uboModifierStack.push(modifier); }; // Tokenize the selector, calculate the balance tokenizeFnBalanced(raw, (type, start, end, _, balance) => { // Special case: style injection (`:style(...)` and `:remove()`) can only be used at the end of the // selector, like // - `div:style(...)`, // - `div:matches-media(...):style(...)`, // - `div:remove()`, // etc. // // But not like // - `:style(...) div`, // - `:matches-media(...):style(...) div`, // - `:remove() div`, // etc. // // The one exception is whitespace, which is allowed after style injection, like // - `div:style(...) `, // - `div:matches-media(...):style(...) `, // - `div:remove() `, // etc. if ((processedModifiers.has(UboPseudoName.Style) || processedModifiers.has(UboPseudoName.Remove)) && type !== TokenType.Whitespace) { throw new AdblockSyntaxError(ERROR_MESSAGES.UBO_STYLE_CANNOT_BE_FOLLOWED, baseOffset + start, baseOffset + raw.length); } // Check for pseudo classes (colon followed by a function) if (tokens[i - 1]?.type === TokenType.Colon && type === TokenType.Function) { // Since closing parenthesis is always included in the function token, but we only need the function // name, we need to cut off the last character, this is why we use `end - 1` here const fn = raw.slice(start, end - 1); // Check if the pseudo class is a known uBO modifier if (KNOWN_UBO_MODIFIERS.has(fn)) { // Generally, uBO modifiers cannot be nested, like // - `:any(:matches-media(...))`, // - `:matches-media(:matches-media(...))`, // - `:not(style(...))`, // etc. if (balance > 1) { // However, we have one exception: `:matches-path()` can be nested inside `:not()`s, like: // - `:not(:matches-path(...))`, // - `:not(:not(:matches-path(...)))`, // etc. // // But it can't be nested inside any other pseudo class, like: // - `:anything(:matches-path(...))`, // etc. // // Moreover, :not() can't contain any other data, like // - `:not(div:matches-path(...))`, // - `:not(:matches-path(...):matches-path(...))`, // - `:not(:matches-path(...) div)`, // etc. if (fn === UboPseudoName.MatchesPath) { if (uboModifierStack.length > 0) { throw new AdblockSyntaxError(sprintf(ERROR_MESSAGES.PSEUDO_CANNOT_BE_NESTED, formatPseudoName(UboPseudoName.MatchesPath), formatPseudoName(uboModifierStack[uboModifierStack.length - 1].name), formatPseudoName(CSS_NOT_PSEUDO)), baseOffset + start - 1, baseOffset + raw.length); } let isException = false; let modifierBalance = balance; let modifierStart = start; for (let j = i - 1; j >= 0; j -= 1) { // If we have reached the root level, then we should check if the `not` function is // preceded by a colon (which means that it's a pseudo class) if (tokens[j].balance === 0) { modifierStart = tokens[j].start; modifierBalance = tokens[j].balance; break; } else if (tokens[j].type === TokenType.Colon || tokens[j].type === TokenType.Whitespace) { continue; } else if (tokens[j].type === TokenType.Function) { const wrapperFnName = raw.slice(tokens[j].start, tokens[j].end - 1); if (wrapperFnName !== CSS_NOT_PSEUDO) { throw new AdblockSyntaxError(sprintf(ERROR_MESSAGES.PSEUDO_CANNOT_BE_NESTED, formatPseudoName(UboPseudoName.MatchesPath), formatPseudoName(wrapperFnName), formatPseudoName(CSS_NOT_PSEUDO)), baseOffset + tokens[j].start - 1, baseOffset + raw.length); } if (tokens[j - 1]?.type !== TokenType.Colon) { const got = tokens[j - 1]?.type ? getFormattedTokenName(tokens[j - 1]?.type) : 'nothing'; throw new AdblockSyntaxError(sprintf(ERROR_MESSAGES.EXPECTED_BUT_GOT_BEFORE, getFormattedTokenName(TokenType.Colon), got, formatPseudoName(UboPseudoName.MatchesPath, CSS_NOT_PSEUDO)), // eslint-disable-next-line no-unsafe-optional-chaining baseOffset + tokens[j - 1]?.start || 0, baseOffset + raw.length); } isException = !isException; continue; } else { throw new AdblockSyntaxError(sprintf(ERROR_MESSAGES.NEGATED_UBO_MODIFIER_CANNOT_BE_PRECEDED_BY, formatPseudoName(UboPseudoName.MatchesPath), getFormattedTokenName(tokens[j].type)), baseOffset + tokens[j].start, baseOffset + raw.length); } } stackModifier({ name: fn, modifierStart, modifierBalance, nameStart: start, nameEnd: end - 1, // ignore opening parenthesis valueStart: end, valueBalance: balance, isException, }); } else { throw new AdblockSyntaxError(sprintf(ERROR_MESSAGES.UBO_MODIFIER_CANNOT_BE_NESTED, formatPseudoName(fn)), baseOffset + start - 1, baseOffset + raw.length); } } else { stackModifier({ name: fn, modifierStart: start - 1, // Include the colon modifierBalance: balance, nameStart: start, nameEnd: end - 1, // ignore opening parenthesis valueStart: end, valueBalance: balance, isException: false, }); } } } else { // Get the last stacked modifier const lastStackedModifier = uboModifierStack[uboModifierStack.length - 1]; // Do not allow any other token after `:matches-path(...)` inside `:not(...)` if (lastStackedModifier?.name === UboPseudoName.MatchesPath && lastStackedModifier?.isException) { if (!(type === TokenType.CloseParenthesis || type === TokenType.Whitespace) && balance < lastStackedModifier.valueBalance) { throw new AdblockSyntaxError(sprintf(ERROR_MESSAGES.NEGATED_UBO_MODIFIER_CANNOT_BE_FOLLOWED_BY, formatPseudoName(UboPseudoName.MatchesPath), getFormattedTokenName(type)), baseOffset + start, baseOffset + raw.length); } } // If we have reached a closing parenthesis, then we should check if it closes the last stacked modifier // and if so, pop it from the stack if (type === TokenType.CloseParenthesis && lastStackedModifier) { if (balance === Math.max(0, lastStackedModifier.valueBalance - 1)) { lastStackedModifier.valueEnd = start; } if (balance === Math.max(0, lastStackedModifier.modifierBalance - 1)) { const modifierName = { type: 'Value', value: lastStackedModifier.name, }; if (options.isLocIncluded) { // TODO: Refactor modifierName.start = baseOffset + lastStackedModifier.nameStart; modifierName.end = baseOffset + lastStackedModifier.nameEnd; } const value = { type: 'Value', value: raw.slice(lastStackedModifier.valueStart, lastStackedModifier.valueEnd), }; if (options.isLocIncluded) { value.start = baseOffset + lastStackedModifier.valueStart; // It's safe to use `!` here, because we determined the value end index in the // previous `if` statement // eslint-disable-next-line @typescript-eslint/no-non-null-assertion value.end = baseOffset + lastStackedModifier.valueEnd; } const modifier = { type: 'Modifier', name: modifierName, value, exception: lastStackedModifier.isException, }; if (options.isLocIncluded) { modifier.start = baseOffset + lastStackedModifier.modifierStart; modifier.end = baseOffset + end; } modifiers.children.push(modifier); processedModifiers.add(lastStackedModifier.name); uboModifierStack.pop(); // Mark the character slots in the selector string that are occupied by uBO modifiers uboIndexes.fill(true, lastStackedModifier.modifierStart, end); } } } // Save the token to the history and increase the index tokens.push({ type, start, end, balance, }); i += 1; }); const selector = { type: 'Value', value: raw .split(EMPTY) .map((char, p) => (uboIndexes[p] ? EMPTY : char)) .join(EMPTY) .trim(), }; if (options.isLocIncluded) { selector.start = baseOffset; selector.end = baseOffset + raw.length; } const result = { type: 'UboSelector', selector, modifiers, }; if (options.isLocIncluded) { result.start = baseOffset; result.end = baseOffset + raw.length; } return result; } } export { ERROR_MESSAGES, UboSelectorParser, formatPseudoName };