bitmark-grammar
Version:
532 lines (481 loc) • 17.7 kB
text/typescript
/*!
* Copyright 2016 The ANTLR Project. All rights reserved.
* Licensed under the BSD-3-Clause license. See LICENSE file in the project root for license information.
*/
// CONVERSTION complete, Burt Harris 10/14/2016
import { BailErrorStrategy } from "../../BailErrorStrategy";
import { CharStreams } from "../../CharStreams";
import { Chunk } from "./Chunk";
import { CommonTokenStream } from "../../CommonTokenStream";
import { Lexer } from "../../Lexer";
import { ListTokenSource } from "../../ListTokenSource";
import { MultiMap } from "../../misc/MultiMap";
import { NotNull } from "../../Decorators";
import { ParseCancellationException } from "../../misc/ParseCancellationException";
import { Parser } from "../../Parser";
import { ParserInterpreter } from "../../ParserInterpreter";
import { ParserRuleContext } from "../../ParserRuleContext";
import { ParseTree } from "../ParseTree";
import { ParseTreeMatch } from "./ParseTreeMatch";
import { ParseTreePattern } from "./ParseTreePattern";
import { RecognitionException } from "../../RecognitionException";
import { RuleNode } from "../RuleNode";
import { RuleTagToken } from "./RuleTagToken";
import { TagChunk } from "./TagChunk";
import { TerminalNode } from "../TerminalNode";
import { TextChunk } from "./TextChunk";
import { Token } from "../../Token";
import { TokenTagToken } from "./TokenTagToken";
/**
* A tree pattern matching mechanism for ANTLR {@link ParseTree}s.
*
* Patterns are strings of source input text with special tags representing
* token or rule references such as:
*
* ```
* <ID> = <expr>;
* ```
*
* Given a pattern start rule such as `statement`, this object constructs
* a {@link ParseTree} with placeholders for the `ID` and `expr`
* subtree. Then the {@link #match} routines can compare an actual
* {@link ParseTree} from a parse with this pattern. Tag `<ID>` matches
* any `ID` token and tag `<expr>` references the result of the
* `expr` rule (generally an instance of `ExprContext`.
*
* Pattern `x = 0;` is a similar pattern that matches the same pattern
* except that it requires the identifier to be `x` and the expression to
* be `0`.
*
* The {@link #matches} routines return `true` or `false` based
* upon a match for the tree rooted at the parameter sent in. The
* {@link #match} routines return a {@link ParseTreeMatch} object that
* contains the parse tree, the parse tree pattern, and a map from tag name to
* matched nodes (more below). A subtree that fails to match, returns with
* {@link ParseTreeMatch#mismatchedNode} set to the first tree node that did not
* match.
*
* For efficiency, you can compile a tree pattern in string form to a
* {@link ParseTreePattern} object.
*
* See `TestParseTreeMatcher` for lots of examples.
* {@link ParseTreePattern} has two static helper methods:
* {@link ParseTreePattern#findAll} and {@link ParseTreePattern#match} that
* are easy to use but not super efficient because they create new
* {@link ParseTreePatternMatcher} objects each time and have to compile the
* pattern in string form before using it.
*
* The lexer and parser that you pass into the {@link ParseTreePatternMatcher}
* constructor are used to parse the pattern in string form. The lexer converts
* the `<ID> = <expr>;` into a sequence of four tokens (assuming lexer
* throws out whitespace or puts it on a hidden channel). Be aware that the
* input stream is reset for the lexer (but not the parser; a
* {@link ParserInterpreter} is created to parse the input.). Any user-defined
* fields you have put into the lexer might get changed when this mechanism asks
* it to scan the pattern string.
*
* Normally a parser does not accept token `<expr>` as a valid
* `expr` but, from the parser passed in, we create a special version of
* the underlying grammar representation (an {@link ATN}) that allows imaginary
* tokens representing rules (`<expr>`) to match entire rules. We call
* these *bypass alternatives*.
*
* Delimiters are `<`} and `>`}, with `\` as the escape string
* by default, but you can set them to whatever you want using
* {@link #setDelimiters}. You must escape both start and stop strings
* `\<` and `\>`.
*/
export class ParseTreePatternMatcher {
/**
* This is the backing field for `lexer`.
*/
private _lexer: Lexer;
/**
* This is the backing field for `parser`.
*/
private _parser: Parser;
protected start = "<";
protected stop = ">";
protected escape = "\\"; // e.g., \< and \> must escape BOTH!
/**
* Regular expression corresponding to escape, for global replace
*/
protected escapeRE = /\\/g;
/**
* Constructs a {@link ParseTreePatternMatcher} or from a {@link Lexer} and
* {@link Parser} object. The lexer input stream is altered for tokenizing
* the tree patterns. The parser is used as a convenient mechanism to get
* the grammar name, plus token, rule names.
*/
constructor(lexer: Lexer, parser: Parser) {
this._lexer = lexer;
this._parser = parser;
}
/**
* Set the delimiters used for marking rule and token tags within concrete
* syntax used by the tree pattern parser.
*
* @param start The start delimiter.
* @param stop The stop delimiter.
* @param escapeLeft The escape sequence to use for escaping a start or stop delimiter.
*
* @throws {@link Error} if `start` is not defined or empty.
* @throws {@link Error} if `stop` is not defined or empty.
*/
public setDelimiters(start: string, stop: string, escapeLeft: string): void {
if (!start) {
throw new Error("start cannot be null or empty");
}
if (!stop) {
throw new Error("stop cannot be null or empty");
}
this.start = start;
this.stop = stop;
this.escape = escapeLeft;
this.escapeRE = new RegExp(escapeLeft.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"), "g");
}
/** Does `pattern` matched as rule `patternRuleIndex` match `tree`? */
public matches(tree: ParseTree, pattern: string, patternRuleIndex: number): boolean;
/** Does `pattern` matched as rule patternRuleIndex match tree? Pass in a
* compiled pattern instead of a string representation of a tree pattern.
*/
public matches(tree: ParseTree, pattern: ParseTreePattern): boolean;
public matches(tree: ParseTree, pattern: string | ParseTreePattern, patternRuleIndex: number = 0): boolean {
if (typeof pattern === "string") {
let p: ParseTreePattern = this.compile(pattern, patternRuleIndex);
return this.matches(tree, p);
} else {
let labels = new MultiMap<string, ParseTree>();
let mismatchedNode = this.matchImpl(tree, pattern.patternTree, labels);
return !mismatchedNode;
}
}
/**
* Compare `pattern` matched as rule `patternRuleIndex` against
* `tree` and return a {@link ParseTreeMatch} object that contains the
* matched elements, or the node at which the match failed.
*/
public match(tree: ParseTree, pattern: string, patternRuleIndex: number): ParseTreeMatch;
/**
* Compare `pattern` matched against `tree` and return a
* {@link ParseTreeMatch} object that contains the matched elements, or the
* node at which the match failed. Pass in a compiled pattern instead of a
* string representation of a tree pattern.
*/
public match(tree: ParseTree, pattern: ParseTreePattern): ParseTreeMatch;
// Implementation of match
public match(tree: ParseTree, pattern: string | ParseTreePattern, patternRuleIndex: number = 0): ParseTreeMatch {
if (typeof pattern === "string") {
let p: ParseTreePattern = this.compile(pattern, patternRuleIndex);
return this.match(tree, p);
} else {
let labels = new MultiMap<string, ParseTree>();
let mismatchedNode = this.matchImpl(tree, pattern.patternTree, labels);
return new ParseTreeMatch(tree, pattern, labels, mismatchedNode);
}
}
/**
* For repeated use of a tree pattern, compile it to a
* {@link ParseTreePattern} using this method.
*/
public compile(pattern: string, patternRuleIndex: number): ParseTreePattern {
let tokenList = this.tokenize(pattern);
let tokenSrc = new ListTokenSource(tokenList);
let tokens = new CommonTokenStream(tokenSrc);
const parser = this._parser;
let parserInterp = new ParserInterpreter(
parser.grammarFileName,
parser.vocabulary,
parser.ruleNames,
parser.getATNWithBypassAlts(),
tokens);
let tree: ParseTree;
try {
parserInterp.errorHandler = new BailErrorStrategy();
tree = parserInterp.parse(patternRuleIndex);
// System.out.println("pattern tree = "+tree.toStringTree(parserInterp));
} catch (e) {
if (e instanceof ParseCancellationException) {
throw e.getCause();
} else if (e instanceof RecognitionException) {
throw e;
} else if (e instanceof Error) {
throw new ParseTreePatternMatcher.CannotInvokeStartRule(e);
} else {
throw e;
}
}
// Make sure tree pattern compilation checks for a complete parse
if (tokens.LA(1) !== Token.EOF) {
throw new ParseTreePatternMatcher.StartRuleDoesNotConsumeFullPattern();
}
return new ParseTreePattern(this, pattern, patternRuleIndex, tree);
}
/**
* Used to convert the tree pattern string into a series of tokens. The
* input stream is reset.
*/
get lexer(): Lexer {
return this._lexer;
}
/**
* Used to collect to the grammar file name, token names, rule names for
* used to parse the pattern into a parse tree.
*/
get parser(): Parser {
return this._parser;
}
// ---- SUPPORT CODE ----
/**
* Recursively walk `tree` against `patternTree`, filling
* `match.`{@link ParseTreeMatch#labels labels}.
*
* @returns the first node encountered in `tree` which does not match
* a corresponding node in `patternTree`, or `undefined` if the match
* was successful. The specific node returned depends on the matching
* algorithm used by the implementation, and may be overridden.
*/
protected matchImpl(
tree: ParseTree,
patternTree: ParseTree,
labels: MultiMap<string, ParseTree>): ParseTree | undefined {
if (!tree) {
throw new TypeError("tree cannot be null");
}
if (!patternTree) {
throw new TypeError("patternTree cannot be null");
}
// x and <ID>, x and y, or x and x; or could be mismatched types
if (tree instanceof TerminalNode && patternTree instanceof TerminalNode) {
let mismatchedNode: ParseTree | undefined;
// both are tokens and they have same type
if (tree.symbol.type === patternTree.symbol.type) {
if (patternTree.symbol instanceof TokenTagToken) { // x and <ID>
let tokenTagToken = patternTree.symbol;
// track label->list-of-nodes for both token name and label (if any)
labels.map(tokenTagToken.tokenName, tree);
const l = tokenTagToken.label;
if (l) {
labels.map(l, tree);
}
}
else if (tree.text === patternTree.text) {
// x and x
}
else {
// x and y
if (!mismatchedNode) {
mismatchedNode = tree;
}
}
}
else {
if (!mismatchedNode) {
mismatchedNode = tree;
}
}
return mismatchedNode;
}
if (tree instanceof ParserRuleContext
&& patternTree instanceof ParserRuleContext) {
let mismatchedNode: ParseTree | undefined;
// (expr ...) and <expr>
let ruleTagToken = this.getRuleTagToken(patternTree);
if (ruleTagToken) {
let m: ParseTreeMatch;
if (tree.ruleContext.ruleIndex === patternTree.ruleContext.ruleIndex) {
// track label->list-of-nodes for both rule name and label (if any)
labels.map(ruleTagToken.ruleName, tree);
const l = ruleTagToken.label;
if (l) {
labels.map(l, tree);
}
}
else {
if (!mismatchedNode) {
mismatchedNode = tree;
}
}
return mismatchedNode;
}
// (expr ...) and (expr ...)
if (tree.childCount !== patternTree.childCount) {
if (!mismatchedNode) {
mismatchedNode = tree;
}
return mismatchedNode;
}
let n: number = tree.childCount;
for (let i = 0; i < n; i++) {
let childMatch = this.matchImpl(tree.getChild(i), patternTree.getChild(i), labels);
if (childMatch) {
return childMatch;
}
}
return mismatchedNode;
}
// if nodes aren't both tokens or both rule nodes, can't match
return tree;
}
/** Is `t` `(expr <expr>)` subtree? */
protected getRuleTagToken(t: ParseTree): RuleTagToken | undefined {
if (t instanceof RuleNode) {
if (t.childCount === 1 && t.getChild(0) instanceof TerminalNode) {
let c = t.getChild(0) as TerminalNode;
if (c.symbol instanceof RuleTagToken) {
// System.out.println("rule tag subtree "+t.toStringTree(parser));
return c.symbol;
}
}
}
return undefined;
}
public tokenize(pattern: string): Token[] {
// split pattern into chunks: sea (raw input) and islands (<ID>, <expr>)
let chunks = this.split(pattern);
// create token stream from text and tags
let tokens: Token[] = [];
for (let chunk of chunks) {
if (chunk instanceof TagChunk) {
let tagChunk = chunk;
const firstChar = tagChunk.tag.substr(0, 1);
// add special rule token or conjure up new token from name
if (firstChar === firstChar.toUpperCase()) {
let ttype: number = this._parser.getTokenType(tagChunk.tag);
if (ttype === Token.INVALID_TYPE) {
throw new Error("Unknown token " + tagChunk.tag + " in pattern: " + pattern);
}
let t: TokenTagToken = new TokenTagToken(tagChunk.tag, ttype, tagChunk.label);
tokens.push(t);
}
else if (firstChar === firstChar.toLowerCase()) {
let ruleIndex: number = this._parser.getRuleIndex(tagChunk.tag);
if (ruleIndex === -1) {
throw new Error("Unknown rule " + tagChunk.tag + " in pattern: " + pattern);
}
let ruleImaginaryTokenType: number = this._parser.getATNWithBypassAlts().ruleToTokenType[ruleIndex];
tokens.push(new RuleTagToken(tagChunk.tag, ruleImaginaryTokenType, tagChunk.label));
}
else {
throw new Error("invalid tag: " + tagChunk.tag + " in pattern: " + pattern);
}
}
else {
let textChunk = chunk as TextChunk;
this._lexer.inputStream = CharStreams.fromString(textChunk.text);
let t: Token = this._lexer.nextToken();
while (t.type !== Token.EOF) {
tokens.push(t);
t = this._lexer.nextToken();
}
}
}
// System.out.println("tokens="+tokens);
return tokens;
}
/** Split `<ID> = <e:expr> ;` into 4 chunks for tokenizing by {@link #tokenize}. */
public split(pattern: string): Chunk[] {
let p: number = 0;
let n: number = pattern.length;
let chunks: Chunk[] = [];
let buf: "";
// find all start and stop indexes first, then collect
let starts: number[] = [];
let stops: number[] = [];
while (p < n) {
if (p === pattern.indexOf(this.escape + this.start, p)) {
p += this.escape.length + this.start.length;
}
else if (p === pattern.indexOf(this.escape + this.stop, p)) {
p += this.escape.length + this.stop.length;
}
else if (p === pattern.indexOf(this.start, p)) {
starts.push(p);
p += this.start.length;
}
else if (p === pattern.indexOf(this.stop, p)) {
stops.push(p);
p += this.stop.length;
}
else {
p++;
}
}
// System.out.println("");
// System.out.println(starts);
// System.out.println(stops);
if (starts.length > stops.length) {
throw new Error("unterminated tag in pattern: " + pattern);
}
if (starts.length < stops.length) {
throw new Error("missing start tag in pattern: " + pattern);
}
let ntags: number = starts.length;
for (let i = 0; i < ntags; i++) {
if (starts[i] >= stops[i]) {
throw new Error("tag delimiters out of order in pattern: " + pattern);
}
}
// collect into chunks now
if (ntags === 0) {
let text: string = pattern.substring(0, n);
chunks.push(new TextChunk(text));
}
if (ntags > 0 && starts[0] > 0) { // copy text up to first tag into chunks
let text: string = pattern.substring(0, starts[0]);
chunks.push(new TextChunk(text));
}
for (let i = 0; i < ntags; i++) {
// copy inside of <tag>
let tag: string = pattern.substring(starts[i] + this.start.length, stops[i]);
let ruleOrToken: string = tag;
let label: string | undefined;
let colon: number = tag.indexOf(":");
if (colon >= 0) {
label = tag.substring(0, colon);
ruleOrToken = tag.substring(colon + 1, tag.length);
}
chunks.push(new TagChunk(ruleOrToken, label));
if (i + 1 < ntags) {
// copy from end of <tag> to start of next
let text: string = pattern.substring(stops[i] + this.stop.length, starts[i + 1]);
chunks.push(new TextChunk(text));
}
}
if (ntags > 0) {
let afterLastTag: number = stops[ntags - 1] + this.stop.length;
if (afterLastTag < n) { // copy text from end of last tag to end
let text: string = pattern.substring(afterLastTag, n);
chunks.push(new TextChunk(text));
}
}
// strip out the escape sequences from text chunks but not tags
for (let i = 0; i < chunks.length; i++) {
let c: Chunk = chunks[i];
if (c instanceof TextChunk) {
let unescaped: string = c.text.replace(this.escapeRE, "");
if (unescaped.length < c.text.length) {
chunks[i] = new TextChunk(unescaped);
}
}
}
return chunks;
}
}
export namespace ParseTreePatternMatcher {
export class CannotInvokeStartRule extends Error {
public constructor(public error: Error) {
super(`CannotInvokeStartRule: ${error}`);
}
}
// Fixes https://github.com/antlr/antlr4/issues/413
// "Tree pattern compilation doesn't check for a complete parse"
export class StartRuleDoesNotConsumeFullPattern extends Error {
constructor() {
super("StartRuleDoesNotConsumeFullPattern");
}
}
}