UNPKG

antlr-ng

Version:

Next generation ANTLR Tool

512 lines (511 loc) 17.7 kB
var __defProp = Object.defineProperty; var __name = (target, value) => __defProp(target, "name", { value, configurable: true }); import { fileURLToPath } from "node:url"; import { RuntimeMetaData, Token } from "antlr4ng"; import { NumberRenderer, STGroup, STGroupFile, StringRenderer } from "stringtemplate4ts"; import { ANTLRv4Parser } from "../generated/ANTLRv4Parser.js"; import { CharSupport } from "../misc/CharSupport.js"; import { Utils } from "../misc/Utils.js"; import { Character } from "../support/Character.js"; import { antlrVersion } from "../tool-parameters.js"; import { IssueCode } from "../tool/Issues.js"; import { Grammar } from "../tool/Grammar.js"; import { Rule } from "../tool/Rule.js"; import { UnicodeEscapes } from "./UnicodeEscapes.js"; class Target { constructor(gen) { this.gen = gen; } static { __name(this, "Target"); } static defaultCharValueEscape = /* @__PURE__ */ new Map([ [" ".codePointAt(0), "\\t"], ["\b".codePointAt(0), "\\b"], ["\n".codePointAt(0), "\\n"], ["\r".codePointAt(0), "\\r"], ["\f".codePointAt(0), "\\f"], ["'".codePointAt(0), "\\'"], ['"'.codePointAt(0), '\\"'], ["\\".codePointAt(0), "\\\\"] ]); static languageTemplates = /* @__PURE__ */ new Map(); static addEscapedChar(map, key, representation) { representation = representation ?? key; map.set(key, "\\" + representation); } /** * For pure strings of Unicode char, how can we display it in the target language as a literal. Useful for dumping * predicates and such that may refer to chars that need to be escaped when represented as strings. Also, * templates need to be escaped so that the target language can hold them as a string. Each target can have * a different set in memory at same time. */ getTargetCharValueEscape() { return Target.defaultCharValueEscape; } getCodeGenerator() { return this.gen; } /** * ANTLR tool should check output templates / target are compatible with tool code generation. For now, a simple * string match used on x.y of x.y.z scheme. We use a method to avoid mismatches between a template called * VERSION. This value is checked against Tool.VERSION during load of templates. * * This additional method forces all targets 4.3 and beyond to add this method. */ getVersion() { return antlrVersion; } get templates() { const language = this.gen.language; let templates = Target.languageTemplates.get(language); if (!templates) { const version = this.getVersion(); const theirVersion = RuntimeMetaData.getMajorMinorVersion(version); const ourVersion = RuntimeMetaData.getMajorMinorVersion(antlrVersion); if (theirVersion !== ourVersion) { this.gen.g.tool.errorManager.toolError( IssueCode.IncompatibleToolAndTemplates, version, antlrVersion, language ); } templates = this.loadTemplates(); Target.languageTemplates.set(language, templates); } return templates; } escapeIfNeeded(identifier) { return this.reservedWords.has(identifier) ? this.escapeWord(identifier) : identifier; } /** * Get a meaningful name for a token type useful during code generation. Literals without associated names * are converted to the string equivalent of their integer values. Used to generate x==ID and x==34 type * comparisons etc... Essentially we are looking for the most obvious way to refer to a token type in the * generated code. */ getTokenTypeAsTargetLabel(g, ttype) { const name = this.escapeIfNeeded(g.getTokenName(ttype)); if (Grammar.INVALID_TOKEN_NAME === name) { return String(ttype); } return name; } getTokenTypesAsTargetLabels(g, tokenTypes) { const labels = new Array(tokenTypes.length); for (let i = 0; i < tokenTypes.length; i++) { labels[i] = this.getTokenTypeAsTargetLabel(g, tokenTypes[i]); } return labels; } /** * Given a random string of unicode chars, return a new string with optionally appropriate quote characters for * target language and possibly with some escaped characters. For example, if the incoming string has actual * newline characters, the output of this method would convert them to the two char sequence \n for Java, C, * C++, ... The new string has double-quotes around it as well. Example string in memory: *``` * a"[newlineChar]b'c[carriageReturnChar]d[tab]e\f *``` * would be converted to the valid s: *``` * "a\"\nb'c\rd\te\\f" *``` * or *``` * a\"\nb'c\rd\te\\f *``` * depending on the quoted arg. */ getTargetStringLiteralFromString(s, quoted) { quoted ??= true; let result = ""; if (quoted) { result += '"'; } for (let i = 0; i < s.length; ) { const c = s.codePointAt(i); const escaped = c <= Character.MAX_VALUE ? this.getTargetCharValueEscape()?.get(Number(c)) : void 0; if (c !== 39 && escaped) { result += escaped; } else if (this.shouldUseUnicodeEscapeForCodePointInDoubleQuotedString(c)) { result += this.createUnicodeEscapedCodePoint(i); } else { result += String.fromCodePoint(c); } i += Character.charCount(c); } if (quoted) { result += '"'; } return result; } /** * Convert from an ANTLR string literal found in a grammar file to an equivalent string literal in the target * language. * * For Java, this is the translation `'a\n"'` -> `"a\n\""`. Expect single quotes around the incoming literal. * Just flip the quotes and replace double quotes with `\"`. * * Note that we have decided to allow people to use '\"' without penalty, so we must build the target string in * a loop as {@link String.replaceAll} cannot handle both `\"` and `"` without a lot of messing around. */ getTargetStringLiteralFromANTLRStringLiteral(generator, literal, addQuotes, escapeSpecial) { escapeSpecial ??= false; let result = ""; if (addQuotes) { result += '"'; } for (let i = 1; i < literal.length - 1; ) { const codePoint = literal.codePointAt(i); let toAdvance = Character.charCount(codePoint); if (codePoint === 92) { const escapedChar = literal.charAt(i + toAdvance); toAdvance++; switch (escapedChar) { // Pass through any escapes that Java also needs case "n": case "r": case "t": case "b": case "f": case "\\": { if (escapeSpecial && escapedChar !== "\\") { result += "\\"; } result += "\\" + escapedChar; break; } case "u": { if (literal.charAt(i + toAdvance) === "{") { while (literal.charAt(i + toAdvance) !== "}") { ++toAdvance; } ++toAdvance; } else { toAdvance += 4; } if (i + toAdvance <= literal.length) { const fullEscape = literal.substring(i, i + toAdvance); result += this.createUnicodeEscapedCodePoint( CharSupport.getCharValueFromCharInGrammarLiteral(fullEscape), escapeSpecial ); } break; } default: { const codePoint2 = literal.codePointAt(i + toAdvance); if (this.shouldUseUnicodeEscapeForCodePointInDoubleQuotedString(codePoint2)) { result += this.createUnicodeEscapedCodePoint(codePoint2, escapeSpecial); } else { result += escapedChar; } break; } } } else { if (codePoint === 34) { result += '\\"'; } else if (this.shouldUseUnicodeEscapeForCodePointInDoubleQuotedString(codePoint)) { result += this.createUnicodeEscapedCodePoint(codePoint, escapeSpecial); } else { result += String.fromCodePoint(codePoint); } } i += toAdvance; } if (addQuotes) { result += '"'; } return result; } /** Assume 16-bit char. */ encodeInt16AsCharEscape(v) { if (v < Character.MIN_VALUE || v > Character.MAX_VALUE) { throw new Error(`Cannot encode the specified value: ${v}`); } if (this.isATNSerializedAsInts()) { return String(v); } const escaped = this.getTargetCharValueEscape()?.get(v); if (escaped) { return escaped; } switch (Character.getType(v)) { case Character.CONTROL: case Character.LINE_SEPARATOR: case Character.PARAGRAPH_SEPARATOR: { return this.escapeChar(v); } default: { if (v <= 127) { return String.fromCodePoint(v); } return this.escapeChar(v); } } } getLoopLabel(ast) { return "loop" + ast.token.tokenIndex; } getLoopCounter(ast) { return "cnt" + ast.token.tokenIndex; } getListLabel(label) { const st = this.templates.getInstanceOf("ListLabelName"); st.add("label", label); return st.render(); } /** * If we know which actual function, we can provide the actual ctx type. This will contain implicit labels etc... * From outside, though, we see only ParserRuleContext unless there are externally visible stuff like args, locals, * explicit labels, etc... */ getRuleFunctionContextStructName(ruleOrFunction) { const rule = ruleOrFunction instanceof Rule ? ruleOrFunction : ruleOrFunction.rule; if (rule.g.isLexer()) { return this.templates.getInstanceOf("LexerRuleContext").render(); } return Utils.capitalize(rule.name) + this.templates.getInstanceOf("RuleContextNameSuffix").render(); } getAltLabelContextStructName(label) { return Utils.capitalize(label) + this.templates.getInstanceOf("RuleContextNameSuffix").render(); } /** * Should be same for all refs to same token like ctx.ID within single rule function for literals like 'while', * we gen _s<ttype> */ getImplicitTokenLabel(tokenName) { const st = this.templates.getInstanceOf("ImplicitTokenLabel"); const ttype = this.getCodeGenerator().g.getTokenType(tokenName); if (tokenName.startsWith("'")) { return "s" + ttype; } const text = this.getTokenTypeAsTargetLabel(this.getCodeGenerator().g, ttype); st.add("tokenName", text); return st.render(); } /** x=(A|B) */ getImplicitSetLabel(id) { const st = this.templates.getInstanceOf("ImplicitSetLabel"); st.add("id", id); return st.render(); } getImplicitRuleLabel(ruleName) { const st = this.templates.getInstanceOf("ImplicitRuleLabel"); st.add("ruleName", ruleName); return st.render(); } getElementListName(name) { const st = this.templates.getInstanceOf("ElementListName"); st.add("elemName", this.getElementName(name)); return st.render(); } getElementName(name) { if (name === ".") { return "_wild"; } if (this.getCodeGenerator().g.getRule(name) !== null) { return name; } const ttype = this.getCodeGenerator().g.getTokenType(name); if (ttype === Token.INVALID_TYPE) { return name; } return this.getTokenTypeAsTargetLabel(this.getCodeGenerator().g, ttype); } /** * Generate TParser.java and TLexer.java from T.g4 if combined, else just use T.java as output regardless of type. */ getRecognizerFileName(header) { const extST = this.templates.getInstanceOf("codeFileExtension"); const recognizerName = this.gen.g.getRecognizerName(); return recognizerName + extST.render(); } /** * A given grammar T, return the listener name such as TListener.java, if we're using the Java target. */ getListenerFileName(header) { const extST = this.templates.getInstanceOf("codeFileExtension"); const listenerName = this.gen.g.name + "Listener"; return listenerName + extST.render(); } /** * A given grammar T, return the visitor name such as TVisitor.java, if we're using the Java target. */ getVisitorFileName(header) { const extST = this.templates.getInstanceOf("codeFileExtension"); const listenerName = this.gen.g.name + "Visitor"; return listenerName + extST.render(); } /** * A given grammar T, return a blank listener implementation such as TBaseListener.java, if we're using the * Java target. */ getBaseListenerFileName(header) { const extST = this.templates.getInstanceOf("codeFileExtension"); const listenerName = this.gen.g.name + "BaseListener"; return listenerName + extST.render(); } /** * A given grammar T, return a blank listener implementation such as TBaseListener.java, if we're using the * Java target. */ getBaseVisitorFileName(header) { const extST = this.templates.getInstanceOf("codeFileExtension"); const listenerName = this.gen.g.name + "BaseVisitor"; return listenerName + extST.render(); } /** * Gets the maximum number of 16-bit unsigned integers that can be encoded in a single segment (a declaration in * target language) of the serialized ATN. E.g., in C++, a small segment length results in multiple decls like: * * static const int32_t serializedATNSegment1[] = { * 0x7, 0x12, 0x2, 0x13, 0x7, 0x13, 0x2, 0x14, 0x7, 0x14, 0x2, 0x15, 0x7, * 0x15, 0x2, 0x16, 0x7, 0x16, 0x2, 0x17, 0x7, 0x17, 0x2, 0x18, 0x7, * 0x18, 0x2, 0x19, 0x7, 0x19, 0x2, 0x1a, 0x7, 0x1a, 0x2, 0x1b, 0x7, * 0x1b, 0x2, 0x1c, 0x7, 0x1c, 0x2, 0x1d, 0x7, 0x1d, 0x2, 0x1e, 0x7, * 0x1e, 0x2, 0x1f, 0x7, 0x1f, 0x2, 0x20, 0x7, 0x20, 0x2, 0x21, 0x7, * 0x21, 0x2, 0x22, 0x7, 0x22, 0x2, 0x23, 0x7, 0x23, 0x2, 0x24, 0x7, * 0x24, 0x2, 0x25, 0x7, 0x25, 0x2, 0x26, * }; * * instead of one big one. Targets are free to ignore this like JavaScript does. * * This is primarily needed by Java target to limit size of any single ATN string to 65k length. * * {@link SerializedATN.getSegments} * * @returns the serialized ATN segment limit */ getSerializedATNSegmentLimit() { return Number.MAX_VALUE; } /** * How many bits should be used to do inline token type tests? Java assumes a 64-bit word for bitsets. Must be a * valid word size for your target like 8, 16, 32, 64, etc... */ getInlineTestSetWordSize() { return 64; } grammarSymbolCausesIssueInGeneratedCode(idNode) { switch (idNode.parent?.getType()) { case ANTLRv4Parser.ASSIGN: { switch (idNode.parent.parent?.getType()) { case ANTLRv4Parser.ELEMENT_OPTIONS: case ANTLRv4Parser.OPTIONS: { return false; } default: { break; } } break; } case ANTLRv4Parser.AT: case ANTLRv4Parser.ELEMENT_OPTIONS: { return false; } case ANTLRv4Parser.LEXER_ACTION_CALL: { if (idNode.childIndex === 0) { return false; } break; } default: { break; } } return this.reservedWords.has(idNode.getText()); } templatesExist() { return this.loadTemplatesHelper(false) !== void 0; } wantsBaseListener() { return true; } wantsBaseVisitor() { return true; } supportsOverloadedMethods() { return true; } isATNSerializedAsInts() { return true; } needsHeader() { return false; } genFile(g, outputFileST, fileName) { this.getCodeGenerator().write(outputFileST, fileName); } escapeWord(word) { return word + "_"; } /** * Escape the Unicode code point appropriately for this language and append the escaped value to {@code sb}. * It exists for flexibility and backward compatibility with external targets, The static method * {@link UnicodeEscapes.appendEscapedCodePoint(StringBuilder, int, String)} can be used as well * if default escaping method (Java) is used or language is officially supported */ createUnicodeEscapedCodePoint(codePoint, escape) { let result = UnicodeEscapes.escapeCodePoint(codePoint, this.gen.language); if (escape) { result = "\\" + result; } return result; } shouldUseUnicodeEscapeForCodePointInDoubleQuotedString(codePoint) { return codePoint < 32 || codePoint === 92 || codePoint >= 127; } escapeChar(v) { return `\\u${v.toString(16).padStart(4, "0")}`; } loadTemplates() { const result = this.loadTemplatesHelper(true); result.registerRenderer(Number, new NumberRenderer()); result.registerRenderer(String, new StringRenderer()); result.setListener(new class { constructor($outer) { this.$outer = $outer; } compileTimeError(msg) { this.reportError(msg); } runTimeError(msg) { this.reportError(msg); } internalError(msg) { this.reportError(msg); } iOError(msg) { this.reportError(msg); } reportError(msg) { this.$outer.gen.g.tool.errorManager.toolError(IssueCode.StringTemplateWarning, msg.toString()); } }(this)); return result; } loadTemplatesHelper(reportErrorIfFail) { const language = this.gen.language; const groupFileName = fileURLToPath(new URL("../../templates/codegen/" + language + "/" + language + STGroup.GROUP_FILE_EXTENSION, import.meta.url)); try { return new STGroupFile(groupFileName); } catch (e) { if (reportErrorIfFail) { this.gen.g.tool.errorManager.toolError(IssueCode.MissingCodeGenTemplates, e, this.gen.language); } return void 0; } } } export { Target };