UNPKG

langium-cli

Version:

CLI for Langium - the language engineering tool

336 lines (335 loc) 12.7 kB
/****************************************************************************** * Copyright 2022 TypeFox GmbH * This program and the accompanying materials are made available under the * terms of the MIT License, which is available in the project root. ******************************************************************************/ import { GrammarAST, GrammarUtils, RegExpUtils } from 'langium'; import { expandToNode, joinToNode, toString } from 'langium/generate'; import { collectKeywords } from '../langium-util.js'; /** * Determines whether a given object is a Rule instance * @param obj Object to check * @returns Whether this object is a Rule */ function isRule(obj) { return obj.regex !== undefined && obj.action !== undefined; } /** * Generates a Monarch highlighting grammar file's contents, based on the passed Langium grammar * @param grammar Langium grammar to use in generating this Monarch syntax highlighting file content * @param config Langium Config to also use during generation * @returns Generated Monarch syntax highlighting file content */ export function generateMonarch(grammar, config) { const symbols = getSymbols(grammar); const regex = /[{}[\]()]/; const operators = symbols.filter(s => !regex.test(s)); // build absract monarch grammar representation const monarchGrammar = { languageDefinition: { name: config.id, // identifier for generating the grammar export keywords: getKeywords(grammar), operators, symbols, tokenPostfix: '.' + config.id, // category appended to all tokens }, tokenizer: { states: getTokenizerStates(grammar) } }; // return concrete monarch grammar representation return prettyPrint(monarchGrammar); } /** * Gets Monarch tokenizer states from a Langium grammar * @param grammar Langium grammar to source tokenizer states from * @returns Array of tokenizer states */ function getTokenizerStates(grammar) { // initial state, name is arbitrary, just needs to come first const initialState = { name: 'initial', rules: getTerminalRules(grammar) }; const whitespaceState = { name: 'whitespace', rules: getWhitespaceRules(grammar) }; const commentState = { name: 'comment', rules: getCommentRules(grammar) }; // order the following additional rules, to prevent // comment sequences being classified as symbols // add include for the whitespace state initialState.rules.push(whitespaceState); // add operator & symbol case handling initialState.rules.push({ regex: '@symbols', action: [ { guard: '@operators', action: { token: 'operator' } }, // by default, leave the symbol alone { guard: '@default', action: { token: '' } } ] }); return [ initialState, whitespaceState, commentState ]; } /** * Pretty prints a monarch grammar into a concrete form, suitable for writing to a file * @param monarchGrammar Grammar to pretty print * @returns Monarch grammar in concrete form */ function prettyPrint(monarchGrammar) { const name = monarchGrammar.languageDefinition.name; const node = expandToNode ` // Monarch syntax highlighting for the ${name} language. export default { ${prettyPrintLangDef(monarchGrammar.languageDefinition)} ${prettyPrintTokenizer(monarchGrammar.tokenizer)} }; `.appendNewLine(); return toString(node); } /** * Generates an entry for a language definition, given a name (token category) and values * @param name Category of language definition to add * @param values Values to add under the given category * @returns GeneratorNode containing this printed language definition entry */ function genLanguageDefEntry(name, values) { return expandToNode ` ${name}: [ ${values.map(v => `'${v}'`).join(',')} ], `; } /** * Pretty prints the language definition portion of a Monarch grammar * @param languageDef LanguageDefinition to pretty print * @param node Existing generator node to append printed language definition to */ function prettyPrintLangDef(languageDef) { return expandToNode ` ${genLanguageDefEntry('keywords', languageDef.keywords)} ${genLanguageDefEntry('operators', languageDef.operators)} ${ /* special case, identify symbols via singular regex*/undefined} symbols: ${new RegExp(languageDef.symbols.map(RegExpUtils.escapeRegExp).join('|')).toString()}, `; } /** * Pretty prints the tokenizer portion of a Monarch grammar file * @param tokenizer Tokenizer portion to print out * @param node Existing generator node to append printed tokenizer to */ function prettyPrintTokenizer(tokenizer) { return expandToNode ` tokenizer: { ${joinToNode(tokenizer.states, prettyPrintState, { appendNewLineIfNotEmpty: true })} } `; } /** * Pretty prints a tokenizer state, composed of various rules * @param state Tokenizer state to pretty print * @param node Existing enerator node to append printed state to */ function prettyPrintState(state) { return expandToNode ` ${state.name}: [ ${joinToNode(state.rules, prettyPrintRule, { appendNewLineIfNotEmpty: true })} ], `; } /** * Pretty prints a Rule. * This can either be a literal rule to match w/ an action, or a reference to a state to include here * @param ruleOrState Rule to pretty print. If it's a state, we include that state's contents implicitly within this context. * @returns Generator node containing this printed rule */ function prettyPrintRule(ruleOrState) { if (isRule(ruleOrState)) { // extract rule pattern, either just a string or a regex w/ parts const rulePatt = ruleOrState.regex instanceof RegExp ? ruleOrState.regex : new RegExp(ruleOrState.regex); return expandToNode `{ regex: ${rulePatt.toString()}, action: ${prettyPrintAction(ruleOrState.action)} },`; } else { // include another state by name, implicitly includes all of its contents return expandToNode `{ include: '@${ruleOrState.name}' },`; } } /** * Pretty prints the action of a Rule * @param action Action to print. Can have several keywords to control what the state machine should do next. * @returns Action in concrete form */ function prettyPrintAction(action) { if (!Array.isArray(action)) { // plain action return JSON.stringify(action); } else { // array of cases, each with an action const prettyCases = action.map(c => `'${c.guard}': ` + prettyPrintAction(c.action)).join(', '); return '{ cases: { ' + prettyCases + ' }}'; } } /** * Extracts Monarch token name from a Langium terminal rule, using either name or type. * @param rule Rule to convert to a Monarch token name * @returns Returns the equivalent monarch token name, or the original rule name */ function getMonarchTokenName(rule) { if (rule.name.toLowerCase() === 'string') { // string is clarified as a terminal by name, but not necessarily by type return 'string'; } else if (rule.type) { // use rule type return rule.type.name; } else { // fallback to the original name return rule.name; } } /** * Gets whitespace rules from the langium grammar. Includes starting comment sequence. * @param grammar Langium grammar to extract whitespace rules from * @returns Array of Monarch whitespace rules */ function getWhitespaceRules(grammar) { const rules = []; for (const rule of grammar.rules) { if (GrammarAST.isTerminalRule(rule)) { const regex = GrammarUtils.terminalRegex(rule); if (!GrammarUtils.isCommentTerminal(rule) && !RegExpUtils.isWhitespace(regex)) { // skip rules that are not comments or whitespace continue; } // token name is either comment or whitespace const tokenName = GrammarUtils.isCommentTerminal(rule) ? 'comment' : 'white'; const part = RegExpUtils.getTerminalParts(regex)[0]; // check if this is a comment terminal w/ a start & end sequence (multi-line) if (part && part.start !== '' && part.end !== '' && GrammarUtils.isCommentTerminal(rule)) { // state-based comment rule, only add push to jump into it rules.push({ regex: part.start, action: { token: tokenName, next: '@' + tokenName } }); } else { // single regex rule, generally for whitespace rules.push({ regex: regex, action: { token: tokenName } }); } } } return rules; } /** * Gets comment state rules from the Langium grammar. * Accounts for multi-line comments, but without nesting. * @param grammar Langium grammar to extract comment rules from * @returns Array of Monarch comment rules */ function getCommentRules(grammar) { const rules = []; for (const rule of grammar.rules) { if (GrammarAST.isTerminalRule(rule) && GrammarUtils.isCommentTerminal(rule)) { const tokenName = 'comment'; const part = RegExpUtils.getTerminalParts(GrammarUtils.terminalRegex(rule))[0]; if (part && part.start !== '' && part.end !== '') { // rules to manage comment start/end // rule order matters const start = part.start; const end = part.end; // 1st, add anything that's not in the start sequence rules.push({ regex: `[^${start}]+`, action: { token: tokenName } }); // 2nd, end of sequence, pop this state, keeping others on the stack rules.push({ regex: end, action: { token: tokenName, next: '@pop' } }); // 3rd, otherwise, start sequence characters are OK in this state rules.push({ regex: `[${start}]`, action: { token: tokenName } }); } } } return rules; } /** * Retrieves non-comment terminal rules, creating associated actions for them * @param grammar Grammar to get non-comment terminals from * @returns Array of Rules to add to a Monarch tokenizer state */ function getTerminalRules(grammar) { const rules = []; for (const rule of grammar.rules) { if (GrammarAST.isTerminalRule(rule) && !GrammarUtils.isCommentTerminal(rule)) { const regex = GrammarUtils.terminalRegex(rule); if (RegExpUtils.isWhitespace(regex)) { // disallow terminal rules that match whitespace continue; } const tokenName = getMonarchTokenName(rule); // default action... let action = { token: tokenName }; if (getKeywords(grammar).some(keyword => regex.test(keyword))) { // this rule overlaps with at least one keyword // add case so keywords aren't tagged incorrectly as this token type action = [{ guard: '@keywords', action: { token: 'keyword' } }, { guard: '@default', action // include default action from above }]; } rules.push({ regex, action }); } } return rules; } /** * Keyword regex for matching keyword terminals, or for only collecting symbol terminals */ const KeywordRegex = /[A-Za-z]/; /** * Retrieves keywords from the current grammar * @param grammar Grammar to get keywords from * @returns Array of keywords */ function getKeywords(grammar) { return collectKeywords(grammar).filter(kw => KeywordRegex.test(kw)); } /** * Retrieve symbols from langium grammar * @param grammar Grammar to get symbols from * @returns Array of symbols, effective inverse of getKeywords */ function getSymbols(grammar) { return collectKeywords(grammar).filter(kw => !KeywordRegex.test(kw)); } //# sourceMappingURL=monarch-generator.js.map