langium-cli
Version:
CLI for Langium - the language engineering tool
336 lines (335 loc) • 12.7 kB
JavaScript
/******************************************************************************
* Copyright 2022 TypeFox GmbH
* This program and the accompanying materials are made available under the
* terms of the MIT License, which is available in the project root.
******************************************************************************/
import { GrammarAST, GrammarUtils, RegExpUtils } from 'langium';
import { expandToNode, joinToNode, toString } from 'langium/generate';
import { collectKeywords } from '../langium-util.js';
/**
* Determines whether a given object is a Rule instance
* @param obj Object to check
* @returns Whether this object is a Rule
*/
function isRule(obj) {
return obj.regex !== undefined && obj.action !== undefined;
}
/**
* Generates a Monarch highlighting grammar file's contents, based on the passed Langium grammar
* @param grammar Langium grammar to use in generating this Monarch syntax highlighting file content
* @param config Langium Config to also use during generation
* @returns Generated Monarch syntax highlighting file content
*/
export function generateMonarch(grammar, config) {
const symbols = getSymbols(grammar);
const regex = /[{}[\]()]/;
const operators = symbols.filter(s => !regex.test(s));
// build absract monarch grammar representation
const monarchGrammar = {
languageDefinition: {
name: config.id, // identifier for generating the grammar export
keywords: getKeywords(grammar),
operators,
symbols,
tokenPostfix: '.' + config.id, // category appended to all tokens
},
tokenizer: {
states: getTokenizerStates(grammar)
}
};
// return concrete monarch grammar representation
return prettyPrint(monarchGrammar);
}
/**
* Gets Monarch tokenizer states from a Langium grammar
* @param grammar Langium grammar to source tokenizer states from
* @returns Array of tokenizer states
*/
function getTokenizerStates(grammar) {
// initial state, name is arbitrary, just needs to come first
const initialState = {
name: 'initial',
rules: getTerminalRules(grammar)
};
const whitespaceState = {
name: 'whitespace',
rules: getWhitespaceRules(grammar)
};
const commentState = {
name: 'comment',
rules: getCommentRules(grammar)
};
// order the following additional rules, to prevent
// comment sequences being classified as symbols
// add include for the whitespace state
initialState.rules.push(whitespaceState);
// add operator & symbol case handling
initialState.rules.push({
regex: '@symbols',
action: [
{
guard: '@operators',
action: { token: 'operator' }
},
// by default, leave the symbol alone
{
guard: '@default',
action: { token: '' }
}
]
});
return [
initialState,
whitespaceState,
commentState
];
}
/**
* Pretty prints a monarch grammar into a concrete form, suitable for writing to a file
* @param monarchGrammar Grammar to pretty print
* @returns Monarch grammar in concrete form
*/
function prettyPrint(monarchGrammar) {
const name = monarchGrammar.languageDefinition.name;
const node = expandToNode `
// Monarch syntax highlighting for the ${name} language.
export default {
${prettyPrintLangDef(monarchGrammar.languageDefinition)}
${prettyPrintTokenizer(monarchGrammar.tokenizer)}
};
`.appendNewLine();
return toString(node);
}
/**
* Generates an entry for a language definition, given a name (token category) and values
* @param name Category of language definition to add
* @param values Values to add under the given category
* @returns GeneratorNode containing this printed language definition entry
*/
function genLanguageDefEntry(name, values) {
return expandToNode `
${name}: [
${values.map(v => `'${v}'`).join(',')}
],
`;
}
/**
* Pretty prints the language definition portion of a Monarch grammar
* @param languageDef LanguageDefinition to pretty print
* @param node Existing generator node to append printed language definition to
*/
function prettyPrintLangDef(languageDef) {
return expandToNode `
${genLanguageDefEntry('keywords', languageDef.keywords)}
${genLanguageDefEntry('operators', languageDef.operators)}
${ /* special case, identify symbols via singular regex*/undefined}
symbols: ${new RegExp(languageDef.symbols.map(RegExpUtils.escapeRegExp).join('|')).toString()},
`;
}
/**
* Pretty prints the tokenizer portion of a Monarch grammar file
* @param tokenizer Tokenizer portion to print out
* @param node Existing generator node to append printed tokenizer to
*/
function prettyPrintTokenizer(tokenizer) {
return expandToNode `
tokenizer: {
${joinToNode(tokenizer.states, prettyPrintState, { appendNewLineIfNotEmpty: true })}
}
`;
}
/**
* Pretty prints a tokenizer state, composed of various rules
* @param state Tokenizer state to pretty print
* @param node Existing enerator node to append printed state to
*/
function prettyPrintState(state) {
return expandToNode `
${state.name}: [
${joinToNode(state.rules, prettyPrintRule, { appendNewLineIfNotEmpty: true })}
],
`;
}
/**
* Pretty prints a Rule.
* This can either be a literal rule to match w/ an action, or a reference to a state to include here
* @param ruleOrState Rule to pretty print. If it's a state, we include that state's contents implicitly within this context.
* @returns Generator node containing this printed rule
*/
function prettyPrintRule(ruleOrState) {
if (isRule(ruleOrState)) {
// extract rule pattern, either just a string or a regex w/ parts
const rulePatt = ruleOrState.regex instanceof RegExp ? ruleOrState.regex : new RegExp(ruleOrState.regex);
return expandToNode `{ regex: ${rulePatt.toString()}, action: ${prettyPrintAction(ruleOrState.action)} },`;
}
else {
// include another state by name, implicitly includes all of its contents
return expandToNode `{ include: '@${ruleOrState.name}' },`;
}
}
/**
* Pretty prints the action of a Rule
* @param action Action to print. Can have several keywords to control what the state machine should do next.
* @returns Action in concrete form
*/
function prettyPrintAction(action) {
if (!Array.isArray(action)) {
// plain action
return JSON.stringify(action);
}
else {
// array of cases, each with an action
const prettyCases = action.map(c => `'${c.guard}': ` + prettyPrintAction(c.action)).join(', ');
return '{ cases: { ' + prettyCases + ' }}';
}
}
/**
* Extracts Monarch token name from a Langium terminal rule, using either name or type.
* @param rule Rule to convert to a Monarch token name
* @returns Returns the equivalent monarch token name, or the original rule name
*/
function getMonarchTokenName(rule) {
if (rule.name.toLowerCase() === 'string') {
// string is clarified as a terminal by name, but not necessarily by type
return 'string';
}
else if (rule.type) {
// use rule type
return rule.type.name;
}
else {
// fallback to the original name
return rule.name;
}
}
/**
* Gets whitespace rules from the langium grammar. Includes starting comment sequence.
* @param grammar Langium grammar to extract whitespace rules from
* @returns Array of Monarch whitespace rules
*/
function getWhitespaceRules(grammar) {
const rules = [];
for (const rule of grammar.rules) {
if (GrammarAST.isTerminalRule(rule)) {
const regex = GrammarUtils.terminalRegex(rule);
if (!GrammarUtils.isCommentTerminal(rule) && !RegExpUtils.isWhitespace(regex)) {
// skip rules that are not comments or whitespace
continue;
}
// token name is either comment or whitespace
const tokenName = GrammarUtils.isCommentTerminal(rule) ? 'comment' : 'white';
const part = RegExpUtils.getTerminalParts(regex)[0];
// check if this is a comment terminal w/ a start & end sequence (multi-line)
if (part && part.start !== '' && part.end !== '' && GrammarUtils.isCommentTerminal(rule)) {
// state-based comment rule, only add push to jump into it
rules.push({
regex: part.start,
action: { token: tokenName, next: '@' + tokenName }
});
}
else {
// single regex rule, generally for whitespace
rules.push({
regex: regex,
action: { token: tokenName }
});
}
}
}
return rules;
}
/**
* Gets comment state rules from the Langium grammar.
* Accounts for multi-line comments, but without nesting.
* @param grammar Langium grammar to extract comment rules from
* @returns Array of Monarch comment rules
*/
function getCommentRules(grammar) {
const rules = [];
for (const rule of grammar.rules) {
if (GrammarAST.isTerminalRule(rule) && GrammarUtils.isCommentTerminal(rule)) {
const tokenName = 'comment';
const part = RegExpUtils.getTerminalParts(GrammarUtils.terminalRegex(rule))[0];
if (part && part.start !== '' && part.end !== '') {
// rules to manage comment start/end
// rule order matters
const start = part.start;
const end = part.end;
// 1st, add anything that's not in the start sequence
rules.push({
regex: `[^${start}]+`,
action: { token: tokenName }
});
// 2nd, end of sequence, pop this state, keeping others on the stack
rules.push({
regex: end,
action: { token: tokenName, next: '@pop' }
});
// 3rd, otherwise, start sequence characters are OK in this state
rules.push({
regex: `[${start}]`,
action: { token: tokenName }
});
}
}
}
return rules;
}
/**
* Retrieves non-comment terminal rules, creating associated actions for them
* @param grammar Grammar to get non-comment terminals from
* @returns Array of Rules to add to a Monarch tokenizer state
*/
function getTerminalRules(grammar) {
const rules = [];
for (const rule of grammar.rules) {
if (GrammarAST.isTerminalRule(rule) && !GrammarUtils.isCommentTerminal(rule)) {
const regex = GrammarUtils.terminalRegex(rule);
if (RegExpUtils.isWhitespace(regex)) {
// disallow terminal rules that match whitespace
continue;
}
const tokenName = getMonarchTokenName(rule);
// default action...
let action = { token: tokenName };
if (getKeywords(grammar).some(keyword => regex.test(keyword))) {
// this rule overlaps with at least one keyword
// add case so keywords aren't tagged incorrectly as this token type
action = [{
guard: '@keywords',
action: { token: 'keyword' }
},
{
guard: '@default',
action // include default action from above
}];
}
rules.push({
regex,
action
});
}
}
return rules;
}
/**
* Keyword regex for matching keyword terminals, or for only collecting symbol terminals
*/
const KeywordRegex = /[A-Za-z]/;
/**
* Retrieves keywords from the current grammar
* @param grammar Grammar to get keywords from
* @returns Array of keywords
*/
function getKeywords(grammar) {
return collectKeywords(grammar).filter(kw => KeywordRegex.test(kw));
}
/**
* Retrieve symbols from langium grammar
* @param grammar Grammar to get symbols from
* @returns Array of symbols, effective inverse of getKeywords
*/
function getSymbols(grammar) {
return collectKeywords(grammar).filter(kw => !KeywordRegex.test(kw));
}
//# sourceMappingURL=monarch-generator.js.map