langium-cli
Version:
CLI for Langium - the language engineering tool
457 lines (401 loc) • 15.5 kB
text/typescript
/******************************************************************************
* Copyright 2022 TypeFox GmbH
* This program and the accompanying materials are made available under the
* terms of the MIT License, which is available in the project root.
******************************************************************************/
import { type Grammar, GrammarAST, GrammarUtils, RegExpUtils } from 'langium';
import { type Generated, expandToNode, joinToNode, toString } from 'langium/generate';
import type { LangiumLanguageConfig } from '../../package-types.js';
import { collectKeywords } from '../langium-util.js';
/**
* Monarch Language Definition, describes aspects & token categories of target language
*/
interface LanguageDefinition {
readonly name: string;
readonly keywords: string[];
readonly operators: string[];
readonly symbols: string[];
readonly tokenPostfix: string;
}
/**
* Monarch Tokenizer, consists of an object that defines states.
*/
interface Tokenizer {
states: State[]
}
/**
* Name of a State
*/
type StateName = string;
/**
* Each state is defined as an array of rules which are used to match the input
* Rules can be regular, or other States whose rules we should include in this state
*/
interface State {
name: StateName
rules: Array<Rule | State>
}
/**
* A rule that matches input. Can have either an action, or an array of cases.
*/
interface Rule {
regex: RegExp | string;
action: Action | Case[];
}
/**
* A case that selects a specific action by matching a guard pattern
*/
interface Case {
guard: string;
action: Action;
}
/**
* Determines whether a given object is a Rule instance
* @param obj Object to check
* @returns Whether this object is a Rule
*/
function isRule(obj: State | Rule): obj is Rule {
return (obj as Rule).regex !== undefined && (obj as Rule).action !== undefined;
}
/**
* Name of a token type, such as 'string'
*/
type Token = string;
/**
* Token class to be used for CSS rendering, such as 'keyword', 'component', or 'type.identifer'
*/
type TokenClass = string;
/**
* Next state that proceeds from an action, can also be a pop or a push of the current state (like for nested block comments)
*/
type NextState = StateName | '@pop' | '@push';
/**
* An action performed when a rule (or a case) matches token.
* It can determine the token class, as well whether to push/pop a tokenizer state
*/
interface Action {
token?: Token
tokenClass?: TokenClass
next?: NextState
// other more advanced states omitted...
}
/**
* Abstract representation of a Monarch grammar file
*/
interface MonarchGrammar {
readonly languageDefinition: LanguageDefinition;
readonly tokenizer: Tokenizer;
}
/**
* Generates a Monarch highlighting grammar file's contents, based on the passed Langium grammar
* @param grammar Langium grammar to use in generating this Monarch syntax highlighting file content
* @param config Langium Config to also use during generation
* @returns Generated Monarch syntax highlighting file content
*/
export function generateMonarch(grammar: Grammar, config: LangiumLanguageConfig): string {
const symbols = getSymbols(grammar);
const regex = /[{}[\]()]/;
const operators = symbols.filter(s => !regex.test(s));
// build absract monarch grammar representation
const monarchGrammar: MonarchGrammar = {
languageDefinition: {
name: config.id, // identifier for generating the grammar export
keywords: getKeywords(grammar),
operators,
symbols,
tokenPostfix: '.' + config.id, // category appended to all tokens
},
tokenizer: {
states: getTokenizerStates(grammar)
}
};
// return concrete monarch grammar representation
return prettyPrint(monarchGrammar);
}
/**
* Gets Monarch tokenizer states from a Langium grammar
* @param grammar Langium grammar to source tokenizer states from
* @returns Array of tokenizer states
*/
function getTokenizerStates(grammar: Grammar): State[] {
// initial state, name is arbitrary, just needs to come first
const initialState: State = {
name: 'initial',
rules: getTerminalRules(grammar)
};
const whitespaceState: State = {
name: 'whitespace',
rules: getWhitespaceRules(grammar)
};
const commentState: State = {
name: 'comment',
rules: getCommentRules(grammar)
};
// order the following additional rules, to prevent
// comment sequences being classified as symbols
// add include for the whitespace state
initialState.rules.push(whitespaceState);
// add operator & symbol case handling
initialState.rules.push({
regex: '@symbols',
action: [
{
guard: '@operators',
action: { token: 'operator' }
},
// by default, leave the symbol alone
{
guard: '@default',
action: { token: '' }
}
]
});
return [
initialState,
whitespaceState,
commentState
];
}
/**
* Pretty prints a monarch grammar into a concrete form, suitable for writing to a file
* @param monarchGrammar Grammar to pretty print
* @returns Monarch grammar in concrete form
*/
function prettyPrint(monarchGrammar: MonarchGrammar): string {
const name = monarchGrammar.languageDefinition.name;
const node = expandToNode`
// Monarch syntax highlighting for the ${name} language.
export default {
${prettyPrintLangDef(monarchGrammar.languageDefinition)}
${prettyPrintTokenizer(monarchGrammar.tokenizer)}
};
`.appendNewLine();
return toString(node);
}
/**
* Generates an entry for a language definition, given a name (token category) and values
* @param name Category of language definition to add
* @param values Values to add under the given category
* @returns GeneratorNode containing this printed language definition entry
*/
function genLanguageDefEntry(name: string, values: string[]): Generated {
return expandToNode`
${name}: [
${ values.map(v => `'${v}'`).join(',') }
],
`;
}
/**
* Pretty prints the language definition portion of a Monarch grammar
* @param languageDef LanguageDefinition to pretty print
* @param node Existing generator node to append printed language definition to
*/
function prettyPrintLangDef(languageDef: LanguageDefinition): Generated {
return expandToNode`
${genLanguageDefEntry('keywords', languageDef.keywords)}
${genLanguageDefEntry('operators', languageDef.operators)}
${/* special case, identify symbols via singular regex*/ undefined}
symbols: ${new RegExp(languageDef.symbols.map(RegExpUtils.escapeRegExp).join('|')).toString()},
`;
}
/**
* Pretty prints the tokenizer portion of a Monarch grammar file
* @param tokenizer Tokenizer portion to print out
* @param node Existing generator node to append printed tokenizer to
*/
function prettyPrintTokenizer(tokenizer: Tokenizer): Generated {
return expandToNode`
tokenizer: {
${joinToNode(tokenizer.states, prettyPrintState, { appendNewLineIfNotEmpty: true})}
}
`;
}
/**
* Pretty prints a tokenizer state, composed of various rules
* @param state Tokenizer state to pretty print
* @param node Existing enerator node to append printed state to
*/
function prettyPrintState(state: State): Generated {
return expandToNode`
${state.name}: [
${joinToNode(state.rules, prettyPrintRule, { appendNewLineIfNotEmpty: true })}
],
`;
}
/**
* Pretty prints a Rule.
* This can either be a literal rule to match w/ an action, or a reference to a state to include here
* @param ruleOrState Rule to pretty print. If it's a state, we include that state's contents implicitly within this context.
* @returns Generator node containing this printed rule
*/
function prettyPrintRule(ruleOrState: Rule | State): Generated {
if (isRule(ruleOrState)) {
// extract rule pattern, either just a string or a regex w/ parts
const rulePatt = ruleOrState.regex instanceof RegExp ? ruleOrState.regex : new RegExp(ruleOrState.regex);
return expandToNode`{ regex: ${rulePatt.toString()}, action: ${prettyPrintAction(ruleOrState.action)} },`;
} else {
// include another state by name, implicitly includes all of its contents
return expandToNode`{ include: '@${ruleOrState.name}' },`;
}
}
/**
* Pretty prints the action of a Rule
* @param action Action to print. Can have several keywords to control what the state machine should do next.
* @returns Action in concrete form
*/
function prettyPrintAction(action: Action | Case[]): string {
if (!Array.isArray(action)) {
// plain action
return JSON.stringify(action);
} else {
// array of cases, each with an action
const prettyCases: string = action.map(c => `'${c.guard}': ` + prettyPrintAction(c.action)).join(', ');
return '{ cases: { ' + prettyCases + ' }}';
}
}
/**
* Extracts Monarch token name from a Langium terminal rule, using either name or type.
* @param rule Rule to convert to a Monarch token name
* @returns Returns the equivalent monarch token name, or the original rule name
*/
function getMonarchTokenName(rule: GrammarAST.TerminalRule): string {
if (rule.name.toLowerCase() === 'string') {
// string is clarified as a terminal by name, but not necessarily by type
return 'string';
} else if (rule.type) {
// use rule type
return rule.type.name;
} else {
// fallback to the original name
return rule.name;
}
}
/**
* Gets whitespace rules from the langium grammar. Includes starting comment sequence.
* @param grammar Langium grammar to extract whitespace rules from
* @returns Array of Monarch whitespace rules
*/
function getWhitespaceRules(grammar: Grammar): Rule[] {
const rules: Rule[] = [];
for (const rule of grammar.rules) {
if (GrammarAST.isTerminalRule(rule)) {
const regex = GrammarUtils.terminalRegex(rule);
if (!GrammarUtils.isCommentTerminal(rule) && !RegExpUtils.isWhitespace(regex)) {
// skip rules that are not comments or whitespace
continue;
}
// token name is either comment or whitespace
const tokenName = GrammarUtils.isCommentTerminal(rule) ? 'comment' : 'white';
const part = RegExpUtils.getTerminalParts(regex)[0];
// check if this is a comment terminal w/ a start & end sequence (multi-line)
if (part && part.start !== '' && part.end !== '' && GrammarUtils.isCommentTerminal(rule)) {
// state-based comment rule, only add push to jump into it
rules.push({
regex: part.start,
action: { token: tokenName, next: '@' + tokenName }
});
} else {
// single regex rule, generally for whitespace
rules.push({
regex: regex,
action: { token: tokenName }
});
}
}
}
return rules;
}
/**
* Gets comment state rules from the Langium grammar.
* Accounts for multi-line comments, but without nesting.
* @param grammar Langium grammar to extract comment rules from
* @returns Array of Monarch comment rules
*/
function getCommentRules(grammar: Grammar): Rule[] {
const rules: Rule[] = [];
for (const rule of grammar.rules) {
if (GrammarAST.isTerminalRule(rule) && GrammarUtils.isCommentTerminal(rule)) {
const tokenName = 'comment';
const part = RegExpUtils.getTerminalParts(GrammarUtils.terminalRegex(rule))[0];
if (part && part.start !== '' && part.end !== '') {
// rules to manage comment start/end
// rule order matters
const start = part.start;
const end = part.end;
// 1st, add anything that's not in the start sequence
rules.push({
regex: `[^${start}]+`,
action: { token: tokenName }
});
// 2nd, end of sequence, pop this state, keeping others on the stack
rules.push({
regex: end,
action: { token: tokenName, next: '@pop' }
});
// 3rd, otherwise, start sequence characters are OK in this state
rules.push({
regex: `[${start}]`,
action: { token: tokenName }
});
}
}
}
return rules;
}
/**
* Retrieves non-comment terminal rules, creating associated actions for them
* @param grammar Grammar to get non-comment terminals from
* @returns Array of Rules to add to a Monarch tokenizer state
*/
function getTerminalRules(grammar: Grammar): Rule[] {
const rules: Rule[] = [];
for (const rule of grammar.rules) {
if (GrammarAST.isTerminalRule(rule) && !GrammarUtils.isCommentTerminal(rule)) {
const regex = GrammarUtils.terminalRegex(rule);
if (RegExpUtils.isWhitespace(regex)) {
// disallow terminal rules that match whitespace
continue;
}
const tokenName = getMonarchTokenName(rule);
// default action...
let action: Action | Case[] = { token: tokenName };
if (getKeywords(grammar).some(keyword => regex.test(keyword))) {
// this rule overlaps with at least one keyword
// add case so keywords aren't tagged incorrectly as this token type
action = [{
guard: '@keywords',
action: { token: 'keyword' }
},
{
guard: '@default',
action // include default action from above
}];
}
rules.push({
regex,
action
});
}
}
return rules;
}
/**
* Keyword regex for matching keyword terminals, or for only collecting symbol terminals
*/
const KeywordRegex = /[A-Za-z]/;
/**
* Retrieves keywords from the current grammar
* @param grammar Grammar to get keywords from
* @returns Array of keywords
*/
function getKeywords(grammar: Grammar): string[] {
return collectKeywords(grammar).filter(kw => KeywordRegex.test(kw));
}
/**
* Retrieve symbols from langium grammar
* @param grammar Grammar to get symbols from
* @returns Array of symbols, effective inverse of getKeywords
*/
function getSymbols(grammar: Grammar): string[] {
return collectKeywords(grammar).filter(kw => !KeywordRegex.test(kw));
}