UNPKG

langium

Version:

A language engineering tool for the Language Server Protocol

183 lines 8.14 kB
/****************************************************************************** * Copyright 2024 TypeFox GmbH * This program and the accompanying materials are made available under the * terms of the MIT License, which is available in the project root. ******************************************************************************/ import type { CustomPatternMatcherFunc, TokenType, IToken, TokenVocabulary } from 'chevrotain'; import type { Grammar, TerminalRule } from '../languages/generated/ast.js'; import type { LexingReport, TokenBuilderOptions } from './token-builder.js'; import type { LexerResult, TokenizeOptions } from './lexer.js'; import type { LangiumCoreServices } from '../services.js'; import { DefaultTokenBuilder } from './token-builder.js'; import { DefaultLexer } from './lexer.js'; type IndentationAwareDelimiter<TokenName extends string> = [begin: TokenName, end: TokenName]; export interface IndentationTokenBuilderOptions<TerminalName extends string = string, KeywordName extends string = string> { /** * The name of the token used to denote indentation in the grammar. * A possible definition in the grammar could look like this: * ```langium * terminal INDENT: ':synthetic-indent:'; * ``` * * @default 'INDENT' */ indentTokenName: TerminalName; /** * The name of the token used to denote deindentation in the grammar. * A possible definition in the grammar could look like this: * ```langium * terminal DEDENT: ':synthetic-dedent:'; * ``` * * @default 'DEDENT' */ dedentTokenName: TerminalName; /** * The name of the token used to denote whitespace other than indentation and newlines in the grammar. * A possible definition in the grammar could look like this: * ```langium * hidden terminal WS: /[ \t]+/; * ``` * * @default 'WS' */ whitespaceTokenName: TerminalName; /** * The delimiter tokens inside of which indentation should be ignored and treated as normal whitespace. * For example, Python doesn't treat any whitespace between `(` and `)` as significant. * * Can be either terminal tokens or keyword tokens. * * @default [] */ ignoreIndentationDelimiters: Array<IndentationAwareDelimiter<TerminalName | KeywordName>>; } export declare const indentationBuilderDefaultOptions: IndentationTokenBuilderOptions; export declare enum LexingMode { REGULAR = "indentation-sensitive", IGNORE_INDENTATION = "ignore-indentation" } export interface IndentationLexingReport extends LexingReport { /** Dedent tokens that are necessary to close the remaining indents. */ remainingDedents: IToken[]; } /** * A token builder that is sensitive to indentation in the input text. * It will generate tokens for indentation and dedentation based on the indentation level. * * The first generic parameter corresponds to the names of terminal tokens, * while the second one corresponds to the names of keyword tokens. * Both parameters are optional and can be imported from `./generated/ast.js`. * * Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js */ export declare class IndentationAwareTokenBuilder<Terminals extends string = string, KeywordName extends string = string> extends DefaultTokenBuilder { /** * The stack stores all the previously matched indentation levels to understand how deeply the next tokens are nested. * The stack is valid for lexing */ protected indentationStack: number[]; readonly options: IndentationTokenBuilderOptions<Terminals, KeywordName>; /** * The token type to be used for indentation tokens */ readonly indentTokenType: TokenType; /** * The token type to be used for dedentation tokens */ readonly dedentTokenType: TokenType; /** * A regular expression to match a series of tabs and/or spaces. * Override this to customize what the indentation is allowed to consist of. */ protected whitespaceRegExp: RegExp; constructor(options?: Partial<IndentationTokenBuilderOptions<NoInfer<Terminals>, NoInfer<KeywordName>>>); buildTokens(grammar: Grammar, options?: TokenBuilderOptions | undefined): TokenVocabulary; flushLexingReport(text: string): IndentationLexingReport; /** * Helper function to check if the current position is the start of a new line. * * @param text The full input string. * @param offset The current position at which to check * @returns Whether the current position is the start of a new line */ protected isStartOfLine(text: string, offset: number): boolean; /** * A helper function used in matching both indents and dedents. * * @param text The full input string. * @param offset The current position at which to attempt a match * @param tokens Previously scanned tokens * @param groups Token Groups * @returns The current and previous indentation levels and the matched whitespace */ protected matchWhitespace(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): { currIndentLevel: number; prevIndentLevel: number; match: RegExpExecArray | null; }; /** * Helper function to create an instance of an indentation token. * * @param tokenType Indent or dedent token type * @param text Full input string, used to calculate the line number * @param image The original image of the token (tabs or spaces) * @param offset Current position in the input string * @returns The indentation token instance */ protected createIndentationTokenInstance(tokenType: TokenType, text: string, image: string, offset: number): IToken; /** * Helper function to get the line number at a given offset. * * @param text Full input string, used to calculate the line number * @param offset Current position in the input string * @returns The line number at the given offset */ protected getLineNumber(text: string, offset: number): number; /** * A custom pattern for matching indents * * @param text The full input string. * @param offset The offset at which to attempt a match * @param tokens Previously scanned tokens * @param groups Token Groups */ protected indentMatcher(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc>; /** * A custom pattern for matching dedents * * @param text The full input string. * @param offset The offset at which to attempt a match * @param tokens Previously scanned tokens * @param groups Token Groups */ protected dedentMatcher(text: string, offset: number, tokens: IToken[], groups: Record<string, IToken[]>): ReturnType<CustomPatternMatcherFunc>; protected buildTerminalToken(terminal: TerminalRule): TokenType; /** * Resets the indentation stack between different runs of the lexer * * @param text Full text that was tokenized * @returns Remaining dedent tokens to match all previous indents at the end of the file */ flushRemainingDedents(text: string): IToken[]; } /** * A lexer that is aware of indentation in the input text. * The only purpose of this lexer is to reset the internal state of the {@link IndentationAwareTokenBuilder} * between the tokenization of different text inputs. * * In your module, you can override the default lexer with this one as such: * ```ts * parser: { * TokenBuilder: () => new IndentationAwareTokenBuilder(), * Lexer: (services) => new IndentationAwareLexer(services), * } * ``` */ export declare class IndentationAwareLexer extends DefaultLexer { protected readonly indentationTokenBuilder: IndentationAwareTokenBuilder; constructor(services: LangiumCoreServices); tokenize(text: string, options?: TokenizeOptions): LexerResult; } export {}; //# sourceMappingURL=indentation-aware.d.ts.map