UNPKG

chevrotain

Version:

Chevrotain is a high performance fault Tolerant Javascript parsing DSL for building recursive decent parsers

640 lines (633 loc) 29.2 kB
/*! chevrotain - v0.4.5 - 2015-07-11 */ declare module chevrotain { module lang { class HashTable<V>{} } function tokenName(clazz: Function): string; /** * utility to help the poor souls who are still stuck writing pure javascript 5.1 * extend and create Token subclasses in a less verbose manner * * @param {string} tokenName the name of the new TokenClass * @param {*} patternOrParent Pa * @param {Function} parentConstructor the Token class to be extended * @returns {Function} a constructor for the new extended Token subclass */ function extendToken(tokenName: string, patternOrParent?: any, parentConstructor?: Function): any; class Token { image: string; offset: number; startLine: number; startColumn: number; endLine: number; endColumn: number; isInsertedInRecovery: boolean; /** * @param {string} image the textual representation of the Token as it appeared in the text * @param {number} offset offset of the first character of the Token * @param {number} startLine line of the first character of the Token * @param {number} startColumn column of the first character of the Token * @param {number} endLine line of the last character of the Token * @param {number} endColumn column of the last character of the Token * * Things to note: * * "do" {startColumn : 1, endColumn: 2} --> the range is inclusive to exclusive 1...2 (2 chars long). * * "\n" {startLine : 1, endLine: 1} --> a lineTerminator as the last character does not effect the Token's line numbering. * * "'hello\tworld\uBBBB'" {image: "'hello\tworld\uBBBB'"} --> a Token's image is the "literal" text * (unicode escaping is untouched). */ constructor(image: string, offset: number, startLine: number, startColumn: number, endLine?: number, endColumn?: number); } /** * a special kind of Token which does not really exist in the input * (hence the 'Virtual' prefix). These type of Tokens can be used as special markers: * for example, EOF (end-of-file). */ class VirtualToken extends Token { constructor(); } class EOF extends VirtualToken { } type TokenConstructor = Function; interface ILexingResult { tokens: Token[]; groups: { [groupName: string]: Token; }; errors: ILexingError[]; } interface ILexingError { line: number; column: number; message: string; } class Lexer { protected tokenClasses: TokenConstructor[]; static SKIPPED: { description: string; }; static NA: RegExp; protected allPatterns: RegExp[]; protected patternIdxToClass: Function[]; protected patternIdxToGroup: boolean[]; protected patternIdxToLongerAltIdx: number[]; protected patternIdxToCanLineTerminator: boolean[]; protected emptyGroups: { [groupName: string]: Token; }; /** * @param {Function[]} tokenClasses constructor functions for the Tokens types this scanner will support * These constructors must be in one of three forms: * * 1. With a PATTERN property that has a RegExp value for tokens to match: * example: -->class Integer extends Token { static PATTERN = /[1-9]\d }<-- * * 2. With a PATTERN property that has a RegExp value AND an IGNORE property with boolean value true. * These tokens will be matched but not as part of the main token vector. * this is usually used for ignoring whitespace/comments * example: --> class Whitespace extends Token { static PATTERN = /(\t| )/; static IGNORE = true}<-- * * 3. With a PATTERN property that has the value of the var Lexer.NA defined above. * This is a convenience form used to avoid matching Token classes that only act as categories. * example: -->class Keyword extends Token { static PATTERN = NA }<-- * * * The following RegExp patterns are not supported: * a. '$' for match at end of input * b. /b global flag * c. /m multi-line flag * * The Lexer will identify the first pattern the matches, Therefor the order of Token Constructors passed * To the SimpleLexer's constructor is meaningful. If two patterns may match the same string, the longer one * should be before the shorter one. * * Note that there are situations in which we may wish to place the longer pattern after the shorter one. * For example: keywords vs Identifiers. * 'do'(/do/) and 'done'(/w+) * * * If the Identifier pattern appears before the 'do' pattern both 'do' and 'done' * will be lexed as an Identifier. * * * If the 'do' pattern appears before the Identifier pattern 'do' will be lexed correctly as a keyword. * however 'done' will be lexed as TWO tokens keyword 'do' and identifier 'ne'. * * To resolve this problem, add a static property on the keyword's Tokens constructor named: LONGER_ALT * example: * * export class Identifier extends Keyword { static PATTERN = /[_a-zA-Z][_a-zA-Z0-9]/ } * export class Keyword extends Token { * static PATTERN = lex.NA * static LONGER_ALT = Identifier * } * export class Do extends Keyword { static PATTERN = /do/ } * export class While extends Keyword { static PATTERN = /while/ } * export class Return extends Keyword { static PATTERN = /return/ } * * The lexer will then also attempt to match a (longer) Identifier each time a keyword is matched * * */ constructor(tokenClasses: TokenConstructor[]); /** * Will lex(Tokenize) a string. * Note that this can be called repeatedly on different strings as this method * does not modify the state of the Lexer. * * @param {string} text the string to lex * @returns {{tokens: {Token}[], errors: string[]}} */ tokenize(text: string): ILexingResult; } import gast = chevrotain.gast; import lang = chevrotain.lang; interface IFollowKey { ruleName: string; idxInCallingRule: number; inRule: string; } /** * OR([ * { WHEN:LA1, THEN_DO:XXX }, * { WHEN:LA2, THEN_DO:YYY }, * { WHEN:LA3, THEN_DO:ZZZ }, * ]) */ interface IOrAlt<T> { WHEN: () => boolean; THEN_DO: () => T; } /** * OR([ * {ALT:XXX }, * {ALT:YYY }, * {ALT:ZZZ } * ]) */ interface IOrAltImplicit<T> { ALT: () => T; } interface IParserState { errors: Error[]; inputIdx: number; RULE_STACK: string[]; } type LookAheadFunc = () => boolean; type GrammarAction = () => void; /** * A Recognizer capable of self analysis to determine it's grammar structure * This is used for more advanced features requiring such information. * for example: Error Recovery, Automatic lookahead calculation */ class Parser { static IGNORE_AMBIGUITIES: boolean; static NO_RESYNC: boolean; protected static performSelfAnalysis(classInstance: Parser): void; errors: Error[]; protected _input: Token[]; protected inputIdx: number; protected isBackTrackingStack: any[]; protected className: string; protected RULE_STACK: string[]; protected RULE_OCCURRENCE_STACK: number[]; protected tokensMap: { [fqn: string]: Function; }; private firstAfterRepMap; private classLAFuncs; private orLookaheadKeys; private manyLookaheadKeys; private atLeastOneLookaheadKeys; private optionLookaheadKeys; constructor(input: Token[], tokensMapOrArr: { [fqn: string]: Function; } | Function[]); input: Token[]; reset(): void; isAtEndOfInput(): boolean; getGAstProductions(): lang.HashTable<gast.Rule>; protected isBackTracking(): boolean; protected SAVE_ERROR(error: Error): Error; protected NEXT_TOKEN(): Token; protected LA(howMuch: number): Token; protected isNextRule<T>(ruleName: string): boolean; /** * * @param grammarRule the rule to try and parse in backtracking mode * @param isValid a predicate that given the result of the parse attempt will "decide" if the parse was successfully or not * @return a lookahead function that will try to parse the given grammarRule and will return true if succeed */ protected BACKTRACK<T>(grammarRule: (...args) => T, isValid: (T) => boolean): () => boolean; protected SKIP_TOKEN(): Token; /** * Convenience method equivalent to CONSUME1 * @see CONSUME1 */ protected CONSUME(tokClass: Function): Token; /** * * A Parsing DSL method use to consume a single terminal Token. * a Token will be consumed, IFF the next token in the token vector is an instanceof tokClass. * otherwise the parser will attempt to perform error recovery. * * The index in the method name indicates the unique occurrence of a terminal consumption * inside a the top level rule. What this means is that if a terminal appears * more than once in a single rule, each appearance must have a difference index. * * for example: * * function parseQualifiedName() { * this.CONSUME1(Identifier); * this.MANY(()=> { * this.CONSUME1(Dot); * this.CONSUME2(Identifier); // <-- here we use CONSUME2 because the terminal * }); // 'Identifier' has already appeared previously in the * // the rule 'parseQualifiedName' * } * * @param {Function} tokClass A constructor function specifying the type of token * to be consumed. * * @returns {chevrotain.tokens.Token} The consumed token. */ protected CONSUME1(tokClass: Function): Token; /** * @see CONSUME1 */ protected CONSUME2(tokClass: Function): Token; /** * @see CONSUME1 */ protected CONSUME3(tokClass: Function): Token; /** * @see CONSUME1 */ protected CONSUME4(tokClass: Function): Token; /** * @see CONSUME1 */ protected CONSUME5(tokClass: Function): Token; /** * Convenience method equivalent to SUBRULE1 * @see SUBRULE1 */ protected SUBRULE<T>(ruleToCall: (number) => T, args?: any[]): T; /** * The Parsing DSL Method is used by one rule to call another. * * This may seem redundant as it does not actually do much. * However using it is mandatory for all sub rule invocations. * calling another rule without wrapping in SUBRULE(...) * will cause errors/mistakes in the Recognizer's self analysis * which will lead to errors in error recovery/automatic lookahead calcualtion * and any other functionality relying on the Recognizer's self analysis * output. * * As in CONSUME the index in the method name indicates the occurrence * of the sub rule invocation in its rule. * * @param {Function} ruleToCall the rule to invoke * @param {*[]} args the arguments to pass to the invoked subrule * @returns {*} the result of invoking ruleToCall */ protected SUBRULE1<T>(ruleToCall: (number) => T, args?: any[]): T; /** * @see SUBRULE1 */ protected SUBRULE2<T>(ruleToCall: (number) => T, args?: any[]): T; /** * @see SUBRULE1 */ protected SUBRULE3<T>(ruleToCall: (number) => T, args?: any[]): T; /** * @see SUBRULE1 */ protected SUBRULE4<T>(ruleToCall: (number) => T, args?: any[]): T; /** * @see SUBRULE1 */ protected SUBRULE5<T>(ruleToCall: (number) => T, args?: any[]): T; /** * Convenience method equivalent to OPTION1 * @see OPTION1 */ protected OPTION(laFuncOrAction: LookAheadFunc | GrammarAction, action?: GrammarAction): boolean; /** * Parsing DSL Method that Indicates an Optional production * in EBNF notation: [...] * * note that the 'action' param is optional. so both of the following forms are valid: * * short: this.OPTION(()=>{ this.CONSUME(Digit}); * long: this.OPTION(isDigit, ()=>{ this.CONSUME(Digit}); * * using the short form is recommended as it will compute the lookahead function * automatically. however this currently has one limitation: * It only works if the lookahead for the grammar is one. * * As in CONSUME the index in the method name indicates the occurrence * of the optional production in it's top rule. * * @param {Function} laFuncOrAction The lookahead function that 'decides' * whether or not the OPTION's action will be * invoked or the action to optionally invoke * @param {Function} [action] The action to optionally invoke. * * @returns {boolean} true iff the OPTION's action has been invoked */ protected OPTION1(laFuncOrAction: LookAheadFunc | GrammarAction, action?: GrammarAction): boolean; /** * @see OPTION1 */ protected OPTION2(laFuncOrAction: LookAheadFunc | GrammarAction, action?: GrammarAction): boolean; /** * @see OPTION1 */ protected OPTION3(laFuncOrAction: LookAheadFunc | GrammarAction, action?: GrammarAction): boolean; /** * @see OPTION1 */ protected OPTION4(laFuncOrAction: LookAheadFunc | GrammarAction, action?: GrammarAction): boolean; /** * @see OPTION1 */ protected OPTION5(laFuncOrAction: LookAheadFunc | GrammarAction, action?: GrammarAction): boolean; /** * Convenience method equivalent to OR1 * @see OR1 */ protected OR<T>(alts: IOrAlt<T>[] | IOrAltImplicit<T>[], errMsgTypes: string, ignoreAmbiguities?: boolean): T; /** * Parsing DSL method that indicates a choice between a set of alternatives must be made. * This is equivalent to EBNF alternation (A | B | C | D ...) * * There are two forms: * * short: this.OR([ * {ALT:()=>{this.CONSUME(One)}}, * {ALT:()=>{this.CONSUME(Two)}}, * {ALT:()=>{this.CONSUME(Three)}}, * ], "a number") * * long: this.OR([ * {WHEN: isOne, THEN_DO:()=>{this.CONSUME(One)}}, * {WHEN: isTwo, THEN_DO:()=>{this.CONSUME(Two)}}, * {WHEN: isThree, THEN_DO:()=>{this.CONSUME(Three)}}, * ], "a number") * * using the short form is recommended as it will compute the lookahead function * automatically. however this currently has one limitation: * It only works if the lookahead for the grammar is one. * * As in CONSUME the index in the method name indicates the occurrence * of the alternation production in it's top rule. * * @param {{ALT:Function}[] | {WHEN:Function, THEN_DO:Function}[]} alts An array of alternatives * @param {string} errMsgTypes A description for the alternatives used in error messages * @returns {*} The result of invoking the chosen alternative * @param {boolean} [ignoreAmbiguities] if true this will ignore ambiguities caused when two alternatives can not * be distinguished by a lookahead of one. enabling this means the first alternative * that matches will be taken. This is sometimes the grammar's intent. * * only enable this if you know what you are doing! */ protected OR1<T>(alts: IOrAlt<T>[] | IOrAltImplicit<T>[], errMsgTypes: string, ignoreAmbiguities?: boolean): T; /** * @see OR1 */ protected OR2<T>(alts: IOrAlt<T>[] | IOrAltImplicit<T>[], errMsgTypes: string, ignoreAmbiguities?: boolean): T; /** * @see OR1 */ protected OR3<T>(alts: IOrAlt<T>[] | IOrAltImplicit<T>[], errMsgTypes: string, ignoreAmbiguities?: boolean): T; /** * @see OR1 */ protected OR4<T>(alts: IOrAlt<T>[] | IOrAltImplicit<T>[], errMsgTypes: string, ignoreAmbiguities?: boolean): T; /** * @see OR1 */ protected OR5<T>(alts: IOrAlt<T>[] | IOrAltImplicit<T>[], errMsgTypes: string, ignoreAmbiguities?: boolean): T; /** * Convenience method equivalent to MANY1 * @see MANY1 */ protected MANY(lookAheadFunc: LookAheadFunc | GrammarAction, action?: GrammarAction): void; /** * Parsing DSL method, that indicates a repetition of zero or more. * This is equivalent to EBNF repetition {...} * * note that the 'action' param is optional. so both of the following forms are valid: * * short: this.MANY(()=>{ * this.CONSUME(Comma}; * this.CONSUME(Digit}); * long: this.MANY(isComma, ()=>{ * this.CONSUME(Comma}; * this.CONSUME(Digit}); * * using the short form is recommended as it will compute the lookahead function * automatically. however this currently has one limitation: * It only works if the lookahead for the grammar is one. * * As in CONSUME the index in the method name indicates the occurrence * of the repetition production in it's top rule. * * @param {Function} laFuncOrAction The lookahead function that 'decides' * whether or not the MANY's action will be * invoked or the action to optionally invoke * @param {Function} [action] The action to optionally invoke. */ protected MANY1(laFuncOrAction: LookAheadFunc | GrammarAction, action?: GrammarAction): void; /** * @see MANY1 */ protected MANY2(laFuncOrAction: LookAheadFunc | GrammarAction, action?: GrammarAction): void; /** * @see MANY1 */ protected MANY3(laFuncOrAction: LookAheadFunc | GrammarAction, action?: GrammarAction): void; /** * @see MANY1 */ protected MANY4(laFuncOrAction: LookAheadFunc | GrammarAction, action?: GrammarAction): void; /** * @see MANY1 */ protected MANY5(laFuncOrAction: LookAheadFunc | GrammarAction, action?: GrammarAction): void; /** * Convenience method equivalent to AT_LEAST_ONE1 * @see AT_LEAST_ONE1 */ protected AT_LEAST_ONE(laFuncOrAction: LookAheadFunc | GrammarAction, action: GrammarAction | string, errMsg?: string): void; /** * * convenience method, same as MANY but the repetition is of one or more. * failing to match at least one repetition will result in a parsing error and * cause the parser to attempt error recovery. * * @see MANY1 * * @param {Function} laFuncOrAction The lookahead function that 'decides' * whether or not the AT_LEAST_ONE's action will be * invoked or the action to optionally invoke * @param {Function} [action] The action to optionally invoke. * @param {string} [errMsg] short title/classification to what is being matched */ protected AT_LEAST_ONE1(laFuncOrAction: LookAheadFunc | GrammarAction, action: GrammarAction | string, errMsg?: string): void; /** * @see AT_LEAST_ONE1 */ protected AT_LEAST_ONE2(laFuncOrAction: LookAheadFunc | GrammarAction, action: GrammarAction | string, errMsg?: string): void; /** * @see AT_LEAST_ONE1 */ protected AT_LEAST_ONE3(laFuncOrAction: LookAheadFunc | GrammarAction, action: GrammarAction | string, errMsg?: string): void; /** * @see AT_LEAST_ONE1 */ protected AT_LEAST_ONE4(laFuncOrAction: LookAheadFunc | GrammarAction, action: GrammarAction | string, errMsg?: string): void; /** * @see AT_LEAST_ONE1 */ protected AT_LEAST_ONE5(laFuncOrAction: LookAheadFunc | GrammarAction, action: GrammarAction | string, errMsg?: string): void; /** * Convenience method, same as RULE with doReSync=false * @see RULE */ protected RULE_NO_RESYNC<T>(ruleName: string, impl: () => T, invalidRet: () => T): (idxInCallingRule: number, isEntryPoint?: boolean) => T; /** * * @param {string} ruleName The name of the Rule. must match the var it is assigned to. * @param {Function} impl The implementation of the Rule * @param {Function} [invalidRet] A function that will return the chosen invalid value for the rule in case of * re-sync recovery. * @param {boolean} [doReSync] enable or disable re-sync recovery for this rule. defaults to true * @returns {Function} The parsing rule which is the impl Function wrapped with the parsing logic that handles * Parser state / error recovery / ... */ protected RULE<T>(ruleName: string, impl: (...implArgs: any[]) => T, invalidRet?: () => T, doReSync?: boolean): (idxInCallingRule?: number, ...args: any[]) => T; protected ruleInvocationStateUpdate(ruleName: string, idxInCallingRule: number): void; protected ruleFinallyStateUpdate(): void; protected getTokenToInsert(tokClass: Function): Token; protected canTokenTypeBeInsertedInRecovery(tokClass: Function): boolean; private defaultInvalidReturn(); private ruleNamePattern; private definedRulesNames; /** * @param ruleFuncName name of the Grammar rule * @throws Grammar validation errors if the name is invalid */ private validateRuleName(ruleFuncName); private tryInRepetitionRecovery(grammarRule, grammarRuleArgs, lookAheadFunc, expectedTokType); private shouldInRepetitionRecoveryBeTried(expectTokAfterLastMatch?, nextTokIdx?); private getFollowsForInRuleRecovery(tokClass, tokIdxInRule); private tryInRuleRecovery(expectedTokType, follows); private canPerformInRuleRecovery(expectedToken, follows); private canRecoverWithSingleTokenInsertion(expectedTokType, follows); private canRecoverWithSingleTokenDeletion(expectedTokType); private isInCurrentRuleReSyncSet(token); private findReSyncTokenType(); private getCurrFollowKey(); private buildFullFollowKeyStack(); private flattenFollowSet(); private getFollowSetFromFollowKey(followKey); private reSyncTo(tokClass); private attemptInRepetitionRecovery(prodFunc, args, lookaheadFunc, prodName, prodOccurrence, nextToksWalker, prodKeys); private optionInternal(condition, action); private atLeastOneInternal(prodFunc, prodName, prodOccurrence, lookAheadFunc, action, errMsg?); private manyInternal(prodFunc, prodName, prodOccurrence, lookAheadFunc, action?); private orInternal<T>(alts, errMsgTypes, occurrence, ignoreAmbiguities); /** * @param tokClass The Type of Token we wish to consume (Reference to its constructor function) * @param idx occurrence index of consumed token in the invoking parser rule text * for example: * IDENT (DOT IDENT)* * the first ident will have idx 1 and the second one idx 2 * * note that for the second ident the idx is always 2 even if its invoked 30 times in the same rule * the idx is about the position in grammar (source code) and has nothing to do with a specific invocation * details * * @returns the consumed Token */ private consumeInternal(tokClass, idx); private consumeInternalOptimized(tokClass); private getKeyForAutomaticLookahead(prodName, prodKeys, occurrence); private getLookaheadFuncForOption(occurence); private getLookaheadFuncForOr(occurence, ignoreErrors); private getLookaheadFuncForMany(occurence); private getLookaheadFuncForAtLeastOne(occurence); private getLookaheadFuncFor<T>(key, occurrence, laFuncBuilder, extraArgs?); private saveRecogState(); private reloadRecogState(newState); private raiseNoAltException(errMsgTypes); } module exceptions { function isRecognitionException(error: Error): boolean; function MismatchedTokenException(message: string, token: Token): void; function NoViableAltException(message: string, token: Token): void; function NotAllInputParsedException(message: string, token: Token): void; function EarlyExitException(message: string, token: Token): void; } module gast { interface IProduction { accept(visitor: GAstVisitor): void; } interface IProductionWithOccurrence extends IProduction { occurrenceInParent: number; implicitOccurrenceIndex: boolean; } class AbstractProduction implements IProduction { definition: IProduction[]; implicitOccurrenceIndex: boolean; constructor(definition: IProduction[]); accept(visitor: GAstVisitor): void; } class NonTerminal extends AbstractProduction implements IProductionWithOccurrence { nonTerminalName: string; referencedRule: Rule; occurrenceInParent: number; constructor(nonTerminalName: string, referencedRule?: Rule, occurrenceInParent?: number); definition: IProduction[]; accept(visitor: GAstVisitor): void; } class Rule extends AbstractProduction { name: string; constructor(name: string, definition: IProduction[]); } class Flat extends AbstractProduction { constructor(definition: IProduction[]); } class Option extends AbstractProduction implements IProductionWithOccurrence { occurrenceInParent: number; constructor(definition: IProduction[], occurrenceInParent?: number); } class RepetitionMandatory extends AbstractProduction implements IProductionWithOccurrence { occurrenceInParent: number; constructor(definition: IProduction[], occurrenceInParent?: number); } class Repetition extends AbstractProduction implements IProductionWithOccurrence { occurrenceInParent: number; constructor(definition: IProduction[], occurrenceInParent?: number); } class Alternation extends AbstractProduction implements IProductionWithOccurrence { occurrenceInParent: number; constructor(definition: IProduction[], occurrenceInParent?: number); } class Terminal implements IProductionWithOccurrence { terminalType: Function; occurrenceInParent: number; implicitOccurrenceIndex: boolean; constructor(terminalType: Function, occurrenceInParent?: number); accept(visitor: GAstVisitor): void; } class GAstVisitor { visit(node: IProduction): void; visitNonTerminal(node: NonTerminal): void; visitFlat(node: Flat): void; visitOption(node: Option): void; visitRepetitionMandatory(node: RepetitionMandatory): void; visitRepetition(node: Repetition): void; visitAlternation(node: Alternation): void; visitTerminal(node: Terminal): void; } } }