UNPKG

@scinorandex/slex

Version:

No fuss lexer generator

431 lines (353 loc) 13 kB
import { initializeCharacter } from "./utils/Character"; const Character = initializeCharacter({}); export class RegexEngineParsingResult { success: boolean; lexeme: string; from: string | null; public constructor(success: boolean, lexeme: string, from: string | null) { this.success = success; this.lexeme = lexeme; this.from = from; } public toString(): string { return this.success ? "From: " + this.from + ". Lexeme: " + this.lexeme : "No token found"; } } export type StringTransformer = (input: string) => string; export abstract class RegexNode<TokenType> { private emit: TokenType | null = null; private transformer: StringTransformer | null = null; public getTokenType(): TokenType | null { return this.emit; } public setTokenType(emit: TokenType) { this.emit = emit; } public getTransformer(): StringTransformer | null { return this.transformer; } public setTransformer(transformer: StringTransformer) { this.transformer = transformer; } public abstract toString(): String; public abstract getMatches( restString: string, environment: Map<string, RegexNode<TokenType>>, negated: boolean ): string[]; } class RegexConcatenationNode<TokenType> extends RegexNode<TokenType> { public constructor(public readonly nodes: RegexNode<TokenType>[]) { super(); } public toString(): String { let ret = ""; for (let i = 0; i < this.nodes.length; i++) { ret += this.nodes[i].toString(); } return ret; } public getMatches(restString: string, environment: Map<string, RegexNode<TokenType>>, negated: boolean): string[] { let caches: string[] = [""]; for (const node of this.nodes) { let nextCaches: string[] = []; for (const cache of caches) { const rest = restString.replace(cache, ""); const nextMatches = node.getMatches(rest, environment, negated); nextCaches.push(...nextMatches.map((m) => cache + m)); } caches = nextCaches; } return caches; } } class RegexEitherNode<TokenType> extends RegexNode<TokenType> { nodes: RegexNode<TokenType>[]; public constructor(nodes: RegexNode<TokenType>[]) { super(); this.nodes = nodes; } public toString(): String { let ret = ""; for (let i = 0; i < this.nodes.length; i++) { ret += this.nodes[i].toString(); if (i != this.nodes.length - 1) ret += "|"; } return ret; } public getMatches(restString: string, environment: Map<string, RegexNode<TokenType>>, negated: boolean): string[] { let matches: string[] = []; for (const node of this.nodes) matches.push(...node.getMatches(restString, environment, negated)); return matches; } } class RegexLiteralNode<TokenType> extends RegexNode<TokenType> { public constructor(public readonly ch: string) { super(); } public toString(): string { return "" + this.ch; } public getMatches(restString: string, environment: Map<string, RegexNode<TokenType>>, negated: boolean): string[] { if (restString.length === 0) return []; const matches: string[] = []; const starting = restString.charAt(0); if (negated === false && starting === this.ch) matches.push("" + starting); else if (negated === true && starting !== this.ch) matches.push("" + starting); return matches; } } export class RegexIntrinsicNode<TokenType> extends RegexNode<TokenType> { public constructor( public readonly intrinsicName: string, public readonly calculator: (restString: string, environment: Map<string, RegexNode<TokenType>>) => string[] ) { super(); } public toString(): string { return "<" + this.intrinsicName + ">"; } public getMatches(restString: string, environment: Map<string, RegexNode<TokenType>>): string[] { return this.calculator(restString, environment); } } class RegexVariableNode<TokenType> extends RegexNode<TokenType> { variableName: string; public constructor(variableName: string) { super(); this.variableName = variableName; } public toString(): string { return "<" + this.variableName + ">"; } public getMatches(restString: string, environment: Map<string, RegexNode<TokenType>>, negated: boolean): string[] { if (!environment.has(this.variableName)) return []; const rootNode = environment.get(this.variableName); return rootNode!.getMatches(restString, environment, negated); } } enum RegexGroupingNodeModifiers { NONE, NONE_OR_MORE, ONE_OR_MORE, NEGATION, } class RegexGroupingNode<TokenType> extends RegexNode<TokenType> { internalNode: RegexNode<TokenType>; modifier: RegexGroupingNodeModifiers; public constructor( internalNode: RegexNode<TokenType>, modifier: RegexGroupingNodeModifiers = RegexGroupingNodeModifiers.NONE ) { super(); this.internalNode = internalNode; this.modifier = modifier; } public toString(): String { return ( "(" + this.internalNode.toString() + ")" + (this.modifier === RegexGroupingNodeModifiers.ONE_OR_MORE ? "+" : this.modifier === RegexGroupingNodeModifiers.NONE_OR_MORE ? "*" : "") ); } public _getMatches(restString: string, environment: Map<string, RegexNode<TokenType>>, negated: boolean): string[] { const initialMatches: string[] = this.internalNode.getMatches(restString, environment, negated); if (initialMatches.length === 0) { if (this.modifier === RegexGroupingNodeModifiers.NONE_OR_MORE) initialMatches.push(""); return initialMatches; } else if (this.modifier === RegexGroupingNodeModifiers.NONE) return initialMatches; let matches = initialMatches; // handle matching for NONE_OR_MORE or ONE_OR_MORE while (true) { const nextMatches: string[] = []; for (const match of matches) { const rest: string = restString.replace(match, ""); if (rest.length === 0) continue; const nextMatch = this.internalNode.getMatches(rest, environment, negated); nextMatches.push(...nextMatch.map((m) => match + m)); } if (nextMatches.length === 0) break; matches = nextMatches; } return matches; } public getMatches(restString: string, environment: Map<string, RegexNode<TokenType>>, negated: boolean): string[] { if (this.modifier === RegexGroupingNodeModifiers.NEGATION) return this.internalNode.getMatches(restString, environment, true); return this._getMatches(restString, environment, negated); } } export class RegexParser<TokenType> { tokens: RegexToken[]; currentTokenIndex = 0; public constructor(tokens: RegexToken[]) { this.tokens = tokens; } parse(): RegexNode<TokenType> { // parse starting from the top const first: RegexNode<TokenType> = this.parseConcatenation(); const possibles: RegexNode<TokenType>[] = [first]; while ( this.currentTokenIndex < this.tokens.length && this.tokens[this.currentTokenIndex].type === RegexTokenType.PIPE ) { this.currentTokenIndex++; // consume the PIPE token const nextNode = this.parseConcatenation(); possibles.push(nextNode); } if (possibles.length > 1) return new RegexEitherNode(possibles); else return first; } parseConcatenation(): RegexNode<TokenType> { const first = this.parseTerminal(); const nodes: RegexNode<TokenType>[] = [first]; while ( this.currentTokenIndex < this.tokens.length && this.tokens[this.currentTokenIndex].type != RegexTokenType.PIPE && this.tokens[this.currentTokenIndex].type != RegexTokenType.RPAREN ) { const nextNode = this.parseTerminal(); nodes.push(nextNode); } if (nodes.length > 1) return new RegexConcatenationNode(nodes); else return first; } parseTerminal(): RegexNode<TokenType> { const currentToken = this.tokens[this.currentTokenIndex]; switch (currentToken.type) { case RegexTokenType.LPAREN: { this.currentTokenIndex++; // CONSUME L_PAREN const internalNode: RegexNode<TokenType> = this.parse(); this.expect(RegexTokenType.RPAREN); // next token should be R_PAREN let modifier = RegexGroupingNodeModifiers.NONE; if (this.tokens[this.currentTokenIndex].type === RegexTokenType.ASTERISK) { modifier = RegexGroupingNodeModifiers.NONE_OR_MORE; this.currentTokenIndex++; } else if (this.tokens[this.currentTokenIndex].type === RegexTokenType.PLUS) { modifier = RegexGroupingNodeModifiers.ONE_OR_MORE; this.currentTokenIndex++; } else if (this.tokens[this.currentTokenIndex].type === RegexTokenType.EXCLAMATION) { modifier = RegexGroupingNodeModifiers.NEGATION; this.currentTokenIndex++; } return new RegexGroupingNode(internalNode, modifier); } case RegexTokenType.LITERAL: { this.currentTokenIndex++; return new RegexLiteralNode(currentToken.value.charAt(0)); } case RegexTokenType.VARIABLE: { this.currentTokenIndex++; return new RegexVariableNode(currentToken.value); } default: { throw new Error("Was not able to parse the regex. Token: " + currentToken.toString()); } } } expect(type: RegexTokenType) { const currentToken: RegexToken = this.tokens[this.currentTokenIndex]; if (currentToken.type === type) { this.currentTokenIndex++; } else { throw new Error("Expected: " + type.toString() + ". Received: " + currentToken.toString()); } } } enum RegexTokenType { LITERAL, PIPE, ASTERISK, EXCLAMATION, PLUS, VARIABLE, LPAREN, RPAREN, } class RegexToken { type: RegexTokenType; value: string; constructor(type: RegexTokenType, value: string) { this.type = type; this.value = value; } public toString(): string { return "Type: " + this.type.toString() + ". Value: " + this.value; } } // This class takes a regular expression and breaks it up into tokens export class RegexLexer { expression: string; tokens: RegexToken[] = []; index = 0; public constructor(expression: string) { this.expression = expression; } public lex(): RegexToken[] { while (this.index < this.expression.length) { const currentCharacter = this.expression.charAt(this.index); if (Character.isAlphabetic(currentCharacter) || Character.isDigit(currentCharacter)) { this.tokens.push(new RegexToken(RegexTokenType.LITERAL, currentCharacter)); this.index++; } else if (currentCharacter === "|") { this.tokens.push(new RegexToken(RegexTokenType.PIPE, currentCharacter)); this.index++; } else if (currentCharacter === "+") { this.tokens.push(new RegexToken(RegexTokenType.PLUS, currentCharacter)); this.index++; } else if (currentCharacter === "*") { this.tokens.push(new RegexToken(RegexTokenType.ASTERISK, currentCharacter)); this.index++; } else if (currentCharacter === "!") { this.tokens.push(new RegexToken(RegexTokenType.EXCLAMATION, currentCharacter)); this.index++; } else if (currentCharacter === "(") { this.tokens.push(new RegexToken(RegexTokenType.LPAREN, currentCharacter)); this.index++; } else if (currentCharacter === ")") { this.tokens.push(new RegexToken(RegexTokenType.RPAREN, currentCharacter)); this.index++; } else if (currentCharacter === "$") { if ( (this.expression.length > this.index + 1 && this.expression.charAt(this.index + 1) != "{") || this.expression.length === this.index + 2 ) { // capture whatever the next character is as is this.tokens.push(new RegexToken(RegexTokenType.LITERAL, this.expression.charAt(this.index + 1))); this.index += 2; } else { // we have a regex variable so we need to handle until the matching } this.index += 2; // move index to the start of the variable let variableName = ""; if (this.index === this.expression.length) { this.tokens.push(new RegexToken(RegexTokenType.LITERAL, "}")); } else { while (this.expression.length > this.index && this.expression.charAt(this.index) != "}") { variableName += this.expression.charAt(this.index); this.index++; } // consume the ending } this.tokens.push(new RegexToken(RegexTokenType.VARIABLE, variableName)); this.index++; } } } else if (Character.isWhitespace(currentCharacter)) this.index++; else { console.log( "Was not able to handle ch: " + currentCharacter + " at index: " + this.index + " in expression: " + this.expression ); } } return this.tokens; } }