UNPKG

pip-services4-expressions-node

Version:

Tokenizers, parsers and expression calculators in Node.js / ES2017

89 lines (81 loc) 3.81 kB
/** @module tokenizers */ import { IWordState } from '../IWordState'; import { Token } from '../Token'; import { TokenType } from '../TokenType'; import { ITokenizer } from '../ITokenizer'; import { IScanner } from '../../io/IScanner'; import { CharValidator } from '../utilities/CharValidator'; import { CharReferenceMap } from '../utilities/CharReferenceMap'; /** * A wordState returns a word from a scanner. Like other states, a tokenizer transfers the job * of reading to this state, depending on an initial character. Thus, the tokenizer decides * which characters may begin a word, and this state determines which characters may appear * as a second or later character in a word. These are typically different sets of characters; * in particular, it is typical for digits to appear as parts of a word, but not * as the initial character of a word. * <p/> * By default, the following characters may appear in a word. * The method <code>setWordChars()</code> allows customizing this. * <blockquote><pre> * From To * 'a', 'z' * 'A', 'Z' * '0', '9' * * as well as: minus sign, underscore, and apostrophe. * </pre></blockquote> */ export class GenericWordState implements IWordState { private _map: CharReferenceMap<boolean> = new CharReferenceMap<boolean>(); /** * Constructs a word state with a default idea of what characters * are admissible inside a word (as described in the class comment). */ public constructor() { this.setWordChars('a'.charCodeAt(0), 'z'.charCodeAt(0), true); this.setWordChars('A'.charCodeAt(0), 'Z'.charCodeAt(0), true); this.setWordChars('0'.charCodeAt(0), '9'.charCodeAt(0), true); this.setWordChars('-'.charCodeAt(0), '-'.charCodeAt(0), true); this.setWordChars('_'.charCodeAt(0), '_'.charCodeAt(0), true); //this.setWordChars(39, 39, true); this.setWordChars(0x00c0, 0x00ff, true); this.setWordChars(0x0100, 0xfffe, true); } /** * Ignore word (such as blanks and tabs), and return the tokenizer's next token. * @param scanner A textual string to be tokenized. * @param tokenizer A tokenizer class that controls the process. * @returns The next token from the top of the stream. */ // eslint-disable-next-line @typescript-eslint/no-unused-vars public nextToken(scanner: IScanner, tokenizer: ITokenizer): Token { const line = scanner.peekLine(); const column = scanner.peekColumn(); let nextSymbol: number; let tokenValue = ""; for (nextSymbol = scanner.read(); this._map.lookup(nextSymbol); nextSymbol = scanner.read()) { tokenValue = tokenValue + String.fromCharCode(nextSymbol); } if (!CharValidator.isEof(nextSymbol)) { scanner.unread(); } return new Token(TokenType.Word, tokenValue, line, column); } /** * Establish characters in the given range as valid characters for part of a word after * the first character. Note that the tokenizer must determine which characters are valid * as the beginning character of a word. * @param fromSymbol First character index of the interval. * @param toSymbol Last character index of the interval. * @param enable <code>true</code> if this state should use characters in the given range. */ public setWordChars(fromSymbol: number, toSymbol: number, enable: boolean): void { this._map.addInterval(fromSymbol, toSymbol, enable); } /** * Clears definitions of word chars. */ public clearWordChars(): void { this._map.clear(); } }