pip-services4-expressions-node
Version:
Tokenizers, parsers and expression calculators in Node.js / ES2017
89 lines (81 loc) • 3.81 kB
text/typescript
/** @module tokenizers */
import { IWordState } from '../IWordState';
import { Token } from '../Token';
import { TokenType } from '../TokenType';
import { ITokenizer } from '../ITokenizer';
import { IScanner } from '../../io/IScanner';
import { CharValidator } from '../utilities/CharValidator';
import { CharReferenceMap } from '../utilities/CharReferenceMap';
/**
* A wordState returns a word from a scanner. Like other states, a tokenizer transfers the job
* of reading to this state, depending on an initial character. Thus, the tokenizer decides
* which characters may begin a word, and this state determines which characters may appear
* as a second or later character in a word. These are typically different sets of characters;
* in particular, it is typical for digits to appear as parts of a word, but not
* as the initial character of a word.
* <p/>
* By default, the following characters may appear in a word.
* The method <code>setWordChars()</code> allows customizing this.
* <blockquote><pre>
* From To
* 'a', 'z'
* 'A', 'Z'
* '0', '9'
*
* as well as: minus sign, underscore, and apostrophe.
* </pre></blockquote>
*/
export class GenericWordState implements IWordState {
private _map: CharReferenceMap<boolean> = new CharReferenceMap<boolean>();
/**
* Constructs a word state with a default idea of what characters
* are admissible inside a word (as described in the class comment).
*/
public constructor() {
this.setWordChars('a'.charCodeAt(0), 'z'.charCodeAt(0), true);
this.setWordChars('A'.charCodeAt(0), 'Z'.charCodeAt(0), true);
this.setWordChars('0'.charCodeAt(0), '9'.charCodeAt(0), true);
this.setWordChars('-'.charCodeAt(0), '-'.charCodeAt(0), true);
this.setWordChars('_'.charCodeAt(0), '_'.charCodeAt(0), true);
//this.setWordChars(39, 39, true);
this.setWordChars(0x00c0, 0x00ff, true);
this.setWordChars(0x0100, 0xfffe, true);
}
/**
* Ignore word (such as blanks and tabs), and return the tokenizer's next token.
* @param scanner A textual string to be tokenized.
* @param tokenizer A tokenizer class that controls the process.
* @returns The next token from the top of the stream.
*/
// eslint-disable-next-line @typescript-eslint/no-unused-vars
public nextToken(scanner: IScanner, tokenizer: ITokenizer): Token {
const line = scanner.peekLine();
const column = scanner.peekColumn();
let nextSymbol: number;
let tokenValue = "";
for (nextSymbol = scanner.read(); this._map.lookup(nextSymbol); nextSymbol = scanner.read()) {
tokenValue = tokenValue + String.fromCharCode(nextSymbol);
}
if (!CharValidator.isEof(nextSymbol)) {
scanner.unread();
}
return new Token(TokenType.Word, tokenValue, line, column);
}
/**
* Establish characters in the given range as valid characters for part of a word after
* the first character. Note that the tokenizer must determine which characters are valid
* as the beginning character of a word.
* @param fromSymbol First character index of the interval.
* @param toSymbol Last character index of the interval.
* @param enable <code>true</code> if this state should use characters in the given range.
*/
public setWordChars(fromSymbol: number, toSymbol: number, enable: boolean): void {
this._map.addInterval(fromSymbol, toSymbol, enable);
}
/**
* Clears definitions of word chars.
*/
public clearWordChars(): void {
this._map.clear();
}
}