pip-services4-expressions-node
Version:
Tokenizers, parsers and expression calculators in Node.js / ES2017
136 lines (135 loc) • 4.64 kB
TypeScript
/** @module tokenizers */
import { IScanner } from "../io/IScanner";
import { Token } from './Token';
import { ICommentState } from './ICommentState';
import { INumberState } from './INumberState';
import { IQuoteState } from './IQuoteState';
import { ISymbolState } from './ISymbolState';
import { IWhitespaceState } from './IWhitespaceState';
import { IWordState } from './IWordState';
/**
* A tokenizer divides a string into tokens. This class is highly customizable with regard
* to exactly how this division occurs, but it also has defaults that are suitable for many
* languages. This class assumes that the character values read from the string lie in
* the range 0-255. For example, the Unicode value of a capital A is 65,
* so <code> System.out.println((char)65); </code> prints out a capital A.
* <p>
* The behavior of a tokenizer depends on its character state table. This table is an array
* of 256 <code>TokenizerState</code> states. The state table decides which state to enter
* upon reading a character from the input string.
* <p>
* For example, by default, upon reading an 'A', a tokenizer will enter a "word" state.
* This means the tokenizer will ask a <code>WordState</code> object to consume the 'A',
* along with the characters after the 'A' that form a word. The state's responsibility
* is to consume characters and return a complete token.
* <p>
* The default table sets a SymbolState for every character from 0 to 255,
* and then overrides this with:<blockquote><pre>
* From To State
* 0 ' ' whitespaceState
* 'a' 'z' wordState
* 'A' 'Z' wordState
* 160 255 wordState
* '0' '9' numberState
* '-' '-' numberState
* '.' '.' numberState
* '"' '"' quoteState
* '\'' '\'' quoteState
* '/' '/' slashState
* </pre></blockquote>
* In addition to allowing modification of the state table, this class makes each of the states
* above available. Some of these states are customizable. For example, wordState allows customization
* of what characters can be part of a word, after the first character.
*/
export interface ITokenizer {
/**
* Skip unknown characters
*/
skipUnknown: boolean;
/**
* Skips whitespaces.
*/
skipWhitespaces: boolean;
/**
* Skips comments.
*/
skipComments: boolean;
/**
* Skips End-Of-File token at the end of stream.
*/
skipEof: boolean;
/**
* Merges whitespaces.
*/
mergeWhitespaces: boolean;
/**
* Unifies numbers: "Integers" and "Floats" makes just "Numbers"
*/
unifyNumbers: boolean;
/**
* Decodes quoted strings.
*/
decodeStrings: boolean;
/**
* A token state to process comments.
*/
commentState: ICommentState;
/**
* A token state to process numbers.
*/
numberState: INumberState;
/**
* A token state to process quoted strings.
*/
quoteState: IQuoteState;
/**
* A token state to process symbols (single like "=" or muti-character like "<>")
*/
symbolState: ISymbolState;
/**
* A token state to process white space delimiters.
*/
whitespaceState: IWhitespaceState;
/**
* A token state to process words or indentificators.
*/
wordState: IWordState;
/**
* The stream scanner to tokenize.
*/
scanner: IScanner;
/**
* Checks if there is the next token exist.
* @returns <code>true</code> if scanner has the next token.
*/
hasNextToken(): boolean;
/**
* Gets the next token from the scanner.
* @returns Next token of <code>null</code> if there are no more tokens left.
*/
nextToken(): Token;
/**
* Tokenizes a textual stream into a list of token structures.
* @param scanner A textual stream to be tokenized.
* @returns A list of token structures.
*/
tokenizeStream(scanner: IScanner): Token[];
/**
* Tokenizes a string buffer into a list of tokens structures.
* @param buffer A string buffer to be tokenized.
* @returns A list of token structures.
*/
tokenizeBuffer(buffer: string): Token[];
/**
* Tokenizes a textual stream into a list of strings.
* @param scanner A textual stream to be tokenized.
* @returns A list of token strings.
*/
tokenizeStreamToStrings(scanner: IScanner): string[];
/**
* Tokenizes a string buffer into a list of strings.
* @param buffer A string buffer to be tokenized.
* @returns A list of token strings.
*/
tokenizeBufferToStrings(buffer: string): string[];
}