UNPKG

mathlive

Version:

Render and edit beautifully typeset math

github.com/arnog/mathlive

288 lines (267 loc) • 8.72 kB

JavaScript

/** * @module core/lexer * @private */ /** * ## Reference * TeX source code: * {@link http://tug.org/texlive/devsrc/Build/source/texk/web2c/tex.web|Tex.web} * * For a list of standard TeX macros, see: * {@link ftp://tug.ctan.org/pub/tex-archive/systems/knuth/dist/lib/plain.tex|plain.tex} */ import GraphemeSplitter from './grapheme-splitter.js'; /** * * A token can be of type: * - `literal`: the value is the character this token represents. This can be * a combination of Unicode codepoints, for example for emojis. * - `^` and `_`: superscript and subscript commands. * - `command`: a command such as `\sin` or `\text` or `\alpha` * - `{` and `}`: begin and end group (use for arguments of commands and for grouping) * - `#`: parameter * * - `placeholder`: a placeholder value meant to be replaced by some actual value * - `space`: one or more space characters (including tab, etc...) * * See: [TeX:289](http://tug.org/texlive/devsrc/Build/source/texk/web2c/tex.web) * @property {string} value * @property {string} type * @class module:core/lexer#Token * @private */ class Token { constructor(type, value) { this.type = type; this.value = value; console.assert(!(type === 'literal' && value === '}')); } } /** * Given a LaTeX expression represented as a character string, * the Lexer class will scan and return Tokens for the lexical * units in the string. * * @param {string} s A string of LaTeX * @class Lexer * @class module:core/lexer#Lexer * @private */ class Lexer { constructor(s) { this.s = GraphemeSplitter.splitGraphemes(s); this.pos = 0; } /** * @return {boolean} True if we reached the end of the stream * @method module:core/lexer#Lexer#end * @private */ end() { return this.pos >= this.s.length; } /** * Return the next char and advance * @return {string} * @method module:core/lexer#Lexer#get * @private */ get() { return this.pos < this.s.length ? this.s[this.pos++] : null; } /** * Return the next char, but do not advance * @return {string} * @method module:core/lexer#Lexer#peek * @private */ peek() { return this.s[this.pos]; } /** * Return the next substring matching regEx and advance. * @param {RegEx} regEx * @return {?string} * @method module:core/lexer#Lexer#scan * @private */ scan(regEx) { let result; // this.s can either be a string, if it's made up only of ASCII chars // or an array of graphemes, if it's more complicated. if (typeof this.s === 'string') { result = regEx.exec(this.s.slice(this.pos)); } else { result = regEx.exec(this.s.slice(this.pos).join('')); } if (result) { this.pos += result[0].length; return result[0]; } return null; } /** * Return true if next char is white space. Does not advance. * * See [Stackoverflow](http://stackoverflow.com/questions/6073637/) * @method module:core/lexer#Lexer#isWhiteSpace * @private */ isWhiteSpace() { return /[ \f\n\r\t\v\xA0\u2028\u2029]/.test(this.s[this.pos]); /* Note that browsers are inconsistent in their definitions of the `\s` metacharacter, so we use an explicit pattern instead. - IE: `[ \f\n\r\t\v]` - Chrome: `[ \f\n\r\t\v\u00A0]` - Firefox: `[ \f\n\r\t\v\u00A0\u2028\u2029]` - \f \u000C: form feed (FORM FEED) - \n \u000A: linefeed (LINE FEED) - \r \u000D: carriage return - \t \u0009: tab (CHARACTER TABULATION) - \v \u000B: vertical tab (LINE TABULATION) - \u00A0: NON-BREAKING SPACE - \u2028: LINE SEPARATOR - \u2029: PARAGRAPH SEPARATOR Could be considered: - \u2000-\u200a spacing - \u202f NARROW NO-BREAK SPACE - \u205F MEDIUM MATHEMATICAL SPACE - \u3000 IDEOGRAPHIC SPACE - \uFEFF ZERO WITH NON-BREAKING SPACE */ } /** * Return a single token, or null, created from the lexer. * * @returns {Token} * @method module:core/lexer#Lexer#makeToken * @private */ makeToken() { // If we've reached the end, exit if (this.end()) return null; // Handle white space // Note that in text mode, spaces are significant and can't be coalesced. if (this.isWhiteSpace()) { this.get(); return new Token('space'); } let result = null; // Is it a command? if (this.peek() === '\\') { this.get(); // Skip the initial \ if (!this.end()) { // A command is either a string of letters and asterisks... let command = this.scan(/^[a-zA-Z*]+/); // There are a few special commands that are handled here... if (command === 'bgroup') { // Begin group, synonym for opening brace result = new Token('{'); } else if (command === 'egroup') { // End group, synonym for closing brace result = new Token('}'); } else { if (!command) { // ... or a single non-letter character command = this.get(); } result = new Token('command', command); } } // Is it a group start/end? } else if (this.peek() === '{' || this.peek() === '}') { result = new Token(this.get()); } else if (this.peek() === '#') { // This could be either a param token, or a literal # (used for // colorspecs, for example). A param token is a '#' followed by // - a digit 0-9 followed by a non-alpha, non-digit // - or '?'. // Otherwise, it's a literal '#'. this.get(); if (!this.end()) { let isParam = false; let next = this.peek(); if (/[0-9?]/.test(next)) { // Could be a param isParam = true; // Need to look ahead to the following char if (this.pos + 1 < this.s.length) { const after = this.s[this.pos + 1]; isParam = /[^0-9A-Za-z]/.test(after); } } if (isParam) { result = new Token('#'); next = this.get(); if (next >= '0' && next <= '9') { result.value = parseInt(next); } else { result.value = '?'; } } else { result = new Token('literal', '#'); } } // result = new Token(this.get()); // if (!this.end()) { // const next = this.get(); // if (next >= '0' && next <= '9') { // result.value = parseInt(next); // } else { // result.value = next; // } // } } else if (this.peek() === '~') { // Spacing this.get(); result = new Token('command', 'space'); } else if (this.peek() === '$') { // Mode switch this.get(); if (this.peek() === '$') { // $$ this.get(); result = new Token('$$'); } else { // $ result = new Token('$'); } } else { result = new Token('literal', this.get()); } return result; } } /** * Create Tokens from a stream of LaTeX * * @param {string} s - A string o LaTeX. It can include comments (with the `%` * marker) and multiple lines. * @return {Token[]} * @memberof module:core/lexer * @private */ function tokenize(s) { const result = []; const lines = s.toString().split(/\r?\n/); let stream = ''; let sep = ''; for (const line of lines) { stream += sep; sep = ' '; // Remove everything after a % (comment marker) // (but \% should be preserved...) const m = line.match(/((?:\\%)|[^%])*/); if (m) stream += m[0]; } const lex = new Lexer(stream); while (!lex.end()) { const token = lex.makeToken(); if (token) result.push(token); } return result; } export default { tokenize }