latex-parser
Version:
468 lines (424 loc) • 22.6 kB
text/typescript
import {Context} from "./Context";
import {LatexStyle} from "../../LatexStyle/index";
import {Token} from "../../SyntaxTree/Token/index";
import {Parameter} from "../../LatexStyle/Item/Parameter";
import {EnvironmentToken} from "../../SyntaxTree/Token/EnvironmentToken";
import {EnvironmentBodyToken} from "../../SyntaxTree/Token/EnvironmentBodyToken";
import {EnvironmentAndPackage} from "../../LatexStyle/Item/Environment";
import {Command} from "../../LatexStyle/Item/Symbol/Command";
import {SpaceToken} from "../../SyntaxTree/Token/SpaceToken";
import {SymbolToken} from "../../SyntaxTree/Token/SymbolToken";
import {ParameterToken} from "../../SyntaxTree/Token/ParameterToken";
import {CommandToken} from "../../SyntaxTree/Token/CommandToken";
import {Lexeme} from "../Lexeme";
import {Symbol as SymbolItem} from "../../LatexStyle/Item/Symbol/index";
import {isNumber, isString, mustNotBeUndefined} from "../../../../src/Utils";
/**
* Parse the next token
* @param {!Context} context the parsing context
* @param {!LatexStyle} latexStyle the latex style
* @return {?Token} the parsed token or undefined if the token cannot be parsed
* @private
*/
function parseToken_(context: Context, latexStyle: LatexStyle): Token | undefined {
let token: Token | undefined = parseSpaceToken_(context); // collect comments and try to parse a space token
if (!token) { // if cannot parse a space token
if (context.position >= context.source.length) return undefined;
const contextBackup = context.copy(); // backup the current context
if (!(token = parseEnvironmentToken_(context, latexStyle))) { // if cannot parse an environment token
contextBackup.copy(context); // restore the context
if (!(token = parseCommandToken_(context, latexStyle))) { // if cannot parse a command token
contextBackup.copy(context); // restore the context
if (!(token = parseSymbolsToken_(context, latexStyle))) { // if cannot parse a symbol token
return undefined; // no token can be parsed
}
}
}
}
//noinspection JSCheckFunctionSignatures
processParsedToken_(context, token);
//noinspection JSValidateTypes
return token; // return the parsed token
}
/**
* Parse a parameter token
* @param {!Context} context the parsing context
* @param {!LatexStyle} latexStyle the latex style
* @param {!Parameter} parameter the symbol or command parameter description
* @param {string=} opt_endLabel
* the parameter end label or undefined if there should be a single token
* @return {?ParameterToken} the parsed parameter token or undefined if cannot parse
* @private
*/
function parseParameterToken_(context: Context, latexStyle: LatexStyle, parameter: Parameter, opt_endLabel?: string) {
const currentTokenBackup = context.currentToken; // store the current token
//noinspection JSUnresolvedFunction,JSUnresolvedVariable
context.updateState(parameter.operations); // update the LaTeX state
if (opt_endLabel === undefined) { // if the parameter must be parsed as a single token
// has the param space prefix or not
const spacePrefixState = parseSpaceToken_(context) !== undefined;
if (context.source[context.position] === "{") { // if the parameter is bounded by brackets
// create the parameter token
context.currentToken =
new ParameterToken({hasBrackets: true, hasSpacePrefix: spacePrefixState});
++context.position; // go to the sources next char
++context.charNumber; // go to the current line next char
// exit if cannot parse until the closing bracket
if (!parseUntilLabel_(context, latexStyle, "}", parameter.lexeme)) return undefined;
++context.position; // skip the bracket in the sources
++context.charNumber; // skip the bracket in the current line
} else { // if the parameter is't bounded by brackets
// create the parameter token
context.currentToken =
new ParameterToken({hasBrackets: false, hasSpacePrefix: spacePrefixState});
// exit if cannot parse a parameter token
if (parseToken_(context, latexStyle) === undefined) return undefined;
}
} else { // if the parameter must be parsed until the end label
// create the parameter token
context.currentToken =
new ParameterToken({hasBrackets: false, hasSpacePrefix: false});
// return if cannot parse
if (!parseUntilLabel_(context, latexStyle, opt_endLabel, parameter.lexeme)) return undefined;
}
const parameterToken = context.currentToken; // the parsed parameter token
context.currentToken = currentTokenBackup; // restore the current token
//noinspection JSCheckFunctionSignatures
processParsedToken_(context, parameterToken);
//noinspection JSValidateTypes
return parameterToken;
}
/**
* Parse an environment token
* @param {!Context} context the parsing context
* @param {!LatexStyle} latexStyle latex style
* @return {?EnvironmentToken} the parsed token or undefined if cannot parse
* @private
*/
function parseEnvironmentToken_(context: Context, latexStyle: LatexStyle): EnvironmentToken | undefined {
if (context.source.substring(context.position).indexOf("\\begin") !== 0) return undefined;
context.position += 6; // just after "\begin"
parseSpaceToken_(context); // skip spaces
// try to obtain the environment name
const nameMatch = context.source.substring(context.position).match(/^{([\w@]+\*?)}/);
if (!nameMatch) return undefined; // exit if cannot bet the environment name
const name = nameMatch[1]; // the environment name
context.position += nameMatch[0].length; // skip the environment name in the sources
context.charNumber += nameMatch[0].length; // skip the environment name in the current line
const currentTokenBackup = context.currentToken; // store the current token
// try to get the corresponding environment
const environment: EnvironmentAndPackage = latexStyle.environments(context.currentState, name)[0];
const environmentToken = context.currentToken = environment ? // the environment token
new EnvironmentToken({environment: environment.environment}) :
new EnvironmentToken({name: name});
// TODO unknown environment notification
// try to parse the environment begin command
const symbols: Command[] = latexStyle.commands(context.currentState, name);
let beginCommandToken: Token | undefined = parsePatterns_(context, latexStyle, symbols);
if (beginCommandToken === undefined) { // if cannot parse a command
// TODO notification about the unrecognized command
// generate unrecognized command token
beginCommandToken = new CommandToken({name: name});
}
//noinspection JSCheckFunctionSignatures
processParsedToken_(context, beginCommandToken);
const environmentBodyToken = context.currentToken = new EnvironmentBodyToken();
const endFound = parseUntilLabel_(context, latexStyle, `\\end\{${name}}`); // try to get to the end
context.currentToken = environmentToken;
processParsedToken_(context, environmentBodyToken); // process the body token
let endCommandToken: Token | undefined = undefined; // the environment end command token
if (endFound) { // if the environment end was reached
context.position += name.length + 6; // skip the environment name in the sources
context.charNumber += name.length + 6; // skip the environment name in the current line
endCommandToken = parsePatterns_(context, latexStyle, latexStyle.commands(context.currentState, `end${name}`));
} else { // if cannot find the end of the environment
// TODO no environment end notification
}
if (endCommandToken === undefined) { // if cannot parse a command
// TODO notification about the unrecognized command
// generate unrecognized command token
endCommandToken = new CommandToken({name: "end" + name});
}
processParsedToken_(context, endCommandToken); // process the end command token
context.currentToken = currentTokenBackup; // restore the current token
return environmentToken;
}
/**
* Parse a command token.
*
* By the rules of TeX syntax, the "name" of a macro that starts with a \ (backslash) character must either
* - consist of a single non-alphabetical character.
* Some examples:
* \, (insert thin space),
* \% ( the % character),
* \\ (insert line break),
* \[ (open display math), and
* \) (close inline math).
* or
* - contain only uppercase and lowercase alphabetical characters: a-z and A-Z. No numerals, and no other characters
* belonging to non-letter categories either. (Well, there are certain ways of assigning "letter-category" status to
* non-letter characters, but that's a topic for a different discussion.)
* (https://tex.stackexchange.com/questions/66666/command-macro-name-cannot-include-numbers-and-symbols/66695#66695)
*
* @param {!Context} context the parsing context
* @param {!LatexStyle} latexStyle latex style
* @return {?CommandToken} the parsed token or undefined if cannot parse
* @private
*
*/
function parseCommandToken_(context: Context, latexStyle: LatexStyle): Token | undefined {
// try to find a command name
// TODO [a-zA-Z] instead of [\w]?
const cmdMatch = context.source.substring(context.position).match(/^\\((?:[\w@]+\*?)|(?:[^\w]))/);
if (!cmdMatch)
return undefined; // exit if cannot find a command name
context.position += cmdMatch[0].length; // set position just after the command name
context.charNumber += cmdMatch[0].length; // skip all the command name chars
// try to parse a command token
let token: Token | undefined = parsePatterns_(context, latexStyle, latexStyle.commands(context.currentState, cmdMatch[1]));
if (token === undefined) { // if cannot parse a command token
// TODO notification about the unrecognized command
// generate unrecognized command token
token = new CommandToken({name: cmdMatch[1]});
}
//noinspection JSValidateTypes
return token;
}
/**
* Parse symbols for a token
* @param {!Context} context the parsing context
* @param {!LatexStyle} latexStyle latex style
* @return {?Token} the parsed token or undefined if cannot parse
* @private
*/
function parseSymbolsToken_(context: Context, latexStyle: LatexStyle): Token | undefined {
// get the available symbols
const sourceCharacter = context.source[context.position]; // the current sources character
// get the symbols started with the current sources character
let token = parsePatterns_(context, latexStyle, latexStyle.symbols(context.currentState, sourceCharacter));
if (token === undefined) { // if cannot parse a symbol token
// TODO notification about the unrecognized symbol
++context.position; // go to the next sources character
// go to the next line character (the line is the same, \n was parsed for a space token)
++context.charNumber;
// generate unrecognized symbol token
token = new SymbolToken({pattern: sourceCharacter});
} else { // if the token was parsed
// TODO parse words and numbers
}
return token;
}
/**
* Try to parse a symbol pattern
* @param {!Context} context the parsing context// generate unrecognized symbol token
* @param {!LatexStyle} latexStyle the latex style
* @param {!Array.<!Symbol>} symbols the symbol or command descriptions in the priority descending order
* @return {?Token} the parsed symbol or command token or undefined if cannot parse
* @private
*/
function parsePatterns_(context: Context, latexStyle: LatexStyle, symbols: SymbolItem[]): Token | undefined {
const contextBackup = context.copy(); // backup the current context
let token: Token | undefined = undefined; // the parsed token
// TODO not how some() is meant to be used...?
symbols.some(symbol => { // for all the symbols until the parsing success
// stop if the token was parsed
if (token = parsePattern_(context, latexStyle, symbol)) {
return true;
} else {
contextBackup.copy(context); // restore the context
return false; // go to the next symbol
}
});
return token;
}
/**
* Try to parse a symbol pattern
* @param {!Context} context the parsing context
* @param {!LatexStyle} latexStyle latex style
* @param {!Array.<!Symbol>} symbol the symbol or command description
* @return {?Token} the parsed symbol or command token or undefined if cannot parse
* @private
*/
function parsePattern_(context: Context, latexStyle: LatexStyle, symbol: SymbolItem): Token | undefined {
const currentTokenBackup = context.currentToken; // store the current token
// if a command description is given
context.currentToken = symbol instanceof Command ?
new CommandToken({command: symbol}) : // generate a command token
new SymbolToken({symbol: symbol}); // generate a symbol token
const patternComponents = symbol.patternComponents; // the symbol pattern components
const nPatternComponents = patternComponents.length; // the pattern componen number
let iPatternComponent = 0; // the pattern component iterator
// for all the pattern components
for (; iPatternComponent < nPatternComponents; ++iPatternComponent) {
const patternComponent = patternComponents[iPatternComponent]; // the pattern component
if (isNumber(patternComponent)) { // if a parameter is expected
const parameter: Parameter | undefined = symbol.parameter(patternComponent); // the parameter description
// try to get the end label for the parameter
const parameterEndLabel = patternComponents[iPatternComponent + 1];
if (typeof parameterEndLabel === "string") { // if there is a end label
// if can parse the parameter token
if (parseParameterToken_(context, latexStyle, mustNotBeUndefined(parameter), parameterEndLabel)) {
// exit if there is no the end label at the positions
if (context.source.substring(context.position).indexOf(parameterEndLabel) !== 0) return undefined;
context.position += parameterEndLabel.length; // skip the end label in the sources
context.charNumber += parameterEndLabel.length; // skip the end label in the line
++iPatternComponent; // skip the end label in the pattern
continue; // go to the next pattern component
}
} else { // if there is no a end label
// go to the next pattern char if can parse the parameter token
if (parseParameterToken_(context, latexStyle, mustNotBeUndefined(parameter))) continue;
}
}
else if (isString(patternComponent)) {
while (parseCommentLine_(context)) {
} // skip all the comments
// if the sources fragment is equal the pattern component
if (context.source.substring(context.position).indexOf(patternComponent) === 0) {
context.position += patternComponent.length; // skip the pattern component in the sources
context.charNumber += patternComponent.length; // skip the pattern component in the line
continue; // go to the next pattern component
}
} else if (parseSpaceToken_(context))
continue;
break; // stop parsing if there was no continue call
}
// return if the pattern parsing was broken
if (iPatternComponent < nPatternComponents) return undefined;
const parsedToken = context.currentToken; // the parsed token to return
context.currentToken = currentTokenBackup; // restore the current token
//noinspection JSUnresolvedFunction,JSUnresolvedVariable
context.updateState(symbol.operations); // update the LaTeX state
return parsedToken;
}
/**
* Parse tokens until the label
* @param {!Context} context the parsing context
* @param {!LatexStyle} latexStyle latex style
* @param {string} endLabel the label to parse until
* @param {Lexeme=} opt_lexeme the lexeme of the single token to parse
* @return {boolean} true if the parsing was successful, false otherwise
* @private
*/
function parseUntilLabel_(context: Context, latexStyle: LatexStyle, endLabel: string, opt_lexeme?: Lexeme) {
switch (opt_lexeme) {
// TODO parse special lexemes
default: {
// while not reached the label
while (context.source.substring(context.position).indexOf(endLabel) !== 0) {
if (context.position >= context.source.length) { // if there is no more sources
// TODO notification about unexpected sources end
return false;
}
parseToken_(context, latexStyle);
}
return true;
}
}
}
/**
* Parse a comment line
* @param {!Context} context the parsing context
* @return {boolean} true if there was a comment line, false otherwise
* @private
*/
function parseCommentLine_(context: Context): boolean {
// try to find a comment int the sources tail
const commentMatch = context.source.substring(context.position).match(/^%([^\n]*)(\n[ \t]*)?/);
if (!commentMatch) return false; // return if there is no comment at this position
context.comments.push(commentMatch[1]); // store the comment string
context.position += commentMatch[0].length; // position just after the comment
if (!commentMatch[2]) { // if there were no line breaks
context.charNumber += commentMatch[0].length; // go to the last char
} else { // if there was a line break
++context.lineNumber; // one more line
context.charNumber = commentMatch[2].length - 1; // skip all the space chars in the new line
}
return true;
}
/**
* Fill the parsed token position, comments and parent
* @param {!Context} context the parsing context
* @param {!Token} token the token to process
* @private
*/
function processParsedToken_(context: Context, token: Token) {
// TODO process comments and position
if (context.currentToken) { // if there is a current token
// console.log(context.currentToken.toString())
//noinspection JSUnresolvedFunction
context.currentToken.insertChildSubtree(token); // store this token as a child of the current
}
}
/**
* Parse space for a token (space or paragraph separator)
* @param {!Context} context the parsing context
* @return {?SpaceToken} the parsed token or undefined if cannot parse a space token
* @private
*/
function parseSpaceToken_(context: Context): SpaceToken | undefined {
let isSpace = false; // true is the sources fragment is a space token, false otherwise
let nLineBreaks = 0; // number of parsed line breaks
while (context.position < context.source.length) { // while there is something to parse
// go to the next iteration if there was a comment
if (parseCommentLine_(context)) continue;
switch (context.source[context.position]) { // depend on the sources current character
case " ":
case "\t": // if a space or a tabular
isSpace = true; // and one more parsed char
++context.position; // go to the next sources char
++context.charNumber; // the next char of the sources line
continue;
case "\n": // if a line break
isSpace = true; // and one more parsed char
++nLineBreaks; // one more parsed line
++context.position; // go to the next sources char
++context.lineNumber; // the next sources line
context.charNumber = 0; // the first char of the line
continue; // go to the next iteration
}
break; // stop if not a space char
}
// create a space token if needed
return isSpace ? new SpaceToken({lineBreakCount: nLineBreaks}) : undefined;
}
/**
* LaTeX parser structure
* @class
* @property {!LatexStyle} latexStyle latexStyle - The LaTeX style description to be used for parsing
*/
export class LatexParser {
latexStyle: LatexStyle;
/**
* Constructor
* @param {!LatexStyle} latexStyle latexStyle LaTeX style description to be used for parsing
*/
constructor(latexStyle: LatexStyle) {
if (!(latexStyle instanceof LatexStyle)) throw new TypeError(`"latexStyle" isn't a LatexStyle instance`);
this.latexStyle = latexStyle;
}
/**
* Parse LaTeX sources
* @param {string} source the sources to parse
* @param {(!Context|undefined)} opt_context the parsing context
* @return {!Array.<!Token>} the list of the parsed tokens
*/
parse(source: string, opt_context?: Context): Token[] {
if (typeof source !== "string") throw new TypeError(`"sources" isn't a string`);
let context;
if (opt_context === undefined) { // if the parsing context isn't defined
context = new Context(source); // create the context
} else if (opt_context instanceof Context) { // if the parsing context is defined
context = opt_context;
context.source += source; // update the sources
} else { // if unexpected context type
throw new TypeError(`"context" isn't a LatexParser.Context instance`);
}
const parsedTokens: Token[] = []; // the list of the parsed tokens
// store the parsed token, stop when cannot parse a token
let parsedToken: Token | undefined;
while (parsedToken = parseToken_(context, this.latexStyle))
parsedTokens.push(parsedToken);
return parsedTokens;
}
}