tex2typst

import { symbolMap } from "./map"; import { TexNode, TexSupsubData, TexToken, TexTokenType } from "./types"; import { assert } from "./util"; import { JSLex, Scanner } from "./jslex"; import { array_find } from "./generic"; const UNARY_COMMANDS = [ 'sqrt', 'text', 'bar', 'bold', 'boldsymbol', 'ddot', 'dot', 'hat', 'mathbb', 'mathbf', 'mathcal', 'mathfrak', 'mathit', 'mathrm', 'mathscr', 'mathsf', 'mathtt', 'operatorname', 'overbrace', 'overline', 'pmb', 'rm', 'tilde', 'underbrace', 'underline', 'vec', 'widehat', 'widetilde', 'overleftarrow', 'overrightarrow', 'hspace', ] const BINARY_COMMANDS = [ 'frac', 'tfrac', 'binom', 'dbinom', 'dfrac', 'tbinom', 'overset', ] const IGNORED_COMMANDS = [ 'bigl', 'bigr', 'biggl', 'biggr', 'Bigl', 'Bigr', 'Biggl', 'Biggr', ]; const EMPTY_NODE: TexNode = new TexNode('empty', ''); function get_command_param_num(command: string): number { if (UNARY_COMMANDS.includes(command)) { return 1; } else if (BINARY_COMMANDS.includes(command)) { return 2; } else { return 0; } } const LEFT_CURLY_BRACKET: TexToken = new TexToken(TexTokenType.CONTROL, '{'); const RIGHT_CURLY_BRACKET: TexToken = new TexToken(TexTokenType.CONTROL, '}'); const LEFT_SQUARE_BRACKET: TexToken = new TexToken(TexTokenType.ELEMENT, '['); const RIGHT_SQUARE_BRACKET: TexToken = new TexToken(TexTokenType.ELEMENT, ']'); function eat_whitespaces(tokens: TexToken[], start: number): TexToken[] { let pos = start; while (pos < tokens.length && [TexTokenType.SPACE, TexTokenType.NEWLINE].includes(tokens[pos].type)) { pos++; } return tokens.slice(start, pos); } function eat_parenthesis(tokens: TexToken[], start: number): TexToken | null { const firstToken = tokens[start]; if (firstToken.type === TexTokenType.ELEMENT && ['(', ')', '[', ']', '|', '\\{', '\\}', '.'].includes(firstToken.value)) { return firstToken; } else if (firstToken.type === TexTokenType.COMMAND && ['lfloor', 'rfloor', 'lceil', 'rceil', 'langle', 'rangle'].includes(firstToken.value.slice(1))) { return firstToken; } else { return null; } } function eat_primes(tokens: TexToken[], start: number): number { let pos = start; while (pos < tokens.length && tokens[pos].eq(new TexToken(TexTokenType.ELEMENT, "'"))) { pos += 1; } return pos - start; } function find_closing_match(tokens: TexToken[], start: number, leftToken: TexToken, rightToken: TexToken): number { assert(tokens[start].eq(leftToken)); let count = 1; let pos = start + 1; while (count > 0) { if (pos >= tokens.length) { return -1; } if (tokens[pos].eq(leftToken)) { count += 1; } else if (tokens[pos].eq(rightToken)) { count -= 1; } pos += 1; } return pos - 1; } const LEFT_COMMAND: TexToken = new TexToken(TexTokenType.COMMAND, '\\left'); const RIGHT_COMMAND: TexToken = new TexToken(TexTokenType.COMMAND, '\\right'); function find_closing_right_command(tokens: TexToken[], start: number): number { return find_closing_match(tokens, start, LEFT_COMMAND, RIGHT_COMMAND); } const BEGIN_COMMAND: TexToken = new TexToken(TexTokenType.COMMAND, '\\begin'); const END_COMMAND: TexToken = new TexToken(TexTokenType.COMMAND, '\\end'); function find_closing_end_command(tokens: TexToken[], start: number): number { return find_closing_match(tokens, start, BEGIN_COMMAND, END_COMMAND); } function unescape(str: string): string { const chars = ['{', '}', '\\', '$', '&', '#', '_', '%']; for (const char of chars) { str = str.replaceAll('\\' + char, char); } return str; } const rules_map = new Map<string, (a: Scanner<TexToken>) => TexToken | TexToken[]>([ [ String.raw`\\(text|operatorname|begin|end|hspace){.+?}`, (s) => { const text = s.text()!; const command = text.substring(0, text.indexOf('{')); const text_inside = text.substring(text.indexOf('{') + 1, text.lastIndexOf('}')); return [ new TexToken(TexTokenType.COMMAND, command), new TexToken(TexTokenType.CONTROL, '{'), new TexToken(TexTokenType.TEXT, unescape(text_inside)), new TexToken(TexTokenType.CONTROL, '}') ] } ], [String.raw`%[^\n]*`, (s) => new TexToken(TexTokenType.COMMENT, s.text()!.substring(1))], [String.raw`[{}_^&]`, (s) => new TexToken(TexTokenType.CONTROL, s.text()!)], [String.raw`\\[\\,:; ]`, (s) => new TexToken(TexTokenType.CONTROL, s.text()!)], [String.raw`\r?\n`, (_s) => new TexToken(TexTokenType.NEWLINE, "\n")], [String.raw`\s+`, (s) => new TexToken(TexTokenType.SPACE, s.text()!)], [String.raw`\\[{}%$&#_|]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)], [String.raw`(\\[a-zA-Z]+)(\s*\d|\s+[a-zA-Z])\s*([0-9a-zA-Z])`, (s) => { const text = s.text()!; const regex = RegExp(String.raw`(\\[a-zA-Z]+)(\s*\d|\s+[a-zA-Z])\s*([0-9a-zA-Z])`); const match = text.match(regex); assert(match !== null); const command = match![1]; if (BINARY_COMMANDS.includes(command.substring(1))) { const arg1 = match![2].trimStart(); const arg2 = match![3]; return [ new TexToken(TexTokenType.COMMAND, command), new TexToken(TexTokenType.ELEMENT, arg1), new TexToken(TexTokenType.ELEMENT, arg2), ]; } else { s.reject(); return []; } }], [String.raw`(\\[a-zA-Z]+)(\s*\d|\s+[a-zA-Z])`, (s) => { const text = s.text()!; const regex = RegExp(String.raw`(\\[a-zA-Z]+)(\s*\d|\s+[a-zA-Z])`); const match = text.match(regex); assert(match !== null); const command = match![1]; if (UNARY_COMMANDS.includes(command.substring(1))) { const arg1 = match![2].trimStart(); return [ new TexToken(TexTokenType.COMMAND, command), new TexToken(TexTokenType.ELEMENT, arg1), ]; } else { s.reject(); return []; } }], [String.raw`\\[a-zA-Z]+`, (s) => { const command = s.text()!; return [ new TexToken(TexTokenType.COMMAND, command), ]; }], // Numbers like "123", "3.14" [String.raw`[0-9]+(\.[0-9]+)?`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)], [String.raw`[a-zA-Z]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)], [String.raw`[+\-*/='<>!.,;:?()\[\]|]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)], // non-ASCII characters [String.raw`[^\x00-\x7F]`, (s) => new TexToken(TexTokenType.ELEMENT, s.text()!)], [String.raw`.`, (s) => new TexToken(TexTokenType.UNKNOWN, s.text()!)], ]); const spec = { "start": rules_map }; export function tokenize_tex(input: string): TexToken[] { const lexer = new JSLex<TexToken>(spec); return lexer.collect(input); } export class LatexParserError extends Error { constructor(message: string) { super(message); this.name = 'LatexParserError'; } } type ParseResult = [TexNode, number]; const SUB_SYMBOL:TexToken = new TexToken(TexTokenType.CONTROL, '_'); const SUP_SYMBOL:TexToken = new TexToken(TexTokenType.CONTROL, '^'); export class LatexParser { space_sensitive: boolean; newline_sensitive: boolean; constructor(space_sensitive: boolean = false, newline_sensitive: boolean = true) { this.space_sensitive = space_sensitive; this.newline_sensitive = newline_sensitive; } parse(tokens: TexToken[]): TexNode { const idx = array_find(tokens, new TexToken(TexTokenType.COMMAND, '\\displaystyle')); if (idx === -1) { // no \displaystyle, normal execution path const [tree, _] = this.parseGroup(tokens, 0, tokens.length); return tree; } else if (idx === 0) { // \displaystyle at the beginning. Wrap the whole thing in \displaystyle const [tree, _] = this.parseGroup(tokens, 1, tokens.length); return new TexNode('unaryFunc', '\\displaystyle', [tree]); } else { // \displaystyle somewhere in the middle. Split the expression to two parts const [tree1, _1] = this.parseGroup(tokens, 0, idx); const [tree2, _2] = this.parseGroup(tokens, idx + 1, tokens.length); const display = new TexNode('unaryFunc', '\\displaystyle', [tree2]); return new TexNode('ordgroup', '', [tree1, display]); } } parseGroup(tokens: TexToken[], start: number, end: number): ParseResult { const results: TexNode[] = []; let pos = start; while (pos < end) { const [res, newPos] = this.parseNextExpr(tokens, pos); pos = newPos; if(res.type === 'whitespace') { if (!this.space_sensitive && res.content.replace(/ /g, '').length === 0) { continue; } if (!this.newline_sensitive && res.content === '\n') { continue; } } if (res.type === 'control' && res.content === '&') { throw new LatexParserError('Unexpected & outside of an alignment'); } results.push(res); } let node: TexNode; if (results.length === 1) { node = results[0]; } else { node = new TexNode('ordgroup', '', results); } return [node, end + 1]; } parseNextExpr(tokens: TexToken[], start: number): ParseResult { let [base, pos] = this.parseNextExprWithoutSupSub(tokens, start); let sub: TexNode | null = null; let sup: TexNode | null = null; let num_prime = 0; num_prime += eat_primes(tokens, pos); pos += num_prime; if (pos < tokens.length && tokens[pos].eq(SUB_SYMBOL)) { [sub, pos] = this.parseNextExprWithoutSupSub(tokens, pos + 1); num_prime += eat_primes(tokens, pos); pos += num_prime; if (pos < tokens.length && tokens[pos].eq(SUP_SYMBOL)) { [sup, pos] = this.parseNextExprWithoutSupSub(tokens, pos + 1); if (eat_primes(tokens, pos) > 0) { throw new LatexParserError('Double superscript'); } } } else if (pos < tokens.length && tokens[pos].eq(SUP_SYMBOL)) { [sup, pos] = this.parseNextExprWithoutSupSub(tokens, pos + 1); if (eat_primes(tokens, pos) > 0) { throw new LatexParserError('Double superscript'); } if (pos < tokens.length && tokens[pos].eq(SUB_SYMBOL)) { [sub, pos] = this.parseNextExprWithoutSupSub(tokens, pos + 1); if (eat_primes(tokens, pos) > 0) { throw new LatexParserError('Double superscript'); } } } if (sub !== null || sup !== null || num_prime > 0) { const res: TexSupsubData = { base }; if (sub) { res.sub = sub; } if (num_prime > 0) { res.sup = new TexNode('ordgroup', '', []); for (let i = 0; i < num_prime; i++) { res.sup.args!.push(new TexNode('element', "'")); } if (sup) { res.sup.args!.push(sup); } if (res.sup.args!.length === 1) { res.sup = res.sup.args![0]; } } else if (sup) { res.sup = sup; } return [new TexNode('supsub', '', [], res), pos]; } else { return [base, pos]; } } parseNextExprWithoutSupSub(tokens: TexToken[], start: number): ParseResult { if (start >= tokens.length) { return [EMPTY_NODE, start]; } const firstToken = tokens[start]; switch (firstToken.type) { case TexTokenType.ELEMENT: return [new TexNode('element', firstToken.value), start + 1]; case TexTokenType.TEXT: return [new TexNode('text', firstToken.value), start + 1]; case TexTokenType.COMMENT: return [new TexNode('comment', firstToken.value), start + 1]; case TexTokenType.SPACE: case TexTokenType.NEWLINE: return [new TexNode('whitespace', firstToken.value), start + 1]; case TexTokenType.COMMAND: const commandName = firstToken.value.slice(1); if (IGNORED_COMMANDS.includes(commandName)) { return this.parseNextExprWithoutSupSub(tokens, start + 1); } if (firstToken.eq(BEGIN_COMMAND)) { return this.parseBeginEndExpr(tokens, start); } else if (firstToken.eq(LEFT_COMMAND)) { return this.parseLeftRightExpr(tokens, start); } else { return this.parseCommandExpr(tokens, start); } case TexTokenType.CONTROL: const controlChar = firstToken.value; switch (controlChar) { case '{': const posClosingBracket = find_closing_match(tokens, start, LEFT_CURLY_BRACKET, RIGHT_CURLY_BRACKET); if(posClosingBracket === -1) { throw new LatexParserError("Unmatched '{'"); } return this.parseGroup(tokens, start + 1, posClosingBracket); case '}': throw new LatexParserError("Unmatched '}'"); case '\\\\': case '\\,': case '\\:': case '\\;': return [new TexNode('control', controlChar), start + 1]; case '\\ ': return [new TexNode('control', '\\:'), start + 1]; case '_': case '^': return [ EMPTY_NODE, start]; case '&': return [new TexNode('control', '&'), start + 1]; default: throw new LatexParserError('Unknown control sequence'); } default: throw new LatexParserError('Unknown token type'); } } parseCommandExpr(tokens: TexToken[], start: number): ParseResult { assert(tokens[start].type === TexTokenType.COMMAND); const command = tokens[start].value; // command name starts with a \ let pos = start + 1; if (['left', 'right', 'begin', 'end'].includes(command.slice(1))) { throw new LatexParserError('Unexpected command: ' + command); } const paramNum = get_command_param_num(command.slice(1)); switch (paramNum) { case 0: if (!symbolMap.has(command.slice(1))) { return [new TexNode('unknownMacro', command), pos]; } return [new TexNode('symbol', command), pos]; case 1: { // TODO: JavaScript gives undefined instead of throwing an error when accessing an index out of bounds, // so index checking like this should be everywhere. This is rough. if(pos >= tokens.length) { throw new LatexParserError('Expecting argument for ' + command); } if (command === '\\sqrt' && pos < tokens.length && tokens[pos].eq(LEFT_SQUARE_BRACKET)) { const posLeftSquareBracket = pos; const posRightSquareBracket = find_closing_match(tokens, pos, LEFT_SQUARE_BRACKET, RIGHT_SQUARE_BRACKET); if (posRightSquareBracket === -1) { throw new LatexParserError('No matching right square bracket for ['); } const [exponent, _] = this.parseGroup(tokens, posLeftSquareBracket + 1, posRightSquareBracket); const [arg1, newPos] = this.parseNextArg(tokens, posRightSquareBracket + 1); return [new TexNode('unaryFunc', command, [arg1], exponent), newPos]; } else if (command === '\\text') { if (pos + 2 >= tokens.length) { throw new LatexParserError('Expecting content for \\text command'); } assert(tokens[pos].eq(LEFT_CURLY_BRACKET)); assert(tokens[pos + 1].type === TexTokenType.TEXT); assert(tokens[pos + 2].eq(RIGHT_CURLY_BRACKET)); const text = tokens[pos + 1].value; return [new TexNode('text', text), pos + 3]; } let [arg1, newPos] = this.parseNextArg(tokens, pos); return [new TexNode('unaryFunc', command, [arg1]), newPos]; } case 2: { const [arg1, pos1] = this.parseNextArg(tokens, pos); const [arg2, pos2] = this.parseNextArg(tokens, pos1); return [new TexNode('binaryFunc', command, [arg1, arg2]), pos2]; } default: throw new Error( 'Invalid number of parameters'); } } /* Extract a non-space argument from the token stream. So that `\frac{12} 3` is parsed as TexCommand{ content: '\frac', args: ['12', '3'] } rather than TexCommand{ content: '\frac', args: ['12', ' '] }, TexElement{ content: '3' } */ parseNextArg(tokens: TexToken[], start: number): ParseResult { let pos = start; let arg: TexNode | null = null; while (pos < tokens.length) { let node: TexNode; [node, pos] = this.parseNextExprWithoutSupSub(tokens, pos); if (node.type !== 'whitespace') { arg = node; break; } } if (arg === null) { throw new LatexParserError('Expecting argument but token stream ended'); } return [arg, pos]; } parseLeftRightExpr(tokens: TexToken[], start: number): ParseResult { assert(tokens[start].eq(LEFT_COMMAND)); let pos = start + 1; pos += eat_whitespaces(tokens, pos).length; if (pos >= tokens.length) { throw new LatexParserError('Expecting delimiter after \\left'); } const leftDelimiter = eat_parenthesis(tokens, pos); if (leftDelimiter === null) { throw new LatexParserError('Invalid delimiter after \\left'); } pos++; const exprInsideStart = pos; const idx = find_closing_right_command(tokens, start); if (idx === -1) { throw new LatexParserError('No matching \\right'); } const exprInsideEnd = idx; pos = idx + 1; pos += eat_whitespaces(tokens, pos).length; if (pos >= tokens.length) { throw new LatexParserError('Expecting \\right after \\left'); } const rightDelimiter = eat_parenthesis(tokens, pos); if (rightDelimiter === null) { throw new LatexParserError('Invalid delimiter after \\right'); } pos++; const [body, _] = this.parseGroup(tokens, exprInsideStart, exprInsideEnd); const args: TexNode[] = [ new TexNode('element', leftDelimiter.value), body, new TexNode('element', rightDelimiter.value) ] const res = new TexNode('leftright', '', args); return [res, pos]; } parseBeginEndExpr(tokens: TexToken[], start: number): ParseResult { assert(tokens[start].eq(BEGIN_COMMAND)); let pos = start + 1; assert(tokens[pos].eq(LEFT_CURLY_BRACKET)); assert(tokens[pos + 1].type === TexTokenType.TEXT); assert(tokens[pos + 2].eq(RIGHT_CURLY_BRACKET)); const envName = tokens[pos + 1].value; pos += 3; pos += eat_whitespaces(tokens, pos).length; // ignore whitespaces and '\n' after \begin{envName} const exprInsideStart = pos; const endIdx = find_closing_end_command(tokens, start); if (endIdx === -1) { throw new LatexParserError('No matching \\end'); } const exprInsideEnd = endIdx; pos = endIdx + 1; assert(tokens[pos].eq(LEFT_CURLY_BRACKET)); assert(tokens[pos + 1].type === TexTokenType.TEXT); assert(tokens[pos + 2].eq(RIGHT_CURLY_BRACKET)); if (tokens[pos + 1].value !== envName) { throw new LatexParserError('Mismatched \\begin and \\end environments'); } pos += 3; const exprInside = tokens.slice(exprInsideStart, exprInsideEnd); // ignore spaces and '\n' before \end{envName} while(exprInside.length > 0 && [TexTokenType.SPACE, TexTokenType.NEWLINE].includes(exprInside[exprInside.length - 1].type)) { exprInside.pop(); } const body = this.parseAligned(exprInside); const res = new TexNode('beginend', envName, [], body); return [res, pos]; } parseAligned(tokens: TexToken[]): TexNode[][] { let pos = 0; const allRows: TexNode[][] = []; let row: TexNode[] = []; allRows.push(row); let group = new TexNode('ordgroup', '', []); row.push(group); while (pos < tokens.length) { const [res, newPos] = this.parseNextExpr(tokens, pos); pos = newPos; if (res.type === 'whitespace') { if (!this.space_sensitive && res.content.replace(/ /g, '').length === 0) { continue; } if (!this.newline_sensitive && res.content === '\n') { continue; } } if (res.type === 'control' && res.content === '\\\\') { row = []; group = new TexNode('ordgroup', '', []); row.push(group); allRows.push(row); } else if (res.type === 'control' && res.content === '&') { group = new TexNode('ordgroup', '', []); row.push(group); } else { group.args!.push(res); } } return allRows; } } // Remove all whitespace before or after _ or ^ function passIgnoreWhitespaceBeforeScriptMark(tokens: TexToken[]): TexToken[] { const is_script_mark = (token: TexToken) => token.eq(SUB_SYMBOL) || token.eq(SUP_SYMBOL); let out_tokens: TexToken[] = []; for (let i = 0; i < tokens.length; i++) { if (tokens[i].type === TexTokenType.SPACE && i + 1 < tokens.length && is_script_mark(tokens[i + 1])) { continue; } if (tokens[i].type === TexTokenType.SPACE && i - 1 >= 0 && is_script_mark(tokens[i - 1])) { continue; } out_tokens.push(tokens[i]); } return out_tokens; } // expand custom tex macros function passExpandCustomTexMacros(tokens: TexToken[], customTexMacros: {[key: string]: string}): TexToken[] { let out_tokens: TexToken[] = []; for (const token of tokens) { if (token.type === TexTokenType.COMMAND && customTexMacros[token.value]) { const expanded_tokens = tokenize_tex(customTexMacros[token.value]); out_tokens = out_tokens.concat(expanded_tokens); } else { out_tokens.push(token); } } return out_tokens; } export function parseTex(tex: string, customTexMacros: {[key: string]: string}): TexNode { const parser = new LatexParser(); let tokens = tokenize_tex(tex); tokens = passIgnoreWhitespaceBeforeScriptMark(tokens); tokens = passExpandCustomTexMacros(tokens, customTexMacros); return parser.parse(tokens); }