tex2typst
Version:
JavaScript library for converting TeX code to Typst
527 lines (462 loc) • 21 kB
text/typescript
import { TexBeginEnd, TexFuncCall, TexLeftRight, TexNode, TexGroup, TexSupSub, TexSupsubData, TexText, TexToken, TexTokenType } from "./tex-types";
import { assert } from "./utils";
import { array_join, array_split } from "./generic";
import { TEX_BINARY_COMMANDS, TEX_UNARY_COMMANDS, tokenize_tex } from "./tex-tokenizer";
const IGNORED_COMMANDS = [
'bigl', 'bigr', 'bigm',
'biggl', 'biggr', 'biggm',
'Bigl', 'Bigr', 'Bigm',
'Biggl', 'Biggr', 'Biggm'
];
const EMPTY_NODE: TexNode = TexToken.EMPTY.toNode();
function get_command_param_num(command: string): number {
if (TEX_UNARY_COMMANDS.includes(command)) {
return 1;
} else if (TEX_BINARY_COMMANDS.includes(command)) {
return 2;
} else {
return 0;
}
}
const LEFT_CURLY_BRACKET: TexToken = new TexToken(TexTokenType.CONTROL, '{');
const RIGHT_CURLY_BRACKET: TexToken = new TexToken(TexTokenType.CONTROL, '}');
const LEFT_SQUARE_BRACKET: TexToken = new TexToken(TexTokenType.ELEMENT, '[');
const RIGHT_SQUARE_BRACKET: TexToken = new TexToken(TexTokenType.ELEMENT, ']');
function eat_whitespaces(tokens: TexToken[], start: number): TexToken[] {
let pos = start;
while (pos < tokens.length && [TexTokenType.SPACE, TexTokenType.NEWLINE].includes(tokens[pos].type)) {
pos++;
}
return tokens.slice(start, pos);
}
function eat_parenthesis(tokens: TexToken[], start: number): TexToken | null {
const firstToken = tokens[start];
if (firstToken.type === TexTokenType.ELEMENT && ['(', ')', '[', ']', '|', '\\{', '\\}', '.', '\\|', '<', '>'].includes(firstToken.value)) {
return firstToken;
} else if (firstToken.type === TexTokenType.COMMAND && ['lfloor', 'rfloor', 'lceil', 'rceil', 'langle', 'rangle', 'lparen', 'rparen', 'lbrace', 'rbrace'].includes(firstToken.value.slice(1))) {
return firstToken;
} else {
return null;
}
}
function eat_primes(tokens: TexToken[], start: number): number {
let pos = start;
while (pos < tokens.length && tokens[pos].eq(new TexToken(TexTokenType.ELEMENT, "'"))) {
pos += 1;
}
return pos - start;
}
function process_styled_parts(nodes: TexNode[]): TexNode[] {
let style_token: TexToken | null = null;
let bucket: TexNode[] = [];
let res: TexNode[] = [];
let i = 0;
while(true) {
if (i === nodes.length || nodes[i].head.eq(TexToken.COMMAND_DISPLAYSTYLE) || nodes[i].head.eq(TexToken.COMMAND_TEXTSTYLE)) {
if(bucket.length > 0) {
const g = (bucket.length === 1)? bucket[0]: new TexGroup(bucket);
res.push(style_token? new TexFuncCall(style_token, [g]): g);
}
if (i === nodes.length) {
break;
}
bucket = [];
style_token = nodes[i].head;
} else {
bucket.push(nodes[i]);
}
i++;
}
return res;
}
const LEFT_COMMAND: TexToken = new TexToken(TexTokenType.COMMAND, '\\left');
const RIGHT_COMMAND: TexToken = new TexToken(TexTokenType.COMMAND, '\\right');
const BEGIN_COMMAND: TexToken = new TexToken(TexTokenType.COMMAND, '\\begin');
const END_COMMAND: TexToken = new TexToken(TexTokenType.COMMAND, '\\end');
const CONTROL_LINEBREAK = new TexToken(TexTokenType.CONTROL, '\\\\');
export class LatexParserError extends Error {
constructor(message: string) {
super(message);
this.name = 'LatexParserError';
}
static readonly UNMATCHED_LEFT_BRACE = new LatexParserError("Unmatched '\\{'");
static readonly UNMATCHED_RIGHT_BRACE = new LatexParserError("Unmatched '\\}'");
static readonly UNMATCHED_LEFT_BRACKET = new LatexParserError("Unmatched '\\['");
static readonly UNMATCHED_RIGHT_BRACKET = new LatexParserError("Unmatched '\\]'");
static readonly UNMATCHED_COMMAND_BEGIN = new LatexParserError("Unmatched '\\begin'");
static readonly UNMATCHED_COMMAND_END = new LatexParserError("Unmatched '\\end'");
static readonly UNMATCHED_COMMAND_LEFT = new LatexParserError("Unmatched '\\left'");
static readonly UNMATCHED_COMMAND_RIGHT = new LatexParserError("Unmatched '\\right'");
}
type ParseResult = [TexNode, number];
const SUB_SYMBOL:TexToken = new TexToken(TexTokenType.CONTROL, '_');
const SUP_SYMBOL:TexToken = new TexToken(TexTokenType.CONTROL, '^');
export class LatexParser {
public space_sensitive: boolean;
public newline_sensitive: boolean;
// how many levels of \begin{...} \end{...} are we currently in
public alignmentDepth: number = 0;
constructor(space_sensitive: boolean = false, newline_sensitive: boolean = true) {
this.space_sensitive = space_sensitive;
this.newline_sensitive = newline_sensitive;
}
parse(tokens: TexToken[]): TexNode {
return this.parseGroup(tokens.slice(0));
}
parseGroup(tokens: TexToken[]): TexNode {
const [tree, _] = this.parseClosure(tokens, 0, null);
return tree;
}
// return pos: (position of closingToken) + 1
// pos will be -1 if closingToken is not found
parseClosure(tokens: TexToken[], start: number, closingToken: TexToken | null): ParseResult {
const results: TexNode[] = [];
let pos = start;
while (pos < tokens.length) {
if (closingToken !== null && tokens[pos].eq(closingToken)) {
break;
}
const [res, newPos] = this.parseNextExpr(tokens, pos);
pos = newPos;
if(res.head.type === TexTokenType.SPACE || res.head.type === TexTokenType.NEWLINE) {
if (!this.space_sensitive && res.head.value.replace(/ /g, '').length === 0) {
continue;
}
if (!this.newline_sensitive && res.head.value === '\n') {
continue;
}
}
results.push(res);
}
if (pos >= tokens.length && closingToken !== null) {
return [EMPTY_NODE, -1];
}
const styledResults = process_styled_parts(results);
let node: TexNode;
if (styledResults.length === 1) {
node = styledResults[0];
} else {
node = new TexGroup(styledResults);
}
return [node, pos + 1];
}
parseNextExpr(tokens: TexToken[], start: number): ParseResult {
let [base, pos] = this.parseNextExprWithoutSupSub(tokens, start);
let sub: TexNode | null = null;
let sup: TexNode | null = null;
let num_prime = 0;
num_prime += eat_primes(tokens, pos);
pos += num_prime;
if (pos < tokens.length) {
const next_token = tokens[pos];
if (next_token.eq(SUB_SYMBOL)) {
[sub, pos] = this.parseNextExprWithoutSupSub(tokens, pos + 1);
const new_primes = eat_primes(tokens, pos);
num_prime += new_primes;
pos += new_primes;
if (pos < tokens.length && tokens[pos].eq(SUP_SYMBOL)) {
[sup, pos] = this.parseNextExprWithoutSupSub(tokens, pos + 1);
if (eat_primes(tokens, pos) > 0) {
throw new LatexParserError('Double superscript');
}
}
} else if (next_token.eq(SUP_SYMBOL)) {
[sup, pos] = this.parseNextExprWithoutSupSub(tokens, pos + 1);
if (eat_primes(tokens, pos) > 0) {
throw new LatexParserError('Double superscript');
}
if (pos < tokens.length && tokens[pos].eq(SUB_SYMBOL)) {
[sub, pos] = this.parseNextExprWithoutSupSub(tokens, pos + 1);
if (eat_primes(tokens, pos) > 0) {
throw new LatexParserError('Double superscript');
}
}
}
}
if (sub !== null || sup !== null || num_prime > 0) {
const res: TexSupsubData = { base, sup: null, sub: null };
if (sub) {
res.sub = sub;
}
if (num_prime > 0) {
const items: TexNode[] = [];
for (let i = 0; i < num_prime; i++) {
items.push(new TexToken(TexTokenType.ELEMENT, "'").toNode());
}
if (sup) {
items.push(sup);
}
res.sup = items.length === 1 ? items[0] : new TexGroup(items);
} else if (sup) {
res.sup = sup;
}
return [new TexSupSub(res), pos];
} else {
return [base, pos];
}
}
parseNextExprWithoutSupSub(tokens: TexToken[], start: number): ParseResult {
if (start >= tokens.length) {
throw new LatexParserError("Unexpected end of input");
}
const firstToken = tokens[start];
switch (firstToken.type) {
case TexTokenType.ELEMENT:
case TexTokenType.LITERAL:
case TexTokenType.COMMENT:
case TexTokenType.SPACE:
case TexTokenType.NEWLINE:
return [firstToken.toNode(), start + 1];
case TexTokenType.COMMAND:
const commandName = firstToken.value.slice(1);
if (IGNORED_COMMANDS.includes(commandName)) {
return this.parseNextExprWithoutSupSub(tokens, start + 1);
}
if (firstToken.eq(BEGIN_COMMAND)) {
return this.parseBeginEndExpr(tokens, start);
} else if(firstToken.eq(END_COMMAND)) {
throw LatexParserError.UNMATCHED_COMMAND_END;
} else if (firstToken.eq(LEFT_COMMAND)) {
return this.parseLeftRightExpr(tokens, start);
} else if (firstToken.eq(RIGHT_COMMAND)) {
throw LatexParserError.UNMATCHED_COMMAND_RIGHT;
} else {
return this.parseCommandExpr(tokens, start);
}
case TexTokenType.CONTROL:
const controlChar = firstToken.value;
switch (controlChar) {
case '{':
const [group, newPos] = this.parseClosure(tokens, start + 1, RIGHT_CURLY_BRACKET);
if (newPos === -1) {
throw LatexParserError.UNMATCHED_LEFT_BRACE;
}
return [group, newPos];
case '}':
throw LatexParserError.UNMATCHED_RIGHT_BRACE;
case '\\\\':
case '\\!':
case '\\,':
case '\\:':
case '\\;':
case '\\>':
return [firstToken.toNode(), start + 1];
case '\\ ':
case '~':
return [firstToken.toNode(), start + 1];
case '_':
case '^':
// e.g. "_1" or "^2" are valid LaTeX math expressions
return [ EMPTY_NODE, start];
case '&':
if (this.alignmentDepth <= 0) {
throw new LatexParserError('Unexpected & outside of an alignment');
}
return [firstToken.toNode(), start + 1];
default:
throw new LatexParserError('Unknown control sequence');
}
default:
throw new LatexParserError('Unknown token type');
}
}
parseCommandExpr(tokens: TexToken[], start: number): ParseResult {
assert(tokens[start].type === TexTokenType.COMMAND);
const command_token = tokens[start];
const command = command_token.value; // command name starts with a \
let pos = start + 1;
const paramNum = get_command_param_num(command.slice(1));
switch (paramNum) {
case 0:
return [command_token.toNode(), pos];
case 1: {
// TODO: JavaScript gives undefined instead of throwing an error when accessing an index out of bounds,
// so index checking like this should be everywhere. This is rough.
if(pos >= tokens.length) {
throw new LatexParserError('Expecting argument for ' + command);
}
if (command === '\\sqrt' && pos < tokens.length && tokens[pos].eq(LEFT_SQUARE_BRACKET)) {
const [exponent, newPos1] = this.parseClosure(tokens, pos + 1, RIGHT_SQUARE_BRACKET);
if (newPos1 === -1) {
throw LatexParserError.UNMATCHED_LEFT_BRACKET;
}
const [arg1, newPos2] = this.parseNextArg(tokens, newPos1);
return [new TexFuncCall(command_token, [arg1], exponent), newPos2];
} else if (command === '\\text') {
if (pos + 2 >= tokens.length) {
throw new LatexParserError('Expecting content for \\text command');
}
assert(tokens[pos].eq(LEFT_CURLY_BRACKET));
assert(tokens[pos + 1].type === TexTokenType.LITERAL);
assert(tokens[pos + 2].eq(RIGHT_CURLY_BRACKET));
const literal = tokens[pos + 1];
return [new TexText(literal), pos + 3];
} else if (command === '\\displaylines') {
assert(tokens[pos].eq(LEFT_CURLY_BRACKET));
const [matrix, newPos] = this.parseAligned(tokens, pos + 1, RIGHT_CURLY_BRACKET);
if (newPos === -1) {
throw LatexParserError.UNMATCHED_LEFT_BRACE;
}
const group = new TexGroup(array_join(matrix, CONTROL_LINEBREAK.toNode()));
return [new TexFuncCall(command_token, [group]), newPos];
}
let [arg1, newPos] = this.parseNextArg(tokens, pos);
return [new TexFuncCall(command_token, [arg1]), newPos];
}
case 2: {
const [arg1, pos1] = this.parseNextArg(tokens, pos);
const [arg2, pos2] = this.parseNextArg(tokens, pos1);
return [new TexFuncCall(command_token, [arg1, arg2]), pos2];
}
default:
throw new Error('Invalid number of parameters');
}
}
/*
Extract a non-space argument from the token stream.
So that `\frac{12} 3` is parsed as
TypstFuncCall{ head: '\frac', args: [ELEMENT_12, ELEMENT_3] }
rather than
TypstFuncCall{ head: '\frac', args: [ELEMENT_12, SPACE] }, ELEMENT_3
*/
parseNextArg(tokens: TexToken[], start: number): ParseResult {
let pos = start;
let arg: TexNode | null = null;
while (pos < tokens.length) {
let node: TexNode;
[node, pos] = this.parseNextExprWithoutSupSub(tokens, pos);
if (!(node.head.type === TexTokenType.SPACE || node.head.type === TexTokenType.NEWLINE)) {
arg = node;
break;
}
}
if (arg === null) {
throw new LatexParserError('Expecting argument but token stream ended');
}
return [arg, pos];
}
parseLeftRightExpr(tokens: TexToken[], start: number): ParseResult {
assert(tokens[start].eq(LEFT_COMMAND));
let pos = start + 1;
pos += eat_whitespaces(tokens, pos).length;
if (pos >= tokens.length) {
throw new LatexParserError('Expecting a delimiter after \\left');
}
const leftDelimiter = eat_parenthesis(tokens, pos);
if (leftDelimiter === null) {
throw new LatexParserError('Invalid delimiter after \\left');
}
pos++;
const [body, idx] = this.parseClosure(tokens, pos, RIGHT_COMMAND);
if (idx === -1) {
throw LatexParserError.UNMATCHED_COMMAND_LEFT;
}
pos = idx;
pos += eat_whitespaces(tokens, pos).length;
if (pos >= tokens.length) {
throw new LatexParserError('Expecting a delimiter after \\right');
}
const rightDelimiter = eat_parenthesis(tokens, pos);
if (rightDelimiter === null) {
throw new LatexParserError('Invalid delimiter after \\right');
}
pos++;
const left = leftDelimiter.value === '.'? null: leftDelimiter;
const right = rightDelimiter.value === '.'? null: rightDelimiter;
const res = new TexLeftRight({body: body, left: left, right: right});
return [res, pos];
}
parseBeginEndExpr(tokens: TexToken[], start: number): ParseResult {
assert(tokens[start].eq(BEGIN_COMMAND));
let pos = start + 1;
assert(tokens[pos].eq(LEFT_CURLY_BRACKET));
assert(tokens[pos + 1].type === TexTokenType.LITERAL);
assert(tokens[pos + 2].eq(RIGHT_CURLY_BRACKET));
const envName = tokens[pos + 1].value;
pos += 3;
let data: TexNode | null = null;
if(['array', 'subarray'].includes(envName)) {
pos += eat_whitespaces(tokens, pos).length;
[data, pos] = this.parseNextArg(tokens, pos);
}
const [body, endIdx] = this.parseAligned(tokens, pos, END_COMMAND);
if (endIdx === -1) {
throw LatexParserError.UNMATCHED_COMMAND_BEGIN;
}
pos = endIdx;
assert(tokens[pos].eq(LEFT_CURLY_BRACKET));
assert(tokens[pos + 1].type === TexTokenType.LITERAL);
assert(tokens[pos + 2].eq(RIGHT_CURLY_BRACKET));
if (tokens[pos + 1].value !== envName) {
throw new LatexParserError('\\begin and \\end environments mismatch');
}
pos += 3;
const res = new TexBeginEnd(new TexToken(TexTokenType.LITERAL, envName), body, data);
return [res, pos];
}
// return pos: (position of closingToken) + 1
// pos will be -1 if closingToken is not found
parseAligned(tokens: TexToken[], start: number, closingToken: TexToken): [TexNode[][], number] {
this.alignmentDepth++;
let pos = start;
// ignore whitespaces and '\n' after \begin{envName}
pos += eat_whitespaces(tokens, pos).length;
let closure: TexNode;
[closure, pos] = this.parseClosure(tokens, pos, closingToken);
if (pos === -1) {
return [[], -1];
}
let allRows: TexNode[][];
if (closure.type === 'ordgroup') {
const elements = (closure as TexGroup).items;
// ignore spaces and '\n' before \end{envName}
while(elements.length > 0 && [TexTokenType.SPACE, TexTokenType.NEWLINE].includes(elements[elements.length - 1].head.type)) {
elements.pop();
}
allRows = array_split(elements, new TexToken(TexTokenType.CONTROL, '\\\\').toNode())
.map(row => {
return array_split(row, new TexToken(TexTokenType.CONTROL, '&').toNode())
.map(arr => new TexGroup(arr));
});
} else {
allRows = [[closure]];
}
this.alignmentDepth--;
return [allRows, pos];
}
}
// Remove all whitespace before or after _ or ^
function passIgnoreWhitespaceBeforeScriptMark(tokens: TexToken[]): TexToken[] {
const is_script_mark = (token: TexToken) => token.eq(SUB_SYMBOL) || token.eq(SUP_SYMBOL);
let out_tokens: TexToken[] = [];
for (let i = 0; i < tokens.length; i++) {
if (tokens[i].type === TexTokenType.SPACE && i + 1 < tokens.length && is_script_mark(tokens[i + 1])) {
continue;
}
if (tokens[i].type === TexTokenType.SPACE && i - 1 >= 0 && is_script_mark(tokens[i - 1])) {
continue;
}
out_tokens.push(tokens[i]);
}
return out_tokens;
}
// expand custom tex macros
function passExpandCustomTexMacros(tokens: TexToken[], customTexMacros: {[key: string]: string}): TexToken[] {
let out_tokens: TexToken[] = [];
for (const token of tokens) {
if (token.type === TexTokenType.COMMAND && customTexMacros[token.value]) {
const expanded_tokens = tokenize_tex(customTexMacros[token.value]);
out_tokens = out_tokens.concat(expanded_tokens);
} else {
out_tokens.push(token);
}
}
return out_tokens;
}
export function parseTex(tex: string, customTexMacros: {[key: string]: string} = {}): TexNode {
const parser = new LatexParser();
let tokens = tokenize_tex(tex);
tokens = passIgnoreWhitespaceBeforeScriptMark(tokens);
tokens = passExpandCustomTexMacros(tokens, customTexMacros);
return parser.parse(tokens);
}