UNPKG

tex2typst

Version:

JavaScript library for converting TeX code to Typst

356 lines (307 loc) 11.6 kB
import { assert } from "./utils"; /** * ELEMENT: 0-9, a-z, A-Z, punctuations such as +-/*,:; etc. * COMMAND: LaTeX macro with no parameter. e.g. \sin \cos \int \sum * EMPTY: special type when something is empty. e.g. the base of _{a} or ^{a} */ export enum TexTokenType { EMPTY, ELEMENT, COMMAND, LITERAL, COMMENT, SPACE, NEWLINE, CONTROL, UNKNOWN, } export class TexToken { readonly type: TexTokenType; value: string; constructor(type: TexTokenType, value: string) { this.type = type; this.value = value; } public eq(token: TexToken): boolean { return this.type === token.type && this.value === token.value; } public toString(): string { switch (this.type) { case TexTokenType.COMMENT: return "%" + this.value; default: return this.value; } } public toNode(): TexNode { return new TexTerminal(this); } public static readonly EMPTY = new TexToken(TexTokenType.EMPTY, ''); public static readonly COMMAND_DISPLAYSTYLE = new TexToken(TexTokenType.COMMAND, '\\displaystyle'); public static readonly COMMAND_TEXTSTYLE = new TexToken(TexTokenType.COMMAND, '\\textstyle'); } export interface TexSupsubData { base: TexNode; sup: TexNode | null; sub: TexNode | null; } // \left. or \right. will be represented as null. export interface TexLeftRightData { body: TexNode; left: TexToken | null; right: TexToken | null; } /** * funcCall: LaTeX macro with 1 or more parameters. e.g. \sqrt{3} \log{x} \exp{x} \frac{1}{2} * text: text enclosed by braces. e.g. \text{hello world} */ type TexNodeType = 'terminal' | 'text' | 'ordgroup' | 'supsub' | 'funcCall' | 'leftright' | 'beginend'; export abstract class TexNode { readonly type: TexNodeType; head: TexToken; constructor(type: TexNodeType, head: TexToken | null) { this.type = type; this.head = head ? head : TexToken.EMPTY; } // Note that this is only shallow equality. public eq(other: TexNode): boolean { return this.type === other.type && this.head.eq(other.head); } abstract serialize(): TexToken[]; // Note: toString() is expensive. Do not use it on performance-critical code path. public toString(): string { /* let buffer = ''; const tokens = this.serialize(); for (let i = 0; i < tokens.length; i++) { buffer = writeTexTokenBuffer(buffer, tokens[i]); } return buffer; */ return this.serialize().reduce(writeTexTokenBuffer, ''); } } export class TexTerminal extends TexNode { constructor(head: TexToken) { super('terminal', head); } public serialize(): TexToken[] { switch(this.head.type) { case TexTokenType.EMPTY: return []; case TexTokenType.ELEMENT: case TexTokenType.COMMAND: case TexTokenType.LITERAL: case TexTokenType.COMMENT: case TexTokenType.CONTROL: { return [this.head]; } case TexTokenType.SPACE: case TexTokenType.NEWLINE: { const tokens: TexToken[] = []; for (const c of this.head.value) { const token_type = c === ' ' ? TexTokenType.SPACE : TexTokenType.NEWLINE; tokens.push(new TexToken(token_type, c)); } return tokens; } default: throw new Error(`Unknown terminal token type: ${this.head.type}`); } } } export class TexText extends TexNode { constructor(head: TexToken) { assert(head.type === TexTokenType.LITERAL); super('text', head); } public serialize(): TexToken[] { return [ new TexToken(TexTokenType.COMMAND, '\\text'), new TexToken(TexTokenType.ELEMENT, '{'), this.head, new TexToken(TexTokenType.ELEMENT, '}'), ]; } } export class TexGroup extends TexNode { public items: TexNode[]; constructor(items: TexNode[]) { super('ordgroup', TexToken.EMPTY); this.items = items; } public serialize(): TexToken[] { return this.items.map((n) => n.serialize()).flat(); } } export class TexSupSub extends TexNode { public base: TexNode; public sup: TexNode | null; public sub: TexNode | null; constructor(data: TexSupsubData) { super('supsub', TexToken.EMPTY); this.base = data.base; this.sup = data.sup; this.sub = data.sub; } public serialize(): TexToken[] { let tokens: TexToken[] = []; const { base, sup, sub } = this; tokens = tokens.concat(base.serialize()); // TODO: should return true for more cases e.g. a_{\theta} instead of a_\theta function should_wrap_in_braces(node: TexNode): boolean { if(node.type === 'ordgroup' || node.type === 'supsub' || node.head.type === TexTokenType.EMPTY) { return true; } else if(node.head.type === TexTokenType.ELEMENT && /\d+(\.\d+)?/.test(node.head.value) && node.head.value.length > 1) { // a number with more than 1 digit as a subscript/superscript should be wrapped in braces return true; } else { return false; } } if (sub) { tokens.push(new TexToken(TexTokenType.CONTROL, '_')); if (should_wrap_in_braces(sub)) { tokens.push(new TexToken(TexTokenType.ELEMENT, '{')); tokens = tokens.concat(sub.serialize()); tokens.push(new TexToken(TexTokenType.ELEMENT, '}')); } else { tokens = tokens.concat(sub.serialize()); } } if (sup) { tokens.push(new TexToken(TexTokenType.CONTROL, '^')); if (should_wrap_in_braces(sup)) { tokens.push(new TexToken(TexTokenType.ELEMENT, '{')); tokens = tokens.concat(sup.serialize()); tokens.push(new TexToken(TexTokenType.ELEMENT, '}')); } else { tokens = tokens.concat(sup.serialize()); } } return tokens; } } export class TexFuncCall extends TexNode { public args: TexNode[]; // For type="sqrt", it's additional argument wrapped square bracket. e.g. 3 in \sqrt[3]{x} public data: TexNode | null; constructor(head: TexToken, args: TexNode[], data: TexNode | null = null) { super('funcCall', head); this.args = args; this.data = data; } public serialize(): TexToken[] { let tokens: TexToken[] = []; tokens.push(this.head); // special hook for \sqrt if (this.head.value === '\\sqrt' && this.data) { tokens.push(new TexToken(TexTokenType.ELEMENT, '[')); tokens = tokens.concat(this.data.serialize()); tokens.push(new TexToken(TexTokenType.ELEMENT, ']')); } for (const arg of this.args) { tokens.push(new TexToken(TexTokenType.ELEMENT, '{')); tokens = tokens.concat(arg.serialize()); tokens.push(new TexToken(TexTokenType.ELEMENT, '}')); } return tokens; } } export class TexLeftRight extends TexNode { public body: TexNode; public left: TexToken | null; public right: TexToken | null; constructor(data: TexLeftRightData) { super('leftright', TexToken.EMPTY); this.body = data.body; this.left = data.left; this.right = data.right; } public serialize(): TexToken[] { let tokens: TexToken[] = []; tokens.push(new TexToken(TexTokenType.COMMAND, '\\left')); tokens.push(new TexToken(TexTokenType.ELEMENT, this.left? this.left.value: '.')); tokens = tokens.concat(this.body.serialize()); tokens.push(new TexToken(TexTokenType.COMMAND, '\\right')); tokens.push(new TexToken(TexTokenType.ELEMENT, this.right? this.right.value: '.')); return tokens; } } export class TexBeginEnd extends TexNode { public matrix: TexNode[][]; // for environment="array" or "subarray", there's additional data like {c|c} right after \begin{env} public data: TexNode | null; constructor(head: TexToken, matrix: TexNode[][], data: TexNode | null = null) { assert(head.type === TexTokenType.LITERAL); super('beginend', head); this.matrix = matrix; this.data = data; } public serialize(): TexToken[] { let tokens: TexToken[] = []; const matrix = this.matrix; // tokens.push(new TexToken(TexTokenType.COMMAND, `\\begin{${this.content}}`)); tokens.push(new TexToken(TexTokenType.COMMAND, '\\begin')); tokens.push(new TexToken(TexTokenType.ELEMENT, '{')); tokens = tokens.concat(this.head); tokens.push(new TexToken(TexTokenType.ELEMENT, '}')); tokens.push(new TexToken(TexTokenType.NEWLINE, '\n')); for (let i = 0; i < matrix.length; i++) { const row = matrix[i]; for (let j = 0; j < row.length; j++) { const cell = row[j]; tokens = tokens.concat(cell.serialize()); if (j !== row.length - 1) { tokens.push(new TexToken(TexTokenType.CONTROL, '&')); } } if (i !== matrix.length - 1) { tokens.push(new TexToken(TexTokenType.CONTROL, '\\\\')); } } tokens.push(new TexToken(TexTokenType.NEWLINE, '\n')); // tokens.push(new TexToken(TexTokenType.COMMAND, `\\end{${this.content}}`)); tokens.push(new TexToken(TexTokenType.COMMAND, '\\end')); tokens.push(new TexToken(TexTokenType.ELEMENT, '{')); tokens = tokens.concat(this.head); tokens.push(new TexToken(TexTokenType.ELEMENT, '}')); return tokens; } } export function writeTexTokenBuffer(buffer: string, token: TexToken): string { const str = token.toString(); let no_need_space = false; if (token.type === TexTokenType.SPACE) { no_need_space = true; } else { // putting the first token in clause no_need_space ||= /[{\(\[\|]$/.test(buffer); // opening a optional [] parameter for a command no_need_space ||= /\\\w+$/.test(buffer) && str === '['; // putting a punctuation no_need_space ||= /^[\.,;:!\?\(\)\]{}_^]$/.test(str); no_need_space ||= ['\\{', '\\}'].includes(str); // putting a prime no_need_space ||= str === "'"; // putting a subscript or superscript no_need_space ||= buffer.endsWith('_') || buffer.endsWith('^'); // buffer ends with a whitespace no_need_space ||= /\s$/.test(buffer); // token starts with a space no_need_space ||= /^\s/.test(str); // buffer is empty no_need_space ||= buffer === ''; // leading sign. e.g. produce "+1" instead of " +1" no_need_space ||= /[\(\[{]\s*(-|\+)$/.test(buffer) || buffer === '-' || buffer === '+'; // "&=" instead of "& =" no_need_space ||= buffer.endsWith('&') && str === '='; // "2y" instead of "2 y" no_need_space ||= /\d$/.test(buffer) && /^[a-zA-Z]$/.test(str); } if (!no_need_space) { buffer += ' '; } return buffer + str; }