tipa2unicode
Version:
Convert LaTeX/TIPA to Unicode IPA
638 lines (603 loc) • 18.4 kB
JavaScript
const { latexParser } = require('latex-utensils');
const modifierLUT = {
[':']: {
d: '\u0256',
l: '\u026d',
n: '\u0273',
r: '\u027d',
R: '\u027b',
s: '\u0282',
t: '\u0288',
z: '\u0290',
},
[';']: {
A: '\u1d00',
B: '\u0299',
G: '\u0262',
H: '\u029c',
L: '\u029f',
N: '\u0274',
R: '\u0280',
},
['!']: {
b: '\u0253',
d: '\u0257',
g: '\u0260',
G: '\u029b',
j: '\u0284',
o: '\u0298',
},
['*']: {
k: '\u029e',
r: '\u0279',
t: '\u0287',
w: '\u028d',
j: '\u025f',
n: '\u0272',
h: '\u0127',
l: '\u026c',
z: '\u026e',
},
};
const toneLUT = {
5: '\ua712',
4: '\ua713',
3: '\ua714',
2: '\ua715',
1: '\ua716',
};
const superLUT = {
h: '\u02b0',
w: '\u02b7',
j: '\u02b2',
G: '\u02e0',
Q: '\u02e4',
n: '\u207f',
l: '\u02e1',
4: '\u1da3',
J: '\u1da8',
V: '\u1db9',
};
const basicCharLUT = {
// Table 2.2: TIPA shortcut characters
':': '\u02d0', ';': '\u02d1', /* '"': '\u02c8', */
'0': '\u0289', '1': '\u0268', '2': '\u028c', '3': '\u025c', '4': '\u0265', '5': '\u0250', '6': '\u0252', '7': '\u0264', '8': '\u0275', '9': '\u0258',
'@': '\u0259', 'A': '\u0251', 'B': '\u03b2', 'C': '\u0255', 'D': '\u00f0', 'E': '\u025b', 'F': '\u0278', 'G': '\u0263', 'H': '\u0266', 'I': '\u026a',
'J': '\u029d', 'K': '\u0281', 'L': '\u028e', 'M': '\u0271', 'N': '\u014b', 'O': '\u0254', 'P': '\u0294', 'Q': '\u0295', 'R': '\u027e', 'S': '\u0283',
'T': '\u03b8', 'U': '\u028a', 'V': '\u028b', 'W': '\u026f', 'X': '\u03c7', 'Y': '\u028f', 'Z': '\u0292', '|': '\u01c0', '!': '\u01c3',
// Vowels and Consonants
'a': 'a', 'b': 'b', 'c': 'c', 'd': 'd', 'e': 'e', 'f': 'f', 'g': '\u0261', 'h': 'h', 'i': 'i',
'j': 'j', 'k': 'k', 'l': 'l', 'm': 'm', 'n': 'n', 'o': 'o', 'p': 'p', 'q': 'q', 'r': 'r',
's': 's', 't': 't', 'u': 'u', 'v': 'v', 'w': 'w', 'x': 'x', 'y': 'y', 'z': 'z',
// Accents and Diacritics
'\'': '\u02bc',
'.': '.',
// Punctuation marks
'!': '!', '\'': '\'', '(': '(', ')': ')', '*': '*', '+': '+', ',': ',', '-': '-', '.': '.',
'/': '/', '=': '=', '?': '?', '[': '[', ']': ']', '`': '`',
};
const basicCommandLUT = {
// Vowels and Consonants
textturna: '\u0250', textscripta: '\u0251', textturnscripta: '\u0252', ae: '\u00e6', textsca: '\u1d00', textturnv: '\u028c',
texthtb: '\u0253', textscb: '\u0299', textbeta: '\u03b2',
textctc: '\u0255',
texthtd: '\u0257', textrtaild: '\u0256', texthtrtaild: '\u1d91', textctd: '\u0221', dh: '\u00f0',
textschwa: '\u0259', textrhookschwa: '\u025a', textreve: '\u0258', textepsilon: '\u025b', textrevepsilon: '\u025c', textrhookrevepsilon: '\u025d', textcloserevepsilon: '\u025e',
textscriptg: '\u0261', texthtg: '\u0260', textscg: '\u0262', texthtscg: '\u029b', textgamma: '\u0263', textramshorns: '\u0264',
textcrh: '\u0127', texthth: '\u0266', texththeng: '\u0267', textturnh: '\u0265', textsch: '\u029c',
textbari: '\u0268', textsci: '\u026a',
textctj: '\u029d', textbardotlessj: '\u025f', textObardotlessj: '\u025f', texthtbardotlessj: '\u0284', texthtbardotlessjvar: '\u0284',
textturnk: '\u029e',
textltilde: '\u026b', textbeltl: '\u026c', textrtaill: '\u026d', textlyoghlig: '\u026e', textOlyoghlig: '\u026e', textscl: '\u029f',
textltailm: '\u0271', textturnm: '\u026f', textturnmrleg: '\u0270',
textltailn: '\u0272', ng: '\u014b', textrtailn: '\u0273', textctn: '\u0235', textscn: '\u0274',
textbullseye: '\u0298', textObullseye: '\u0298', textbaro: '\u0275', o: '\u00f8', oe: '\u0153', textscoelig: '\u0276', OE: '\u0276', textopeno: '\u0254',
textphi: '\u0278',
textfishhookr: '\u027e', textrtailr: '\u027d', textturnr: '\u0279', textturnrrtail: '\u027b', textturnlonglegr: '\u027a', textscr: '\u0280', textinvscr: '\u0281',
textrtails: '\u0282', textesh: '\u0283',
textrtailt: '\u0288', textturnt: '\u0287', textctt: '\u0236', texttheta: '\u03b8',
textbaru: '\u0289', textupsilon: '\u028a',
textscriptv: '\u028b',
textturnw: '\u028d',
textchi: '\u03c7',
textturny: '\u028e', textscy: '\u028f',
textctz: '\u0291', textrtailz: '\u0290', textyogh: '\u0292',
textglotstop: '\u0294', textbarglotstop: '\u02a1', textrevglotstop: '\u0295', textbarrevglotstop: '\u02a2',
textpipe: '\u01c0', textpipevar: '\u01c0', textdoublebarpipe: '\u01c2', textdoublebarpipevar: '\u01c2', textdoublepipe: '\u01c1', textdoublepipevar: '\u01c1',
// Suprasegmentals
textprimstress: '\u02c8', textsecstress: '\u02cc',
textlengthmark: '\u02d0', texthalflength: '\u02d1',
['t*']: '\u203f', textbottomtiebar: '\u203f',
// Accents and Diacritics
textcorner: '\u02fa', // \u031a
textopencorner: '\u02f9',
textrhoticity: '\u02de',
};
const checkArg0 = (nm, args) => {
if (args.length !== 0 &&
!(args.length === 1 && args[0].kind === 'arg.group' && args[0].content.length === 0))
throw new Error(`Command ${nm} should not have any argument, but got ${args.length}`);
};
const checkArg1 = (nm, args) => {
if (args.length !== 1 ||
args[0].kind !== 'arg.group')
throw new Error(`Unexpected calling format of command ${nm}`);
return args[0].content;
}
const checkArg1s = (nm, args) => {
const ast = checkArg1(nm, args);
if (ast.length !== 1 ||
ast[0].kind !== 'text.string')
throw new Error(`Unexpected calling format of command ${nm}`);
return ast[0].content;
}
const formatAST = /* istanbul ignore next */ (ast) => {
if (ast.location)
return `${ast.kind}/L${ast.location.start.line}C${ast.location.start.column}`;
return ast.kind;
}
const fancyjoin = (arr, str = '') => {
let res = '';
for (const a of arr) {
let r;
if (Array.isArray(a)) {
r = fancyjoin(a, str);
} else if (typeof a === 'string') {
r = a;
} else if (a === undefined) {
continue;
} else /* istanbul ignore next */ {
throw new Error(`Internal error in fancyjoin`);
}
if (!r)
;
else if (r.startsWith('\u200d'))
res += r.substr(1);
else if (res)
res += str, res += r;
else
res = r;
}
return res;
};
class TipaTranspiler {
constructor(debug) {
// undefined: nothing
// Array: to be looked up by the following char
// string: the suffix to be added (when this.isSuffix === true)
// string: the prefix to be added (when this.isSuffix === false)
this.lut = undefined;
this.isSuffix = false;
if (debug) /* istanbul ignore next */ {
this.debug = require('./debug.js');
}
}
endOfGroup(ar, d = '') {
const arr = ar.map((v) => this.take(v));
switch (typeof this.lut) {
case 'undefined':
break;
case 'object':
if (this.lut['']) this.lut = this.lut[''];
else throw new Error('EndOfGroup is not allowed here, as a char is required by LUT');
// fallthrough
case 'string':
arr.push(this.lut);
this.isSuffix = false;
this.lut = undefined;
break;
/* istanbul ignore next */
default:
throw new Error('Internal error', this.lut);
}
return fancyjoin(arr, d);
}
processCommand(nm, args) {
if (nm === 'relax') {
return args.map((v) => this.take(v));
}
if (this.debug) /* istanbul ignore next */ {
console.error(`processCommand(${nm}, args[${args.length}]) isSuffix = ${this.isSuffix} lut =`, this.debug(this.lut));
}
let prefix = '', suffix = '';
switch (typeof this.lut) {
case 'undefined':
break;
case 'object':
if (this.lut['']) this.lut = this.lut[''];
else throw new Error('Command is not allowed here, as a char is required by LUT');
// fallthrough
case 'string':
if (!this.isSuffix) {
prefix = this.lut;
} else {
suffix = this.lut;
}
this.isSuffix = false;
this.lut = undefined;
break;
/* istanbul ignore next */
default:
throw new Error('Internal error', this.lut);
}
if (basicCommandLUT[nm]) {
checkArg0(nm, args);
if (this.debug) /* istanbul ignore next */ {
console.error(`processCommand(${nm}, args[${args.length}]) basic:`, this.debug(prefix), this.debug(basicCommandLUT[nm]), this.debug(suffix));
}
return prefix + basicCommandLUT[nm] + suffix;
}
const s = (lut) => {
this.isSuffix = true;
if (suffix)
this.lut = Object.fromEntries(Object.entries(lut).map(([k, v]) => [k, v + suffix]));
else
this.lut = lut;
return prefix;
};
if (modifierLUT[nm]) {
checkArg0(nm, args);
this.isSuffix = false;
this.lut = Object.fromEntries(Object.entries(modifierLUT[nm]).map(([k, v]) => [k, v + suffix]));
return prefix;
}
const f = (sf = '', d = '') => {
const rst = this.endOfGroup(checkArg1(nm, args), d);
if (this.debug) /* istanbul ignore next */ {
console.error(`processCommand(${nm}, args[${args.length}]) f:`, this.debug(prefix), this.debug(rst), this.debug(sf), this.debug(suffix));
}
return prefix + rst + sf + suffix;
}
switch (nm) {
case 'tone':
return [...checkArg1s(nm, args)].map((v) => toneLUT[v]);
case 'super':
case 'textsuperscript':
return '\u200d' + [...checkArg1s(nm, args)].map((v) => superLUT[v]);
case 't':
case 'texttoptiebar':
return f('', '\u0361');
// Table 3.3: Examples of the accent prefix \|
case '|':
checkArg0(nm, args);
return s({
'[': '\u032a',
']': '\u033a',
'(': '\u031c',
')': '\u0339',
'c': '\u0311',
'+': '\u031f',
'\'': '\u02d4',
'`': '\u031e',
'<': '\u0318',
'>': '\u0319',
'x': '\u033d',
'w': '\u032b',
'm': '\u033c',
'~': '\u0334',
});
case 'textsubbridge':
return f('\u032a');
case 'textinvsubbridge':
return f('\u033a');
case 'textsubrhalfring':
return f('\u0339');
case 'textsublhalfring':
return f('\u031c');
case 'textroundcap':
return f('\u0311');
case 'textsubplus':
return f('\u031f');
case 'textraising':
return f('\u02d4');
case 'textlowering':
return f('\u031e');
case 'textadvancing':
return f('\u0318');
case 'textretracting':
return f('\u0319');
case 'textovercross':
return f('\u033d');
case 'textsubw':
return f('\u032b');
case 'textseagull':
return f('\u033c');
case 'textsuperimposetilde':
return f('\u0334');
case '~':
if (args.length) {
return f('\u0303');
}
return s({
'': '\u0303',
'.': '\u0307\u0303',
'*': '\u0330',
});
case 'texttildedot':
return f('\u0307\u0303');
case 'textsubtilde':
return f('\u0330');
case '"':
if (args.length) {
return f('\u0308');
}
return s({
'': '\u0308',
'*': '\u0324',
});
case 'textsubumlaut':
return f('\u0324');
case '^':
if (args.length) {
return f('\u0302');
}
return s({
'': '\u0302',
'*': '\u032d',
'.': '\u0307\u0302',
});
case 'textsubcircum':
return f('\u032d');
case 'textcircumdot':
return f() + '\u0307\u0302';
case '\'':
if (args.length) {
return f('\u0301');
}
return s({
'': '\u0301',
'*': '\u02cf', // \u0317
'=': '\u0304\u0301',
'.': '\u0307\u0301',
});
case 'textsubacute':
return f('\u02cf'); // \u0317
case 'textacutemacron':
return f('\u0304\u0301');
case 'textdotacute':
return f('\u0307\u0301');
case '`':
if (args.length) {
return f('\u0300');
}
return s({
'': '\u0300',
'*': '\u02ce', // \u0316
'.': '\u0300\u0307',
});
case 'textsubgrave':
return f('\u02ce'); // \u0316
case 'textgravemacron':
return f('\u0304\u0300');
case 'textgravedot':
return f('\u0300\u0307');
case 'H':
return f('\u02dd');
case 'H*':
if (args.length) {
return f('\u030f');
}
return s({
'': '\u030f',
});
case 'textdoublegrave':
return f('\u030f');
case 'r':
if (args.length) {
return f('\u030a');
}
return s({
'': '\u030a',
'=': '\u0304\u030a',
});
case 'textringmacron':
return f('\u0304\u030a');
case 'r*':
if (args.length) {
return f('\u0325');
}
return s({
'': '\u0325',
});
case 'textsubring':
return f('\u0325');
case 'v':
if (args.length) {
return f('\u030c');
}
return s({
'': '\u030c',
'\'': '\u030c\u0301',
});
case 'textacutewedge':
return f('\u030c\u0301');
case 'v*':
if (args.length) {
return f('\u032c');
}
return s({
'': '\u032c',
});
case 'textsubwedge':
return f('\u032c');
case 'u':
if (args.length) {
return f('\u0306');
}
return s({
'': '\u0306',
'=': '\u0304\u0306',
});
case 'textbrevemacron':
return f('\u0304\u0306');
case '=':
if (args.length) {
return f('\u02c9');
}
return s({
'': '\u02c9',
'*': '\u0320',
});
case 'textsubbar':
return f('\u0320');
case '.':
if (args.length) {
return f('\u02d9');
}
return s({
'': '\u02d9',
'*': '\u0323',
'\'': '\u0307\u0301',
});
case 'textsubdot':
return f('\u0323');
case 'c':
return f('\u0327');
case 'k':
case 'textpolhook':
return f('\u02db');
case 'textvbaraccent':
return f('\u030d');
case 'textdoublevbaraccent':
return f('\u030e');
case 'textsubsquare':
return f('\u033b');
case 'textsubarch':
return f('\u032f');
case 's':
case 'textsyllabic':
return f('\u0329');
default:
throw new Error(`Unknown LaTeX command \\${nm}`);
}
}
processChar(ch) {
if (this.debug) /* istanbul ignore next */ {
console.error(`processChar(${ch}) isSuffix = ${this.isSuffix} lut =`, this.debug(this.lut));
}
let prefix = '', suffix = '';
switch (typeof this.lut) {
case 'undefined':
break;
case 'object':
if (this.lut[ch]) {
const s = this.lut[ch];
if (!this.isSuffix) {
this.lut = undefined;
return s;
} else {
this.lut = s;
return undefined;
}
}
if (this.lut['']) this.lut = this.lut[''];
else throw new Error(`Cannot find LUT entry ${ch}`);
// fallthrough
case 'string':
if (!this.isSuffix) {
prefix = this.lut;
} else {
suffix = this.lut;
}
this.isSuffix = false;
this.lut = undefined;
break;
}
if (ch === '"') {
this.isSuffix = false;
this.lut = {
'': '\u02c8',
'"': '\u02cc',
};
if (suffix)
throw new Error('Stress cannot occur after suffix');
return prefix;
}
const sym = basicCharLUT[ch];
if (!sym)
throw new Error(`Cannot find shortcut character ${ch}`);
return prefix + sym + suffix;
}
processBreak(ch) {
switch (typeof this.lut) {
case 'undefined':
return ch;
case 'object':
if (this.lut['']) this.lut = this.lut[''];
else throw new Error('Break is not allowed here');
// fallthrough
case 'string':
if (this.isSuffix)
throw new Error('Break cannot occur after suffix');
const s = this.lut;
this.isSuffix = false;
this.lut = undefined;
return s + ch;
/* istanbul ignore next */
default:
throw new Error('Internal error', this.lut);
}
}
processGroup(ast) {
let prefix = '', suffix = '';
switch (typeof this.lut) {
case 'undefined':
break;
case 'object':
if (this.lut['']) this.lut = this.lut[''];
else throw new Error('Group is not allowed here, as a char is required by LUT');
// fallthrough
case 'string':
if (!this.isSuffix) {
prefix = this.lut;
} else {
suffix = this.lut;
}
this.isSuffix = false;
this.lut = undefined;
break;
/* istanbul ignore next */
default:
throw new Error('Internal error', this.lut);
}
return prefix + this.endOfGroup(ast.content) + suffix;
}
take(ast) {
if (this.debug) /* istanbul ignore next */ {
console.error(`take(ast: ${formatAST(ast)}) isSuffix = ${this.isSuffix} lut =`, this.debug(this.lut));
}
switch (ast.kind) {
case 'ast.root':
case 'arg.group':
return this.processGroup(ast);
case 'command':
return this.processCommand(ast.name, ast.args);
case 'text.string':
return [...ast.content].map((ch) => this.processChar(ch));
/* istanbul ignore next */
case 'activeCharacter':
return this.processChar('~');
case 'space':
case 'softbreak':
return this.processBreak(' ');
case 'parbreak':
return this.processBreak('\n');
default:
throw new Error(`Unknown ast kind ${ast.kind}`);
}
}
};
module.exports = (latex, debug) => {
const ast = latexParser.parse(latex);
const trans = new TipaTranspiler(debug);
try {
return fancyjoin(trans.take(ast));
} catch (e) {
if (debug) /* istanbul ignore next */ {
console.error(`Erorr ${e.message} when parsing the following AST:`);
console.error(JSON.stringify(ast, null, 2));
console.error(e);
}
throw e;
}
};