biblatex-csl-converter
Version:
a set of converters: biblatex => json, CSL => json, json => biblatex, json => CSL
497 lines (481 loc) • 20.7 kB
text/typescript
import type { NodeObject, TextNodeObject, MarkObject } from "../const"
import type { ConfigObject } from "./biblatex"
const LATEX_COMMANDS = [
// commands that can can contain richtext.
["\\textbf{", "strong"],
["\\mkbibbold{", "strong"],
["\\mkbibitalic{", "em"],
["\\mkbibemph{", "em"],
["\\textit{", "em"],
["\\emph{", "em"],
["\\textsc{", "smallcaps"],
["\\enquote{", "enquote"],
["\\mkbibquote{", "enquote"],
["\\textsubscript{", "sub"],
["\\textsuperscript{", "sup"],
]
const LATEX_NOBRACE_COMMANDS = [
// Commands that do not have a brace themselves
["\\scshape", "smallcaps"],
["\\sc", "smallcaps"],
["\\bfseries", "strong"],
["\\bf", "strong"],
["\\itshape", "em"],
["\\it", "em"],
]
const LATEX_VERBATIM_COMMANDS = [
// commands that can only contain plaintext.
["\\url{", "url"],
]
const LATEX_SPECIAL_CHARS: Record<string, string> = {
"&": "&",
"%": "%",
$: "$",
"#": "#",
_: "_",
"{": "{",
"}": "}",
",": ",",
"~": "~",
"^": "^",
"'": "'",
";": "\u2004",
"\\": "\n",
}
export class BibLatexLiteralParser {
string: string
config: ConfigObject
cpMode: boolean
braceLevel: number
slen: number
si: number
json: NodeObject[]
braceClosings: boolean[]
currentMarks: MarkObject[]
inCasePreserve: number | null
textNode?: TextNodeObject
constructor(string: string, config: ConfigObject, cpMode = false) {
this.string = string
this.config = config
this.cpMode = cpMode // Whether to consider case preservation.
this.braceLevel = 0
this.slen = string.length
this.si = 0 // string index
this.json = []
this.braceClosings = []
this.currentMarks = []
this.inCasePreserve = null
this.addNewTextNode()
}
// If the last text node has no content, remove it.
removeIfEmptyTextNode(): void {
if (this.textNode) {
if (this.textNode.text.length === 0) {
this.json.pop()
} else if (!this.config.includeUnusedNocase) {
this.removeUnusedNocase()
}
}
}
checkAndAddNewTextNode(): void {
if (this.textNode && this.textNode.text.length > 0) {
// We have text in the last node already,
// so we need to start a new text node.
if (!this.config.includeUnusedNocase) {
this.removeUnusedNocase()
}
this.addNewTextNode()
}
}
removeUnusedNocase(): void {
if (
this.textNode?.marks &&
this.textNode.text === this.textNode.text.toLowerCase()
) {
const marks = this.textNode.marks.filter(
(mark) => mark.type !== "nocase"
)
if (marks.length) {
this.textNode.marks = marks
} else {
delete this.textNode.marks
}
}
}
addNewTextNode(): void {
const textNode: TextNodeObject = { type: "text", text: "" }
this.json.push(textNode)
this.textNode = textNode
}
stringParser(): NodeObject[] {
let variable, sj
parseString: while (this.si < this.slen) {
switch (this.string[this.si]) {
case "\\":
for (let command of LATEX_COMMANDS) {
if (
this.string.substring(
this.si,
this.si + command[0].length
) === command[0]
) {
this.braceLevel++
this.si += command[0].length
this.checkAndAddNewTextNode()
if (this.cpMode) {
// If immediately inside a brace that added case protection, remove case protection. See
// http://tex.stackexchange.com/questions/276943/biblatex-how-to-emphasize-but-not-caps-protect
if (
this.inCasePreserve ===
this.braceLevel - 1 &&
this.string[this.si - 1] === "{" &&
this.currentMarks[
this.currentMarks.length - 1
].type === "nocase"
) {
this.currentMarks.pop()
this.inCasePreserve = null
} else if (
!this.currentMarks.find(
(mark) => mark.type === "nocase"
)
) {
// If not immediately inside a brace, any styling also
// adds case protection.
this.currentMarks.push({
type: "nocase",
})
this.inCasePreserve = this.braceLevel
}
}
if (
this.currentMarks.find(
(mark) => mark.type === command[1]
)
) {
this.braceClosings.push(false)
} else {
this.currentMarks.push({ type: command[1] })
this.textNode!.marks = this.currentMarks.slice()
this.braceClosings.push(true)
}
continue parseString
}
}
for (let command of LATEX_NOBRACE_COMMANDS) {
if (
this.string.substring(
this.si,
this.si + command[0].length
) === command[0] &&
[" ", "\\"].includes(
this.string.substring(
this.si + command[0].length,
this.si + command[0].length + 1
)
)
) {
this.si += command[0].length
if (
this.string.substring(this.si, this.si + 1) ===
" "
) {
this.si++
}
if (
!this.currentMarks.find(
(mark) => mark.type === command[1]
)
) {
this.currentMarks.push({ type: command[1] })
}
this.textNode!.marks = this.currentMarks.slice()
continue parseString
}
}
for (let command of LATEX_VERBATIM_COMMANDS) {
if (
this.string.substring(
this.si,
this.si + command[0].length
) === command[0]
) {
this.checkAndAddNewTextNode()
this.textNode!.marks = this.currentMarks.slice()
this.textNode!.marks.push({ type: command[1] })
this.si += command[0].length
let sj = this.si
let internalBraceLevel = 0
while (
sj < this.slen &&
(this.string[sj] !== "}" ||
internalBraceLevel > 0)
) {
switch (this.string[sj]) {
case "{":
internalBraceLevel++
break
case "}":
internalBraceLevel--
break
}
sj++
}
this.textNode!.text = this.string.substring(
this.si,
sj
)
this.addNewTextNode()
this.si = sj + 1
continue parseString
}
}
if (LATEX_SPECIAL_CHARS[this.string[this.si + 1]]) {
this.textNode!.text +=
LATEX_SPECIAL_CHARS[this.string[this.si + 1]]
this.si += 2
} else {
// We don't know the command and skip it.
this.si++
while (
this.si < this.slen &&
this.string[this.si].match("[a-zA-Z0-9]")
) {
this.si++
}
// If there is a brace at the end of the command,
// increase brace level but ignore brace.
if (this.string[this.si] === "{") {
this.braceLevel++
this.braceClosings.push(false)
this.si++
}
}
break
case "_":
switch (this.string[this.si + 1]) {
case "{":
this.checkAndAddNewTextNode()
this.braceLevel++
this.si += 2
if (
this.currentMarks.find(
(mark) => mark.type === "sub"
)
) {
this.braceClosings.push(false)
} else {
this.currentMarks.push({ type: "sub" })
this.textNode!.marks = this.currentMarks.slice()
this.braceClosings.push(true)
}
break
case "\\":
// There is a command following directly. Ignore the sub symbol.
this.si++
break
default:
// We only add the next character to a sub node.
this.checkAndAddNewTextNode()
this.textNode!.marks = this.currentMarks.slice()
this.textNode!.marks.push({ type: "sub" })
this.textNode!.text = this.string[this.si + 1]
this.addNewTextNode()
if (this.currentMarks.length) {
this.textNode!.marks = this.currentMarks.slice()
}
this.si += 2
}
break
case "`":
if (this.string[this.si + 1] === "`") {
this.checkAndAddNewTextNode()
this.braceLevel++
this.si += 2
if (
this.currentMarks.find(
(mark) => mark.type === "enquote"
)
) {
this.braceClosings.push(false)
} else {
this.currentMarks.push({ type: "enquote" })
this.textNode!.marks = this.currentMarks.slice()
this.braceClosings.push(true)
}
} else {
this.textNode!.text += this.string[this.si]
this.si++
}
break
case "'":
if (this.string[this.si + 1] === "'") {
this.braceLevel--
if (this.braceLevel > -1) {
let closeBrace = this.braceClosings.pop()
if (closeBrace) {
this.checkAndAddNewTextNode()
this.currentMarks.pop()
if (this.currentMarks.length) {
this.textNode!.marks =
this.currentMarks.slice()
} else {
delete this.textNode!.marks
}
}
this.si += 2
//continue parseString
} else {
// A brace was closed before it was opened. Abort and return the original string.
return [{ type: "text", text: this.string }]
}
} else {
this.textNode!.text += this.string[this.si]
this.si++
}
break
case "^":
switch (this.string[this.si + 1]) {
case "{":
this.checkAndAddNewTextNode()
this.braceLevel++
this.si += 2
if (
this.currentMarks.find(
(mark) => mark.type === "sup"
)
) {
this.braceClosings.push(false)
} else {
this.currentMarks.push({ type: "sup" })
this.textNode!.marks = this.currentMarks.slice()
this.braceClosings.push(true)
}
break
case "\\":
// There is a command following directly. Ignore the sup symbol.
this.si++
break
default:
// We only add the next character to a sup node.
this.checkAndAddNewTextNode()
this.textNode!.marks = this.currentMarks.slice()
this.textNode!.marks.push({ type: "sup" })
this.textNode!.text = this.string[this.si + 1]
this.addNewTextNode()
if (this.currentMarks.length) {
this.textNode!.marks = this.currentMarks.slice()
}
this.si += 2
}
break
case "{":
if (this.string[this.si + 1] === "}") {
// bracket is closing immediately. Ignore it.
this.si += 2
continue
}
this.braceLevel++
if (this.inCasePreserve || !this.cpMode) {
// If already inside case preservation, do not add a second
this.braceClosings.push(false)
} else {
this.inCasePreserve = this.braceLevel
this.checkAndAddNewTextNode()
this.currentMarks.push({ type: "nocase" })
this.textNode!.marks = this.currentMarks.slice()
this.braceClosings.push(true)
}
this.si++
break
case "}":
this.braceLevel--
if (this.braceLevel > -1) {
let closeBrace = this.braceClosings.pop()
if (closeBrace) {
this.checkAndAddNewTextNode()
let lastMark = this.currentMarks.pop()!
if (this.inCasePreserve === this.braceLevel + 1) {
this.inCasePreserve = null
// The last tag may have added more tags. The
// lowest level will be the case preserving one.
while (
lastMark.type !== "nocase" &&
this.currentMarks.length
) {
lastMark = this.currentMarks.pop()!
}
}
if (this.currentMarks.length) {
this.textNode!.marks = this.currentMarks.slice()
} else {
delete this.textNode!.marks
}
}
this.si++
//continue parseString
} else {
// A brace was closed before it was opened. Abort and return the original string.
return [{ type: "text", text: this.string }]
}
break
case "$":
// math env, just remove
this.si++
break
case "~":
// a non-breakable space
this.textNode!.text += "\u00A0"
this.si++
break
case "\u0870":
// An undefined variable.
this.removeIfEmptyTextNode()
sj = this.si + 1
while (sj < this.slen && this.string[sj] !== "\u0870") {
sj++
}
variable = this.string.substring(this.si + 1, sj)
this.json.push({ type: "variable", attrs: { variable } })
this.addNewTextNode()
this.si = sj + 1
break
case "\u0871":
// A backslash
this.textNode!.text += "\\"
this.si++
break
case "\r":
this.si++
break
case "\n":
if (
["\r", "\n"].includes(this.string[this.si + 1]) &&
this.string[this.si - 1] !== "\n"
) {
this.textNode!.text += "\n\n"
} else if (
/\S/.test(this.string[this.si - 1]) &&
/\S/.test(this.string[this.si + 1])
) {
this.textNode!.text += " "
}
this.si++
break
default:
this.textNode!.text += this.string[this.si]
this.si++
}
}
if (this.braceLevel > 0) {
// Too many opening braces, we return the original string.
return [{ type: "text", text: this.string }]
}
this.removeIfEmptyTextNode()
// Braces were accurate.
return this.json
}
get output(): NodeObject[] {
return this.stringParser()
}
}