biblatex-csl-converter
Version:
Bibliography format converter: BibLaTeX, BibTeX, CSL-JSON, RIS, ENW, EndNote XML, Citavi, DOCX citations, ODT citations — parse, convert, and export with round-trip fidelity
320 lines (306 loc) • 10 kB
text/typescript
import { langidOptions } from "../const"
/**
* Reverse-lookup map: various representations of a language → internal langid
* key used by BibFieldTypes.langid (e.g. "german", "usenglish").
*
* Covers:
* - The internal key itself ("german" → "german")
* - Full BCP-47 CSL code ("de-de" → "german")
* - Bare BCP-47 language subtag ("de" → "german")
* - BibLaTeX alias ("ngerman"→ "german")
* - Common full English name ("german" → "german", already identity)
* - ISO 639-2/T three-letter codes ("ger" → "german", "eng" → "usenglish")
*/
const LANGID_BY_CODE: Map<string, string> = (() => {
const m = new Map<string, string>()
// Preferred mappings for bare BCP-47 language subtags and common full names
// that would otherwise resolve to an unexpected regional variant.
const preferred: Record<string, string> = {
// BCP-47 bare subtags
en: "usenglish",
de: "german",
fr: "french",
pt: "portuguese",
zh: "chinese",
sr: "serbian",
no: "norwegian",
// ISO 639-2/T codes (used by PubMed NBIB, some RIS exporters, etc.)
eng: "usenglish",
ger: "german",
deu: "german",
fra: "french",
fre: "french",
spa: "spanish",
ita: "italian",
por: "portuguese",
por_br: "brportuguese",
zho: "chinese",
chi: "chinese",
jpn: "japanese",
kor: "korean",
ara: "arabic",
rus: "russian",
pol: "polish",
nld: "dutch",
dut: "dutch",
swe: "swedish",
nor: "norwegian",
dan: "danish",
fin: "finnish",
ces: "czech",
cze: "czech",
slk: "slovak",
slo: "slovak",
hrv: "croatian",
slv: "slovene",
bul: "bulgarian",
ron: "romanian",
rum: "romanian",
hun: "hungarian",
tur: "turkish",
heb: "hebrew",
ell: "greek",
gre: "greek",
lat: "latin",
cat: "catalan",
eus: "basque",
baq: "basque",
afr: "afrikaans",
ukr: "ukrainian",
vie: "vietnamese",
tha: "thai",
lit: "lithuanian",
lav: "latvian",
est: "estonian",
isl: "icelandic",
ice: "icelandic",
mon: "mongolian",
fas: "farsi",
per: "farsi",
srp: "serbian",
// Common full English language names (case-insensitively applied below)
english: "usenglish",
german: "german",
french: "french",
spanish: "spanish",
italian: "italian",
portuguese: "portuguese",
chinese: "chinese",
japanese: "japanese",
arabic: "arabic",
russian: "russian",
polish: "polish",
dutch: "dutch",
swedish: "swedish",
norwegian: "norwegian",
danish: "danish",
finnish: "finnish",
czech: "czech",
slovak: "slovak",
croatian: "croatian",
slovene: "slovene",
slovenian: "slovene",
bulgarian: "bulgarian",
romanian: "romanian",
hungarian: "hungarian",
turkish: "turkish",
hebrew: "hebrew",
greek: "greek",
latin: "latin",
catalan: "catalan",
basque: "basque",
afrikaans: "afrikaans",
ukrainian: "ukrainian",
vietnamese: "vietnamese",
thai: "thai",
lithuanian: "lithuanian",
latvian: "latvian",
estonian: "estonian",
icelandic: "icelandic",
mongolian: "mongolian",
farsi: "farsi",
persian: "farsi",
serbian: "serbian",
}
for (const [code, key] of Object.entries(preferred)) {
m.set(code, key)
}
for (const [key, val] of Object.entries(langidOptions)) {
// Full BCP-47 CSL code (lower-cased), e.g. "de-de"
const cslCode = val.csl.toLowerCase()
if (!m.has(cslCode)) m.set(cslCode, key)
// Bare language subtag, e.g. "de"
const lang = cslCode.split("-")[0]
if (!m.has(lang)) m.set(lang, key)
// BibLaTeX alias, e.g. "ngerman"
const bbl = val.biblatex.toLowerCase()
if (!m.has(bbl)) m.set(bbl, key)
// Internal key itself (identity), e.g. "german"
if (!m.has(key.toLowerCase())) m.set(key.toLowerCase(), key)
}
return m
})()
/**
* Map a raw language string (BCP-47 code, ISO 639-2 code, BibLaTeX alias, or
* full English language name) to the internal langid option key recognised by
* BibFieldTypes.langid (e.g. "german", "usenglish").
*
* Returns `undefined` when no match is found so the caller can omit the field
* rather than storing an invalid value that would crash the exporters.
*/
export function lookupLangid(text: string): string | undefined {
return LANGID_BY_CODE.get(text.toLowerCase().trim())
}
/**
* Normalise a candidate BibLaTeX citation key so that it always starts with a
* letter. If `candidate` already starts with a letter it is used as-is;
* otherwise a synthetic key is derived from the optional `lastName` and `year`
* hints (producing `{lastName}{year}`). When neither hint is available the
* fallback `ref{candidate}` is used so the numeric part is still preserved.
*
* The returned key is also guaranteed to be unique within `usedKeys`. If the
* base key is already taken, single lower-case letters (`a`, `b`, …) are
* appended until a free slot is found. The chosen key is automatically added
* to `usedKeys`.
*
* @param candidate - The raw key string produced by the parser (may be purely
* numeric, a UUID, etc.).
* @param usedKeys - Mutable set of keys that have already been assigned in
* this parse session. Will be updated with the returned key.
* @param lastName - Optional family-name fragment to use when synthesising a
* key from scratch.
* @param year - Optional four-digit year string to append.
*/
export function makeEntryKey(
candidate: string,
usedKeys: Set<string>,
lastName?: string,
year?: string,
): string {
// Determine a base key that starts with a letter.
let base: string
if (/^[A-Za-z]/.test(candidate)) {
base = candidate
} else if (lastName) {
// Build a clean surname: keep ASCII letters/digits only.
const cleanName = lastName.replace(/[^A-Za-z0-9]/g, "")
if (cleanName && /^[A-Za-z]/.test(cleanName)) {
base = year ? `${cleanName}${year}` : cleanName
} else if (year) {
base = `ref${year}`
} else {
base = `ref${candidate}`
}
} else if (year) {
base = `ref${year}`
} else {
base = `ref${candidate}`
}
// Ensure uniqueness.
if (!usedKeys.has(base)) {
usedKeys.add(base)
return base
}
// Try appending a, b, c, … z, then aa, ab, … (simple single-suffix loop
// covers all realistic bibliography sizes).
const letters = "abcdefghijklmnopqrstuvwxyz"
for (let i = 0; i < letters.length; i++) {
const attempt = `${base}${letters[i]}`
if (!usedKeys.has(attempt)) {
usedKeys.add(attempt)
return attempt
}
}
// Ultra-rare: more than 26 duplicates — append numeric suffix.
let n = 2
while (true) {
const attempt = `${base}${n}`
if (!usedKeys.has(attempt)) {
usedKeys.add(attempt)
return attempt
}
n++
}
}
/**
* Given a string that starts with `{` at position `startIndex`, walks forward
* tracking brace depth and JSON string escapes to find the matching closing
* `}`. Returns the substring from `startIndex` up to and including that
* closing brace, or `null` if no balanced closing brace is found.
*
* This is used to extract a JSON object from a larger string that may contain
* trailing text after the object (e.g. Zotero ODT mark names append a
* random ID such as " RNDjURflxg9F1" after the JSON payload). The random ID
* format is: space + "RND" + exactly 10 alphanumeric characters.
*/
export function extractJsonObject(str: string, startIndex = 0): string | null {
let depth = 0
let inString = false
let escape = false
for (let i = startIndex; i < str.length; i++) {
const ch = str[i]
if (escape) {
escape = false
continue
}
if (inString) {
if (ch === "\\") {
escape = true
} else if (ch === '"') {
inString = false
}
continue
}
if (ch === '"') {
inString = true
} else if (ch === "{") {
depth++
} else if (ch === "}") {
depth--
if (depth === 0) {
return str.slice(startIndex, i + 1)
}
}
}
return null
}
// split at each occurence of splitToken, but only if no braces are currently open.
export function splitTeXString(
texString: string,
splitToken = "and",
): string[] {
const output = []
const tokenRe = /([^\s{}]+|\s|{|})/g
let j = 0
let k = 0
let item
while ((item = tokenRe.exec(texString)) !== null) {
const token = item?.length ? item[0] : false
if (token === false) {
break
}
if (k === output.length) {
output.push("")
}
switch (token) {
case "{":
j += 1
output[k] += token
break
case "}":
j -= 1
output[k] += token
break
case splitToken:
if (0 === j) {
k++
} else {
output[k] += token
}
break
default:
output[k] += token
}
}
return output
}