biblatex-csl-converter

Version:

Bibliography format converter: BibLaTeX, BibTeX, CSL-JSON, RIS, ENW, EndNote XML, Citavi, DOCX citations, ODT citations — parse, convert, and export with round-trip fidelity

github.com/fiduswriter/biblatex-csl-converter

fiduswriter/biblatex-csl-converter

320 lines (306 loc) • 10 kB

text/typescript

import { langidOptions } from "../const" /** * Reverse-lookup map: various representations of a language → internal langid * key used by BibFieldTypes.langid (e.g. "german", "usenglish"). * * Covers: * - The internal key itself ("german" → "german") * - Full BCP-47 CSL code ("de-de" → "german") * - Bare BCP-47 language subtag ("de" → "german") * - BibLaTeX alias ("ngerman"→ "german") * - Common full English name ("german" → "german", already identity) * - ISO 639-2/T three-letter codes ("ger" → "german", "eng" → "usenglish") */ const LANGID_BY_CODE: Map<string, string> = (() => { const m = new Map<string, string>() // Preferred mappings for bare BCP-47 language subtags and common full names // that would otherwise resolve to an unexpected regional variant. const preferred: Record<string, string> = { // BCP-47 bare subtags en: "usenglish", de: "german", fr: "french", pt: "portuguese", zh: "chinese", sr: "serbian", no: "norwegian", // ISO 639-2/T codes (used by PubMed NBIB, some RIS exporters, etc.) eng: "usenglish", ger: "german", deu: "german", fra: "french", fre: "french", spa: "spanish", ita: "italian", por: "portuguese", por_br: "brportuguese", zho: "chinese", chi: "chinese", jpn: "japanese", kor: "korean", ara: "arabic", rus: "russian", pol: "polish", nld: "dutch", dut: "dutch", swe: "swedish", nor: "norwegian", dan: "danish", fin: "finnish", ces: "czech", cze: "czech", slk: "slovak", slo: "slovak", hrv: "croatian", slv: "slovene", bul: "bulgarian", ron: "romanian", rum: "romanian", hun: "hungarian", tur: "turkish", heb: "hebrew", ell: "greek", gre: "greek", lat: "latin", cat: "catalan", eus: "basque", baq: "basque", afr: "afrikaans", ukr: "ukrainian", vie: "vietnamese", tha: "thai", lit: "lithuanian", lav: "latvian", est: "estonian", isl: "icelandic", ice: "icelandic", mon: "mongolian", fas: "farsi", per: "farsi", srp: "serbian", // Common full English language names (case-insensitively applied below) english: "usenglish", german: "german", french: "french", spanish: "spanish", italian: "italian", portuguese: "portuguese", chinese: "chinese", japanese: "japanese", arabic: "arabic", russian: "russian", polish: "polish", dutch: "dutch", swedish: "swedish", norwegian: "norwegian", danish: "danish", finnish: "finnish", czech: "czech", slovak: "slovak", croatian: "croatian", slovene: "slovene", slovenian: "slovene", bulgarian: "bulgarian", romanian: "romanian", hungarian: "hungarian", turkish: "turkish", hebrew: "hebrew", greek: "greek", latin: "latin", catalan: "catalan", basque: "basque", afrikaans: "afrikaans", ukrainian: "ukrainian", vietnamese: "vietnamese", thai: "thai", lithuanian: "lithuanian", latvian: "latvian", estonian: "estonian", icelandic: "icelandic", mongolian: "mongolian", farsi: "farsi", persian: "farsi", serbian: "serbian", } for (const [code, key] of Object.entries(preferred)) { m.set(code, key) } for (const [key, val] of Object.entries(langidOptions)) { // Full BCP-47 CSL code (lower-cased), e.g. "de-de" const cslCode = val.csl.toLowerCase() if (!m.has(cslCode)) m.set(cslCode, key) // Bare language subtag, e.g. "de" const lang = cslCode.split("-")[0] if (!m.has(lang)) m.set(lang, key) // BibLaTeX alias, e.g. "ngerman" const bbl = val.biblatex.toLowerCase() if (!m.has(bbl)) m.set(bbl, key) // Internal key itself (identity), e.g. "german" if (!m.has(key.toLowerCase())) m.set(key.toLowerCase(), key) } return m })() /** * Map a raw language string (BCP-47 code, ISO 639-2 code, BibLaTeX alias, or * full English language name) to the internal langid option key recognised by * BibFieldTypes.langid (e.g. "german", "usenglish"). * * Returns `undefined` when no match is found so the caller can omit the field * rather than storing an invalid value that would crash the exporters. */ export function lookupLangid(text: string): string | undefined { return LANGID_BY_CODE.get(text.toLowerCase().trim()) } /** * Normalise a candidate BibLaTeX citation key so that it always starts with a * letter. If `candidate` already starts with a letter it is used as-is; * otherwise a synthetic key is derived from the optional `lastName` and `year` * hints (producing `{lastName}{year}`). When neither hint is available the * fallback `ref{candidate}` is used so the numeric part is still preserved. * * The returned key is also guaranteed to be unique within `usedKeys`. If the * base key is already taken, single lower-case letters (`a`, `b`, …) are * appended until a free slot is found. The chosen key is automatically added * to `usedKeys`. * * @param candidate - The raw key string produced by the parser (may be purely * numeric, a UUID, etc.). * @param usedKeys - Mutable set of keys that have already been assigned in * this parse session. Will be updated with the returned key. * @param lastName - Optional family-name fragment to use when synthesising a * key from scratch. * @param year - Optional four-digit year string to append. */ export function makeEntryKey( candidate: string, usedKeys: Set<string>, lastName?: string, year?: string, ): string { // Determine a base key that starts with a letter. let base: string if (/^[A-Za-z]/.test(candidate)) { base = candidate } else if (lastName) { // Build a clean surname: keep ASCII letters/digits only. const cleanName = lastName.replace(/[^A-Za-z0-9]/g, "") if (cleanName && /^[A-Za-z]/.test(cleanName)) { base = year ? `${cleanName}${year}` : cleanName } else if (year) { base = `ref${year}` } else { base = `ref${candidate}` } } else if (year) { base = `ref${year}` } else { base = `ref${candidate}` } // Ensure uniqueness. if (!usedKeys.has(base)) { usedKeys.add(base) return base } // Try appending a, b, c, … z, then aa, ab, … (simple single-suffix loop // covers all realistic bibliography sizes). const letters = "abcdefghijklmnopqrstuvwxyz" for (let i = 0; i < letters.length; i++) { const attempt = `${base}${letters[i]}` if (!usedKeys.has(attempt)) { usedKeys.add(attempt) return attempt } } // Ultra-rare: more than 26 duplicates — append numeric suffix. let n = 2 while (true) { const attempt = `${base}${n}` if (!usedKeys.has(attempt)) { usedKeys.add(attempt) return attempt } n++ } } /** * Given a string that starts with `{` at position `startIndex`, walks forward * tracking brace depth and JSON string escapes to find the matching closing * `}`. Returns the substring from `startIndex` up to and including that * closing brace, or `null` if no balanced closing brace is found. * * This is used to extract a JSON object from a larger string that may contain * trailing text after the object (e.g. Zotero ODT mark names append a * random ID such as " RNDjURflxg9F1" after the JSON payload). The random ID * format is: space + "RND" + exactly 10 alphanumeric characters. */ export function extractJsonObject(str: string, startIndex = 0): string | null { let depth = 0 let inString = false let escape = false for (let i = startIndex; i < str.length; i++) { const ch = str[i] if (escape) { escape = false continue } if (inString) { if (ch === "\\") { escape = true } else if (ch === '"') { inString = false } continue } if (ch === '"') { inString = true } else if (ch === "{") { depth++ } else if (ch === "}") { depth-- if (depth === 0) { return str.slice(startIndex, i + 1) } } } return null } // split at each occurence of splitToken, but only if no braces are currently open. export function splitTeXString( texString: string, splitToken = "and", ): string[] { const output = [] const tokenRe = /([^\s{}]+|\s|{|})/g let j = 0 let k = 0 let item while ((item = tokenRe.exec(texString)) !== null) { const token = item?.length ? item[0] : false if (token === false) { break } if (k === output.length) { output.push("") } switch (token) { case "{": j += 1 output[k] += token break case "}": j -= 1 output[k] += token break case splitToken: if (0 === j) { k++ } else { output[k] += token } break default: output[k] += token } } return output }