biblatex-csl-converter

Version:

Bibliography format converter: BibLaTeX, BibTeX, CSL-JSON, RIS, ENW, EndNote XML, Citavi, DOCX citations, ODT citations — parse, convert, and export with round-trip fidelity

github.com/fiduswriter/biblatex-csl-converter

fiduswriter/biblatex-csl-converter

456 lines (398 loc) • 16.6 kB

text/typescript

/** * Bibliography format sniffer * * Inspects a raw string and returns the most likely import format identifier, * or `null` if the format cannot be determined. * * Recognised format identifiers * ────────────────────────────── * "biblatex" – BibTeX / BibLaTeX (@article{…}, @book{…}, …) * "ris" – RIS (TY - … ER - ) * "enw" – EndNote tagged (%0 …) * "nbib" – PubMed / MEDLINE NBIB (PMID- …) * "endnote_xml" – EndNote XML export (<xml><records>…) * "citavi_xml" – Citavi native XML project (<CitaviExchangeData …) * "csl_json" – CSL-JSON (array or object whose values carry "type") * "citavi_json" – Citavi JSON / WordPlaceholder export * "odt_citations" – ODT content.xml with embedded citation marks * "docx_citations" – DOCX word/document.xml with embedded citation fields * * Design principles * ────────────────── * • Work only on the first ~4 KB of the string so that the sniffer is O(1) * in practice regardless of file size. * • Prefer cheap string operations over full parses. * • Use a clear priority order so that formats that are strict supersets of * others (e.g. Citavi XML vs. generic XML) are tested first. * • Never throw — return `null` on any unexpected input. */ export type ImportFormat = | "biblatex" | "ris" | "enw" | "nbib" | "endnote_xml" | "citavi_xml" | "csl_json" | "citavi_json" | "odt_citations" | "docx_citations" /** * The maximum number of characters from the beginning of the input that the * sniffer will examine. Large enough to skip an XML declaration and BOM but * still tiny compared to any real bibliography file. */ const PEEK = 4096 /** * Sniff a raw bibliography string and return its most likely format, or * `null` when the format cannot be identified with reasonable confidence. * * @param input - The raw string content of a bibliography file. * @returns - An `ImportFormat` identifier, or `null`. */ export function sniffFormat(input: string): ImportFormat | null { if (typeof input !== "string" || input.length === 0) return null // Work on a trimmed head of the string to keep every check O(1). const head = input.trimStart().slice(0, PEEK) // ── 1. XML-based formats ───────────────────────────────────────────────── // These all start with either a BOM + XML declaration or directly with a // tag. We check for a leading `<` first, but also scan forward a short // distance to tolerate files that have a few bytes of garbage before the // XML declaration (e.g. a stray backtick-fence in some fixture files). if (head.startsWith("<") || head.startsWith("\uFEFF<")) { return sniffXml(head, input) } const firstAngle = head.indexOf("<") if (firstAngle !== -1 && firstAngle < 16) { const xmlHead = head.slice(firstAngle) const result = sniffXml(xmlHead, input) if (result !== null) return result } // ── 2. JSON-based formats ──────────────────────────────────────────────── if (head.startsWith("{") || head.startsWith("[")) { return sniffJson(head, input) } // ── 3. Line-oriented tagged formats ───────────────────────────────────── // RIS: first non-blank line must match TY - <type> // The spec mandates exactly two uppercase letters, two spaces, a hyphen, // and a space before the value. if (/^TY {2}- \S/.test(head)) return "ris" // ENW (EndNote tagged): records open with "%0 <type>" if (/^%0 \S/.test(head)) return "enw" // NBIB / PubMed-MEDLINE: records typically open with "PMID- " but files // exported for multiple records may start with any recognised two-to-four // character tag followed by a hyphen-padded delimiter "- ". // We require at least one classic MEDLINE tag near the top of the file. if (isNBIB(head)) return "nbib" // ── 4. BibTeX / BibLaTeX ──────────────────────────────────────────────── // An entry starts with "@<word>", optionally preceded by comments. // We scan for the first "@" to tolerate leading comment lines. if (hasBiblatexEntry(head)) return "biblatex" return null } // ─── XML discriminator ─────────────────────────────────────────────────────── function sniffXml(head: string, full: string): ImportFormat | null { // Citavi native project XML: root element is <CitaviExchangeData> // Test this before the generic EndNote XML check. if ( head.includes("<CitaviExchangeData") || full.slice(0, PEEK * 4).includes("<CitaviExchangeData") ) { return "citavi_xml" } // DOCX word/document.xml: uses the WordprocessingML namespace // Characteristic markers: the "w:" prefix on core elements. // We look for the namespace URI or for <w:document / <w:body. if ( head.includes("schemas.openxmlformats.org/wordprocessingml") || head.includes("<w:document") || head.includes("<w:body") ) { return "docx_citations" } // ODT content.xml: uses the OpenDocument "text:" namespace. // Present for both LibreOffice-native and Zotero/Mendeley/JabRef ODT // citations. The bibliography-mark element or reference-mark elements // are the reliable discriminator. if ( head.includes("xmlns:text=") || head.includes("<text:bibliography-mark") || head.includes("ZOTERO_ITEM CSL_CITATION") || head.includes("CSL_CITATION") || head.includes("JABREF_") ) { // Could be ODT or DOCX — we already ruled out DOCX above, so this // must be ODT. return "odt_citations" } // EndNote XML: the canonical export wraps records in <xml><records>… // but some variants use <records> directly or have <record> children // with <ref-type> elements. We search a generous prefix. const wide = full.slice(0, PEEK * 4) if ( wide.includes("<xml>") || wide.includes("<records>") || wide.includes("<ref-type") || wide.includes('<source-app name="EndNote"') || wide.includes("<record>") ) { return "endnote_xml" } // Unknown XML — we cannot identify it. return null } // ─── JSON discriminator ────────────────────────────────────────────────────── function sniffJson(head: string, full: string): ImportFormat | null { // We do a lightweight parse of only as much JSON as we need. We never // parse the whole document — that would defeat the purpose. try { // Attempt to parse only the head fragment (may be incomplete JSON). // If it fails we fall back to regex heuristics on the raw text. const sample = tryParseJsonHead(head, full) if (sample !== null) { return classifyJsonValue(sample) } } catch { // Fall through to regex heuristics. } // ── Regex heuristics on the raw text ──────────────────────────────────── // These run when tryParseJsonHead cannot produce a valid fragment. // Citavi JSON: SwissAcademic type annotations are unmistakable. if ( head.includes("SwissAcademic.Citavi") || head.includes('"$type"') || head.includes('"ReferenceType"') || head.includes('"BibTeXKey"') ) { return "citavi_json" } // CSL JSON: look for the pair of "id" and "type" keys that every CSL // entry must carry. if (head.includes('"id"') && head.includes('"type"')) { return "csl_json" } // Citavi plain array: array of objects with "Title" / "Authors" keys. if (head.includes('"Title"') && head.includes('"Authors"')) { return "citavi_json" } return null } /** * Try to extract a representative parsed value from the raw JSON text. * * Strategy: * - If the outer container is an array, parse just the first element. * - If it is an object, parse just the first top-level key/value pair. * * Returns the parsed value on success, or `null` on any failure. */ function tryParseJsonHead( head: string, full: string, ): Record<string, unknown> | Record<string, unknown>[] | null { // For an array we want the first element. if (head.startsWith("[")) { const firstClose = findFirstArrayElement(full) if (firstClose !== null) { try { return [JSON.parse(firstClose)] as Record<string, unknown>[] } catch { return null } } return null } // For an object, try to parse it completely first (works for small files) // then fall back to extracting the first value. try { return JSON.parse(full) as Record<string, unknown> } catch { return null } } /** * Extract the text of the first element of a JSON array from the full input, * attempting to handle nested objects and arrays by counting braces/brackets. */ function findFirstArrayElement(full: string): string | null { // Find the opening '[', then scan for the matching first element end. const start = full.indexOf("[") if (start === -1) return null let i = start + 1 // Skip whitespace before the first element. while (i < full.length && /\s/.test(full[i])) i++ if (i >= full.length) return null const ch = full[i] if (ch === "{") { // Object element — find the matching '}'. const end = findMatchingBrace(full, i, "{", "}") if (end === -1) return null return full.slice(i, end + 1) } if (ch === "[") { // Nested array element. const end = findMatchingBrace(full, i, "[", "]") if (end === -1) return null return full.slice(i, end + 1) } // Scalar first element (string, number, etc.) — read until comma or ']'. const end = full.search(/[,\]]/) if (end === -1) return null return full.slice(i, end).trim() } /** * Scan forward from `pos` (which must point at `open`) and return the index * of the matching `close` character, respecting nesting and JSON strings. */ function findMatchingBrace( s: string, pos: number, open: string, close: string, ): number { let depth = 0 let inString = false let escape = false for (let i = pos; i < s.length; i++) { const c = s[i] if (escape) { escape = false continue } if (inString) { if (c === "\\") escape = true else if (c === '"') inString = false continue } if (c === '"') { inString = true } else if (c === open) { depth++ } else if (c === close) { depth-- if (depth === 0) return i } } return -1 } /** * Classify a parsed JSON value as a specific import format. */ function classifyJsonValue( value: Record<string, unknown> | Record<string, unknown>[], ): ImportFormat | null { // Array: could be CSL-JSON array or a Citavi reference array. if (Array.isArray(value)) { const first = value[0] if (!first || typeof first !== "object") return null return classifyJsonObject(first as Record<string, unknown>) } // Object: could be a CSL-JSON map keyed by citation ID, or a Citavi // WordPlaceholder / project export object. if (typeof value === "object" && value !== null) { return classifyJsonObject(value as Record<string, unknown>) } return null } /** * Inspect the keys and values of a single JSON object to determine its format. */ function classifyJsonObject(obj: Record<string, unknown>): ImportFormat | null { // Citavi WordPlaceholder / inline JSON from DOCX citations. // The "$type" key is populated by the Newtonsoft.Json serialiser and // always contains a fully-qualified SwissAcademic type name. if (typeof obj.$type === "string" && obj.$type.includes("SwissAcademic")) { return "citavi_json" } // Citavi project JSON export: top-level "References" array. if (Array.isArray(obj.References)) { const firstRef = obj.References[0] if (firstRef && typeof firstRef === "object") { return "citavi_json" } } // Citavi project JSON export: top-level "Entries" array (WordPlaceholder). if (Array.isArray(obj.Entries)) { const firstEntry = obj.Entries[0] if ( firstEntry && typeof firstEntry === "object" && (typeof (firstEntry as Record<string, unknown>).$type === "string" || typeof (firstEntry as Record<string, unknown>).ReferenceId === "string") ) { return "citavi_json" } } // Citavi reference object (array element): has "ReferenceType" and // either "BibTeXKey" or "Title". if ( typeof obj.ReferenceType === "string" && (obj.BibTeXKey !== undefined || obj.Title !== undefined) ) { return "citavi_json" } // CSL-JSON entry object: must have both "id" and "type". if (typeof obj.id !== "undefined" && typeof obj.type === "string") { return "csl_json" } // CSL-JSON keyed map: the object's *values* are CSL entries. // Check whether any top-level value looks like a CSL entry. for (const val of Object.values(obj)) { if ( val !== null && typeof val === "object" && !Array.isArray(val) && typeof (val as Record<string, unknown>).type === "string" && typeof (val as Record<string, unknown>).id !== "undefined" ) { return "csl_json" } // Only check the first value to stay cheap. break } return null } // ─── BibLaTeX heuristic ────────────────────────────────────────────────────── /** * Returns `true` if the text contains a BibTeX/BibLaTeX entry opener. * * We scan for `@<word>{` or `@<word>(` (the two BibTeX entry delimiters), * but also accept `@comment`, `@preamble`, and `@string` directives so that * files consisting only of those constructs are still recognised. */ function hasBiblatexEntry(head: string): boolean { return /@[A-Za-z][A-Za-z0-9_]*\s*[({]/.test(head) } // ─── NBIB heuristic ───────────────────────────────────────────────────────── /** * MEDLINE/NBIB files have a very distinctive two-to-four uppercase letter * tag at the start of each line, right-padded with spaces to column 4, * followed by "- " and the value. Examples: * * PMID- 39730211 * OWN - NLM * TI - Fish Gastroenterology. * FAU - Smith, John * * We require at least two such lines near the top of the file to avoid false * positives from other formats that might incidentally contain a similar * pattern. */ function isNBIB(head: string): boolean { // The pattern: start-of-line, 2-4 uppercase letters (or digits for PMID), // spaces padding to column 4, "- ", non-empty value. const nbibLine = /^[A-Z]{2,4}[ ]{0,2}- \S/m // Count how many lines match — require at least 2 for confidence. const lines = head.split("\n").slice(0, 20) let matches = 0 for (const line of lines) { if (/^[A-Z]{2,4}[ ]{0,2}- \S/.test(line)) { matches++ if (matches >= 2) return true } } // A single "PMID-" is distinctive enough on its own. return nbibLine.test(head) && head.includes("PMID-") }