biblatex-csl-converter

Version:

Bibliography format converter: BibLaTeX, BibTeX, CSL-JSON, RIS, ENW, EndNote XML, Citavi, DOCX citations, ODT citations — parse, convert, and export with round-trip fidelity

github.com/fiduswriter/biblatex-csl-converter

fiduswriter/biblatex-csl-converter

771 lines (683 loc) • 27.5 kB

text/typescript

/** * ODT Citations importer * * Extracts bibliographic citations from the XML of an ODT file's * content.xml (or any XML fragment containing the same markup). * * Supported citation manager formats and how each is handled: * * - LibreOffice native `<text:bibliography-mark>` elements — all * bibliographic data is stored as XML attributes. * Delegated to OdtNativeParser in odt-native.ts. * * - Zotero Reference mark name: `ZOTERO_ITEM CSL_CITATION * {json}`. The JSON payload's `citationItems` * array is reshaped into a Record<string, CSLEntry> * and fed to CSLParser. * * - Mendeley Desktop Reference mark name: `CSL_CITATION {json}`. Same * (legacy) CSL-JSON shape as Zotero; handled identically. * * - JabRef Reference mark name: `JABREF_{key} CID_{n} {id}`. * JabRef embeds fully rendered citation text inside * the mark, not raw bibliographic data. The citation * key is extracted from the name and a stub `misc` * entry is emitted so callers know which keys are * cited. * * - EndNote ODT only: plain-text placeholder `{Author, Year * #RecNum}`. No live reference marks are created. * The document body is scanned for these patterns and * a stub entry is created for each unique record * number. * * Usage: * const parser = new OdtCitationsParser(contentXml) * const result = parser.parse() * // result.entries → BibDB (Record<number, EntryObject>) * // result.errors → ErrorObject[] * // result.warnings → ErrorObject[] */ import type { EntryObject } from "../const" import { type CSLEntry, CSLParser } from "./csl" import { OdtNativeParser } from "./odt-native" import { extractJsonObject } from "./tools" // --------------------------------------------------------------------------- // Public result types // --------------------------------------------------------------------------- export interface OdtCitationsParseResult { entries: Record<number, EntryObject> errors: ErrorObject[] warnings: ErrorObject[] } // --------------------------------------------------------------------------- // Citation accumulator — shared mutable state for multi-element processing // --------------------------------------------------------------------------- /** * Mutable accumulator passed to static extraction methods when processing * multiple document elements in a single pass. All four fields are mutated * in place as entries are discovered and keys are deduplicated. */ export interface CitationAccumulator { entries: EntryObject[] errors: ErrorObject[] warnings: ErrorObject[] seenKeys: Set<string> /** * Persistent map from raw CSL `id` strings to the normalised `entry_key` * values assigned by `CSLParser`. Accumulated across all citation elements * processed with the same accumulator so that duplicate items (already in * `seenKeys`) can still have their metadata resolved to the correct key. */ cslRawIdToEntryKey: Map<string, string> } interface ErrorObject { type: string field?: string value?: unknown entry?: string } // --------------------------------------------------------------------------- // Citation item metadata // --------------------------------------------------------------------------- /** * Per-entry citation metadata, keyed by `entry_key`. * * This captures the cite-specific decorations that surround a bibliographic * reference inside a single citation: page locators, textual prefixes / * suffixes, and author-rendering flags. It is returned alongside the * `entries` BibDB when `retrieveMetadata` is `true` on a static method call. * * Field availability by format: * * | Field | Zotero | Mendeley | EndNote (ODT placeholder) | * |----------------|--------|----------|---------------------------| * | locator | ✅ | ✅ | – | * | label | ✅ | ✅ | – | * | prefix | ✅ | ✅ | – | * | suffix | ✅ | ✅ | – | * | suppressAuthor | ✅ | ✅ | – | * | authorOnly | ✅ | ✅ | – | * | authorYear | – | – | – | */ export interface CitationItemMetadata { /** The `entry_key` of the corresponding entry in the returned `entries` BibDB. */ entry_key: string /** * Pinpoint location within the cited work (page number, chapter, etc.). * For CSL formats this is the raw `locator` string. */ locator?: string /** * CSL locator type label (e.g. `"page"`, `"chapter"`, `"section"`). * Only populated for CSL-based formats (Zotero, Mendeley). */ label?: string /** Text to prepend to the formatted citation (e.g. `"see "`, `"cf. "`). */ prefix?: string /** Text to append to the formatted citation. */ suffix?: string /** * When `true`, author names are suppressed in the formatted output, * leaving only the year (and locator) in parentheses: `(2020, p. 45)`. * Only populated for CSL-based formats (Zotero, Mendeley). */ suppressAuthor?: boolean /** * When `true`, only the author name is rendered with nothing else: * `William T. Williams`. * Only populated for CSL-based formats (Zotero, Mendeley). */ authorOnly?: boolean /** * When `true`, the author name is rendered outside the parentheses while * the year (and locator) remain inside: `William T. Williams (2020, p. 45)`. * Not used by ODT formats; included for interface parity with the DOCX parser. */ authorYear?: boolean } // --------------------------------------------------------------------------- // Static utility result types // --------------------------------------------------------------------------- export interface CitationResult { isCitation: boolean format?: string // e.g., "zotero", "mendeley_legacy", "jabref", "libreoffice_native", "endnote" entries?: Record<number, EntryObject> errors?: ErrorObject[] warnings?: ErrorObject[] /** * Per-entry citation metadata (locators, prefixes, suffixes, flags). * Only populated when `retrieveMetadata` is `true` on the static method call. */ metadata?: CitationItemMetadata[] } export interface BibliographyResult { isBibliography: boolean format?: string } // --------------------------------------------------------------------------- // Parser class // --------------------------------------------------------------------------- export class OdtCitationsParser { private contentXml: string entries: EntryObject[] errors: ErrorObject[] warnings: ErrorObject[] /** Prevents inserting the same source twice across different mark types. */ private seenKeys: Set<string> /** Persistent raw CSL id → normalised entry_key map for the instance parse. */ private cslRawIdToEntryKey: Map<string, string> constructor(contentXml: string) { this.contentXml = contentXml this.entries = [] this.errors = [] this.warnings = [] this.seenKeys = new Set() this.cslRawIdToEntryKey = new Map() } // ----------------------------------------------------------------------- // Static utility methods for reusable citation detection and extraction // ----------------------------------------------------------------------- /** * Check or extract citation data from a reference mark name. * * @param markName - The text:name attribute value from a reference-mark-start * @param retrieve - If true, extract and return full citation data; if false, only check presence * @returns CitationResult with format and optionally entries/errors/warnings */ static referenceMarkCitation( markName: string, retrieve = true, retrieveMetadata = false, acc: CitationAccumulator = { entries: [], errors: [], warnings: [], seenKeys: new Set<string>(), cslRawIdToEntryKey: new Map<string, string>(), }, ): CitationResult { const { entries, errors, warnings } = acc // Detect format let format: string | undefined if (markName.startsWith("ZOTERO_ITEM CSL_CITATION")) { format = "zotero" } else if (markName.startsWith("CSL_CITATION")) { format = "mendeley_legacy" } else if (markName.startsWith("JABREF_")) { format = "jabref" } if (!format) { return { isCitation: false } } if (!retrieve) { return { isCitation: true, format } } // Extract citation data const metadata: CitationItemMetadata[] = [] if (format === "zotero" || format === "mendeley_legacy") { OdtCitationsParser.extractCslMarkData( markName, format, acc, retrieveMetadata ? metadata : undefined, ) } else if (format === "jabref") { OdtCitationsParser.extractJabRefMarkData(markName, acc) } const bibDB: Record<number, EntryObject> = {} entries.forEach((entry, i) => { bibDB[i + 1] = entry }) const result: CitationResult = { isCitation: true, format, entries: bibDB, errors, warnings, } if (retrieveMetadata) result.metadata = metadata return result } /** * Check or extract bibliography rendering region from a reference mark name. * * @param markName - The text:name attribute value from a reference-mark-start * @param retrieve - If true, extract data (currently returns empty as bibliographies have no importable data) * @returns BibliographyResult indicating whether it's a bibliography */ static referenceMarkBibliography(markName: string): BibliographyResult { let format: string | undefined if (markName.startsWith("CSL_BIBLIOGRAPHY")) { format = "mendeley_legacy" } if (!format) { return { isBibliography: false } } // Bibliography marks are rendering regions with no importable source data const result: BibliographyResult = { isBibliography: true, format, } return result } /** * Check or extract bibliography rendering region from a text:section element. * * @param sectionName - The text:name attribute value from a text:section element * @returns BibliographyResult indicating whether it's a bibliography and the format */ static sectionBibliography(sectionName: string): BibliographyResult { let format: string | undefined // Zotero creates bibliography sections with text:name starting with "ZOTERO_BIBL" if (sectionName.startsWith("ZOTERO_BIBL")) { format = "zotero" // JabRef creates bibliography sections with text:name="JR_bib" or "JR_BIB" } else if (sectionName.toUpperCase() === "JR_BIB") { format = "jabref" } if (!format) { return { isBibliography: false } } // Section bibliographies are rendering regions with no importable source data const result: BibliographyResult = { isBibliography: true, format, } return result } /** * Check or extract citation data from a LibreOffice native bibliography-mark element. * * @param bibMarkXml - XML string of a <text:bibliography-mark> element * @param retrieve - If true, extract and return full citation data * @returns CitationResult with format and optionally entries/errors/warnings */ static bibliographyMarkCitation( bibMarkXml: string, retrieve = true, ): CitationResult { if (!bibMarkXml.includes("<text:bibliography-mark")) { return { isCitation: false } } const format = "libreoffice_native" if (!retrieve) { return { isCitation: true, format } } // Extract citation data by delegating to OdtNativeParser const errors: ErrorObject[] = [] const warnings: ErrorObject[] = [] try { const nativeParser = new OdtNativeParser(bibMarkXml) const { entries: entryList, warnings: parseWarnings } = nativeParser.parse() warnings.push(...parseWarnings) const bibDB: Record<number, EntryObject> = {} entryList.forEach((entry, i) => { bibDB[i + 1] = entry }) return { isCitation: true, format, entries: bibDB, errors, warnings, } } catch (error) { errors.push({ type: "libreoffice_parse_error", value: String(error), }) return { isCitation: true, format, entries: {}, errors, warnings, } } } /** * Check or extract citation data from EndNote placeholder text. * * @param text - Text containing EndNote placeholders like {Author, Year #RecNum} * @param retrieve - If true, extract and return full citation data * @returns CitationResult with format and optionally entries/errors/warnings */ static endNotePlaceholder(text: string, retrieve = true): CitationResult { // EndNote placeholders look like {Author, Year #RecNum} const hasPlaceholder = /\{[^{}]+#\d+[^{}]*\}/g.test(text) if (!hasPlaceholder) { return { isCitation: false } } const format = "endnote" if (!retrieve) { return { isCitation: true, format } } // Extract citation data const acc: CitationAccumulator = { entries: [], errors: [], warnings: [], seenKeys: new Set<string>(), cslRawIdToEntryKey: new Map<string, string>(), } const placeholderRe = /\{([^{}]+#\d+[^{}]*)\}/g let m: RegExpExecArray | null while ((m = placeholderRe.exec(text)) !== null) { for (const part of m[1].split(";").map((s) => s.trim())) { OdtCitationsParser.extractEndNotePlaceholderData(part, acc) } } const bibDB: Record<number, EntryObject> = {} acc.entries.forEach((entry, i) => { bibDB[i + 1] = entry }) return { isCitation: true, format, entries: bibDB, errors: [], warnings: [], } } // ----------------------------------------------------------------------- // Static helper methods for extraction logic // ----------------------------------------------------------------------- /** * Extract CSL citation data from Zotero or Mendeley legacy marks. */ private static extractCslMarkData( markName: string, source: string, acc: CitationAccumulator, metadata?: CitationItemMetadata[], ): void { const { warnings } = acc const jsonStart = markName.indexOf("{") if (jsonStart === -1) { warnings.push({ type: `${source}_missing_json` }) return } const jsonStr = extractJsonObject(markName, jsonStart) if (jsonStr === null) { warnings.push({ type: `${source}_missing_json` }) return } OdtCitationsParser.processCslJson(jsonStr, source, acc, metadata) } /** * Extract JabRef citation key from mark name. */ private static extractJabRefMarkData( markName: string, acc: CitationAccumulator, ): void { const { entries, warnings, seenKeys } = acc const withoutPrefix = markName.slice("JABREF_".length) const cidIndex = withoutPrefix.indexOf(" CID_") const rawKey = cidIndex === -1 ? withoutPrefix.split(" ")[0] : withoutPrefix.slice(0, cidIndex) if (!rawKey) { warnings.push({ type: "jabref_missing_key", value: markName }) return } const citationKey = rawKey.trim() if (seenKeys.has(citationKey)) return seenKeys.add(citationKey) entries.push({ entry_key: citationKey, bib_type: "misc", fields: {}, }) } /** * Process CSL-JSON citation payload. */ private static processCslJson( jsonStr: string, source: string, acc: CitationAccumulator, metadata?: CitationItemMetadata[], ): void { const { entries, errors, warnings, seenKeys } = acc let citation: { citationItems?: Array<{ itemData?: CSLEntry id?: unknown locator?: unknown label?: unknown prefix?: unknown suffix?: unknown "suppress-author"?: unknown "author-only"?: unknown }> } try { citation = JSON.parse(jsonStr) as typeof citation } catch { warnings.push({ type: `${source}_invalid_json`, value: jsonStr.slice(0, 80), }) return } const items = citation.citationItems ?? [] if (items.length === 0) return const cslRecord: Record<string, CSLEntry> = {} // Track the raw CSL id for each item index so we can attach metadata later const itemKeys: Array<string | undefined> = [] items.forEach((item, i) => { if (!item.itemData) { itemKeys.push(undefined) return } const key = item.itemData.id === undefined ? `${source}_${i}` : String(item.itemData.id) // Always record the raw key; skip adding to cslRecord if already seen itemKeys.push(key) if (seenKeys.has(key)) return cslRecord[key] = item.itemData }) if (Object.keys(cslRecord).length > 0) { const parser = new CSLParser(cslRecord) const bibDB = parser.parse() errors.push(...parser.errors) warnings.push(...parser.warnings) for (const entry of Object.values(bibDB)) { seenKeys.add(entry.entry_key) entries.push(entry) } // Merge the authoritative raw-id → entry_key map from this parse // into the accumulator so future citations can resolve duplicates. for (const [rawId, entryKey] of parser.rawIdToEntryKey) { acc.cslRawIdToEntryKey.set(rawId, entryKey) } } if (metadata) { items.forEach((item, i) => { const rawKey = itemKeys[i] if (!rawKey) return // Resolve normalised entry_key via the persistent accumulator map; // fall back to rawKey only if the entry was never successfully parsed. const entry_key = acc.cslRawIdToEntryKey.get(rawKey) ?? rawKey const meta: CitationItemMetadata = { entry_key } if ( item.locator !== undefined && item.locator !== null && item.locator !== "" ) meta.locator = String(item.locator) if ( item.label !== undefined && item.label !== null && item.label !== "" ) meta.label = String(item.label) if ( item.prefix !== undefined && item.prefix !== null && item.prefix !== "" ) meta.prefix = String(item.prefix) if ( item.suffix !== undefined && item.suffix !== null && item.suffix !== "" ) meta.suffix = String(item.suffix) if (item["suppress-author"]) meta.suppressAuthor = true if (item["author-only"]) meta.authorOnly = true metadata.push(meta) }) } } /** * Parse a single EndNote placeholder segment. */ private static extractEndNotePlaceholderData( segment: string, acc: CitationAccumulator, ): void { const { entries, seenKeys } = acc const re = /^(.*?)[,\s]+(\d{4})\s+#(\d+)/ const m = re.exec(segment.trim()) if (!m) return const authorPart = m[1].trim() const year = m[2] const recNum = m[3] const key = `EN${recNum}` if (seenKeys.has(key)) return seenKeys.add(key) const fields: Record<string, unknown> = {} if (authorPart) { const nameObj: { family?: import("../const").NodeArray given?: import("../const").NodeArray literal?: import("../const").NodeArray } = {} if (authorPart.includes(",")) { const parts = authorPart.split(",").map((p) => p.trim()) nameObj.family = [{ type: "text", text: parts[0] }] if (parts[1]) nameObj.given = [{ type: "text", text: parts[1] }] } else { nameObj.literal = [{ type: "text", text: authorPart }] } fields.author = [nameObj] } if (year) fields.date = year entries.push({ entry_key: key, bib_type: "misc", fields, }) } /** * Unescape XML entities. */ private static unescapeXmlEntitiesStatic(text: string): string { return text .replace(/</g, "<") .replace(/>/g, ">") .replace(/&/g, "&") .replace(/"/g, '"') .replace(/'/g, "'") } // ----------------------------------------------------------------------- // Instance API // ----------------------------------------------------------------------- parse(): OdtCitationsParseResult { // 1. LibreOffice-native <text:bibliography-mark> elements this.parseLibreOfficeBibMarks() // 2. Named reference marks (Zotero, Mendeley legacy, JabRef) this.parseReferenceMarks() // 3. EndNote plain-text placeholders {Author, Year #RecNum} this.parseEndNotePlaceholders() const bibDB: Record<number, EntryObject> = {} this.entries.forEach((entry, i) => { bibDB[i + 1] = entry }) return { entries: bibDB, errors: this.errors, warnings: this.warnings, } } // ----------------------------------------------------------------------- // Step 1 — LibreOffice native bibliography marks // ----------------------------------------------------------------------- /** * Delegates to OdtNativeParser, passing `seenKeys` so that the native * parser can skip identifiers already seen by the other steps (and vice * versa — the set is mutated in place). */ private parseLibreOfficeBibMarks(): void { const nativeParser = new OdtNativeParser(this.contentXml) const { entries, warnings } = nativeParser.parse(this.seenKeys) this.entries.push(...entries) this.warnings.push(...warnings) } // ----------------------------------------------------------------------- // Step 2 — reference marks (Zotero, Mendeley legacy, JabRef) // ----------------------------------------------------------------------- private parseReferenceMarks(): void { // Match all text:reference-mark-start elements // All marks must be properly closed but we ignore the end tags for extraction. const markRe = /<text:reference-mark-start[^>]+text:name="([^"]+)"/g let m: RegExpExecArray | null while ((m = markRe.exec(this.contentXml)) !== null) { const name = OdtCitationsParser.unescapeXmlEntitiesStatic(m[1]) OdtCitationsParser.referenceMarkCitation(name, true, false, { entries: this.entries, errors: this.errors, warnings: this.warnings, seenKeys: this.seenKeys, cslRawIdToEntryKey: this.cslRawIdToEntryKey, }) } } // --- LibreOffice Native Bibliography Marks --- // ----------------------------------------------------------------------- // Step 3 — EndNote plain-text placeholders // ----------------------------------------------------------------------- /** * EndNote does not use live reference marks in ODT files. Instead it * leaves temporary citation placeholders directly in the document body: * * {Smith, 2023 #291} * {Smith, 2023 #291; Jones, 2019 #47} * * This method scans the raw XML text for these patterns. Because the * placeholder contains no full bibliographic record — only author name, * year, and EndNote record number — the emitted entry is a stub `misc` * entry keyed by `EN{RecNum}`. */ private parseEndNotePlaceholders(): void { // The `#\d+` guard prevents false positives from other brace-delimited // constructs that do not resemble EndNote placeholders. const placeholderRe = /\{([^{}]+#\d+[^{}]*)\}/g let m: RegExpExecArray | null while ((m = placeholderRe.exec(this.contentXml)) !== null) { // Multiple simultaneous citations are separated by ";" for (const part of m[1].split(";").map((s) => s.trim())) { OdtCitationsParser.extractEndNotePlaceholderData(part, { entries: this.entries, errors: this.errors, warnings: this.warnings, seenKeys: this.seenKeys, cslRawIdToEntryKey: this.cslRawIdToEntryKey, }) } } } } // --------------------------------------------------------------------------- // Convenience function // --------------------------------------------------------------------------- export function parseOdtCitations(contentXml: string): OdtCitationsParseResult { return new OdtCitationsParser(contentXml).parse() }