biblatex-csl-converter
Version:
Bibliography format converter: BibLaTeX, BibTeX, CSL-JSON, RIS, ENW, EndNote XML, Citavi, DOCX citations, ODT citations — parse, convert, and export with round-trip fidelity
1,329 lines (1,212 loc) • 57 kB
text/typescript
/**
* DOCX Citations importer
*
* Extracts bibliographic citations from the XML of a DOCX file's
* word/document.xml (or any XML fragment containing the same markup).
*
* Supported citation manager formats and how each is handled:
*
* - Word native `CITATION key \l locale` inline field +
* `customXml/item1.xml` sources (passed as
* `sourcesXml` option). Delegated to
* DocxNativeParser in docx-native.ts.
*
* - Zotero `ADDIN ZOTERO_ITEM CSL_CITATION {json}` inline
* field. The embedded CSL-JSON `citationItems`
* array is reshaped into a Record<string, CSLEntry>
* and fed to CSLParser.
*
* - Mendeley Cite v3 Current add-in: `<w:sdt>` with the citation JSON
* (current) base64-encoded in `w:tag w:val=
* "MENDELEY_CITATION_v3_{base64}"`. Decoded payload
* is CSL-JSON; handled identically to Zotero.
*
* - Mendeley Desktop Legacy add-in: `ADDIN CSL_CITATION {json}` inline
* (legacy) field. Same CSL-JSON shape; handled identically
* to Zotero.
*
* - EndNote `ADDIN EN.CITE <EndNote>…</EndNote>` — XML is
* either entity-escaped inline or base64-encoded in
* `<w:fldData>`. The `<record>` subtree is
* converted to an EndNoteRecord object and passed to
* EndNoteParser.
*
* - Citavi `<w:sdt>` wrapping `ADDIN CitaviPlaceholder
* {base64}`. The base64-decoded JSON contains a
* WordPlaceholder with `Entries[].ReferenceId`
* UUIDs. Two sub-cases:
*
* A. If the payload already embeds `Reference`
* objects (some Citavi export modes), they are
* passed directly to CitaviParser.
*
* B. In older or incomplete formats, only UUIDs may
* be present without embedded references. Such
* citations cannot be fully resolved and will
* generate warnings.
*
* Usage:
* const parser = new DocxCitationsParser(documentXml, {
* sourcesXml, // contents of customXml/item1.xml (Word-native)
* })
* const result = parser.parse()
* // result.entries → BibDB (Record<number, EntryObject>)
* // result.errors → ErrorObject[]
* // result.warnings → ErrorObject[]
*
* The `sourcesXml` option must be the contents of `customXml/item1.xml` from
* the DOCX ZIP when Word-native citations are present.
*
* Citavi citations embed complete bibliographic data directly in each citation
* field, so no external Citavi project file is required.
*/
import type { EntryObject } from "../const"
import { type CitaviInput, CitaviParser } from "./citavi"
import { type CSLEntry, CSLParser } from "./csl"
import { DocxNativeParser } from "./docx-native"
import { EndNoteParser, type EndNoteRecord } from "./endnote"
import { extractJsonObject } from "./tools"
// ---------------------------------------------------------------------------
// Public result types
// ---------------------------------------------------------------------------
export interface DocxCitationsParseResult {
entries: Record<number, EntryObject>
errors: ErrorObject[]
warnings: ErrorObject[]
}
// ---------------------------------------------------------------------------
// Citation accumulator — shared mutable state for multi-element processing
// ---------------------------------------------------------------------------
/**
* Mutable accumulator passed to static extraction methods when processing
* multiple document elements in a single pass. All four fields are mutated
* in place as entries are discovered and keys are deduplicated.
*/
export interface CitationAccumulator {
entries: EntryObject[]
errors: ErrorObject[]
warnings: ErrorObject[]
seenKeys: Set<string>
/**
* Persistent map from raw CSL `id` strings to the normalised `entry_key`
* values assigned by `CSLParser`. Accumulated across all citation elements
* processed with the same accumulator so that duplicate items (already in
* `seenKeys`) can still have their metadata resolved to the correct key.
*/
cslRawIdToEntryKey: Map<string, string>
}
interface ErrorObject {
type: string
field?: string
value?: unknown
entry?: string
}
// ---------------------------------------------------------------------------
// Citation item metadata
// ---------------------------------------------------------------------------
/**
* Per-entry citation metadata, keyed by `entry_key`.
*
* This captures the cite-specific decorations that surround a bibliographic
* reference inside a single citation: page locators, textual prefixes /
* suffixes, and author-rendering flags. It is returned alongside the
* `entries` BibDB when `retrieveMetadata` is `true` on a static method call.
*
* Field availability by format:
*
* | Field | Zotero | Mendeley | EndNote | Citavi |
* |------------------|--------|----------|----------------------|-------------------------------------|
* | locator | ✅ | ✅ | ✅ (Pages) | ✅ (PageRange.OriginalString) |
* | label | ✅ | ✅ | – | ✅ (derived from PageRange.NumberingType; mapping inferred from manual) |
* | prefix | ✅ | ✅ | ✅ | ✅ |
* | suffix | ✅ | ✅ | ✅ | ✅ (confirmed by manual; not yet seen in real files) |
* | suppressAuthor | ✅ | ✅ | – | – |
* | authorOnly | ✅ | ✅ | – | – |
* | authorYear | – | – | ✅ (AuthorYear attr) | – |
* | bibliographyEntry| – | – | – | ✅ (confirmed by manual; not yet seen in real files) |
* | ruleSet | – | – | – | ✅ (confirmed by manual; serialised form unknown) |
* | formatOption | – | – | – | ✅ (confirmed by manual; serialised form unknown) |
* | insertAs | – | – | – | ✅ (confirmed by manual; serialised form unknown) |
*/
export interface CitationItemMetadata {
/** The `entry_key` of the corresponding entry in the returned `entries` BibDB. */
entry_key: string
/**
* Pinpoint location within the cited work (page number, chapter, etc.).
* For CSL formats this is the raw `locator` string; for EndNote it is the
* `<Pages>` element text; for Citavi it is `PageRange.OriginalString`.
*/
locator?: string
/**
* CSL locator type label (e.g. `"page"`, `"chapter"`, `"section"`).
* For CSL-based formats (Zotero, Mendeley) this is the raw `label` string
* from the citation item. For Citavi it is derived from `PageRange.NumberingType`:
* `0` (Pages) → `"page"`, `1` (Columns) → `"column"`,
* `2` (Section numbers) → `"section"`, `3` (Margin numbers) → `"note"`,
* `4` (Other / free-form) → `"custom"`.
* The integer-to-label mapping for Citavi is inferred from the Citavi manual
* and has not been confirmed against observed data beyond value `0`.
*/
label?: string
/** Text to prepend to the formatted citation (e.g. `"see "`, `"cf. "`). */
prefix?: string
/** Text to append to the formatted citation (e.g. `", etc."`). */
suffix?: string
/**
* When `true`, author names are suppressed in the formatted output,
* leaving only the year (and locator) in parentheses: `(2020, p. 45)`.
* Only populated for CSL-based formats (Zotero, Mendeley).
*/
suppressAuthor?: boolean
/**
* When `true`, only the author name is rendered with nothing else:
* `William T. Williams`.
* Only populated for CSL-based formats (Zotero, Mendeley).
*/
authorOnly?: boolean
/**
* When `true`, the author name is rendered outside the parentheses while
* the year (and locator) remain inside: `William T. Williams (2020, p. 45)`.
* This reflects the `AuthorYear="1"` attribute on EndNote's `<Cite>` element.
* Only populated for EndNote citations.
*/
authorYear?: boolean
/**
* Controls whether and where this reference appears in the bibliography.
* Only populated for Citavi citations (from `Entries[].BibliographyEntry`).
*
* Known values:
* `"/bibonly"` – reference appears only in the bibliography, not in-text
* `"/nobib"` – reference appears only in-text, not in the bibliography
*
* When absent the reference appears in both (default behaviour).
* Confirmed by the Citavi manual; not yet observed in real files.
*/
bibliographyEntry?: string
/**
* Overrides which citation-style rule set (formatting variant) is used for
* this entry. Only populated for Citavi citations (from `Entries[].RuleSet`).
* Serialised form not yet observed in real files.
*/
ruleSet?: unknown
/**
* Selects among the citation style's optional formatting variants for this
* entry (values 1, 2, or 3). Only populated for Citavi citations (from
* `Entries[].FormatOption`). Serialised form not yet observed in real files.
*/
formatOption?: unknown
/**
* Overrides where the citation is physically inserted (in-text vs. footnote).
* Only populated for Citavi citations (from `Entries[].InsertAs`).
* Serialised form not yet observed in real files.
*/
insertAs?: unknown
}
// ---------------------------------------------------------------------------
// Static utility result types
// ---------------------------------------------------------------------------
export interface CitationResult {
isCitation: boolean
format?: string // e.g., "zotero", "mendeley_v3", "endnote", "citavi", "word_native"
entries?: Record<number, EntryObject>
errors?: ErrorObject[]
warnings?: ErrorObject[]
/**
* Per-entry citation metadata (locators, prefixes, suffixes, flags).
* Only populated when `retrieveMetadata` is `true` on the static method call.
*/
metadata?: CitationItemMetadata[]
}
export interface BibliographyResult {
isBibliography: boolean
format?: string
}
// ---------------------------------------------------------------------------
// Options
// ---------------------------------------------------------------------------
export interface DocxCitationsParserOptions {
/**
* Contents of `customXml/item1.xml` from the DOCX ZIP, using the MS
* Office Bibliography XML namespace. Required to resolve Word-native and
* `CITATION` keys into full bibliographic data.
*/
sourcesXml?: string
}
// ---------------------------------------------------------------------------
// Parser class
// ---------------------------------------------------------------------------
export class DocxCitationsParser {
private documentXml: string
private options: DocxCitationsParserOptions
entries: EntryObject[]
errors: ErrorObject[]
warnings: ErrorObject[]
/** Entry keys already added — prevents duplicates across multiple fields. */
private seenKeys: Set<string>
/** Persistent raw CSL id → normalised entry_key map for the instance parse. */
private cslRawIdToEntryKey: Map<string, string>
constructor(documentXml: string, options: DocxCitationsParserOptions = {}) {
this.documentXml = documentXml
this.options = options
this.entries = []
this.errors = []
this.warnings = []
this.seenKeys = new Set()
this.cslRawIdToEntryKey = new Map()
}
// -----------------------------------------------------------------------
// Static utility methods for reusable citation detection and extraction
// -----------------------------------------------------------------------
/**
* Check if an SDT block contains citation data (without full document parsing).
*
* @param sdtXml - XML string of a single <w:sdt>...</w:sdt> block
* @returns CitationCheckResult indicating whether it's a citation and its format
*/
static sdtCitation(
sdtXml: string,
retrieve = true,
retrieveMetadata = false,
acc: CitationAccumulator = {
entries: [],
errors: [],
warnings: [],
seenKeys: new Set<string>(),
cslRawIdToEntryKey: new Map<string, string>(),
},
): CitationResult {
const { entries, errors, warnings } = acc
const tagMatch = sdtXml.match(/<w:tag\s+w:val="([^"]*)"/)
if (!tagMatch) return { isCitation: false }
const tagVal = tagMatch[1]
let format: string | undefined
if (tagVal.startsWith("MENDELEY_CITATION_v3_")) {
format = "mendeley_v3"
} else if (tagVal.startsWith("CitaviPlaceholder#")) {
format = "citavi"
}
if (!format) {
return { isCitation: false }
}
if (!retrieve) {
return { isCitation: true, format }
}
// Extract citation data
const metadata: CitationItemMetadata[] = []
if (format === "mendeley_v3") {
const b64 = tagVal.slice("MENDELEY_CITATION_v3_".length)
DocxCitationsParser.extractCslJsonData(
DocxCitationsParser.decodeBase64Static(b64),
"mendeley_v3",
acc,
retrieveMetadata ? metadata : undefined,
)
} else if (format === "citavi") {
// Word may split long field codes across multiple <w:instrText> runs.
// Always concatenate every instrText fragment inside <w:sdtContent>
// before searching for the "ADDIN CitaviPlaceholder" prefix so that
// we assemble the complete Base64 payload regardless of how many runs
// were used.
let b64: string | null = null
const sdtContentMatch = sdtXml.match(
/<w:sdtContent\b[^>]*>([\s\S]*?)<\/w:sdtContent>/,
)
if (sdtContentMatch) {
const sdtContent = sdtContentMatch[1]
const allInstrTexts: string[] = []
const fragRe = /<w:instrText[^>]*>([\s\S]*?)<\/w:instrText>/g
let frag: RegExpExecArray | null
while ((frag = fragRe.exec(sdtContent)) !== null) {
allInstrTexts.push(frag[1])
}
const combined = allInstrTexts.join("")
const prefixUpper = "ADDIN CITAVIPLACEHOLDER"
const upperCombined = combined.toUpperCase()
const prefixIdx = upperCombined.indexOf(prefixUpper)
if (prefixIdx !== -1) {
b64 = combined
.slice(prefixIdx + prefixUpper.length)
.replace(/\s/g, "")
.replace(/^\{/, "")
.replace(/\}$/, "")
}
}
if (b64 !== null && b64.length > 0) {
DocxCitationsParser.extractCitaviData(
b64,
acc,
retrieveMetadata ? metadata : undefined,
)
} else {
warnings.push({
type: "citavi_missing_payload",
value: tagVal,
})
}
}
const bibDB: Record<number, EntryObject> = {}
entries.forEach((entry, i) => {
bibDB[i + 1] = entry
})
const result: CitationResult = {
isCitation: true,
format,
entries: bibDB,
errors,
warnings,
}
if (retrieveMetadata) result.metadata = metadata
return result
}
/**
* Check or extract bibliography rendering region from an SDT block.
*
* @param sdtXml - XML string of a single <w:sdt>...</w:sdt> block
* @param retrieve - If true, extract data (currently returns empty as bibliographies have no importable data)
* @returns BibliographyResult indicating whether it's a bibliography
*/
static sdtBibliography(sdtXml: string): BibliographyResult {
const tagMatch = sdtXml.match(/<w:tag\s+w:val="([^"]*)"/)
if (!tagMatch) return { isBibliography: false }
const tagVal = tagMatch[1]
let format: string | undefined
if (tagVal.startsWith("MENDELEY_BIBLIOGRAPHY_v3_")) {
format = "mendeley_v3"
}
if (!format) {
return { isBibliography: false }
}
const result: BibliographyResult = {
isBibliography: true,
format,
}
return result
}
/**
* Check or extract citation data from a field instruction.
*
* @param instrText - The concatenated instruction text from w:instrText elements
* @param retrieve - If true, extract and return full citation data
* @param fldData - Optional field data (for EndNote base64 payloads)
* @param options - Optional parser options (e.g., sourcesXml for Word native)
* @returns CitationResult with format and optionally entries/errors/warnings
*/
static fieldCitation(
instrText: string,
retrieve = true,
retrieveMetadata = false,
extractWordNative = true,
fldData?: string,
options: DocxCitationsParserOptions = {},
acc: CitationAccumulator = {
entries: [],
errors: [],
warnings: [],
seenKeys: new Set<string>(),
cslRawIdToEntryKey: new Map<string, string>(),
},
): CitationResult {
const { entries, errors, warnings, seenKeys } = acc
const upper = instrText.trim().toUpperCase()
let format: string | undefined
if (upper.startsWith("ADDIN ZOTERO_ITEM")) {
format = "zotero"
} else if (
upper.startsWith("ADDIN CSL_CITATION") ||
upper.startsWith("CSL_CITATION")
) {
format = "mendeley_legacy"
} else if (upper.startsWith("ADDIN EN.CITE")) {
format = "endnote"
} else if (upper.startsWith("ADDIN CITAVIPLACEHOLDER")) {
format = "citavi"
} else if (upper.startsWith("CITATION ")) {
format = "word_native"
}
if (!format) {
return { isCitation: false }
}
if (!retrieve) {
return { isCitation: true, format }
}
// Extract citation data
const metadata: CitationItemMetadata[] = []
if (format === "zotero" || format === "mendeley_legacy") {
const jsonStart = instrText.indexOf("{")
if (jsonStart === -1) {
warnings.push({ type: `${format}_missing_json` })
} else {
const jsonStr = extractJsonObject(instrText, jsonStart)
if (jsonStr === null) {
warnings.push({ type: `${format}_missing_json` })
} else {
DocxCitationsParser.extractCslJsonData(
jsonStr,
format,
acc,
retrieveMetadata ? metadata : undefined,
)
}
}
} else if (format === "endnote") {
DocxCitationsParser.extractEndNoteData(
instrText,
fldData,
acc,
retrieveMetadata ? metadata : undefined,
)
} else if (format === "citavi") {
const b64Match = instrText.match(
/ADDIN CitaviPlaceholder\{?([A-Za-z0-9+/=\s]+)\}?/i,
)
if (b64Match) {
DocxCitationsParser.extractCitaviData(
b64Match[1].replace(/\s/g, ""),
acc,
retrieveMetadata ? metadata : undefined,
)
}
} else if (format === "word_native") {
// Record the key for later resolution
const m = /^CITATION\s+(\S+)/i.exec(instrText.trim())
if (m) {
seenKeys.add(m[1])
}
// Only extract data if sourcesXml is provided AND extractWordNative is true.
// When called from instance methods, extractWordNative is false and we
// should only record keys - extraction happens later in parseSourcesXml.
if (options.sourcesXml && extractWordNative) {
DocxCitationsParser.extractWordNativeData(
instrText,
options.sourcesXml,
acc,
)
}
}
const bibDB: Record<number, EntryObject> = {}
entries.forEach((entry, i) => {
bibDB[i + 1] = entry
})
const result: CitationResult = {
isCitation: true,
format,
entries: bibDB,
errors,
warnings,
}
if (retrieveMetadata) result.metadata = metadata
return result
}
/**
* Check or extract bibliography rendering region from a field instruction.
*
* @param instrText - The concatenated instruction text
* @param retrieve - If true, extract data (currently returns empty as bibliographies have no importable data)
* @returns BibliographyResult indicating whether it's a bibliography
*/
static fieldBibliography(instrText: string): BibliographyResult {
const upper = instrText.trim().toUpperCase()
let format: string | undefined
if (upper.startsWith("ADDIN ZOTERO_BIBL")) {
format = "zotero"
} else if (upper.startsWith("ADDIN EN.REFLIST")) {
format = "endnote"
} else if (upper.startsWith("BIBLIOGRAPHY")) {
format = "word_native"
}
if (!format) {
return { isBibliography: false }
}
const result: BibliographyResult = {
isBibliography: true,
format,
}
return result
}
// -----------------------------------------------------------------------
// Static helper methods for extraction logic
// -----------------------------------------------------------------------
/**
* Extract CSL citation JSON data.
*/
private static extractCslJsonData(
jsonStr: string,
source: string,
acc: CitationAccumulator,
metadata?: CitationItemMetadata[],
): void {
const { entries, errors, warnings, seenKeys } = acc
let citation: {
citationItems?: Array<{
itemData?: CSLEntry
id?: unknown
locator?: unknown
label?: unknown
prefix?: unknown
suffix?: unknown
"suppress-author"?: unknown
"author-only"?: unknown
}>
}
try {
citation = JSON.parse(jsonStr) as typeof citation
} catch {
warnings.push({
type: `${source}_invalid_json`,
value: jsonStr.slice(0, 80),
})
return
}
const items = citation.citationItems ?? []
if (items.length === 0) return
const cslRecord: Record<string, CSLEntry> = {}
// Track the raw CSL id for each item index so we can attach metadata later
const itemKeys: Array<string | undefined> = []
items.forEach((item, i) => {
if (!item.itemData) {
itemKeys.push(undefined)
return
}
const key =
item.itemData.id === undefined
? `${source}_${i}`
: String(item.itemData.id)
// Always record the raw key; skip adding to cslRecord if already seen
itemKeys.push(key)
if (seenKeys.has(key)) return
cslRecord[key] = item.itemData
})
if (Object.keys(cslRecord).length > 0) {
const parser = new CSLParser(cslRecord)
const bibDB = parser.parse()
errors.push(...parser.errors)
warnings.push(...parser.warnings)
for (const entry of Object.values(bibDB)) {
seenKeys.add(entry.entry_key)
entries.push(entry)
}
// Merge the authoritative raw-id → entry_key map from this parse
// into the accumulator so future citations can resolve duplicates.
for (const [rawId, entryKey] of parser.rawIdToEntryKey) {
acc.cslRawIdToEntryKey.set(rawId, entryKey)
}
}
if (metadata) {
items.forEach((item, i) => {
const rawKey = itemKeys[i]
if (!rawKey) return
// Resolve normalised entry_key via the persistent accumulator map;
// fall back to rawKey only if the entry was never successfully parsed.
const entry_key = acc.cslRawIdToEntryKey.get(rawKey) ?? rawKey
const meta: CitationItemMetadata = { entry_key }
if (
item.locator !== undefined &&
item.locator !== null &&
item.locator !== ""
)
meta.locator = String(item.locator)
if (
item.label !== undefined &&
item.label !== null &&
item.label !== ""
)
meta.label = String(item.label)
if (
item.prefix !== undefined &&
item.prefix !== null &&
item.prefix !== ""
)
meta.prefix = String(item.prefix)
if (
item.suffix !== undefined &&
item.suffix !== null &&
item.suffix !== ""
)
meta.suffix = String(item.suffix)
if (item["suppress-author"]) meta.suppressAuthor = true
if (item["author-only"]) meta.authorOnly = true
metadata.push(meta)
})
}
}
/**
* Extract EndNote citation data.
*/
private static extractEndNoteData(
instrText: string,
fldData: string | undefined,
acc: CitationAccumulator,
metadata?: CitationItemMetadata[],
): void {
const { warnings } = acc
let xmlPayload = ""
if (fldData && fldData.length > 0) {
try {
xmlPayload = DocxCitationsParser.decodeBase64Static(fldData)
} catch {
warnings.push({
type: "endnote_invalid_flddata",
value: fldData.slice(0, 40),
})
return
}
} else {
const idx = instrText.toUpperCase().indexOf("ADDIN EN.CITE")
if (idx === -1) return
xmlPayload = DocxCitationsParser.unescapeXmlEntitiesStatic(
instrText.slice(idx + "ADDIN EN.CITE".length).trim(),
)
}
if (xmlPayload.includes("<EndNote") || xmlPayload.includes("<record")) {
DocxCitationsParser.parseEndNoteXml(xmlPayload, acc, metadata)
} else {
warnings.push({
type: "endnote_no_xml",
value: xmlPayload.slice(0, 80),
})
}
}
/**
* Extract Citavi citation data from base64-encoded WordPlaceholder JSON.
*
* Citavi embeds complete bibliographic data directly in each citation via
* `Reference` objects within the `Entries` array. This method checks for
* embedded references and converts them via CitaviParser. If no embedded
* references are found (only UUIDs), a warning is generated.
*/
private static extractCitaviData(
b64: string,
acc: CitationAccumulator,
metadata?: CitationItemMetadata[],
): void {
const { entries, errors, warnings, seenKeys } = acc
let payload: CitaviInput
try {
const decoded = DocxCitationsParser.decodeBase64Static(b64)
payload = JSON.parse(decoded) as CitaviInput
} catch {
warnings.push({
type: "citavi_invalid_payload",
value: b64.slice(0, 40),
})
return
}
// Check if the payload has embedded references
const typedPayload = payload as {
Entries?: Array<import("./citavi").CitaviEntry>
}
const hasEmbeddedReferences =
!Array.isArray(payload) &&
Array.isArray(typedPayload.Entries) &&
typedPayload.Entries?.some(
(e) => e.Reference !== null && e.Reference !== undefined,
)
if (!hasEmbeddedReferences) {
warnings.push({
type: "citavi_missing_embedded_references",
value: b64.slice(0, 40),
})
return
}
const parser = new CitaviParser(payload)
const bibDB = parser.parse()
errors.push(...parser.errors)
warnings.push(...parser.warnings)
for (const entry of Object.values(bibDB)) {
if (!seenKeys.has(entry.entry_key)) {
seenKeys.add(entry.entry_key)
entries.push(entry)
}
}
if (metadata && typedPayload.Entries) {
// Use the authoritative ReferenceId → entry_key map that CitaviParser
// builds during parse(). This is reliable even when the parser
// synthesises a lastname+year key that bears no resemblance to the
// original UUID.
for (const citaviEntry of typedPayload.Entries) {
const refId = citaviEntry.ReferenceId
if (!refId) continue
const entry_key =
parser.referenceIdToEntryKey.get(refId) ?? refId
const meta: CitationItemMetadata = { entry_key }
if (citaviEntry.Prefix) meta.prefix = citaviEntry.Prefix
if (citaviEntry.Suffix) meta.suffix = citaviEntry.Suffix
const pageRange = citaviEntry.PageRange
const pageStr = pageRange?.OriginalString
if (pageStr !== undefined && pageStr !== null && pageStr !== "")
meta.locator = pageStr
// Derive a CSL-style locator label from NumberingType.
// The integer-to-type mapping is inferred from the Citavi manual's
// prose (types listed in order) and has NOT been confirmed against
// observed data beyond value 0 (Pages).
if (
pageRange &&
!citaviEntry.UseNumberingTypeOfParentDocument
) {
const nt = pageRange.NumberingType
if (nt !== undefined && nt !== null && nt !== 0) {
const numberingTypeLabels: Record<number, string> = {
1: "column", // Columns (Col.) — inferred
2: "section", // Section numbers (Nr./§) — inferred
3: "note", // Margin numbers — inferred
4: "custom", // Other / free-form — inferred
}
const label = numberingTypeLabels[nt]
if (label !== undefined) meta.label = label
}
}
if (citaviEntry.BibliographyEntry)
meta.bibliographyEntry = citaviEntry.BibliographyEntry
if (
citaviEntry.RuleSet !== undefined &&
citaviEntry.RuleSet !== null
)
meta.ruleSet = citaviEntry.RuleSet
if (
citaviEntry.FormatOption !== undefined &&
citaviEntry.FormatOption !== null
)
meta.formatOption = citaviEntry.FormatOption
if (
citaviEntry.InsertAs !== undefined &&
citaviEntry.InsertAs !== null
)
meta.insertAs = citaviEntry.InsertAs
metadata.push(meta)
}
}
}
/**
* Extract Word native citation data.
*/
private static extractWordNativeData(
instrText: string,
sourcesXml: string,
acc: CitationAccumulator,
): void {
const { entries, errors, warnings, seenKeys } = acc
const m = /^CITATION\s+(\S+)/i.exec(instrText.trim())
if (m) {
const citationKey = m[1]
seenKeys.add(citationKey)
// citedKeys = the key we just recorded (the allowlist for this call).
// importedKeys = keys already pushed into `entries` so we don't
// duplicate them across multiple CITATION fields in the same document.
const citedKeys = new Set<string>([citationKey])
const importedKeys = new Set<string>(
entries.map((e) => e.entry_key),
)
const nativeParser = new DocxNativeParser(sourcesXml)
const result = nativeParser.parse(citedKeys, importedKeys)
errors.push(...result.errors)
warnings.push(...result.warnings)
entries.push(...result.entries)
}
}
/**
* Parse EndNote XML payload.
*/
private static parseEndNoteXml(
xml: string,
acc: CitationAccumulator,
metadata?: CitationItemMetadata[],
): void {
const { entries, errors, warnings, seenKeys } = acc
const records: EndNoteRecord[] = []
// When collecting metadata, capture per-Cite fields before deduplication
interface CiteFields {
recNum: string
prefix?: string
suffix?: string
pages?: string
authorYear?: boolean
}
const citeFieldsList: CiteFields[] = []
// Match the full opening tag (which may carry AuthorYear="1") plus body
const citeRe = /<Cite(\s[^>]*)?>([\s\S]*?)<\/Cite>/g
let citeMatch: RegExpExecArray | null
while ((citeMatch = citeRe.exec(xml)) !== null) {
const citeAttrs = citeMatch[1] ?? ""
const citeXml = citeMatch[2]
const recordMatch = /<record>([\s\S]*?)<\/record>/.exec(citeXml)
if (recordMatch) {
const record = DocxCitationsParser.parseEndNoteRecordXml(
recordMatch[0],
)
const key = String(record["rec-number"] ?? "")
if (key && !seenKeys.has(key)) {
records.push(record)
seenKeys.add(key)
}
if (metadata && key) {
// AuthorYear="1" on the opening <Cite> tag means show author outside parens
const authorYear = /AuthorYear\s*=\s*["']?1["']?/i.test(
citeAttrs,
)
// Only search for citation-level fields (Prefix, Suffix,
// Pages) in the portion of <Cite> that comes *before* the
// embedded <record> element. The <record> block contains
// the reference's own <pages> field, and matching against
// the full citeXml would confuse the reference page range
// with a per-citation locator.
const recordStart = citeXml.indexOf("<record>")
const citeHeader =
recordStart === -1
? citeXml
: citeXml.slice(0, recordStart)
const prefixMatch =
/<Prefix[^>]*>([\s\S]*?)<\/Prefix>/i.exec(citeHeader)
const suffixMatch =
/<Suffix[^>]*>([\s\S]*?)<\/Suffix>/i.exec(citeHeader)
const pagesMatch = /<Pages[^>]*>([\s\S]*?)<\/Pages>/i.exec(
citeHeader,
)
citeFieldsList.push({
recNum: key,
prefix: prefixMatch
? DocxCitationsParser.stripStyleTagsStatic(
prefixMatch[1],
)
: undefined,
suffix: suffixMatch
? DocxCitationsParser.stripStyleTagsStatic(
suffixMatch[1],
)
: undefined,
pages: pagesMatch
? DocxCitationsParser.stripStyleTagsStatic(
pagesMatch[1],
)
: undefined,
authorYear,
})
}
}
}
if (records.length === 0) return
const parser = new EndNoteParser(records)
const result = parser.parse()
errors.push(...result.errors)
warnings.push(...result.warnings)
entries.push(...Object.values(result.entries))
if (metadata) {
// Use the authoritative rec-number → entry_key map that EndNoteParser
// builds during parse(). This is reliable even when the parser
// synthesises a lastname+year key that bears no resemblance to the
// original rec-number.
for (const cite of citeFieldsList) {
const entry_key =
parser.recNumberToEntryKey.get(cite.recNum) ?? cite.recNum
const meta: CitationItemMetadata = { entry_key }
if (cite.prefix) meta.prefix = cite.prefix
if (cite.suffix) meta.suffix = cite.suffix
if (cite.pages) meta.locator = cite.pages
if (cite.authorYear) meta.authorYear = true
metadata.push(meta)
}
}
}
/**
* Parse EndNote record XML.
*/
private static parseEndNoteRecordXml(recordXml: string): EndNoteRecord {
const record: EndNoteRecord = {}
const refTypeMatch = recordXml.match(
/<ref-type(?:\s+name="([^"]*)")?[^>]*>(\d+)<\/ref-type>/,
)
if (refTypeMatch) {
record["ref-type"] = {
name: refTypeMatch[1] ?? "",
"#text": refTypeMatch[2],
}
}
const recNumMatch = recordXml.match(
/<rec-number[^>]*>([\s\S]*?)<\/rec-number>/,
)
if (recNumMatch) {
record["rec-number"] = recNumMatch[1].trim()
}
const titlesMatch = recordXml.match(/<titles>([\s\S]*?)<\/titles>/)
if (titlesMatch) {
const t = titlesMatch[1]
const titles: Record<string, { "#text": string }> = {}
for (const tag of [
"title",
"secondary-title",
"tertiary-title",
"short-title",
"alt-title",
"translated-title",
] as const) {
const m = t.match(
new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\\/${tag}>`),
)
if (m)
titles[tag] = {
"#text": DocxCitationsParser.stripStyleTagsStatic(m[1]),
}
}
if (Object.keys(titles).length > 0) record.titles = titles
}
const contribMatch = recordXml.match(
/<contributors>([\s\S]*?)<\/contributors>/,
)
if (contribMatch) {
record.contributors = DocxCitationsParser.parseContributorsXml(
contribMatch[1],
)
}
const periodicalMatch = recordXml.match(
/<periodical>([\s\S]*?)<\/periodical>/,
)
if (periodicalMatch) {
const p = periodicalMatch[1]
const periodical: Record<string, { "#text": string }> = {}
for (const tag of [
"full-title",
"abbr-1",
"abbr-2",
"abbr-3",
] as const) {
const m = p.match(
new RegExp(`<${tag}[^>]*>([\\s\\S]*?)<\\/${tag}>`),
)
if (m)
periodical[tag] = {
"#text": DocxCitationsParser.stripStyleTagsStatic(m[1]),
}
}
if (Object.keys(periodical).length > 0)
record.periodical = periodical
}
const scalarFields: Array<[keyof EndNoteRecord, string]> = [
["pages", "pages"],
["volume", "volume"],
["number", "number"],
["issue", "issue"],
["edition", "edition"],
["section", "section"],
["publisher", "publisher"],
["isbn", "isbn"],
["issn", "issn"],
["abstract", "abstract"],
["notes", "notes"],
["language", "language"],
["label", "label"],
["doi", "doi"],
["electronic-resource-num", "electronic-resource-num"],
]
for (const [recordKey, xmlTag] of scalarFields) {
const m = recordXml.match(
new RegExp(`<${xmlTag}[^>]*>([\\s\\S]*?)<\\/${xmlTag}>`),
)
if (m) {
// eslint-disable-next-line @typescript-eslint/no-extra-semi
;(record as Record<string, unknown>)[recordKey as string] = {
"#text": DocxCitationsParser.stripStyleTagsStatic(m[1]),
}
}
}
const pubLocMatch = recordXml.match(
/<pub-location[^>]*>([\s\S]*?)<\/pub-location>/,
)
if (pubLocMatch) {
record["pub-location"] = {
"#text": DocxCitationsParser.stripStyleTagsStatic(
pubLocMatch[1],
),
}
}
const datesMatch = recordXml.match(/<dates>([\s\S]*?)<\/dates>/)
if (datesMatch) {
record.dates = DocxCitationsParser.parseDatesXml(datesMatch[1])
}
const keywordsMatch = recordXml.match(
/<keywords>([\s\S]*?)<\/keywords>/,
)
if (keywordsMatch) {
const kwMatches = [
...keywordsMatch[1].matchAll(
/<keyword[^>]*>([\s\S]*?)<\/keyword>/g,
),
]
if (kwMatches.length > 0) {
record.keywords = {
keyword: kwMatches.map((kw) => ({
"#text": DocxCitationsParser.stripStyleTagsStatic(
kw[1],
),
})),
}
}
}
const urlsMatch = recordXml.match(/<urls>([\s\S]*?)<\/urls>/)
if (urlsMatch) {
record.urls = DocxCitationsParser.parseUrlsXml(urlsMatch[1])
}
return record
}
/**
* Parse contributors XML.
*/
private static parseContributorsXml(
xml: string,
): Record<string, { author: Array<{ "#text": string }> }> {
const result: Record<string, { author: Array<{ "#text": string }> }> =
{}
for (const group of [
"authors",
"secondary-authors",
"tertiary-authors",
"subsidiary-authors",
] as const) {
const m = xml.match(
new RegExp(`<${group}[^>]*>([\\s\\S]*?)<\\/${group}>`),
)
if (m) {
const authorMatches = [
...m[1].matchAll(/<author[^>]*>([\s\S]*?)<\/author>/g),
]
if (authorMatches.length > 0) {
result[group] = {
author: authorMatches.map((author) => ({
"#text": DocxCitationsParser.stripStyleTagsStatic(
author[1],
),
})),
}
}
}
}
return result
}
/**
* Parse dates XML.
*/
private static parseDatesXml(xml: string): Record<string, unknown> {
const dates: Record<string, unknown> = {}
const yearMatch = xml.match(/<year[^>]*>([\s\S]*?)<\/year>/)
if (yearMatch) {
dates.year = {
"#text": DocxCitationsParser.stripStyleTagsStatic(yearMatch[1]),
}
}
const pubDatesMatch = xml.match(/<pub-dates>([\s\S]*?)<\/pub-dates>/)
if (pubDatesMatch) {
const dateMatches = [
...pubDatesMatch[1].matchAll(/<date[^>]*>([\s\S]*?)<\/date>/g),
]
if (dateMatches.length > 0) {
dates["pub-dates"] = {
date: dateMatches.map((d) => ({
"#text": DocxCitationsParser.stripStyleTagsStatic(d[1]),
})),
}
}
}
return dates
}
/**
* Parse URLs XML.
*/
private static parseUrlsXml(xml: string): Record<string, unknown> {
const urls: Record<string, unknown> = {}
for (const group of [
"web-urls",
"pdf-urls",
"related-urls",
"text-urls",
"image-urls",
] as const) {
const m = xml.match(
new RegExp(`<${group}>([\\s\\S]*?)<\\/${group}>`),
)
if (!m) continue
const urlMatches = [
...m[1].matchAll(/<url[^>]*>([\s\S]*?)<\/url>/g),
]
if (urlMatches.length > 0) {
urls[group] = {
url: urlMatches.map((u) => ({
"#text": DocxCitationsParser.stripStyleTagsStatic(u[1]),
})),
}
}
}
return urls
}
/**
* Strip style tags and decode XML entities.
*/
private static stripStyleTagsStatic(text: string): string {
return text
.replace(/<style[^>]*>([\s\S]*?)<\/style>/g, "$1")
.replace(/<[^>]+>/g, "")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/&/g, "&")
.replace(/"/g, '"')
.replace(/'/g, "'")
.trim()
}
/**
* Unescape XML entities.
*/
private static unescapeXmlEntitiesStatic(text: string): string {
return text
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/&/g, "&")
.replace(/"/g, '"')
.replace(/'/g, "'")
}
/**
* Decode base64.
*/
private static decodeBase64Static(b64: string): string {
const binary = atob(b64)
const bytes = new Uint8Array(binary.length)
for (let i = 0; i < binary.length; i++) {
bytes[i] = binary.charCodeAt(i)
}
let end = bytes.length
while (end > 0 && bytes[end - 1] === 0) {
end--
}
return new TextDecoder("utf-8").decode(bytes.subarray(0, end))
}
// -----------------------------------------------------------------------
// Public API
// -----------------------------------------------------------------------
parse(): DocxCitationsParseResult {
// 1) Parse SDT blocks (Mendeley v3, Citavi with embedded references).
this.parseSdtBlocks()
// 2) Parse field codes (Zotero, legacy Mendeley, Word native, etc.).
this.parseFieldCodes()
// 3) Parse sources XML if provided (Word native).
if (this.options.sourcesXml) {
this.parseSourcesXml(this.options.sourcesXml)
}
// Build final BibDB
const bibDB: Record<number, EntryObject> = {}
this.entries.forEach((entry, i) => {
bibDB[i + 1] = entry
})
return {
entries: bibDB,
errors: this.errors,
warnings: this.warnings,
}
}
// -----------------------------------------------------------------------
// Step 1 — <w:sdt> structured document tags
// -----------------------------------------------------------------------
private parseSdtBlocks(): void {
const sdtRe = /<w:sdt\b[^>]*>([\s\S]*?)<\/w:sdt>/g
let m: RegExpExecArray | null
while ((m = sdtRe.exec(this.documentXml)) !== null) {
DocxCitationsParser.sdtCitation(m[1], true, false, {
entries: this.entries,
errors: this.errors,
warnings: this.warnings,
seenKeys: this.seenKeys,