biblatex-csl-converter
Version:
Bibliography format converter: BibLaTeX, BibTeX, CSL-JSON, RIS, ENW, EndNote XML, Citavi, DOCX citations, ODT citations — parse, convert, and export with round-trip fidelity
289 lines (242 loc) • 9.15 kB
text/typescript
/**
* LibreOffice native bibliography-mark importer
*
* Handles `<text:bibliography-mark>` elements that LibreOffice writes into
* ODT content.xml. All bibliographic data is stored as XML attributes
* directly on the element, so no external parser is needed.
*
* This module is consumed by OdtCitationsParser in odt-citations.ts.
*/
import type { EntryObject, NodeArray, RangeArray } from "../const"
// ---------------------------------------------------------------------------
// Type map
// ---------------------------------------------------------------------------
/**
* Maps `text:bibliography-type` attribute values (ODF 1.2 spec) to internal
* BibLaTeX type strings.
*/
export const ODTBibTypeMap: Record<string, string> = {
article: "article-journal",
book: "book",
booklet: "booklet",
conference: "inproceedings",
inbook: "inbook",
incollection: "inbook",
inproceedings: "inproceedings",
journal: "article-journal",
manual: "manual",
mastersthesis: "thesis",
misc: "misc",
phdthesis: "thesis",
proceedings: "proceedings",
techreport: "report",
unpublished: "unpublished",
www: "online",
email: "misc",
custom1: "misc",
custom2: "misc",
custom3: "misc",
custom4: "misc",
custom5: "misc",
}
// ---------------------------------------------------------------------------
// Result type
// ---------------------------------------------------------------------------
export interface OdtNativeParseResult {
entries: EntryObject[]
warnings: Array<{
type: string
field?: string
value?: unknown
entry?: string
}>
}
// ---------------------------------------------------------------------------
// Parser class
// ---------------------------------------------------------------------------
export class OdtNativeParser {
private contentXml: string
constructor(contentXml: string) {
this.contentXml = contentXml
}
/**
* Scans `contentXml` for all `<text:bibliography-mark>` elements and
* returns the parsed entries together with any warnings produced.
*
* Duplicate detection (via `seenKeys`) is the responsibility of the
* caller; this parser will emit an entry for every mark it finds,
* including duplicates. Pass a pre-populated `seenKeys` set to skip
* keys that have already been processed.
*/
parse(seenKeys?: Set<string>): OdtNativeParseResult {
const entries: EntryObject[] = []
const warnings: Array<{
type: string
field?: string
value?: unknown
entry?: string
}> = []
const markRe =
/<text:bibliography-mark\b([\s\S]*?)(?:\/>|>[\s\S]*?<\/text:bibliography-mark>)/g
let m: RegExpExecArray | null
while ((m = markRe.exec(this.contentXml)) !== null) {
const result = processLibreOfficeMarkAttrs(m[1], seenKeys)
if (result.warning) warnings.push(result.warning)
if (result.entry) entries.push(result.entry)
}
return { entries, warnings }
}
}
// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------
function processLibreOfficeMarkAttrs(
attrString: string,
seenKeys?: Set<string>,
): {
entry?: EntryObject
warning?: { type: string; field?: string; value?: unknown; entry?: string }
} {
const getAttr = (name: string): string => {
const re = new RegExp(`${name}="([^"]*)"`)
const m = re.exec(attrString)
if (m) return unescapeXmlEntities(m[1])
// also handle single-quoted values
const re2 = new RegExp(`${name}='([^']*)'`)
const m2 = re2.exec(attrString)
return m2 ? unescapeXmlEntities(m2[1]) : ""
}
const identifier = getAttr("text:identifier")
if (!identifier) {
return { warning: { type: "odt_bibmark_missing_identifier" } }
}
if (seenKeys?.has(identifier)) return {}
seenKeys?.add(identifier)
const bibTypeName = getAttr("text:bibliography-type") || "misc"
const bibType = ODTBibTypeMap[bibTypeName] ?? "misc"
const warning = ODTBibTypeMap[bibTypeName]
? undefined
: {
type: "odt_bibmark_unknown_type",
value: bibTypeName,
entry: identifier,
}
const fields: Record<string, unknown> = {}
const title = getAttr("text:title")
if (title) fields.title = makeRichText(title)
// text:author is a plain string (may contain multiple authors
// separated by semicolons, e.g. "Jones, Alice; Smith, Bob")
const author = getAttr("text:author")
if (author) fields.author = parseODTNameString(author)
const year = getAttr("text:year")
if (year) fields.date = year
const journal = getAttr("text:journal")
if (journal) fields.journaltitle = makeRichText(journal)
const booktitle = getAttr("text:booktitle")
if (booktitle) fields.booktitle = makeRichText(booktitle)
const volume = getAttr("text:volume")
if (volume) fields.volume = makeRichText(volume)
const number = getAttr("text:number")
if (number) fields.number = makeRichText(number)
const pages = getAttr("text:pages")
if (pages) fields.pages = convertRange(pages)
const publisher = getAttr("text:publisher")
if (publisher) fields.publisher = [makeRichText(publisher)]
// text:address is the ODF attribute for publisher address / place
const address = getAttr("text:address")
if (address) fields.location = [makeRichText(address)]
const edition = getAttr("text:edition")
if (edition) fields.edition = makeRichText(edition)
const isbn = getAttr("text:isbn")
if (isbn) fields.isbn = makeRichText(isbn)
const issn = getAttr("text:issn")
if (issn) fields.issn = makeRichText(issn)
const doi = getAttr("text:doi")
if (doi) fields.doi = doi
const url = getAttr("text:url")
if (url) fields.url = url
const note = getAttr("text:note")
if (note) fields.note = makeRichText(note)
const annote = getAttr("text:annote")
if (annote && !fields.note) {
fields.note = makeRichText(annote)
}
// text:institution / text:school → institution field
const institution = getAttr("text:institution")
if (institution) fields.institution = [makeRichText(institution)]
const school = getAttr("text:school")
if (school && !fields.institution) {
fields.institution = [makeRichText(school)]
}
const reportType = getAttr("text:report-type")
if (reportType) fields.type = reportType
const chapter = getAttr("text:chapter")
if (chapter) fields.chapter = makeRichText(chapter)
const series = getAttr("text:series")
if (series) fields.series = makeRichText(series)
const editor = getAttr("text:editor")
if (editor) fields.editor = parseODTNameString(editor)
return {
entry: { entry_key: identifier, bib_type: bibType, fields },
warning,
}
}
/**
* ODT bibliography marks store names as plain strings such as
* "Jones, Alice" or "Smith, John; Doe, Jane". We split on semicolons
* then parse each individual name in "Last, First" or "First Last" form.
*/
function parseODTNameString(nameStr: string): Array<{
family?: NodeArray
given?: NodeArray
literal?: NodeArray
}> {
return nameStr
.split(";")
.map((s) => s.trim())
.filter(Boolean)
.map((name) => {
const obj: {
family?: NodeArray
given?: NodeArray
literal?: NodeArray
} = {}
if (name.includes(",")) {
const parts = name.split(",").map((p) => p.trim())
obj.family = makeRichText(parts[0])
if (parts[1]) obj.given = makeRichText(parts[1])
} else {
const words = name.split(/\s+/)
if (words.length === 1) {
obj.literal = makeRichText(words[0])
} else {
obj.family = makeRichText(words[words.length - 1])
obj.given = makeRichText(words.slice(0, -1).join(" "))
}
}
return obj
})
}
function makeRichText(text: string): NodeArray {
return [{ type: "text", text: text.trim() }]
}
function convertRange(rangeText: string): RangeArray[] {
return rangeText.split(/,\s*/).map((r): RangeArray => {
const parts = r.split(/[-–—]/)
if (parts.length >= 2) {
return [
[{ type: "text", text: parts[0].trim() }],
[{ type: "text", text: parts.slice(1).join("-").trim() }],
]
}
return [[{ type: "text", text: r.trim() }]]
})
}
function unescapeXmlEntities(text: string): string {
return text
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/&/g, "&")
.replace(/"/g, '"')
.replace(/'/g, "'")
}