biblatex-csl-converter

Version:

Bibliography format converter: BibLaTeX, BibTeX, CSL-JSON, RIS, ENW, EndNote XML, Citavi, DOCX citations, ODT citations — parse, convert, and export with round-trip fidelity

github.com/fiduswriter/biblatex-csl-converter

fiduswriter/biblatex-csl-converter

471 lines (407 loc) • 14.5 kB

text/typescript

import { BibFieldTypes, BibTypes, type EntryObject, type NameDictObject, type NodeArray, type RangeArray, } from "../const" import { makeEntryKey } from "./tools" interface CSLNameObject { literal?: string given?: string family?: string suffix?: string "non-dropping-particle"?: string "dropping-particle"?: string } interface CSLDateObject { "date-parts"?: [number[]] | [number[], number[]] circa?: boolean } export interface CSLEntry { id?: string type?: string [key: string]: unknown } interface ErrorObject { type: string field?: string value?: unknown entry?: string } export class CSLParser { input: Record<string, CSLEntry> entries: EntryObject[] errors: ErrorObject[] warnings: ErrorObject[] private usedKeys: Set<string> = new Set() /** * Maps each raw CSL `id` string to the final `entry_key` assigned after * normalisation. Populated during `parse()` so that callers (e.g. * `DocxCitationsParser`, `OdtCitationsParser`) can resolve a raw CSL id * back to the actual key used in the returned BibDB — even when the parser * synthesised a lastname+year key that bears no resemblance to the original. */ rawIdToEntryKey: Map<string, string> = new Map() constructor(input: Record<string, CSLEntry>) { this.input = input this.entries = [] this.errors = [] this.warnings = [] } parse(): Record<number, EntryObject> { // Convert each CSL entry to internal format for (const [id, entry] of Object.entries(this.input)) { const convertedEntry = this.convertEntry(entry, id) if (convertedEntry) { this.entries.push(convertedEntry) } } // Create numbered index like BibLatexParser const bibDB: Record<number, EntryObject> = {} this.entries.forEach((entry, index) => { bibDB[index + 1] = entry }) return bibDB } private convertEntry(entry: CSLEntry, id: string): EntryObject | false { // Find matching BibTeX type const bibType = this.getBibType(entry.type || "") if (!bibType) { this.errors.push({ type: "unknown_type", value: entry.type, entry: id, }) return false } const fields: Record<string, unknown> = {} // Convert each field for (const [key, value] of Object.entries(entry)) { if (key === "type" || key === "id") continue const field = this.convertField(key, value, id) if (field) { fields[field[0]] = field[1] } } // Derive a name/year hint for key synthesis when the raw id is not // letter-prefixed (e.g. purely numeric ids from some CSL producers). let lastName: string | undefined let year: string | undefined if (!/^[A-Za-z]/.test(id)) { const authors = entry.author as CSLNameObject[] | undefined const first = Array.isArray(authors) ? authors[0] : undefined if (first?.family) { lastName = first.family.replace(/[^A-Za-z0-9]/g, "") } else if (first?.literal) { lastName = first.literal .split(/\s+/)[0] .replace(/[^A-Za-z0-9]/g, "") } const issued = entry.issued as CSLDateObject | undefined const parts = issued?.["date-parts"]?.[0] if (parts?.[0]) year = String(parts[0]) } const entryKey = makeEntryKey(id, this.usedKeys, lastName, year) // Record the raw id → normalised entry_key for caller lookup. this.rawIdToEntryKey.set(id, entryKey) return { entry_key: entryKey, bib_type: bibType, fields, } } private getBibType(cslType: string): string | false { // Find BibTeX type that maps to this CSL type return ( Object.keys(BibTypes).find( (type) => BibTypes[type].csl === cslType, ) || false ) } private convertField( key: string, value: unknown, entryId: string, ): [string, unknown] | false { // Find matching BibTeX field const bibField = Object.keys(BibFieldTypes).find((field) => { const csl = BibFieldTypes[field].csl const matches = typeof csl === "string" ? csl === key : csl?.[key] if (matches) { const bibType = this.getBibType(this.input[entryId].type || "") if (!bibType) return false const typeFields = BibTypes[bibType] return ( typeFields.required.includes(field) || typeFields.optional.includes(field) || typeFields.eitheror.includes(field) ) } return false }) if (!bibField) { this.warnings.push({ type: "unknown_field", field: key, value, entry: entryId, }) return false } // Convert the value based on field type const fieldType = BibFieldTypes[bibField].type let convertedValue: unknown switch (fieldType) { case "f_date": convertedValue = this.convertDate(value as CSLDateObject) break case "f_integer": convertedValue = this.convertInteger(value) break case "f_key": { const keyVal = this.convertKey(value, bibField) if (!keyVal) { // Unrecognized option value — skip this field entirely to // avoid storing an empty string that would crash the // BibLaTeX exporter's options lookup. this.warnings.push({ type: "unknown_field_value", field: bibField, value, entry: entryId, }) return false } convertedValue = keyVal break } case "f_literal": case "f_long_literal": convertedValue = this.convertRichText(value as string) break case "l_range": convertedValue = this.convertRange(value as string) break case "f_title": convertedValue = this.convertRichText(value as string) break case "f_uri": case "f_verbatim": convertedValue = String(value) break case "l_key": convertedValue = this.convertKeyList( value as string[], bibField, ) break case "l_literal": convertedValue = this.convertLiteralList(value as string[]) break case "l_name": convertedValue = this.convertNames(value as CSLNameObject[]) break case "l_tag": convertedValue = this.convertTags(value as string[]) break default: convertedValue = value } return [bibField, convertedValue] } private convertDate(date: CSLDateObject): string { if (!date["date-parts"]) return "" const parts = date["date-parts"][0] let dateStr = String(parts[0]) // Year if (parts[1]) { // Month dateStr += `-${String(parts[1]).padStart(2, "0")}` if (parts[2]) { // Day dateStr += `-${String(parts[2]).padStart(2, "0")}` } } if (date.circa) { dateStr += "~" } return dateStr } private convertNames(names: CSLNameObject[]): NameDictObject[] { return names.map((name) => { const nameObj: NameDictObject = {} if (name.literal) { nameObj.literal = this.convertRichText(name.literal) } else { if (name.family) { nameObj.family = this.convertRichText(name.family) } if (name.given) { nameObj.given = this.convertRichText(name.given) } if (name.suffix) { nameObj.suffix = this.convertRichText(name.suffix) } if (name["non-dropping-particle"]) { nameObj.prefix = this.convertRichText( name["non-dropping-particle"], ) nameObj.useprefix = true } else if (name["dropping-particle"]) { nameObj.prefix = this.convertRichText( name["dropping-particle"], ) nameObj.useprefix = false } } return nameObj }) } private convertInteger(value: unknown): NodeArray { const num = parseInt(String(value), 10) return [ { type: "text", text: Number.isNaN(num) ? String(value) : String(num), }, ] } private convertKey(value: unknown, fieldName: string): string { const stringValue = String(value).toLowerCase() const fieldType = BibFieldTypes[fieldName] if (fieldType.options) { if (Array.isArray(fieldType.options)) { // Simple list of options return fieldType.options.includes(stringValue) ? stringValue : "" } else { // Map of options (like langid) // Add type assertion here const options = fieldType.options as Record< string, { csl: string } > const option = Object.keys(options).find( (key) => options[key].csl === stringValue, ) return option || "" } } return stringValue } private convertRange(value: string): RangeArray[] { return String(value) .split(",") .map((range) => { const parts = range.split(/[-–—]/) // Ensure we always return an array with exactly one element return [ parts.map((part) => ({ type: "text", text: part.trim(), })), ] }) } private convertKeyList( values: string[], fieldName: string, ): (string | NodeArray)[] { if (!Array.isArray(values)) { values = [String(values)] } return values.map((value) => this.convertKey(value, fieldName)) } private convertLiteralList(values: string[]): NodeArray[] { if (!Array.isArray(values)) { values = [String(values)] } return values.map((value) => this.convertRichText(value)) } private convertTags(values: string[]): string[] { if (!Array.isArray(values)) { values = [String(values)] } return values.map((value) => value.trim()) } private convertRichText(text: string): NodeArray { if (typeof text !== "string") { return [{ type: "text", text: String(text) }] } // If no HTML tags present, return simple text node if (!text.includes("<")) { return [{ type: "text", text }] } const nodes: NodeArray = [] let currentText = "" let currentMarks: { type: string }[] = [] // Helper to add accumulated text as node const addTextNode = () => { if (currentText) { nodes.push({ type: "text", text: currentText, ...(currentMarks.length ? { marks: [...currentMarks] } : {}), }) currentText = "" } } let i = 0 while (i < text.length) { if (text[i] === "<") { const closeTag = text[i + 1] === "/" const tagEnd = text.indexOf(">", i) if (tagEnd === -1) { currentText += text[i] i++ continue } let tag = text.substring(closeTag ? i + 2 : i + 1, tagEnd) // Handle style attribute for small-caps if (tag.startsWith('span style="font-variant:small-caps;"')) { tag = "smallcaps" } else if (tag.startsWith('span class="nocase"')) { tag = "nocase" } else if (tag === "span") { // Skip closing span tags i = tagEnd + 1 continue } // Map HTML tags to internal mark types const markType = { b: "strong", i: "em", sub: "sub", sup: "sup", smallcaps: "smallcaps", nocase: "nocase", }[tag] if (markType) { addTextNode() if (closeTag) { currentMarks = currentMarks.filter( (mark) => mark.type !== markType, ) } else { currentMarks.push({ type: markType }) } i = tagEnd + 1 continue } } currentText += text[i] i++ } addTextNode() return nodes } } export function parseCSL( input: Record<string, CSLEntry>, ): Record<number, EntryObject> { return new CSLParser(input).parse() }