biblatex-csl-converter
Version:
Bibliography format converter: BibLaTeX, BibTeX, CSL-JSON, RIS, ENW, EndNote XML, Citavi, DOCX citations, ODT citations — parse, convert, and export with round-trip fidelity
471 lines (407 loc) • 14.5 kB
text/typescript
import {
BibFieldTypes,
BibTypes,
type EntryObject,
type NameDictObject,
type NodeArray,
type RangeArray,
} from "../const"
import { makeEntryKey } from "./tools"
interface CSLNameObject {
literal?: string
given?: string
family?: string
suffix?: string
"non-dropping-particle"?: string
"dropping-particle"?: string
}
interface CSLDateObject {
"date-parts"?: [number[]] | [number[], number[]]
circa?: boolean
}
export interface CSLEntry {
id?: string
type?: string
[key: string]: unknown
}
interface ErrorObject {
type: string
field?: string
value?: unknown
entry?: string
}
export class CSLParser {
input: Record<string, CSLEntry>
entries: EntryObject[]
errors: ErrorObject[]
warnings: ErrorObject[]
private usedKeys: Set<string> = new Set()
/**
* Maps each raw CSL `id` string to the final `entry_key` assigned after
* normalisation. Populated during `parse()` so that callers (e.g.
* `DocxCitationsParser`, `OdtCitationsParser`) can resolve a raw CSL id
* back to the actual key used in the returned BibDB — even when the parser
* synthesised a lastname+year key that bears no resemblance to the original.
*/
rawIdToEntryKey: Map<string, string> = new Map()
constructor(input: Record<string, CSLEntry>) {
this.input = input
this.entries = []
this.errors = []
this.warnings = []
}
parse(): Record<number, EntryObject> {
// Convert each CSL entry to internal format
for (const [id, entry] of Object.entries(this.input)) {
const convertedEntry = this.convertEntry(entry, id)
if (convertedEntry) {
this.entries.push(convertedEntry)
}
}
// Create numbered index like BibLatexParser
const bibDB: Record<number, EntryObject> = {}
this.entries.forEach((entry, index) => {
bibDB[index + 1] = entry
})
return bibDB
}
private convertEntry(entry: CSLEntry, id: string): EntryObject | false {
// Find matching BibTeX type
const bibType = this.getBibType(entry.type || "")
if (!bibType) {
this.errors.push({
type: "unknown_type",
value: entry.type,
entry: id,
})
return false
}
const fields: Record<string, unknown> = {}
// Convert each field
for (const [key, value] of Object.entries(entry)) {
if (key === "type" || key === "id") continue
const field = this.convertField(key, value, id)
if (field) {
fields[field[0]] = field[1]
}
}
// Derive a name/year hint for key synthesis when the raw id is not
// letter-prefixed (e.g. purely numeric ids from some CSL producers).
let lastName: string | undefined
let year: string | undefined
if (!/^[A-Za-z]/.test(id)) {
const authors = entry.author as CSLNameObject[] | undefined
const first = Array.isArray(authors) ? authors[0] : undefined
if (first?.family) {
lastName = first.family.replace(/[^A-Za-z0-9]/g, "")
} else if (first?.literal) {
lastName = first.literal
.split(/\s+/)[0]
.replace(/[^A-Za-z0-9]/g, "")
}
const issued = entry.issued as CSLDateObject | undefined
const parts = issued?.["date-parts"]?.[0]
if (parts?.[0]) year = String(parts[0])
}
const entryKey = makeEntryKey(id, this.usedKeys, lastName, year)
// Record the raw id → normalised entry_key for caller lookup.
this.rawIdToEntryKey.set(id, entryKey)
return {
entry_key: entryKey,
bib_type: bibType,
fields,
}
}
private getBibType(cslType: string): string | false {
// Find BibTeX type that maps to this CSL type
return (
Object.keys(BibTypes).find(
(type) => BibTypes[type].csl === cslType,
) || false
)
}
private convertField(
key: string,
value: unknown,
entryId: string,
): [string, unknown] | false {
// Find matching BibTeX field
const bibField = Object.keys(BibFieldTypes).find((field) => {
const csl = BibFieldTypes[field].csl
const matches = typeof csl === "string" ? csl === key : csl?.[key]
if (matches) {
const bibType = this.getBibType(this.input[entryId].type || "")
if (!bibType) return false
const typeFields = BibTypes[bibType]
return (
typeFields.required.includes(field) ||
typeFields.optional.includes(field) ||
typeFields.eitheror.includes(field)
)
}
return false
})
if (!bibField) {
this.warnings.push({
type: "unknown_field",
field: key,
value,
entry: entryId,
})
return false
}
// Convert the value based on field type
const fieldType = BibFieldTypes[bibField].type
let convertedValue: unknown
switch (fieldType) {
case "f_date":
convertedValue = this.convertDate(value as CSLDateObject)
break
case "f_integer":
convertedValue = this.convertInteger(value)
break
case "f_key": {
const keyVal = this.convertKey(value, bibField)
if (!keyVal) {
// Unrecognized option value — skip this field entirely to
// avoid storing an empty string that would crash the
// BibLaTeX exporter's options lookup.
this.warnings.push({
type: "unknown_field_value",
field: bibField,
value,
entry: entryId,
})
return false
}
convertedValue = keyVal
break
}
case "f_literal":
case "f_long_literal":
convertedValue = this.convertRichText(value as string)
break
case "l_range":
convertedValue = this.convertRange(value as string)
break
case "f_title":
convertedValue = this.convertRichText(value as string)
break
case "f_uri":
case "f_verbatim":
convertedValue = String(value)
break
case "l_key":
convertedValue = this.convertKeyList(
value as string[],
bibField,
)
break
case "l_literal":
convertedValue = this.convertLiteralList(value as string[])
break
case "l_name":
convertedValue = this.convertNames(value as CSLNameObject[])
break
case "l_tag":
convertedValue = this.convertTags(value as string[])
break
default:
convertedValue = value
}
return [bibField, convertedValue]
}
private convertDate(date: CSLDateObject): string {
if (!date["date-parts"]) return ""
const parts = date["date-parts"][0]
let dateStr = String(parts[0]) // Year
if (parts[1]) {
// Month
dateStr += `-${String(parts[1]).padStart(2, "0")}`
if (parts[2]) {
// Day
dateStr += `-${String(parts[2]).padStart(2, "0")}`
}
}
if (date.circa) {
dateStr += "~"
}
return dateStr
}
private convertNames(names: CSLNameObject[]): NameDictObject[] {
return names.map((name) => {
const nameObj: NameDictObject = {}
if (name.literal) {
nameObj.literal = this.convertRichText(name.literal)
} else {
if (name.family) {
nameObj.family = this.convertRichText(name.family)
}
if (name.given) {
nameObj.given = this.convertRichText(name.given)
}
if (name.suffix) {
nameObj.suffix = this.convertRichText(name.suffix)
}
if (name["non-dropping-particle"]) {
nameObj.prefix = this.convertRichText(
name["non-dropping-particle"],
)
nameObj.useprefix = true
} else if (name["dropping-particle"]) {
nameObj.prefix = this.convertRichText(
name["dropping-particle"],
)
nameObj.useprefix = false
}
}
return nameObj
})
}
private convertInteger(value: unknown): NodeArray {
const num = parseInt(String(value), 10)
return [
{
type: "text",
text: Number.isNaN(num) ? String(value) : String(num),
},
]
}
private convertKey(value: unknown, fieldName: string): string {
const stringValue = String(value).toLowerCase()
const fieldType = BibFieldTypes[fieldName]
if (fieldType.options) {
if (Array.isArray(fieldType.options)) {
// Simple list of options
return fieldType.options.includes(stringValue)
? stringValue
: ""
} else {
// Map of options (like langid)
// Add type assertion here
const options = fieldType.options as Record<
string,
{ csl: string }
>
const option = Object.keys(options).find(
(key) => options[key].csl === stringValue,
)
return option || ""
}
}
return stringValue
}
private convertRange(value: string): RangeArray[] {
return String(value)
.split(",")
.map((range) => {
const parts = range.split(/[-–—]/)
// Ensure we always return an array with exactly one element
return [
parts.map((part) => ({
type: "text",
text: part.trim(),
})),
]
})
}
private convertKeyList(
values: string[],
fieldName: string,
): (string | NodeArray)[] {
if (!Array.isArray(values)) {
values = [String(values)]
}
return values.map((value) => this.convertKey(value, fieldName))
}
private convertLiteralList(values: string[]): NodeArray[] {
if (!Array.isArray(values)) {
values = [String(values)]
}
return values.map((value) => this.convertRichText(value))
}
private convertTags(values: string[]): string[] {
if (!Array.isArray(values)) {
values = [String(values)]
}
return values.map((value) => value.trim())
}
private convertRichText(text: string): NodeArray {
if (typeof text !== "string") {
return [{ type: "text", text: String(text) }]
}
// If no HTML tags present, return simple text node
if (!text.includes("<")) {
return [{ type: "text", text }]
}
const nodes: NodeArray = []
let currentText = ""
let currentMarks: { type: string }[] = []
// Helper to add accumulated text as node
const addTextNode = () => {
if (currentText) {
nodes.push({
type: "text",
text: currentText,
...(currentMarks.length
? { marks: [...currentMarks] }
: {}),
})
currentText = ""
}
}
let i = 0
while (i < text.length) {
if (text[i] === "<") {
const closeTag = text[i + 1] === "/"
const tagEnd = text.indexOf(">", i)
if (tagEnd === -1) {
currentText += text[i]
i++
continue
}
let tag = text.substring(closeTag ? i + 2 : i + 1, tagEnd)
// Handle style attribute for small-caps
if (tag.startsWith('span style="font-variant:small-caps;"')) {
tag = "smallcaps"
} else if (tag.startsWith('span class="nocase"')) {
tag = "nocase"
} else if (tag === "span") {
// Skip closing span tags
i = tagEnd + 1
continue
}
// Map HTML tags to internal mark types
const markType = {
b: "strong",
i: "em",
sub: "sub",
sup: "sup",
smallcaps: "smallcaps",
nocase: "nocase",
}[tag]
if (markType) {
addTextNode()
if (closeTag) {
currentMarks = currentMarks.filter(
(mark) => mark.type !== markType,
)
} else {
currentMarks.push({ type: markType })
}
i = tagEnd + 1
continue
}
}
currentText += text[i]
i++
}
addTextNode()
return nodes
}
}
export function parseCSL(
input: Record<string, CSLEntry>,
): Record<number, EntryObject> {
return new CSLParser(input).parse()
}