biblatex-csl-converter
Version:
Bibliography format converter: BibLaTeX, BibTeX, CSL-JSON, RIS, ENW, EndNote XML, Citavi, DOCX citations, ODT citations — parse, convert, and export with round-trip fidelity
899 lines (814 loc) • 33.9 kB
text/typescript
/**
* PubMed NBIB format parser
* Handles the PubMed/MEDLINE tagged export format (e.g., PMID- 12345678)
*
* Format specification:
* https://www.nlm.nih.gov/bsd/mms/medlineelements.html
*/
import {
BibFieldTypes,
BibTypes,
type EntryObject,
type NameDictObject,
type NodeArray,
type RangeArray,
} from "../const"
import { lookupLangid, makeEntryKey } from "./tools"
/**
* Map from PubMed publication type strings (PT tag) to internal BibTypes.
* PubMed entries can have multiple PT values; the first recognised one wins.
* Source: https://www.nlm.nih.gov/mesh/pubtypes.html
*/
const NBIBTypeMap: Record<string, string> = {
"Journal Article": "article-journal",
Review: "article-journal",
"Systematic Review": "article-journal",
"Meta-Analysis": "article-journal",
"Clinical Trial": "article-journal",
"Clinical Trial, Phase I": "article-journal",
"Clinical Trial, Phase II": "article-journal",
"Clinical Trial, Phase III": "article-journal",
"Clinical Trial, Phase IV": "article-journal",
"Randomized Controlled Trial": "article-journal",
"Controlled Clinical Trial": "article-journal",
"Multicenter Study": "article-journal",
"Observational Study": "article-journal",
"Case Reports": "article-journal",
"Comparative Study": "article-journal",
Editorial: "article-journal",
Letter: "article-journal",
Comment: "article-journal",
News: "article-newspaper",
"Newspaper Article": "article-newspaper",
"Magazine Article": "article-magazine",
Book: "book",
"Book Chapter": "inbook",
"Collected Works": "collection",
Congress: "proceedings",
"Conference Paper": "inproceedings",
Dataset: "dataset",
Preprint: "unpublished",
"Technical Report": "report",
Report: "report",
"Government Publication": "report",
"Legal Case": "legal_case",
Legislation: "legislation",
Patent: "patent",
Thesis: "thesis",
Biography: "article-journal",
"Historical Article": "article-journal",
Interview: "article-journal",
Lecture: "misc",
"Video-Audio Media": "video",
"Audiovisual Aids": "video",
"Retracted Publication": "article-journal",
"Retraction of Publication": "article-journal",
"Published Erratum": "article-journal",
Address: "article-journal",
Portrait: "misc",
Guideline: "report",
"Practice Guideline": "report",
Advisory: "report",
"Consensus Development Conference": "article-journal",
"Consensus Development Conference, NIH": "article-journal",
"Evaluation Study": "article-journal",
"Validation Study": "article-journal",
"Twin Study": "article-journal",
"Clinical Conference": "inproceedings",
"Introductory Journal Article": "article-journal",
"Scientific Integrity Review": "article-journal",
"Expression of Concern": "article-journal",
Overall: "article-journal",
"Classical Article": "article-journal",
"English Abstract": "article-journal",
"Duplicate Publication": "article-journal",
Festschrift: "collection",
Bibliography: "misc",
Directory: "misc",
Autobiography: "book",
"Interactive Tutorial": "misc",
Webcasts: "online",
"Electronic Supplementary Materials": "misc",
"Online Only": "online",
"Corrected and Republished Article": "article-journal",
Retraction: "article-journal",
}
/**
* NBIB tags that are explicitly handled during conversion.
* Any tag in a record not in this set will trigger an `unknown_tag` warning.
* Source: https://www.nlm.nih.gov/bsd/mms/medlineelements.html
*/
const KNOWN_NBIB_TAGS = new Set([
"PMID", // PubMed Unique Identifier
"OWN", // Owner
"STAT", // Status
"DCOM", // Date Completed
"LR", // Last Revision Date
"IS", // ISSN
"VI", // Volume
"IP", // Issue
"DP", // Date of Publication
"TI", // Title
"TT", // Transliterated Title
"PG", // Pagination
"LID", // Location Identifier (DOI, pii, etc.)
"AB", // Abstract
"CI", // Copyright Information
"FAU", // Full Author Name
"AU", // Author
"AD", // Affiliation / Author Address
"FED", // Full Editor Name
"ED", // Editor
"LA", // Language
"PT", // Publication Type
"DEP", // Date of Electronic Publication
"PL", // Place of Publication
"TA", // Journal Title Abbreviation
"JT", // Full Journal Title
"JID", // NLM Unique ID
"SB", // Subset
"MH", // MeSH Terms
"OTO", // Other Term Owner
"OT", // Other Term (Keywords)
"COIS", // Conflict of Interest Statement
"EDAT", // Entrez Date
"MHDA", // MeSH Date
"CRDT", // Create Date
"PHST", // Publication History Status
"AID", // Article Identifier
"PST", // Publication Status
"SO", // Source
"GR", // Grant Number
"PMC", // PubMed Central ID
"PMCR", // PubMed Central Release
"RN", // Registry Number / EC Number
"NM", // Substance Name
"OID", // Other ID
"RF", // Number of References
"SFM", // Space Flight Mission
"CIN", // Comment In
"CON", // Comment On
"EIN", // Erratum In
"EON", // Erratum For
"EFR", // Erratum For (Retraction)
"RI", // Republished In
"RIN", // Retraction In
"ROF", // Retraction Of
"UIN", // Update In
"UOF", // Update Of
"SPIN", // Summary For Patients In
"ORI", // Original Report In
"IR", // Investigator
"FIR", // Full Investigator Name
"BTI", // Book Title
"CTI", // Collection Title
"ISBN", // ISBN
"PB", // Publisher
"CN", // Corporate Author
"EN", // Edition
"VTI", // Volume Title
"IRAD", // Investigator Affiliation
"CRF", // Correction and Republication For
"CRI", // Correction and Republication In
"ECI", // Expression of Concern In
"ECF", // Expression of Concern For
])
interface ErrorObject {
type: string
field?: string
value?: unknown
entry?: string
tag?: string
}
export interface NBIBParseResult {
entries: Record<number, EntryObject>
errors: ErrorObject[]
warnings: ErrorObject[]
}
interface NBIBRecord {
[key: string]: string[]
}
export class NBIBParser {
input: string
entries: EntryObject[]
errors: ErrorObject[]
warnings: ErrorObject[]
private usedKeys: Set<string> = new Set()
constructor(input: string) {
this.input = input
this.entries = []
this.errors = []
this.warnings = []
}
parse(): NBIBParseResult {
const records = this.parseNBIBFormat()
for (let i = 0; i < records.length; i++) {
const convertedEntry = this.convertRecord(records[i], i + 1)
if (convertedEntry) {
this.entries.push(convertedEntry)
}
}
const entries: Record<number, EntryObject> = {}
this.entries.forEach((entry, index) => {
entries[index + 1] = entry
})
return {
entries,
errors: this.errors,
warnings: this.warnings,
}
}
/**
* Parse raw NBIB text into an array of tag→values records.
*
* The NBIB/MEDLINE format uses lines of the form:
* TAG - value
* where TAG is a 2–4 character identifier padded with spaces to a fixed
* column width of 6 before the hyphen-space delimiter. Continuation lines
* start with 6 spaces. Records are delimited by a blank line following the
* final tag (often "SO"), or by the start of a new PMID tag.
*/
private parseNBIBFormat(): NBIBRecord[] {
const records: NBIBRecord[] = []
const normalizedInput = this.input
.replace(/\r\r\n/g, "\n")
.replace(/\r\n/g, "\n")
const lines = normalizedInput.split("\n")
let currentRecord: NBIBRecord = {}
let currentTag: string | null = null
const saveRecord = () => {
if (Object.keys(currentRecord).length > 0) {
records.push(currentRecord)
currentRecord = {}
currentTag = null
}
}
for (let i = 0; i < lines.length; i++) {
const line = lines[i]
// A tag line: up to 4 non-space chars, then spaces up to column 4,
// then "- " (with at least one space before and after the dash).
// The canonical format is "XXXX- value" where the tag field is
// left-justified in a 4-character column followed by "- ".
const tagMatch = line.match(/^([A-Z]+)\s*-\s(.*)$/)
if (tagMatch) {
const tag = tagMatch[1].trim()
const value = tagMatch[2]
// A new PMID tag means a new record starts
if (tag === "PMID" && Object.keys(currentRecord).length > 0) {
saveRecord()
}
if (!currentRecord[tag]) {
currentRecord[tag] = []
}
currentRecord[tag].push(value)
currentTag = tag
} else if (line.match(/^\s{6}/) && currentTag) {
// Continuation line: 6 leading spaces
const value = line.trim()
const lastIdx = currentRecord[currentTag].length - 1
// Append with a space to join the multi-line field naturally
currentRecord[currentTag][lastIdx] += ` ${value}`
} else if (line.trim() === "") {
// Blank line ends a record
saveRecord()
}
// Lines that don't match any pattern are silently ignored
// (e.g. file headers, blank tag lines)
}
// Flush the last record if the file doesn't end with a blank line
saveRecord()
return records
}
private convertRecord(
record: NBIBRecord,
index: number,
): EntryObject | false {
// Determine entry type from PT (Publication Type) tags.
// A single NBIB record may list multiple PT values; we use the first
// one that maps to a known internal type, falling back to misc.
const pubTypes = record.PT || []
let mappedBibType: string | undefined
let matchedPT: string | undefined
for (const pt of pubTypes) {
const trimmed = pt.trim()
if (NBIBTypeMap[trimmed]) {
mappedBibType = NBIBTypeMap[trimmed]
matchedPT = trimmed
break
}
}
if (!mappedBibType && pubTypes.length > 0) {
this.warnings.push({
type: "unknown_type",
value: pubTypes[0].trim(),
entry: String(index),
})
}
const bibType = mappedBibType || "misc"
if (mappedBibType && !BibTypes[bibType]) {
this.errors.push({
type: "unknown_type",
value: matchedPT,
entry: String(index),
})
return false
}
const entryKey = this.generateEntryKey(record, index)
const fields: Record<string, unknown> = {}
// ── Title ─────────────────────────────────────────────────────────────
// TI = article/chapter title; BTI = book title (for book records)
const title =
this.getFirstValue(record.TI) || this.getFirstValue(record.BTI)
if (title) {
fields.title = this.setField("title", title)
} else {
this.warnings.push({
type: "missing_required_field",
field: "title",
entry: entryKey,
})
}
// ── Transliterated / alternate title ──────────────────────────────────
const transTitle = this.getFirstValue(record.TT)
if (transTitle) {
fields.origtitle = this.setField("origtitle", transTitle)
}
// ── Journal title ─────────────────────────────────────────────────────
// JT = full journal title; TA = abbreviated title; CTI = collection title
const journalTitle =
this.getFirstValue(record.JT) || this.getFirstValue(record.CTI)
if (journalTitle) {
fields.journaltitle = this.setField("journaltitle", journalTitle)
}
const journalAbbrev = this.getFirstValue(record.TA)
if (journalAbbrev && !journalTitle) {
// Only use abbreviation when the full title is absent
fields.journaltitle = this.setField("journaltitle", journalAbbrev)
}
if (journalAbbrev) {
fields.shortjournal = this.setField("shortjournal", journalAbbrev)
}
// ── Volume title (VTI) ────────────────────────────────────────────────
const volumeTitle = this.getFirstValue(record.VTI)
if (volumeTitle) {
fields.booktitle = this.setField("booktitle", volumeTitle)
}
// ── Authors ───────────────────────────────────────────────────────────
// FAU = full author name (preferred); AU = abbreviated author name;
// CN = corporate/collective author (treated as literal name)
const fullAuthors = record.FAU || []
const shortAuthors = record.AU || []
const corpAuthors = record.CN || []
const authorNames: NameDictObject[] = []
if (fullAuthors.length > 0) {
authorNames.push(...this.parseNames(fullAuthors))
} else if (shortAuthors.length > 0) {
authorNames.push(...this.parseNames(shortAuthors))
}
// Corporate authors are appended as literal names
for (const corp of corpAuthors) {
const trimmed = corp.trim()
if (trimmed) {
authorNames.push({
literal: this.convertRichText(trimmed),
})
}
}
if (authorNames.length > 0) {
fields.author = authorNames
} else {
this.warnings.push({
type: "missing_required_field",
field: "author",
entry: entryKey,
})
}
// ── Editors ───────────────────────────────────────────────────────────
// FED = full editor name; ED = abbreviated editor name
const fullEditors = record.FED || []
const shortEditors = record.ED || []
const editorNames: NameDictObject[] =
fullEditors.length > 0
? this.parseNames(fullEditors)
: this.parseNames(shortEditors)
if (editorNames.length > 0) {
fields.editor = editorNames
}
// ── Abstract ─────────────────────────────────────────────────────────
const abstract = this.getFirstValue(record.AB)
if (abstract) {
fields.abstract = this.setField("abstract", abstract)
}
// ── Date of publication ───────────────────────────────────────────────
// DP contains a human-readable date such as "2025 May", "2025 May 1",
// "2025", "2025 Spring", etc. We parse out a best-effort EDTF string.
const dp = this.getFirstValue(record.DP)
if (dp) {
fields.date = this.parsePublicationDate(dp)
} else {
this.warnings.push({
type: "missing_required_field",
field: "date",
entry: entryKey,
})
}
// ── Volume ────────────────────────────────────────────────────────────
const volume = this.getFirstValue(record.VI)
if (volume) {
fields.volume = this.setField("volume", volume)
}
// ── Issue ─────────────────────────────────────────────────────────────
const issue = this.getFirstValue(record.IP)
if (issue) {
fields.issue = this.setField("issue", issue)
}
// ── Pages ─────────────────────────────────────────────────────────────
const pages = this.getFirstValue(record.PG)
if (pages) {
fields.pages = this.convertRange(pages)
}
// ── Publisher ─────────────────────────────────────────────────────────
const publisher = this.getFirstValue(record.PB)
if (publisher) {
fields.publisher = this.setField("publisher", publisher)
}
// ── Place of publication ──────────────────────────────────────────────
const place = this.getFirstValue(record.PL)
if (place) {
fields.location = this.setField("location", place)
}
// ── DOI and other location identifiers ───────────────────────────────
// LID lines carry a value AND a type tag, e.g.:
// "10.1016/j.xxx.2024.11.004 [doi]"
// "S1094-9194(24)00064-1 [pii]"
// AID lines use the same convention.
const doi = this.extractLID(record, "doi")
if (doi) {
fields.doi = this.setField("doi", doi)
}
const pii = this.extractLID(record, "pii")
if (pii && !doi) {
// Store pii as eprint when there is no DOI
fields.eprint = this.setField("eprint", pii)
fields.eprinttype = "pii"
}
// ── ISSN ─────────────────────────────────────────────────────────────
// IS lines may appear multiple times (print/electronic ISSN).
// We keep all values joined by space, or store each separately.
if (record.IS && record.IS.length > 0) {
// Prefer the first value; it is often the print ISSN
const issn = record.IS[0].trim()
if (issn) {
fields.issn = this.setField("issn", issn)
}
}
// ── ISBN ─────────────────────────────────────────────────────────────
const isbn = this.getFirstValue(record.ISBN)
if (isbn) {
fields.isbn = this.setField("isbn", isbn)
}
// ── PubMed ID ─────────────────────────────────────────────────────────
const pmid = this.getFirstValue(record.PMID)
if (pmid) {
fields.eprint = this.setField("eprint", pmid.trim())
fields.eprinttype = "pubmed"
}
// ── PubMed Central ID ─────────────────────────────────────────────────
const pmc = this.getFirstValue(record.PMC)
if (pmc) {
// Only overwrite eprint with PMC if PMID was not set
if (!pmid) {
fields.eprint = this.setField("eprint", pmc.trim())
fields.eprinttype = "pmcid"
}
fields.note = this.setField("note", `PMC: ${pmc.trim()}`)
}
// ── Language ─────────────────────────────────────────────────────────
const language = this.getFirstValue(record.LA)
if (language) {
const langid = this.setField("langid", language.trim())
if (langid !== undefined) {
fields.langid = langid
}
}
// ── Keywords (MeSH + author keywords) ────────────────────────────────
// OT = other terms (author-supplied keywords)
// MH = MeSH headings — we include these too as they are topic keywords
const otKeywords = (record.OT || [])
.map((k) => k.trim())
.filter(Boolean)
const meshKeywords = (record.MH || [])
.map((k) => {
// Strip leading asterisks used to denote major MeSH headings
// and trailing qualifiers like "/pathology"
return k.replace(/^\*/, "").split("/")[0].trim()
})
.filter(Boolean)
const allKeywords = [
...otKeywords,
// Deduplicate MeSH terms against already-present OT keywords
...meshKeywords.filter(
(m) =>
!otKeywords.some(
(o) => o.toLowerCase() === m.toLowerCase(),
),
),
]
if (allKeywords.length > 0) {
fields.keywords = allKeywords
}
// ── Edition ───────────────────────────────────────────────────────────
const edition = this.getFirstValue(record.EN)
if (edition) {
fields.edition = this.setField("edition", edition)
}
// ── Grant numbers ─────────────────────────────────────────────────────
if (record.GR && record.GR.length > 0) {
// Store as a note — there is no dedicated internal field for grants
const grants = record.GR.map((g) => g.trim()).join("; ")
if (grants && !fields.note) {
fields.note = this.setField("note", `Grants: ${grants}`)
}
}
this.checkUnknownTags(record, entryKey)
return {
entry_key: entryKey,
bib_type: bibType,
fields,
}
}
/**
* Scan LID and AID tag arrays for a value carrying a specific type label
* (e.g. "[doi]", "[pii]") and return the bare identifier.
*/
private extractLID(record: NBIBRecord, type: string): string {
const pattern = new RegExp(`\\[${type}\\]`, "i")
for (const tagKey of ["LID", "AID"]) {
for (const value of record[tagKey] || []) {
if (pattern.test(value)) {
return value.replace(pattern, "").trim()
}
}
}
return ""
}
/**
* Convert a MEDLINE "Date of Publication" string into a best-effort
* EDTF/ISO 8601 date string.
*
* Examples of DP values encountered in practice:
* "2025" → "2025"
* "2025 May" → "2025-05"
* "2025 May 1" → "2025-05-01"
* "2025 May-Jun" → "2025-05/2025-06"
* "2024 Winter" → "2024" (season dropped — not valid EDTF)
* "2024 Jan-Mar" → "2024-01/2024-03"
*/
private parsePublicationDate(dp: string): string {
const monthMap: Record<string, string> = {
jan: "01",
feb: "02",
mar: "03",
apr: "04",
may: "05",
jun: "06",
jul: "07",
aug: "08",
sep: "09",
oct: "10",
nov: "11",
dec: "12",
}
dp = dp.trim()
// Year only
const yearOnly = dp.match(/^(\d{4})$/)
if (yearOnly) {
return yearOnly[1]
}
// "YYYY Season" — seasons are not representable in EDTF; keep year only
const withSeason = dp.match(
/^(\d{4})\s+(spring|summer|fall|autumn|winter)$/i,
)
if (withSeason) {
return withSeason[1]
}
// "YYYY Mon" or "YYYY Mon-Mon" or "YYYY Mon Day" or "YYYY Mon Day-Day"
const yearMonthMatch = dp.match(
/^(\d{4})\s+([A-Za-z]{3})(?:-([A-Za-z]{3}))?(?:\s+(\d{1,2})(?:-(\d{1,2}))?)?$/,
)
if (yearMonthMatch) {
const year = yearMonthMatch[1]
const mon1 = monthMap[yearMonthMatch[2].toLowerCase()]
const mon2 = yearMonthMatch[3]
? monthMap[yearMonthMatch[3].toLowerCase()]
: undefined
const day1 = yearMonthMatch[4]
? yearMonthMatch[4].padStart(2, "0")
: undefined
const day2 = yearMonthMatch[5]
? yearMonthMatch[5].padStart(2, "0")
: undefined
if (!mon1) {
return year
}
const start = day1 ? `${year}-${mon1}-${day1}` : `${year}-${mon1}`
if (mon2) {
const end = day2 ? `${year}-${mon2}-${day2}` : `${year}-${mon2}`
return `${start}/${end}`
}
return start
}
// Fallback — return the raw value and let the consumer deal with it
return dp
}
/**
* Emit `unknown_tag` warnings for every tag in a parsed NBIB record that
* is not present in the {@link KNOWN_NBIB_TAGS} set.
*/
private checkUnknownTags(record: NBIBRecord, entryKey: string): void {
for (const tag of Object.keys(record)) {
if (!KNOWN_NBIB_TAGS.has(tag)) {
const value = this.getFirstValue(record[tag])
this.warnings.push({
type: "unknown_tag",
tag,
value: value ? value.substring(0, 100) : undefined,
entry: entryKey,
})
}
}
}
private getFirstValue(values: string[] | undefined): string {
if (!values || values.length === 0) {
return ""
}
return values[0].trim()
}
private parseNames(names: string[]): NameDictObject[] {
return names
.map((name) => this.parseName(name.trim()))
.filter((n): n is NameDictObject => n !== null)
}
/**
* Parse an author name string.
*
* PubMed FAU names are in "Family, Given" format:
* "Vergneau-Grosset, Claire"
* "van der Berg, Jan Willem"
*
* AU (abbreviated) names look like:
* "Raulic J"
* "Smith AB"
*/
private parseName(nameText: string): NameDictObject | null {
nameText = nameText.trim()
if (!nameText) {
return null
}
const nameObj: NameDictObject = {}
if (nameText.includes(",")) {
// "Family, Given" format
const commaIdx = nameText.indexOf(",")
const family = nameText.slice(0, commaIdx).trim()
const given = nameText.slice(commaIdx + 1).trim()
nameObj.family = this.convertRichText(family)
if (given) {
nameObj.given = this.convertRichText(given)
}
return nameObj
}
// Abbreviated format: "Smith AB" — last token is initials, rest is family
const words = nameText.split(/\s+/)
if (words.length === 1) {
nameObj.literal = this.convertRichText(nameText)
} else {
const lastWord = words[words.length - 1]
// If the last word looks like initials (all uppercase letters), treat
// it as given initials and the preceding words as family name
if (/^[A-Z]{1,4}$/.test(lastWord)) {
nameObj.family = this.convertRichText(
words.slice(0, -1).join(" "),
)
nameObj.given = this.convertRichText(lastWord)
} else {
// Otherwise treat entire string as a single literal
nameObj.literal = this.convertRichText(nameText)
}
}
return nameObj
}
private generateEntryKey(record: NBIBRecord, index: number): string {
// Prefer full author name, fall back to abbreviated
const firstAuthor =
this.getFirstValue(record.FAU) || this.getFirstValue(record.AU)
const dp = this.getFirstValue(record.DP)
const year = dp ? (dp.match(/\d{4}/)?.[0] ?? "") : ""
// Use the family name part (before the comma, if present)
let lastName: string | undefined
if (firstAuthor) {
const family = firstAuthor.includes(",")
? firstAuthor.split(",")[0].trim()
: firstAuthor.split(/\s+/).slice(0, -1).join("") ||
firstAuthor.split(/\s+/)[0]
const cleanFamily = family.replace(/[^A-Za-z0-9]/g, "")
if (cleanFamily) lastName = cleanFamily
}
// Use PMID as the candidate when no author is available so that the
// prefixed form "pmid{number}" is preserved as a fallback base.
const pmid = this.getFirstValue(record.PMID)
const candidate = pmid ? `pmid${pmid.trim()}` : String(index)
return makeEntryKey(
candidate,
this.usedKeys,
lastName,
year || undefined,
)
}
private convertRange(value: string): RangeArray[] {
if (!value) {
return []
}
// Pages field may look like "315-330", "e123", "315-330, e1-e5", etc.
return String(value)
.split(/,|;/)
.map((range) => {
const trimmed = range.trim()
// Split on hyphen/en-dash/em-dash, but only when both sides
// look numeric or alphanumeric (avoids splitting "e123-456")
const parts = trimmed.split(/(?<=\w)[-–—](?=\w)/)
if (parts.length >= 2) {
return [
parts.map((part) => ({
type: "text" as const,
text: part.trim(),
})),
] as RangeArray
}
return [
[{ type: "text" as const, text: trimmed }],
] as RangeArray
})
}
/**
* Stores a plain text value into the correct internal shape for the given
* BibField key:
* - l_literal → NodeArray[] (array of NodeArrays)
* - f_verbatim / f_uri / f_date → plain string
* - f_key → matched option key string, or undefined if unrecognised
* - everything else → NodeArray
*/
private setField(
fieldKey: string,
text: string,
): NodeArray | NodeArray[] | string | undefined {
const fieldDef = BibFieldTypes[fieldKey]
const fieldType = fieldDef?.type
if (fieldType === "l_literal") {
return [this.convertRichText(text)]
} else if (
fieldType === "f_verbatim" ||
fieldType === "f_uri" ||
fieldType === "f_date"
) {
return text
} else if (fieldType === "f_key") {
const options = fieldDef?.options
if (Array.isArray(options)) {
// Array options (e.g. bookpagination, type): plain string match
const lower = text.toLowerCase().trim()
const matched = options.find(
(k: string) => k.toLowerCase() === lower,
)
return matched // undefined if no match
} else if (options) {
// Object options (e.g. langid): use shared lookup that handles
// BCP-47 codes, ISO 639-2 codes, full names, biblatex aliases
return lookupLangid(text) // undefined if no match
}
return text
}
return this.convertRichText(text)
}
private convertRichText(text: string): NodeArray {
if (typeof text !== "string") {
return [{ type: "text", text: String(text) }]
}
if (!text) {
return [{ type: "text", text: "" }]
}
// Decode common HTML entities that may appear in PubMed abstracts
const decodedText = text
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/
/g, "\n")
.replace(/
/g, "\n")
.replace(/ /g, "\n")
.replace(/ /g, "\n")
return [{ type: "text", text: decodedText }]
}
}
export function parseNBIB(input: string): NBIBParseResult {
return new NBIBParser(input).parse()
}