biblatex-csl-converter

Version:

Bibliography format converter: BibLaTeX, BibTeX, CSL-JSON, RIS, ENW, EndNote XML, Citavi, DOCX citations, ODT citations — parse, convert, and export with round-trip fidelity

github.com/fiduswriter/biblatex-csl-converter

fiduswriter/biblatex-csl-converter

1,519 lines (1,397 loc) • 53.8 kB

text/typescript

/** * EndNote XML parser * Supports EndNote XML export format (both EndNote.dtd and RSXML.dtd variants) * as well as EndNote Cite While You Write format */ import { BibFieldTypes, BibTypes, type EntryObject, type NameDictObject, type NodeArray, type RangeArray, } from "../const" import { lookupLangid, makeEntryKey } from "./tools" // EndNote reference type name to BibType mapping // Direct mapping to internal BibType names // Source: Endnote 21 User Guide, p. 248-249 const EndNoteTypeMap: Record<string, string> = { "Aggregated Database": "misc", "Ancient Text": "classic", Artwork: "artwork", "Audiovisual Material": "video", Bill: "legislation", Blog: "online", Book: "book", "Book Section": "inbook", Case: "legal_case", Catalog: "book", "Chart or Table": "figure", "Classical Work": "classic", "Computer Program": "software", "Conference Paper": "inproceedings", "Conference Proceedings": "proceedings", Dataset: "dataset", Dictionary: "inreference", "Discussion Forum": "post", "Edited Book": "collection", "Electronic Article": "article", "Electronic Book": "book", "Electronic Book Section": "inbook", Encyclopedia: "inreference", Equation: "misc", Figure: "figure", "Film or Broadcast": "video", Generic: "misc", "Government Document": "report", Grant: "report", Hearing: "hearing", Interview: "interview", "Journal Article": "article-journal", "Legal Rule or Regulation": "legislation", "Magazine Article": "article-magazine", Manuscript: "unpublished", Map: "map", "Multimedia Application": "software", Music: "audio", "Newspaper Article": "article-newspaper", "Online Database": "misc", "Online Multimedia": "online", Pamphlet: "booklet", Patent: "patent", "Personal Communication": "personal_communication", Podcast: "audio", "Press Release": "misc", Report: "report", Serial: "book", "Social Media": "post", Standard: "standard", Statute: "legislation", "Television Episode": "video", Thesis: "thesis", "Unpublished Work": "unpublished", "Web Page": "online", } interface ErrorObject { type: string field?: string value?: unknown entry?: string } export interface EndNoteParseResult { entries: Record<number, EntryObject> errors: ErrorObject[] warnings: ErrorObject[] } // Style element for formatted text interface EndNoteStyle { "#text"?: string color?: string face?: string font?: string size?: string } // Field value that may contain styled text type EndNoteStyledValue = | string | { "#text"?: string; style?: EndNoteStyle | EndNoteStyle[] } | EndNoteStyle interface EndNoteAuthor { "#text"?: string style?: EndNoteStyle | EndNoteStyle[] "corp-name"?: string "first-name"?: string initials?: string "last-name"?: string "middle-initial"?: string role?: string salutation?: string suffix?: string title?: string } interface EndNoteDate { "#text"?: string style?: EndNoteStyle | EndNoteStyle[] day?: string julian?: string month?: string year?: string } interface EndNoteUrl { "#text"?: string style?: EndNoteStyle | EndNoteStyle[] "has-ut"?: "yes" | "no" "ppv-app"?: string "ppv-ref"?: "yes" | "no" "ppv-ut"?: string } interface EndNoteUrlGroup { url?: EndNoteUrl | EndNoteUrl[] } interface EndNoteUrls { "web-urls"?: EndNoteUrlGroup "pdf-urls"?: EndNoteUrlGroup "text-urls"?: EndNoteUrlGroup "related-urls"?: EndNoteUrlGroup "image-urls"?: EndNoteUrlGroup } interface EndNoteRecord { // Core elements database?: string | { "#text"?: string; name?: string; path?: string } "source-app"?: | string | { "#text"?: string; name?: string; version?: string } "rec-number"?: string | number "foreign-keys"?: { key?: | { "#text"?: string app?: string "db-id"?: string timestamp?: string } | Array<{ "#text"?: string app?: string "db-id"?: string timestamp?: string }> } "ref-type"?: string | { "#text"?: string; name?: string } // Contributors contributors?: { authors?: { author?: EndNoteAuthor | EndNoteAuthor[] } "secondary-authors"?: { author?: EndNoteAuthor | EndNoteAuthor[] } "tertiary-authors"?: { author?: EndNoteAuthor | EndNoteAuthor[] } "subsidiary-authors"?: { author?: EndNoteAuthor | EndNoteAuthor[] } "translated-authors"?: { author?: EndNoteAuthor | EndNoteAuthor[] } editors?: { editor?: EndNoteAuthor | EndNoteAuthor[] } translators?: { translator?: EndNoteAuthor | EndNoteAuthor[] } } "auth-address"?: EndNoteStyledValue "auth-affiliaton"?: EndNoteStyledValue // Titles titles?: { title?: EndNoteStyledValue "secondary-title"?: EndNoteStyledValue "tertiary-title"?: EndNoteStyledValue "alt-title"?: EndNoteStyledValue "short-title"?: EndNoteStyledValue "translated-title"?: EndNoteStyledValue } // Periodical periodical?: { "full-title"?: EndNoteStyledValue "abbr-1"?: EndNoteStyledValue "abbr-2"?: EndNoteStyledValue "abbr-3"?: EndNoteStyledValue } // Volume/issue pages?: | EndNoteStyledValue | { "#text"?: string; start?: string; end?: string } volume?: EndNoteStyledValue number?: EndNoteStyledValue issue?: EndNoteStyledValue "secondary-volume"?: EndNoteStyledValue "secondary-issue"?: EndNoteStyledValue "num-vols"?: EndNoteStyledValue edition?: EndNoteStyledValue section?: EndNoteStyledValue "reprint-edition"?: EndNoteStyledValue "reprint-status"?: { date?: string status: "in-file" | "no-file" | "on-request" } // Keywords keywords?: { keyword?: EndNoteStyledValue | EndNoteStyledValue[] } // Dates dates?: { year?: EndNoteDate | EndNoteDate[] "pub-dates"?: { date?: EndNoteDate | EndNoteDate[] } "copyright-dates"?: { date?: EndNoteDate | EndNoteDate[] } month?: EndNoteStyledValue day?: EndNoteStyledValue } // Publisher "pub-location"?: EndNoteStyledValue publisher?: EndNoteStyledValue "orig-pub"?: EndNoteStyledValue // Identifiers isbn?: EndNoteStyledValue issn?: EndNoteStyledValue "accession-num"?: EndNoteStyledValue "call-num"?: EndNoteStyledValue "report-id"?: EndNoteStyledValue coden?: EndNoteStyledValue "electronic-resource-num"?: EndNoteStyledValue // Abstract/Notes abstract?: EndNoteStyledValue label?: EndNoteStyledValue image?: string | { "#text"?: string; file?: string; name?: string } caption?: EndNoteStyledValue notes?: EndNoteStyledValue "research-notes"?: EndNoteStyledValue // Other "work-type"?: EndNoteStyledValue "reviewed-item"?: EndNoteStyledValue availability?: EndNoteStyledValue "remote-source"?: EndNoteStyledValue "meeting-place"?: EndNoteStyledValue "work-location"?: EndNoteStyledValue "work-extent"?: EndNoteStyledValue "pack-method"?: EndNoteStyledValue size?: EndNoteStyledValue "repro-ratio"?: EndNoteStyledValue "remote-database-name"?: EndNoteStyledValue "remote-database-provider"?: EndNoteStyledValue language?: EndNoteStyledValue // URLs urls?: EndNoteUrls // Record info "access-date"?: EndNoteStyledValue "modified-date"?: EndNoteStyledValue // Custom/Misc custom1?: EndNoteStyledValue custom2?: EndNoteStyledValue custom3?: EndNoteStyledValue custom4?: EndNoteStyledValue custom5?: EndNoteStyledValue custom6?: EndNoteStyledValue custom7?: EndNoteStyledValue misc1?: EndNoteStyledValue misc2?: EndNoteStyledValue misc3?: EndNoteStyledValue [key: string]: unknown } export class EndNoteParser { input: EndNoteRecord[] entries: EntryObject[] errors: ErrorObject[] warnings: ErrorObject[] private usedKeys: Set<string> = new Set() /** * Maps each record's `rec-number` string to the final `entry_key` assigned * after normalisation. Populated during `parse()` so that callers (e.g. * `DocxCitationsParser`) can resolve a rec-number back to the actual key * used in the returned entries without fragile suffix-matching heuristics. */ recNumberToEntryKey: Map<string, string> = new Map() constructor(input: EndNoteRecord[]) { this.input = Array.isArray(input) ? input : [input] this.entries = [] this.errors = [] this.warnings = [] } parse(): EndNoteParseResult { // Convert each EndNote entry to internal format for (let i = 0; i < this.input.length; i++) { const record = this.input[i] const convertedEntry = this.convertRecord(record, i + 1) if (convertedEntry) { this.entries.push(convertedEntry) } } // Create numbered index const entries: Record<number, EntryObject> = {} this.entries.forEach((entry, index) => { entries[index + 1] = entry }) return { entries, errors: this.errors, warnings: this.warnings, } } private convertRecord( record: EndNoteRecord, index: number, ): EntryObject | false { // Get the reference type and map directly to BibType const refType = this.getRefType(record) const mappedBibType = EndNoteTypeMap[refType] const bibType = mappedBibType || "misc" // Warn when the EndNote ref-type string is not recognised at all if (!mappedBibType) { this.warnings.push({ type: "unknown_type", value: refType, entry: String(index), }) } else if (!BibTypes[bibType]) { // The mapped type itself is not a known BibType — treat as error this.errors.push({ type: "unknown_type", value: refType, entry: String(index), }) return false } const entryKey = this.generateEntryKey(record, index) const fields: Record<string, unknown> = {} const processedFields: Set<string> = new Set() const unhandledData: string[] = [] // Mark core fields as processed processedFields.add("rec-number") processedFields.add("ref-type") processedFields.add("foreign-keys") processedFields.add("database") processedFields.add("source-app") // Extract and convert titles this.extractField( record.titles?.title, fields, "title", processedFields, "titles.title", ) this.extractField( record.titles?.["secondary-title"], fields, "booktitle", processedFields, "titles.secondary-title", ) this.extractField( record.titles?.["tertiary-title"], fields, "series", processedFields, "titles.tertiary-title", ) this.extractField( record.titles?.["short-title"], fields, "shorttitle", processedFields, "titles.short-title", ) // Handle journal title from periodical or secondary-title processedFields.add("periodical") const periodicalTitle = this.getTextContent( record.periodical?.["full-title"], ) const periodicalAbbr = this.getTextContent( record.periodical?.["abbr-1"], ) if (periodicalTitle) { fields.journaltitle = this.convertRichText(periodicalTitle) } else if (!fields.booktitle) { // If no booktitle was set from secondary-title, try it as journal const secondaryTitle = this.getTextContent( record.titles?.["secondary-title"], ) if (secondaryTitle) { fields.journaltitle = this.convertRichText(secondaryTitle) } } if (periodicalAbbr) { fields.shortjournal = this.convertRichText(periodicalAbbr) } // Volume/issue/number fields this.extractField( record.volume, fields, "volume", processedFields, "volume", ) this.extractField( record.number, fields, "number", processedFields, "number", ) this.extractField( record.issue, fields, "issue", processedFields, "issue", ) this.extractField( record["secondary-volume"], fields, "volume", processedFields, "secondary-volume", ) this.extractField( record["secondary-issue"], fields, "issue", processedFields, "secondary-issue", ) this.extractField( record["num-vols"], fields, "volumes", processedFields, "num-vols", ) this.extractField( record.edition, fields, "edition", processedFields, "edition", ) this.extractField( record.section, fields, "chapter", processedFields, "section", ) // Pages with special handling for start/end attributes processedFields.add("pages") if (record.pages) { const pagesText = this.getTextContent(record.pages) const startPage = typeof record.pages === "object" && "start" in record.pages ? record.pages.start : null const endPage = typeof record.pages === "object" && "end" in record.pages ? record.pages.end : null if (startPage && endPage && !pagesText) { fields.pages = [ [ [ { type: "text", text: String(startPage) }, { type: "text", text: String(endPage) }, ], ], ] } else if (pagesText) { fields.pages = this.convertRange(pagesText) } } // Publisher info this.extractField( record.publisher, fields, "publisher", processedFields, "publisher", ) this.extractField( record["pub-location"], fields, "location", processedFields, "pub-location", ) this.extractField( record["orig-pub"], fields, "origpublisher", processedFields, "orig-pub", ) // Identifiers this.extractField(record.isbn, fields, "isbn", processedFields, "isbn") this.extractField(record.issn, fields, "issn", processedFields, "issn") this.extractField( record["accession-num"], fields, "eprint", processedFields, "accession-num", ) this.extractField( record["call-num"], fields, "library", processedFields, "call-num", ) this.extractField( record["report-id"], fields, "number", processedFields, "report-id", ) this.extractField( record.coden, fields, "coden", processedFields, "coden", ) this.extractField( record["electronic-resource-num"], fields, "doi", processedFields, "electronic-resource-num", ) // Abstract and notes this.extractField( record.abstract, fields, "abstract", processedFields, "abstract", ) this.extractField( record.notes, fields, "note", processedFields, "notes", ) this.extractField( record["research-notes"], fields, "annotation", processedFields, "research-notes", ) this.extractField( record.caption, fields, "annotation", processedFields, "caption", ) this.extractField( record.label, fields, "label", processedFields, "label", ) // Language this.extractField( record.language, fields, "langid", processedFields, "language", ) // Work type processedFields.add("work-type") const workType = this.getTextContent(record["work-type"]) if (workType && bibType === "misc") { fields.type = { type: "text", text: workType } } // Reviewed item processedFields.add("reviewed-item") const reviewedItem = this.getTextContent(record["reviewed-item"]) if (reviewedItem) { fields.related = this.convertRichText(reviewedItem) } // Custom fields for (let i = 1; i <= 7; i++) { const fieldName = `custom${i}` processedFields.add(fieldName) const customField = record[fieldName as keyof EndNoteRecord] if (customField) { const customText = this.getTextContent(customField) if (customText) { fields[fieldName] = this.convertRichText(customText) } } } // Misc fields for (let i = 1; i <= 3; i++) { const fieldName = `misc${i}` processedFields.add(fieldName) const miscField = record[fieldName as keyof EndNoteRecord] if (miscField) { const miscText = this.getTextContent(miscField) if (miscText) { fields[fieldName] = this.convertRichText(miscText) } } } // URLs - combine all URL types processedFields.add("urls") const urls: string[] = [] const urlFields = [ record.urls?.["web-urls"], record.urls?.["related-urls"], record.urls?.["pdf-urls"], record.urls?.["text-urls"], ] for (const urlGroup of urlFields) { if (urlGroup?.url) { const urlArray = Array.isArray(urlGroup.url) ? urlGroup.url : [urlGroup.url] for (const url of urlArray) { const urlText = this.getTextContent(url) if (urlText) { urls.push(urlText) } } } } if (urls.length > 0) { // url is f_uri — store as a plain string, not a NodeArray fields.url = urls.length > 1 ? urls.join("; ") : urls[0] } // Access date this.extractField( record["access-date"], fields, "urldate", processedFields, "access-date", ) // Mark dates as processed (handled separately) processedFields.add("dates") // Mark contributors as processed if (record.contributors) { processedFields.add("contributors") } if (record.keywords) { processedFields.add("keywords") } if (record.image) { processedFields.add("image") } if (record.titles) { processedFields.add("titles") } if (record.periodical) { processedFields.add("periodical") } if (record["reprint-status"]) { processedFields.add("reprint-status") } if (record["auth-address"]) { processedFields.add("auth-address") } if (record["auth-affiliaton"]) { processedFields.add("auth-affiliaton") } if (record["modified-date"]) { processedFields.add("modified-date") } // Handle dates const dateValue = this.extractDate(record, processedFields) if (dateValue) { fields.date = dateValue } // Copyright date const copyrightDate = this.extractCopyrightDate(record, processedFields) if (copyrightDate?.length) { fields.origdate = copyrightDate } // Handle translated authors: in EndNote, "translated-authors" are the // original authors of the source work that was translated. The "authors" // field in that case holds the translators of the work. const translatedAuthors = this.extractAuthors( record.contributors?.["translated-authors"]?.author, ) // Handle primary authors const authors = this.extractAuthors( record.contributors?.authors?.author, ) if (translatedAuthors.length > 0) { // When original authors are recorded via translated-authors, the // regular authors are actually the translators. fields.author = translatedAuthors if (authors.length > 0) { fields.translator = authors } } else if (authors.length > 0) { fields.author = authors } // Handle secondary authors (editors) const secondaryAuthors = this.extractAuthors( record.contributors?.["secondary-authors"]?.author, ) if (secondaryAuthors.length > 0) { fields.editor = secondaryAuthors } // Handle tertiary authors (book authors for inbook/book types) const tertiaryAuthors = this.extractAuthors( record.contributors?.["tertiary-authors"]?.author, ) if (tertiaryAuthors.length > 0) { // For book sections, tertiary authors are the book authors if (bibType === "inbook" || bibType === "book") { fields.bookauthor = tertiaryAuthors } } // Handle subsidiary authors const subsidiaryAuthors = this.extractAuthors( record.contributors?.["subsidiary-authors"]?.author, ) if (subsidiaryAuthors.length > 0) { fields.editora = subsidiaryAuthors } // Handle keywords const keywords = this.extractKeywords( record.keywords?.keyword, processedFields, ) if (keywords.length > 0) { fields.keywords = keywords } // Handle image if (record.image) { const imageText = typeof record.image === "string" ? record.image : record.image.file || record.image.name if (imageText) { fields.file = { type: "text", text: imageText } } } // Check for unprocessed fields and add warnings // Warn about missing title if (!fields.title) { this.warnings.push({ type: "missing_required_field", field: "title", entry: entryKey, }) } // Warn about missing author/editor when neither is present if (!fields.author && !fields.editor) { this.warnings.push({ type: "missing_required_field", field: "author", entry: entryKey, }) } // Warn about missing date if (!fields.date) { this.warnings.push({ type: "missing_required_field", field: "date", entry: entryKey, }) } this.checkUnhandledFields( record, processedFields, index, fields, unhandledData, ) // Add unhandled data to note field if there's any if (unhandledData.length > 0 && !fields.note) { fields.note = this.convertRichText( `EndNote import: Unhandled fields - ${unhandledData.join("; ")}`, ) } return { entry_key: entryKey, bib_type: bibType, fields, } } private generateEntryKey(record: EndNoteRecord, index: number): string { // Try to get the first primary author's last name for the key. const authorsRaw = record.contributors?.authors?.author let authorsArr: EndNoteAuthor[] if (!authorsRaw) { authorsArr = [] } else if (Array.isArray(authorsRaw)) { authorsArr = authorsRaw } else { authorsArr = [authorsRaw] } const firstAuthor = authorsArr[0] let lastName: string | undefined if (firstAuthor) { const raw = firstAuthor["last-name"] || firstAuthor["#text"] || firstAuthor["corp-name"] || "" const clean = (typeof raw === "string" ? raw : "").replace( /[^A-Za-z0-9]/g, "", ) if (clean) lastName = clean } // Extract a four-digit year from the dates structure. const datesYear = record.dates?.year const yearNode = Array.isArray(datesYear) ? datesYear[0] : datesYear let year: string | undefined if (yearNode) { const rawYear = typeof yearNode === "string" ? yearNode : (yearNode["#text"] ?? "") const m = String(rawYear).match(/\d{4}/) if (m) year = m[0] } // Use rec-number as the candidate so the numeric identifier is // preserved in the key prefix ("ref{recNum}") when no author/year // are available. const candidate = String(record["rec-number"] || index) const key = makeEntryKey(candidate, this.usedKeys, lastName, year) // Record rec-number → entry_key so callers can do an O(1) lookup. const recNum = String(record["rec-number"] ?? "") if (recNum) { this.recNumberToEntryKey.set(recNum, key) } return key } private getRefType(record: EndNoteRecord): string { const refType = record["ref-type"] if (typeof refType === "object" && refType) { return refType.name || String(refType["#text"] || "Generic") } return String(refType || "Generic") } private getTextContent( value: EndNoteStyledValue | EndNoteAuthor | EndNoteDate | undefined, ): string { if (!value) { return "" } if (typeof value === "string") { return value } // Direct text content if ("#text" in value && value["#text"]) { return value["#text"] } // Handle author attributes directly if ("last-name" in value || "corp-name" in value) { const author = value as EndNoteAuthor if (author["corp-name"]) { return author["corp-name"] } if (author["last-name"]) { const parts: string[] = [author["last-name"]] if (author["first-name"]) { parts.unshift(author["first-name"]) } else if (author.initials) { parts.unshift(author.initials) } return parts.join(", ") } } // Handle date attributes if ("year" in value && (value as EndNoteDate).year) { return (value as EndNoteDate).year || "" } // Handle style element directly if ("style" in value && value.style) { const style = value.style if (typeof style === "string") { return style } if (Array.isArray(style)) { return style.map((s) => s["#text"] || "").join("") } return style["#text"] || "" } return "" } // eslint-disable-next-line max-params private extractField( value: EndNoteStyledValue | undefined, fields: Record<string, unknown>, targetField: string, processedFields?: Set<string>, sourceField?: string, ): void { if (!value) { return } const textContent = this.getTextContent(value) if (!textContent) { return } if (processedFields && sourceField) { processedFields.add(sourceField) } const fieldDef = BibFieldTypes[targetField] const fieldType = fieldDef?.type if (fieldType === "l_range") { fields[targetField] = this.convertRange(textContent) } else if (fieldType === "l_literal") { // l_literal expects NodeArray[] — an array of NodeArrays fields[targetField] = [this.convertRichText(textContent)] } else if ( fieldType === "f_verbatim" || fieldType === "f_uri" || fieldType === "f_date" ) { // verbatim, URI, and date fields are stored as plain strings fields[targetField] = textContent } else if (fieldType === "f_key") { // f_key fields (e.g. langid) must be stored as a key string that // matches one of the option keys in the field definition. const options = fieldDef?.options if (Array.isArray(options)) { // Array options (e.g. bookpagination, type): plain string match const lower = textContent.toLowerCase().trim() const matched = options.find( (k: string) => k.toLowerCase() === lower, ) if (matched) { fields[targetField] = matched } } else if (options) { // Object options (e.g. langid): use shared lookup that handles // BCP-47 codes, ISO 639-2 codes, full names, biblatex aliases const matched = lookupLangid(textContent) if (matched) { fields[targetField] = matched } // If no match is found, omit the field entirely rather than // storing an invalid key that would break the exporters. } else { fields[targetField] = textContent } } else { fields[targetField] = this.convertRichText(textContent) } } // eslint-disable-next-line max-params private extractDate( record: EndNoteRecord, processedFields?: Set<string>, ): string { // Always read the base year from the <year> element first so it can be // used as a fallback when pub-dates only carries month/day information // (e.g. Mendeley exports <pub-dates><date>4</date></pub-dates> for April). let baseYear = "" const yearEl = record.dates?.year if (yearEl) { if (processedFields) processedFields.add("dates.year") baseYear = Array.isArray(yearEl) ? this.getTextContent(yearEl[0]) : yearEl.year || this.getTextContent(yearEl) || yearEl["#text"] || "" } // Try pub-dates for month / day refinement const pubDates = record.dates?.["pub-dates"] if (pubDates?.date) { if (processedFields) processedFields.add("dates.pub-dates") const dates = Array.isArray(pubDates.date) ? pubDates.date : [pubDates.date] const dateTexts: string[] = [] for (const d of dates) { const dYear = d.year || "" const month = d.month const day = d.day if (dYear || month || day) { // Has structured attributes — use explicit year or fall back // to the base year extracted from <year>. const effectiveYear = dYear || baseYear let dateStr = effectiveYear if (month) { dateStr += `-${month.padStart(2, "0")}` if (day) { dateStr += `-${day.padStart(2, "0")}` } } dateTexts.push(dateStr) } else { const text = this.getTextContent(d) if (text) { const trimmed = text.trim() const parsed = this.parsePubDateText(trimmed, baseYear) if (parsed) { dateTexts.push(parsed) } // If parsePubDateText returns null the text could not // be interpreted as a real calendar date — skip it and // let the fallback below use the plain <year> element. } } } if (dateTexts.length > 0) { return dateTexts.join("/") } } // Fall back to the year element alone (with any inline month/day attrs) if (baseYear) { const month = (yearEl && !Array.isArray(yearEl) ? yearEl.month : null) || this.getTextContent(record.dates?.month) const day = (yearEl && !Array.isArray(yearEl) ? yearEl.day : null) || this.getTextContent(record.dates?.day) let dateStr = baseYear if (month) { dateStr += `-${month.padStart(2, "0")}` if (day) { dateStr += `-${day.padStart(2, "0")}` } } return dateStr } return "" } /** * Try to turn a free-text pub-date string from an EndNote `<date>` element * into a valid EDTF / ISO-8601 date string (YYYY, YYYY-MM, or YYYY-MM-DD), * or preserve it verbatim when it carries human-readable date information * that cannot be normalised. * * EndNote lets users type arbitrary text in the publication-date field, so * the content is unreliable. Known patterns encountered in real exports: * * "2009" → "2009" bare four-digit year * "4" → "YYYY-04" bare integer 1–12 (Mendeley month) * "April 2005" → "2005-04" month-name + year (any order) * "Apr 2005" → "2005-04" abbreviated month name * "2005 April" → "2005-04" year-first variant * "Apr. 2005" → "2005-04" abbreviated with period * "August 02" → "YYYY-08-02" month + day, no year → uses baseYear * "01 Jan. 2020" → "2020-01-01" DD Mon. YYYY * "2012/07/01/" → "2012-07-01" YYYY/MM/DD/ (trailing slash) * "2021/10/01/" → "2021-10-01" same * "2012/06/01" → "2012-06-01" YYYY/MM/DD (no trailing slash) * "2009/001/001" → null EndNote pseudo-date (invalid month 001) * "10/31/print" → null MM/DD/garbage — no usable year * "15-17 June 2021"→ (verbatim) complex range — kept as-is * "Mar" → (verbatim) bare month name, no baseYear available * * The only values actively discarded (returning `null`, causing the caller * to fall back to the plain `<year>` element) are those that contain no * recoverable date information at all — specifically the EndNote * `YYYY/NNN/NNN` pseudo-date format and similar non-date constructs. * Everything else is either normalised to ISO 8601 or returned verbatim. * * @param trimmed Already-trimmed text content of the `<date>` element. * @param baseYear Year string from the sibling `<year>` element, used when * the pub-date text supplies only a month (and/or day). * @returns A date string (normalised or verbatim), or `null` to signal * "no usable date information here". */ private parsePubDateText(trimmed: string, baseYear: string): string | null { // Shared month-name lookup (full names + standard abbreviations). const monthMap: Record<string, string> = { january: "01", february: "02", march: "03", april: "04", may: "05", june: "06", july: "07", august: "08", september: "09", october: "10", november: "11", december: "12", jan: "01", feb: "02", mar: "03", apr: "04", jun: "06", jul: "07", aug: "08", sep: "09", oct: "10", nov: "11", dec: "12", } // Helper: normalise a month token ("Jan", "Jan.", "january") → "01" | undefined const resolveMonth = (tok: string): string | undefined => monthMap[tok.replace(/\.$/, "").toLowerCase()] // ── 1. Bare four-digit year ─────────────────────────────────────────── if (/^\d{4}$/.test(trimmed)) { return trimmed } // ── 2. Bare integer 1–12 — Mendeley month number ───────────────────── // Must be an exact integer string with no leading zeros so that we // don't accidentally treat "01" (a day token elsewhere) as a month. const bareInt = parseInt(trimmed, 10) if ( !Number.isNaN(bareInt) && bareInt >= 1 && bareInt <= 12 && String(bareInt) === trimmed ) { if (!baseYear) return null return `${baseYear}-${trimmed.padStart(2, "0")}` } // ── 3. Slash-separated numeric dates: YYYY/MM/DD[/] ────────────────── // EndNote sometimes emits "2012/07/01/" (note trailing slash). // We accept both with and without the trailing slash and validate the // month/day ranges so that the "2009/001/001" pseudo-date (invalid // month 001) is correctly rejected here. // Tokens are limited to 1–2 digits so that zero-padded issue numbers // like "001" (3 digits) are always rejected rather than silently // parsed as integer 1 and accepted as a valid month. const slashDateRe = /^(\d{4})\/(\d{1,2})\/(\d{1,2})\/?$/ const slashMatch = slashDateRe.exec(trimmed) if (slashMatch) { const mm = parseInt(slashMatch[2], 10) const dd = parseInt(slashMatch[3], 10) if (mm >= 1 && mm <= 12 && dd >= 1 && dd <= 31) { // Valid calendar date — normalise to ISO 8601 with 2-digit padding return `${slashMatch[1]}-${String(mm).padStart( 2, "0", )}-${String(dd).padStart(2, "0")}` } // Out-of-range values → not a real calendar date return null } // Slash-date with 3+-digit tokens (e.g. "2009/001/001") — pseudo-date if (/^\d{4}\/\d{3,}\/\d+$|^\d{4}\/\d+\/\d{3,}/.test(trimmed)) { return null } // ── 4. "DD Mon. YYYY" — e.g. "01 Jan. 2020" ───────────────────────── const ddMonYearRe = /^(\d{1,2})\s+([a-z]+\.?)\s+(\d{4})$/i const ddMonYearMatch = ddMonYearRe.exec(trimmed) if (ddMonYearMatch) { const monthNum = resolveMonth(ddMonYearMatch[2]) if (monthNum) { const day = parseInt(ddMonYearMatch[1], 10) if (day >= 1 && day <= 31) { return `${ ddMonYearMatch[3] }-${monthNum}-${ddMonYearMatch[1].padStart(2, "0")}` } } } // ── 5. "Month YYYY" or "YYYY Month" — e.g. "April 2005", "2005 Apr." ─ const monthYearRe = /^([a-z]+\.?)\s+(\d{4})$|^(\d{4})\s+([a-z]+\.?)$/i const myMatch = monthYearRe.exec(trimmed) if (myMatch) { const monthStr = myMatch[1] ?? myMatch[4] const yearStr = myMatch[2] ?? myMatch[3] const monthNum = resolveMonth(monthStr) if (monthNum) { return `${yearStr}-${monthNum}` } // Recognised the year but not the month name — return just the year. return yearStr } // ── 6. "Month DD" — e.g. "August 02" — use baseYear ───────────────── const monthDayRe = /^([a-z]+\.?)\s+(\d{1,2})$/i const mdMatch = monthDayRe.exec(trimmed) if (mdMatch) { const monthNum = resolveMonth(mdMatch[1]) const day = parseInt(mdMatch[2], 10) if (monthNum && day >= 1 && day <= 31 && baseYear) { return `${baseYear}-${monthNum}-${mdMatch[2].padStart(2, "0")}` } // Can't build a full date — fall through to verbatim return } // ── 7. Bare month name — e.g. "Mar", "March" ───────────────────────── const monthOnlyRe = /^([a-z]+\.?)$/i const moMatch = monthOnlyRe.exec(trimmed) if (moMatch) { const monthNum = resolveMonth(moMatch[1]) if (monthNum && baseYear) { return `${baseYear}-${monthNum}` } // Known month but no year, or unknown word — keep verbatim so the // caller preserves whatever signal is present. return trimmed } // ── 8. Everything else — keep verbatim ─────────────────────────────── // Complex strings like "15-17 June 2021" that we cannot normalise are // passed through unchanged so that human-readable date information is // not silently discarded. return trimmed } private extractCopyrightDate( record: EndNoteRecord, processedFields?: Set<string>, ): string { const copyrightDates = record.dates?.["copyright-dates"] if (copyrightDates?.date) { if (processedFields) processedFields.add("dates.copyright-dates") const dates = Array.isArray(copyrightDates.date) ? copyrightDates.date : [copyrightDates.date] const dateTexts: string[] = [] for (const d of dates) { const year = d.year || this.getTextContent(d) if (year) dateTexts.push(year) } if (dateTexts.length > 0) { return dateTexts.join("/") } } return "" } private extractAuthors( authorsData: EndNoteAuthor | EndNoteAuthor[] | undefined, ): NameDictObject[] { if (!authorsData) { return [] } const authors = Array.isArray(authorsData) ? authorsData : [authorsData] const names: NameDictObject[] = [] for (const author of authors) { const nameObj = this.parseAuthor(author) if (nameObj) { names.push(nameObj) } } return names } private parseAuthor(author: EndNoteAuthor): NameDictObject | null { const nameObj: NameDictObject = {} // First check for structured name attributes const lastName = author["last-name"] const firstName = author["first-name"] const initials = author.initials const suffix = author.suffix const corpName = author["corp-name"] if (corpName) { // Corporate/institutional author nameObj.literal = this.convertRichText(corpName) return nameObj } if (lastName) { nameObj.family = this.convertRichText(lastName) if (firstName) { nameObj.given = this.convertRichText(firstName) } else if (initials) { nameObj.given = this.convertRichText(initials) } if (suffix) { nameObj.suffix = this.convertRichText(suffix) } return nameObj } // Fall back to parsing the text content const nameText = this.getTextContent(author) if (!nameText) { return null } return this.parseNameText(nameText) } private parseNameText(nameText: string): NameDictObject | null { nameText = nameText.trim() if (!nameText) { return null } const nameObj: NameDictObject = {} // Handle comma-separated names (Last, First) or (Last, Jr, First) if (nameText.includes(",")) { const parts = nameText.split(",").map((p) => p.trim()) if (parts.length >= 2) { nameObj.family = this.convertRichText(parts[0]) // Check if middle part looks like a suffix if ( parts.length >= 3 && /^(Jr|Sr|I{1,3}|IV|V|VI|VII|2nd|3rd|\d+th)\.?$/i.test( parts[1], ) ) { nameObj.suffix = this.convertRichText(parts[1]) nameObj.given = this.convertRichText(parts[2]) } else { nameObj.given = this.convertRichText(parts[1]) if (parts.length >= 3) { nameObj.suffix = this.convertRichText(parts[2]) } } return nameObj } } // Handle space-separated names (First von Last) const words = nameText.split(/\s+/) if (words.length === 1) { nameObj.literal = this.convertRichText(nameText) } else { // Last word is family name, rest is given nameObj.family = this.convertRichText(words[words.length - 1]) nameObj.given = this.convertRichText(words.slice(0, -1).join(" ")) } return nameObj } private extractKeywords( keywordsData: EndNoteStyledValue | EndNoteStyledValue[] | undefined, processedFields?: Set<string>, ): string[] { if (!keywordsData) { return [] } if (processedFields) processedFields.add("keywords") const keywords = Array.isArray(keywordsData) ? keywordsData : [keywordsData] const result: string[] = [] for (const kw of keywords) { const text = this.getTextContent(kw) if (text) { // Keywords may be separated by semicolons or commas const parts = text.split(/[;,]/).map((p) => p.trim()) result.push(...parts.filter((p) => p)) } } return result } private convertRange(value: string): RangeArray[] { if (!value) { return [] } return String(value) .split(/,|;/) .map((range) => { const parts = range.split(/[-–—]/) return [ parts.map((part) => ({ type: "text", text: part.trim(), })), ] }) } private convertRichText(text: string): NodeArray { if (typeof text !== "string") { return [{ type: "text", text: String(text) }] } if (!text) { return [{ type: "text", text: "" }] } // Decode common HTML entities const decodedText = text .replace(/&/g, "&") .replace(/</g, "<") .replace(/>/g, ">") .replace(/"/g, '"') .replace(//g, "\n") .replace(/
/g, "\n") .replace(//g, "\n") .replace(/
/g, "\n") return [{ type: "text", text: decodedText }] } // eslint-disable-next-line max-params private checkUnhandledFields( record: EndNoteRecord, processedFields: Set<string>, index: number, _fields: Record<string, unknown>, unhandledData: string[], ): void { // Define which top-level fields should be checked const knownFields = [ "database", "source-app", "rec-number", "foreign-keys", "ref-type", "contributors", "auth-address", "auth-affiliaton", "titles", "periodical", "pages", "volume", "number", "issue", "secondary-volume", "secondary-issue", "num-vols",