biblatex-csl-converter
Version:
Bibliography format converter: BibLaTeX, BibTeX, CSL-JSON, RIS, ENW, EndNote XML, Citavi, DOCX citations, ODT citations — parse, convert, and export with round-trip fidelity
629 lines (564 loc) • 19.9 kB
text/typescript
/**
* RIS (Research Information Systems) format parser
* Supports standard RIS format (two-character tags with " -" delimiter)
*/
import {
BibFieldTypes,
BibTypes,
type EntryObject,
type NameDictObject,
type NodeArray,
} from "../const"
import { lookupLangid, makeEntryKey } from "./tools"
// RIS type to BibType mapping
// Source of types: https://github.com/zotero/translators/blob/873602eb8b0961da0b306161dc386032631ffaeb/RIS.js
const RISTypeMap: Record<string, string> = {
ART: "artwork", // Artwork
ABST: "article-journal", // Abstract
ADVS: "video", // Audiovisual material
AGGR: "misc", // Aggregated database
ANCIENT: "classic", // Ancient text
BILL: "legislation", // Bill
BLOG: "post", // Blog post
BOOK: "book", // Book
CHAP: "inbook", // Book chapter
CASE: "legal_case", // Case
CHART: "figure", // Chart or table
CLSWK: "classic", // Classical work
COMP: "software", // Computer program
CONF: "inproceedings", // Conference paper
CPAPER: "inproceedings", // Conference paper
CTLG: "article-magazine", // Catalog
DATA: "dataset", // Dataset
DBASE: "dataset", // Database
DICT: "inreference", // Dictionary entry
EBOOK: "book", // Electronic book
ECHAP: "inbook", // Electronic book section
EJOUR: "article-journal", // Electronic journal
ENCYC: "inreference", // Encyclopedia entry
EQUA: "misc", // Equation
FIGURE: "figure", // Figure
FILM: "video", // Film
GEN: "misc", // Generic
ELEC: "online", // Electronic resource
HEAR: "hearing", // Hearing
GOVDOC: "report", // Government document
GRNT: "report", // Grant
ICOMM: "personal_communication", // Internet communication (email, etc.)
INPR: "article-journal", // In press (treat as journal article)
JFULL: "article-journal", // Full journal article
JOUR: "article-journal", // Journal article
LEGAL: "legislation", // Legal document
MGZN: "article-magazine", // Magazine article
MPCT: "video", // Motion picture
MANSCPT: "unpublished", // Manuscript
MAP: "map", // Map
MULTI: "misc", // Multimedia
MUSIC: "audio", // Music
NEWS: "article-newspaper", // News article
PAMP: "booklet", // Pamphlet
PAT: "patent", // Patent
PCOMM: "personal_communication", // Personal communication
RPRT: "report", // Report
SER: "book", // Serial
SLIDE: "misc", // Slide presentation
SOUND: "audio", // Sound recording
STAND: "standard", // Standard
STAT: "legislation", // Statute
THES: "thesis", // Thesis
UNBILL: "legislation", // Unpublished bill
UNPD: "unpublished", // Unpublished document
VIDEO: "video", // Video
WEB: "online", // Web page
}
/**
* RIS tags that are explicitly handled during conversion.
* Any tag present in a record but not in this set will trigger an
* `unknown_tag` warning so callers know data may have been dropped.
*/
const KNOWN_RIS_TAGS = new Set([
"TY", // Reference type (always handled)
"ER", // End of record (always handled)
"TI",
"T1", // Title
"T2",
"JF",
"JO",
"J2", // Secondary / journal title
"ST", // Short title
"AU",
"A1", // Primary authors
"A2", // Secondary authors (editors)
"A3", // Tertiary authors
"AB",
"N2", // Abstract
"N1", // Notes
"PY",
"Y1", // Publication year
"DA",
"Y2", // Full date / access date
"VL", // Volume
"IS",
"C7", // Issue / article number
"SP", // Start page
"EP", // End page
"PB", // Publisher
"CY",
"PP", // Place / city
"DO",
"M3", // DOI
"UR",
"L1",
"L2",
"L3", // URL / link
"SN",
"SE", // ISBN / ISSN / section (dual-use: handled with fallback)
"KW", // Keywords
"ET", // Edition
"CN", // Call number
"AN",
"M1", // Accession / article number
"LA", // Language
])
interface ErrorObject {
type: string
field?: string
value?: unknown
entry?: string
tag?: string
}
export interface RISParseResult {
entries: Record<number, EntryObject>
errors: ErrorObject[]
warnings: ErrorObject[]
}
interface RISRecord {
[key: string]: string[]
}
export class RISParser {
input: string
entries: EntryObject[]
errors: ErrorObject[]
warnings: ErrorObject[]
private usedKeys: Set<string> = new Set()
constructor(input: string) {
this.input = input
this.entries = []
this.errors = []
this.warnings = []
}
parse(): RISParseResult {
// Parse records
const records = this.parseRISFormat()
// Convert each record
for (let i = 0; i < records.length; i++) {
const convertedEntry = this.convertRecord(records[i], i + 1)
if (convertedEntry) {
this.entries.push(convertedEntry)
}
}
// Create numbered index
const entries: Record<number, EntryObject> = {}
this.entries.forEach((entry, index) => {
entries[index + 1] = entry
})
return {
entries,
errors: this.errors,
warnings: this.warnings,
}
}
private parseRISFormat(): RISRecord[] {
const records: RISRecord[] = []
// Normalize line endings
const normalizedInput = this.input
.replace(/\r\r\n/g, "\n")
.replace(/\r\n/g, "\n")
const lines = normalizedInput.split("\n")
let currentRecord: RISRecord = {}
let currentTag: string | null = null
let currentValue: string[] = []
for (let i = 0; i < lines.length; i++) {
const line = lines[i]
// Check for new tag (format: "XX - value" or "XX -")
const tagMatch = line.match(/^([A-Z][A-Z0-9])\s\s-\s?(.*)$/)
if (tagMatch) {
// Save previous tag value if exists
if (currentTag && currentValue.length > 0) {
if (!currentRecord[currentTag]) {
currentRecord[currentTag] = []
}
currentRecord[currentTag].push(currentValue.join("\n"))
}
currentTag = tagMatch[1]
currentValue = [tagMatch[2]]
// Check for end of record
if (currentTag === "ER") {
records.push(currentRecord)
currentRecord = {}
currentTag = null
currentValue = []
}
} else if (currentTag && line.startsWith(" ")) {
// Continuation line (6 spaces)
currentValue.push(line.trim())
} else if (currentTag && line.trim() === "") {
// Empty line - save current tag
if (currentValue.length > 0) {
if (!currentRecord[currentTag]) {
currentRecord[currentTag] = []
}
currentRecord[currentTag].push(currentValue.join("\n"))
currentValue = []
}
currentTag = null
}
}
// Handle last record if not ended with ER
if (Object.keys(currentRecord).length > 0) {
records.push(currentRecord)
}
return records
}
private convertRecord(
record: RISRecord,
index: number,
): EntryObject | false {
// Get the reference type
const risType = this.getFirstValue(record.TY) || "GEN"
const mappedBibType = RISTypeMap[risType]
const bibType = mappedBibType || "misc"
// Warn when the RIS type is not recognised at all
if (!mappedBibType) {
this.warnings.push({
type: "unknown_type",
value: risType,
entry: String(index),
})
} else if (!BibTypes[bibType]) {
// The mapped type itself is not a known BibType — treat as error
this.errors.push({
type: "unknown_type",
value: risType,
entry: String(index),
})
return false
}
const entryKey = this.generateEntryKey(record, index)
const fields: Record<string, unknown> = {}
// Title
const title =
this.getFirstValue(record.TI) || this.getFirstValue(record.T1)
if (title) {
fields.title = this.setField("title", title)
} else {
this.warnings.push({
type: "missing_required_field",
field: "title",
entry: entryKey,
})
}
// Secondary title (journal/book title)
const secondaryTitle =
this.getFirstValue(record.T2) ||
this.getFirstValue(record.JF) ||
this.getFirstValue(record.JO) ||
this.getFirstValue(record.J2)
if (secondaryTitle) {
fields.journaltitle = this.setField("journaltitle", secondaryTitle)
}
// Short title
const shortTitle = this.getFirstValue(record.ST)
if (shortTitle) {
fields.shorttitle = this.setField("shorttitle", shortTitle)
}
// Authors
const authors: string[] = [...(record.AU || []), ...(record.A1 || [])]
if (authors.length > 0) {
fields.author = this.parseNames(authors)
} else {
this.warnings.push({
type: "missing_required_field",
field: "author",
entry: entryKey,
})
}
// Secondary authors (editors)
const secondaryAuthors = record.A2 || []
if (secondaryAuthors.length > 0) {
fields.editor = this.parseNames(secondaryAuthors)
}
// Tertiary authors
const tertiaryAuthors = record.A3 || []
if (tertiaryAuthors.length > 0) {
fields.editora = this.parseNames(tertiaryAuthors)
}
// Abstract
const abstract =
this.getFirstValue(record.AB) || this.getFirstValue(record.N2)
if (abstract) {
fields.abstract = this.setField("abstract", abstract)
}
// Notes
const notes = this.getFirstValue(record.N1)
if (notes) {
fields.note = this.setField("note", notes)
}
// Date/Publication Year
const year =
this.getFirstValue(record.PY) || this.getFirstValue(record.Y1)
const date =
this.getFirstValue(record.DA) || this.getFirstValue(record.Y2)
if (year) {
fields.date = year
} else if (date) {
fields.date = date
} else {
this.warnings.push({
type: "missing_required_field",
field: "date",
entry: entryKey,
})
}
// Volume
const volume = this.getFirstValue(record.VL)
if (volume) {
fields.volume = this.setField("volume", volume)
}
// Issue/Number
const issue =
this.getFirstValue(record.IS) || this.getFirstValue(record.C7)
if (issue) {
fields.issue = this.setField("issue", issue)
}
// Pages
const startPage = this.getFirstValue(record.SP)
const endPage = this.getFirstValue(record.EP)
if (startPage && endPage) {
fields.pages = [
[
[
{ type: "text", text: startPage },
{ type: "text", text: endPage },
],
],
]
} else if (startPage) {
fields.pages = [[[{ type: "text", text: startPage }]]]
}
// Publisher
const publisher = this.getFirstValue(record.PB)
if (publisher) {
fields.publisher = this.setField("publisher", publisher)
}
// Place/City
const place =
this.getFirstValue(record.CY) || this.getFirstValue(record.PP)
if (place) {
fields.location = this.setField("location", place)
}
// DOI
const doi =
this.getFirstValue(record.DO) || this.getFirstValue(record.M3)
if (doi) {
fields.doi = this.setField("doi", doi)
}
// URL
const url =
this.getFirstValue(record.UR) ||
this.getFirstValue(record.L1) ||
this.getFirstValue(record.L2) ||
this.getFirstValue(record.L3)
if (url) {
fields.url = this.setField("url", url)
}
// ISBN/ISSN
const isbn =
this.getFirstValue(record.SN) || this.getFirstValue(record.SE)
if (isbn) {
// Could be ISBN or ISSN, try to determine
if (isbn.includes("-") && isbn.length <= 13) {
fields.isbn = this.setField("isbn", isbn)
} else {
fields.issn = this.setField("issn", isbn)
}
}
// Keywords — l_tag expects string[], split on comma/semicolon like
// the BibLaTeX importer does
if (record.KW && record.KW.length > 0) {
fields.keywords = record.KW.flatMap((kw) =>
kw
.split(/[,;]/)
.map((s) => s.trim())
.filter(Boolean),
)
}
// Edition
const edition = this.getFirstValue(record.ET)
if (edition) {
fields.edition = this.setField("edition", edition)
}
// Call Number
const callNum = this.getFirstValue(record.CN)
if (callNum) {
fields.library = this.setField("library", callNum)
}
// Accession Number
const accNum =
this.getFirstValue(record.AN) || this.getFirstValue(record.M1)
if (accNum) {
fields.eprint = this.setField("eprint", accNum)
}
// Language
const language = this.getFirstValue(record.LA)
if (language) {
const langid = this.setField("langid", language)
if (langid !== undefined) {
fields.langid = langid
}
}
// Section/Chapter
const section = this.getFirstValue(record.SE)
if (section) {
fields.chapter = this.setField("chapter", section)
}
// Warn about tags present in the record that are not handled
this.checkUnknownTags(record, entryKey)
return {
entry_key: entryKey,
bib_type: bibType,
fields,
}
}
/**
* Emit `unknown_tag` warnings for every tag in a parsed RIS record that
* is not present in the {@link KNOWN_RIS_TAGS} set. This lets callers
* detect data that the converter silently dropped.
*/
private checkUnknownTags(record: RISRecord, entryKey: string): void {
for (const tag of Object.keys(record)) {
if (!KNOWN_RIS_TAGS.has(tag)) {
const value = this.getFirstValue(record[tag])
this.warnings.push({
type: "unknown_tag",
tag,
value: value ? value.substring(0, 100) : undefined,
entry: entryKey,
})
}
}
}
private getFirstValue(values: string[] | undefined): string {
if (!values || values.length === 0) {
return ""
}
return values[0].trim()
}
private parseNames(names: string[]): NameDictObject[] {
return names
.map((name) => this.parseName(name.trim()))
.filter((n): n is NameDictObject => n !== null)
}
private parseName(nameText: string): NameDictObject | null {
nameText = nameText.trim()
if (!nameText) {
return null
}
const nameObj: NameDictObject = {}
// Handle "Last, First" format
if (nameText.includes(",")) {
const parts = nameText.split(",").map((p) => p.trim())
if (parts.length >= 2) {
nameObj.family = this.convertRichText(parts[0])
nameObj.given = this.convertRichText(parts[1])
return nameObj
}
}
// Handle "First Last" format
const words = nameText.split(/\s+/)
if (words.length === 1) {
nameObj.literal = this.convertRichText(nameText)
} else {
nameObj.family = this.convertRichText(words[words.length - 1])
nameObj.given = this.convertRichText(words.slice(0, -1).join(" "))
}
return nameObj
}
private generateEntryKey(record: RISRecord, index: number): string {
const firstAuthor =
this.getFirstValue(record.AU) || this.getFirstValue(record.A1)
const yearRaw =
this.getFirstValue(record.PY) || this.getFirstValue(record.Y1)
// Extract a clean four-digit year from whatever the field contains.
const year = yearRaw ? (yearRaw.match(/\d{4}/)?.[0] ?? "") : ""
const lastName = firstAuthor ? firstAuthor.split(",")[0].trim() : ""
return makeEntryKey(
String(index),
this.usedKeys,
lastName || undefined,
year || undefined,
)
}
/**
* Stores a plain text value into the correct internal shape for the given
* BibField key:
* - l_literal → NodeArray[] (array of NodeArrays)
* - f_verbatim / f_uri / f_date → plain string
* - f_key → matched option key string, or undefined if unrecognised
* - everything else → NodeArray
*/
private setField(
fieldKey: string,
text: string,
): NodeArray | NodeArray[] | string | undefined {
const fieldDef = BibFieldTypes[fieldKey]
const fieldType = fieldDef?.type
if (fieldType === "l_literal") {
return [this.convertRichText(text)]
} else if (
fieldType === "f_verbatim" ||
fieldType === "f_uri" ||
fieldType === "f_date"
) {
return text
} else if (fieldType === "f_key") {
const options = fieldDef?.options
if (Array.isArray(options)) {
// Array options (e.g. bookpagination, type): plain string match
const lower = text.toLowerCase().trim()
const matched = options.find(
(k: string) => k.toLowerCase() === lower,
)
return matched // undefined if no match
} else if (options) {
// Object options (e.g. langid): use shared lookup that handles
// BCP-47 codes, ISO 639-2 codes, full names, biblatex aliases
return lookupLangid(text) // undefined if no match
}
return text
}
return this.convertRichText(text)
}
private convertRichText(text: string): NodeArray {
if (typeof text !== "string") {
return [{ type: "text", text: String(text) }]
}
if (!text) {
return [{ type: "text", text: "" }]
}
// Decode common HTML entities
const decodedText = text
.replace(/&/g, "&")
.replace(/</g, "<")
.replace(/>/g, ">")
.replace(/"/g, '"')
.replace(/
/g, "\n")
.replace(/
/g, "\n")
.replace(/ /g, "\n")
.replace(/ /g, "\n")
return [{ type: "text", text: decodedText }]
}
}
export function parseRIS(input: string): RISParseResult {
return new RISParser(input).parse()
}