UNPKG

@iebh/reflib

Version:

Reference / Citation reference library utilities

402 lines (364 loc) 13.3 kB
import Emitter from '../shared/emitter.js'; /** * Lookup enum for the current parser mode we are in * * @type {Object<Number>} */ const MODES = { REF: 0, FIELDS: 1, FIELD_START: 2, FIELD_VALUE: 3, }; /** * Parse a BibTeX file from a readable stream * * @see modules/interface.js * * @param {Stream} stream The readable stream to accept data from * @param {Object} [options] Additional options to use when parsing * @param {Boolean} [options.recNumberNumeric=true] Only process the BibTeX ID into a recNumber if its a finite numeric, otherwise disguard * @param {Boolean} [options.recNumberRNPrefix=true] Accept `RN${NUMBER}` as recNumber if present * @param {Boolean} [options.recNumberKey=true] If the reference key cannot be otherwise parsed store it in `key<String>` instead * @param {String} [options.fallbackType='unkown'] Reflib fallback type if the incoming type is unrecognised or unsupported * @param {Set<String>} [options.fieldsOverwrite] Set of field names where the value is clobbered rather than appended if discovered more than once * @param {Boolean} [options.preserveUnknownKeys=true] Retain keys we do not have a direct lookup for in the output object * * @returns {Object} A readable stream analogue defined in `modules/interface.js` */ export function readStream(stream, options) { let settings = { recNumberNumeric: true, recNumberRNPrefix: true, recNumberKey: true, fallbackType: 'unknown', fieldsOverwrite: new Set(['type']), preserveUnkownKeys: true, ...options, }; let emitter = Emitter(); let buffer = ''; let mode = MODES.REF; let state; // Misc state storage when we're digesting ref data let ref = {}; // Reference item being constructed // Queue up the parser in the next tick (so we can return the emitter first) setTimeout(()=> { stream .on('error', e => emitter.emit('error', e)) .on('end', ()=> emitter.emit('end')) .on('data', chunkBuffer => { emitter.emit('progress', stream.bytesRead); buffer += chunkBuffer.toString(); // Append incomming data to the partial-buffer we're holding in memory while (true) { let match; // Regex storage for match groups if ((mode == MODES.REF) && (match = /^\s*@(?<type>\w+?)\s*\{(?<id>.*?),/s.exec(buffer))) { if (settings.recNumberNumeric && isFinite(match.groups.id)) { // Accept numeric recNumber ref.recNumber = +match.groups.id; } else if (settings.recNumberRNPrefix && /^RN\d+$/.test(match.groups.id)) { ref.recNumber = +match.groups.id.slice(2); } else if (!settings.recNumberNumeric && match.groups.id) { // Non numeric / finite ID - but we're allowed to accept it anyway ref.recNumber = +match.groups.id; } else if (settings.recNumberKey) { // Non numeric, custom looking key, stash in 'key' instead ref.key = match.groups.id; } // Implied else - No ID, ignore ref.type = match.groups.type; mode = MODES.FIELDS; state = null; } else if (mode == MODES.FIELDS && (match = /^\s*(?<field>\w+?)\s*=\s*/s.exec(buffer))) { mode = MODES.FIELD_START; state = {field: match.groups.field}; } else if (mode == MODES.FIELDS && (match = /^\s*\}\s*/s.exec(buffer))) { // End of ref emitter.emit('ref', tidyRef(ref, settings)); mode = MODES.REF; ref = {}; state = null; } else if (mode == MODES.FIELD_START && (match = /^\s*(?<fieldWrapper>"|{)\s*/.exec(buffer))) { mode = MODES.FIELD_VALUE; state.fieldWrapper = match.groups.fieldWrapper; } else if ( // TODO: Note that we use `\r?\n` as delimiters for field values, this is a cheat to avoid having to implement a full AST parser // This is a hack but since most BibTeX files use properly formatted BibTeX this should work in the majority of cases // This WILL break if given one continuous line of BibTeX though // - MC 2026-01-02 mode == MODES.FIELD_VALUE && ( ( state.fieldWrapper == '{' && (match = /^(?<value>.*?)(?<!\\%)\}\s*,?\s*$/sm.exec(buffer)) ) || ( state.fieldWrapper == '"' && (match = /^(?<value>.*?)"\s*,?\s*$/sm.exec(buffer)) ) ) ) { mode = MODES.FIELDS; if (// Already have content - and we should overwrite ref[state.field] !== undefined && ( settings.preserveUnkownKeys || settings.fieldsOverwrite.has(state.field) ) ) { ref[state.field] = unescape(match.groups.value); } else if (ref[state.field] !== undefined) { // Already have content - append ref[state.field] += '\n' + unescape(match.groups.value); } else { // Populate initial value ref[state.field] = unescape(match.groups.value); } state = null; } else { // Implied else - No match to buffer, let it fill and process next data block break; } // Crop start of buffer to last match buffer = buffer.slice(match[0].length); } }) }) return emitter; } /** * Tidy up a raw BibTeX reference before emitting * * @param {Object} ref The input raw ref to tidy * * @param {Object} settings Optimized settings object for fast access * * @returns {Object} The tidied ref */ export function tidyRef(ref, settings) { return Object.fromEntries( Object.entries(ref) .map(([key, val]) => { let rlField = translations.fields.btMap.get(key.toLowerCase()); if (key == 'type') { // Special conversion for type let rlType = ref.type && translations.types.btMap.get(val.toLowerCase()); return rlType ? [key, rlType.rl] // Can translate incoming type to Reflib type : [key, settings.fallbackType] // Unknown Reflib type varient } else if (!settings.preserveUnkownKeys && !rlField) { // Omit unknown fields return; } else if (rlField && rlField.array) { // Field needs array casting return [rlField.rl, val.split(/\n*\s+and\s+/)]; } else if (rlField && rlField.rl) { // Known BT field but different RL field return [rlField.rl, val]; } else if (settings.preserveUnkownKeys) { // Everything else - add field return [key, val]; } }) .filter(Boolean) // Remove duds ); } /** * Translate a BibTeX encoded string into a regular JS String * * @param {String} str Input BibTeX encoded string * @returns {String} Regular JS output string */ export function unescape(str) { return str .replace(/\/\*/g, '\n') .replace(/\{\\\&\}/g, '&') .replace(/\{\\\%\}/g, '%') } /** * Translate a JS string into a BibTeX encoded string * * @param {String} str Input regular JS String * @returns {String} BibTeX encoded string */ export function escape(str) { return (''+str) .replace(/\&/g, '{\\&}') .replace(/%/g, '{\\%}') } /** * Write a RIS file to a writable stream * * @see modules/interface.js * * @param {Stream} stream The writable stream to write to * * @param {Object} [options] Additional options to use when parsing * @param {string} [options.defaultType='Misc'] Default citation type to assume when no other type is specified * @param {string} [options.delimeter='\r'] How to split multi-line items * @param {Set} [options.omitFields] Set of special fields to always omit, either because we are ignoring or because we have special treatment for them * @param {Boolean} [options.keyForce=true] Force a unique ID to exist if we don't already have one for each reference * @param {Boolean} [options.recNumberRNPrefix=true] Rewrite recNumber fields as `RN${NUMBER}` * @param {Boolean} [options.recNumberKey=true] If the reference `recNumber` is empty use `key<String>` instead * @param {Boolean} [options.preserveUnknownKeys=true] Output keys we do not have a direct lookup for in the output object * * @returns {Object} A writable stream analogue defined in `modules/interface.js` */ export function writeStream(stream, options) { let settings = { defaultType: 'Misc', delimeter: '\n', omitFields: new Set(['key', 'recNumber', 'type']), keyForce: true, recNumberRNPrefix: true, recNumberKey: true, preserveUnkownKeys: true, ...options, }; return { start() { return Promise.resolve(); }, write: ref => { // Fetch Reflib type definition ref.type ||= settings.defaultType; let rlType = translations.types.rlMap.get(ref.type.toLowerCase()); let btType = rlType?.bt || settings.defaultType; stream.write( '@' + btType + '{' + ( ref.recNumber && settings.recNumberRNPrefix ? `RN${ref.recNumber},` : ref.recNumber ? `${ref.recNumber},` : ref.key ? `${ref.key},` : settings.keyForce ? `${generateCitationKey(ref)},` : '' ) + '\n' + Object.entries(ref) .filter(([key, val]) => val // We have a non-nullish val && !settings.omitFields.has(key) ) .reduce((buf, [rawKey, rawVal], keyIndex, keys) => { // Fetch Reflib field definition let rlField = translations.fields.rlMap.get(rawKey) if (!rlField && !settings.preserveUnkownKeys) return buf; // Unknown field mapping - skip if were omitting unknown fields let key = rlField ? rlField.bt : rawKey; // Use Reflib->BibTeX field mapping if we have one, otherwise use raw key let val = escape( // Escape input value, either as an Array via join or as a flat string rawKey == 'authors' && Array.isArray(rawVal) ? rawVal.join(' and ') // Special joining conditions for author field : Array.isArray(rawVal) ? rawVal.join(', ') // Treat other arrays as a CSV : rawVal // Splat everything else as a string ); return buf + // Return string buffer of ref under construction `${key}={${val}}` // Append ref key=val pair to buffer + (keyIndex < keys.length-1 ? ',' : '') // Append comma (if non-last) + '\n' // Finish each field with a newline }, '') + '}\n' ); return Promise.resolve(); }, middle() { stream.write('\n'); }, end() { return new Promise((resolve, reject) => stream.end(err => err ? reject(err) : resolve()) ); }, }; } /** * Generate a citation key from first author + year * Example: "Roomruangwong2020" */ function generateCitationKey(ref) { let author = 'Anon'; if (ref.authors && ref.authors.length > 0) { author = ref.authors[0].split(',')[0]; } let year = 'n.d.'; if (ref.year) { year = ref.year; } return `${author}${year}`; } /** * Lookup tables for this module * @type {Object} * @property {Array<Object>} fields Field translations between Reflib (`rl`) and BibTeX format (`bt`) */ export let translations = { // Field translations {{{ fields: { collection: [ // Order by priority (highest at top) {rl: 'address', bt: 'address'}, {rl: 'authors', bt: 'author', array: true}, {rl: 'doi', bt: 'doi'}, {rl: 'edition', bt: 'edition'}, {rl: 'editor', bt: 'editor'}, {rl: 'journal', bt: 'journal'}, {rl: 'notes', bt: 'note'}, {rl: 'number', bt: 'number'}, {rl: 'pages', bt: 'pages'}, {rl: 'title', bt: 'booktitle'}, {rl: 'title', bt: 'title'}, {rl: 'volume', bt: 'volume'}, {rl: 'isbn', bt: 'issn'}, // Misc {bt: 'month'}, // Combined into {rl:'date'} {bt: 'type'}, // Ignored {bt: 'year'}, // Combined into {rl:'date'} // Nonestandard but used anyway {rl: 'abstract', bt: 'abstract'}, {rl: 'language', bt: 'language'}, {rl: 'keywords', bt: 'keywords', array: true}, {rl: 'urls', bt: 'url', array: true}, // Unknown how to translate these // {bt: 'annote'}, // {bt: 'email'}, // {bt: 'chapter'}, // {bt: 'crossref'}, // {bt: 'howpublished'}, // {bt: 'institution'}, // {bt: 'key'}, // {bt: 'organization'}, // {bt: 'publisher'}, // {bt: 'school'}, // {bt: 'series'}, ], rlMap: new Map(), btMap: new Map(), }, // }}} // Ref type translations {{{ types: { collection: [ // Order by priority (highest at top) {rl: 'journalArticle', bt: 'Article'}, {rl: 'book', bt: 'Book'}, {rl: 'bookSection', bt: 'InBook'}, {rl: 'conferencePaper', bt: 'Conference'}, {rl: 'conferenceProceedings', bt: 'InProceedings'}, {rl: 'report', bt: 'TechReport'}, {rl: 'thesis', bt: 'PHDThesis'}, {rl: 'unknown', bt: 'Misc'}, {rl: 'unpublished', bt: 'Unpublished'}, // Type aliases {rl: 'journalArticle', bt: 'Journal Article'}, // Unknown how to translate these {rl: 'Misc', bt: 'Booklet'}, {rl: 'Misc', bt: 'InCollection'}, {rl: 'Misc', bt: 'Manual'}, {rl: 'Misc', bt: 'MastersThesis'}, {rl: 'Misc', bt: 'Proceedings'}, ], rlMap: new Map(), btMap: new Map(), }, // }}} }; /** * @see modules/interface.js */ export function setup() { // Create lookup object of translations.fields with key as .rl / val as the full object translations.fields.collection.forEach(c => { if (c.rl) translations.fields.rlMap.set(c.rl.toLowerCase(), c); if (c.bt) translations.fields.btMap.set(c.bt, c); }); // Create lookup object of ref.types with key as .rl / val as the full object translations.types.collection.forEach(c => { // Append each type to the set, accepting the first in each case as the priority let rlLc = c.rl.toLowerCase(); let btLc = c.bt.toLowerCase(); if (c.rl && !translations.types.rlMap.has(rlLc)) translations.types.rlMap.set(rlLc, c); if (c.bt && !translations.types.btMap.has(btLc)) translations.types.btMap.set(btLc, c); }); }