UNPKG

dejats

Version:
419 lines (406 loc) 15.3 kB
/* eslint array-bracket-spacing: 0, space-in-parens: 0 */ let { parseString } = require('get-xml') , dom = require('get-dom') , xpath = require('xpath') , Marcheur = require('marcheur') , nodal = require('marcheur/nodal') , Matcher = require('marcheur/matcher') , { ELEMENT_NODE, TEXT_NODE, CDATA_SECTION_NODE, PROCESSING_INSTRUCTION_NODE, COMMENT_NODE, } = require('dom-node-types') , { ALI_NS, MATHML_NS, XLINK_NS } = require('./ns') ; const nsMap = { ali: ALI_NS, xlink: XLINK_NS, mml: MATHML_NS } , attrMap = { 'iso-8601-date': 'datetime', 'xml:base': 'base', 'xml:lang': 'lang', } , select = xpath.useNamespaces(nsMap) ; // map the ones that just stay the same // TODO: some of these are fishy in that they are XHTML/HTML4 archaic things, we should probably // have a module that just kills them. [ 'id', 'abbr', 'align', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff', 'colspan', 'frame', 'headers', 'rowspan', 'rules', 'scope', 'span', 'style', 'summary', 'valign', ].forEach(n => (attrMap[n] = n)); // map the data ones [ 'article-type', 'authenticated', 'abbrev-type', 'abstract-type', 'alt', 'alt-title-type', 'arrange', 'assigning-authority', 'award-type', 'baseline-shift', 'calendar', 'code-type', 'code-version', 'continued-from', 'count', 'count-type', 'country', 'currency', 'date-type', 'collab-type', 'content-type', 'contrib-id-type', 'contrib-type', 'corresp', 'deceased', 'dtd-version', 'description', 'designator', 'document-id', 'document-id-type', 'document-type', 'disp-level', 'elocation-id', 'end_date', 'executable', 'ext-link-type', 'equal-contrib', 'fig-type', 'fn-type', 'fontchar', 'fontname', 'format', 'glyph-data', 'initials', 'institution-id-type', 'issue', 'journal-id', 'journal-id-type', 'kwd-group-type', 'language', 'language-version', 'license-type', 'link-type', 'list-content', 'list-type', 'mime-subtype', 'mimetype', 'name', 'name-style', 'notation', 'notes-type', 'object-id', 'object-id-type', 'object-type', 'orientation', 'page', 'person-group-type', 'platforms', 'position', 'prefix-word', 'preformat-type', 'product-type', 'pub-id-type', 'publication-format', 'publication-type', 'publisher-type', 'pub-type', 'rationale', 'ref-type', 'related-article-type', 'resolution', 'response-type', 'rid', 'specific-use', 'sec-type', 'seq', 'source-id', 'source-id-type', 'source-type', 'start_date', 'style-type', 'subj-group-type', 'supplement-type', 'symbol', 'target-type', 'toggle', 'underline-style', 'units', 'version', 'vol', 'width', 'xlink:actuate', 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type', 'x-size', 'y-size' ].forEach(n => (attrMap[n] = `data-${n.replace(/:/g, '-')}`)); module.exports = function dejats (jats, cb) { parseString(jats, (err, doc) => { if (err) return cb(err); _dejats(doc, cb); }); }; function _dejats (jatsDoc, cb) { let m = new Matcher(nsMap) , walker = new Marcheur() , wrapper = (elName, atMaker = () => ({})) => (src, out, w) => { let output = el(elName, amap(src, atMaker(src)), out); w.walk(output); } , convertXLink = (el, attrName = 'href', target = el) => { if (el.hasAttribute('data-xlink-href')) { target.setAttribute(attrName, el.getAttribute('data-xlink-href')); el.removeAttribute('data-xlink-href'); } if (el.hasAttribute('data-xlink-title')) { target.setAttribute('title', el.getAttribute('data-xlink-title')); el.removeAttribute('data-xlink-title'); } } , classLocalName = (src) => ({ class: src.localName }) , elMatch = m.el.bind(m) , el , amap ; walker // #document .match(m.document(), (src, out, w) => { let doc = dom.implementation().createHTMLDocument('') , nod = nodal(doc, attrMap, nsMap) ; el = nod.el; amap = nod.amap; w.result(doc); w.walk(doc.body); } ) // GENERIC MAPPING TO ELEMENTS // article .match(m.el('article'), wrapper('article')) // div.localName .match( [ 'front', 'body', 'back', 'floats-group', 'sub-article', 'response', 'array', 'author-notes', 'award-group', 'chem-struct', 'copyright-holder', 'corresp', 'custom-meta', 'meta-value', 'custom-meta-group', 'def', 'def-head', 'def-item', 'def-list', 'element-citation', 'fn', 'front-stub', 'funding-group', 'private-char', 'license', 'license-p', 'list', 'mixed-citation', 'named-content', 'nlm-citation', 'note', 'open-access', 'permissions', 'product', 'ref', 'sig', 'speech', 'statement', 'table-wrap-foot', 'verse-group', 'verse-line', 'journal-meta', 'article-meta', 'unstructured-kwd-group', 'list-item' ].map(elMatch), wrapper('div', classLocalName) ) // span.localName .match( [ 'contrib-id', 'anonymous', 'addr-line', 'aff', 'aff-alternatives', 'alternatives', 'alt-text', 'annotation', 'award-id', 'city', 'comment', 'compound-kwd', 'compound-kwd-part', 'compound-subject', 'compound-subject-part', 'conf-name', 'conf-num', 'conf-theme', 'copyright-statement', 'copyright-year', 'country', 'meta-name', 'data-title', 'day', 'degrees', 'edition', 'elocation-id', 'era', 'etal', 'fixed-case', 'fpage', 'funding-source', 'funding-statement', 'given-names', 'glyph-ref', 'gov', 'history', 'inline-formula', 'inline-supplementary-material', 'institution', 'institution-id', 'institution-wrap', 'isbn', 'issn', 'issn-l', 'issue', 'issue-id', 'issue-part', 'issue-sponsor', 'issue-title', 'kwd', 'kwd-group', 'label', 'long-desc', 'lpage', 'milestone-end', 'milestone-start', 'month', 'name', 'name-alternatives', 'nested-kwd', 'object-id', 'on-behalf-of', 'overline', 'page-range', 'part-title', 'patent', 'ali:free_to_read', 'ali:license_ref', 'postal-code', 'prefix', 'price', 'principal-award-recipient', 'principal-investigator', 'pub-id', 'address', 'conf-loc', 'conf-sponsor', 'conference', 'publisher', 'publisher-name', 'publisher-loc', 'journal-title', 'trans-title', 'article-title', 'chapter-title', 'related-article', 'person-group', 'collab', 'collab-alternatives', 'citation-alternatives', 'trans-title-group', 'contrib', 'contrib-group', 'article-categories', 'author-comment', 'related-object', 'role', 'roman', 'sans-serif', 'sc', 'season', 'series', 'series-text', 'series-title', 'size', 'source', 'speaker', 'state', 'std', 'std-organization', 'string-name', 'styled-content', 'subj-group', 'subject', 'suffix', 'supplement', 'surname', 'target', 'term', 'term-head', 'textual-form', 'time-stamp', 'trans-source', 'version', 'volume', 'volume-id', 'volume-issue-group', 'volume-series', 'xref', 'year', 'count', 'fig-count', 'table-count', 'equation-count', 'ref-count', 'page-count', 'word-count', 'journal-id', 'article-id', 'counts', 'x', 'underline-start', 'underline-end', 'overline-start', 'overline-end', 'string-conf', 'supplementary-material' ].map(elMatch), wrapper('span', classLocalName) ) // section.localName .match( [ 'abstract', 'ack', 'app', 'app-group', 'bio', 'fn-group', 'glossary', 'notes', 'ref-list', 'sec', 'sig-block', 'trans-abstract' ].map(elMatch), wrapper('section', classLocalName) ) // figure.localName .match( [ 'chem-struct-wrap', 'disp-formula', 'disp-formula-group', 'fig', 'fig-group', 'table-wrap', 'table-wrap-group' ].map(elMatch), wrapper('figure', classLocalName) ) // header .match( ['journal-title-group', 'sec-meta', 'title-group'].map(elMatch), wrapper('header', classLocalName) ) // p .match( [ 'journal-subtitle', 'trans-subtitle', 'abbrev-journal-title', 'alt-title', 'subtitle' ].map(elMatch), wrapper('p', classLocalName) ) // time .match( [ 'access-date', 'conf-date', 'date', 'date-in-citation', 'pub-date', 'string-date', 'year' ].map(elMatch), wrapper('time', classLocalName) ) // abbr .match( [m.el('conf-acronym')], wrapper('abbr', classLocalName) ) // strong, em, tt .match( [m.el('bold')], wrapper('strong') ) .match( [m.el('italic')], wrapper('em') ) .match( [m.el('monospace')], wrapper('tt') ) .match( [m.el('strike')], wrapper('s') ) .match( [m.el('underline')], wrapper('u') ) // cite (arguably a stretch) .match( [m.el('attrib')], wrapper('cite', classLocalName) ) // aside .match( [m.el('boxed-text')], wrapper('aside', classLocalName) ) // blockquote .match( [m.el('disp-quote')], wrapper('blockquote', classLocalName) ) // pre .match( [m.el('preformat')], wrapper('pre', classLocalName) ) // br .match( [m.el('break')], (src, out) => el('br', amap(src), out) ) // COPIED ELEMENTS // (regenerated) .match( [ 'col', 'colgroup', 'hr', 'p', 'rb', 'rp', 'rt', 'rtc', 'ruby', 'sub', 'sup', 'table', 'td', 'tfoot', 'th', 'thead', 'tr' ].map(elMatch), (src, out, w) => { let copy = el(src.localName, amap(src), out); w.walk(copy); } ) // (imported) .match( ['mml:math'].map(elMatch), (src, out) => { out.appendChild(importAsLocalName(src, out.ownerDocument)); } ) // MORE INVOLVED ELEMENTS // graphics & media .match( [m.el('graphic'), m.el('media')], (src, out, w) => { let type = src.getAttribute('mimetype') , elName ; if (type === 'video') elName = 'video'; else if (type === 'audio') elName = 'audio'; else if (type === 'image') elName = 'img'; else if (!type) elName = (src.localName === 'graphic') ? 'img' : 'iframe'; else elName = 'iframe'; if (src.hasChildNodes()) { let div = el('div', { class: src.localName }, out) , img = el(elName, amap(src), div) ; convertXLink(img, 'src'); w.walk(div); } else { let img = el(elName, amap(src), out); convertXLink(img, 'src'); } } ) .match( [m.el('inline-graphic')], (src, out) => { let img = el('img', amap(src), out); convertXLink(img, 'src'); if (src.hasChildNodes()) img.setAttribute('alt', src.textContent); } ) // titling - all titles are h2, it's up to downstream processors to override that .match( [m.el('title')], (src, out, w) => { let h2 = el('h2', amap(src), out); if (src.parentNode.localName === 'sec' && src.parentNode.hasAttribute('disp-level')) { h2.setAttribute('aria-level', src.parentNode.getAttribute('disp-level')); } w.walk(h2); } ) // a .match( ['ext-link', 'self-uri'].map(elMatch), (src, out, w) => { let a = el('a', amap(src, { class: src.localName }), out); convertXLink(a); w.walk(a); } ) // captioning .match( [m.el('caption')], (src, out, w) => { let pn = src.parentNode.localName , outName = 'figcaption' ; if (pn === 'boxed-text' || pn === 'supplementary-material') outName = 'header'; else if (pn === 'media') outName = 'div'; let cap = el(outName, amap(src, { class: src.localName }), out); w.walk(cap); } ) // code .match( [m.el('code')], (src, out, w) => { let pre = el('pre', amap(src, { class: src.localName }), out) , code = el('code', {}, pre) ; w.walk(code); } ) // email, fax, phone, uri .match( [ 'email', 'fax', 'phone', 'uri' ].map(elMatch), (src, out, w) => { let a = el('a', amap(src, { class: src.localName }), out) , ln2scheme = { email: 'mailto:', fax: 'fax:', phone: 'tel:', uri: '' } ; a.setAttribute('href', `${ln2scheme[src.localName]}${src.textContent.replace(/\s+/g, '')}`); if (src.localName === 'uri') convertXLink(a); w.walk(a); } ) // abbr .match( [m.el('abbrev')], (src, out) => { let abbr = el('abbr', amap(src), out) , tit = '' ; select('./def', src).forEach(def => { tit += def.textContent; src.removeChild(def); }); if (!tit && src.hasAttribute('alt')) tit = src.getAttribute('alt'); if (tit) abbr.setAttribute('title', tit); if (src.hasAttribute('data-xlink-href')) { let a = el('a', {}, abbr); convertXLink(src, 'href', a); a.textContent = src.textContent; } else { abbr.textContent = src.textContent; } } ) // glyphs .match( [m.el('glyph-data')], (src, out, w) => { let script = el('script', amap(src, { class: src.localName }), out); script.setAttribute('type', 'text/x-jats-glyph-data'); w.walk(script); } ) // tex/latex .match( [m.el('tex-math')], (src, out, w) => { let script = el('script', amap(src, { class: src.localName }), out) , notation = src.getAttribute('notation').toLowerCase() , type = 'text/x-jats-tex-math' ; if (notation === 'latex') type = 'application/x-latex'; else if (notation === 'tex') type = 'application/x-tex'; script.setAttribute('type', type); w.walk(script); } ) // tbody .match( [m.el('tbody')], (src, out, w) => { if (src.parentNode.localName === 'table') { let tb = el('tbody', amap(src), out); w.walk(tb); } else { let table = el('table', {}, out) , tb = el('tbody', amap(src), table) ; w.walk(tb); } } ) .run( jatsDoc, cb ) ; } // This is a brutal XML -> HTML import function that simply uses the localName in the new document. function importAsLocalName (el, doc) { let imported = doc.createElement(el.localName); copyAttr(el, imported); Array.from(el.childNodes).forEach(child => { let nt = child.nodeType , kid ; if (nt === ELEMENT_NODE) kid = importAsLocalName(child, doc); else if (nt === TEXT_NODE || nt === CDATA_SECTION_NODE) kid = doc.createTextNode(child.data); else if (nt === PROCESSING_INSTRUCTION_NODE) { kid = doc.createProcessingInstruction(child.target, child.data); } else if (nt === COMMENT_NODE) kid = doc.createComment(child.data); else return; // weird XML stuff imported.appendChild(kid); }); return imported; } function copyAttr (from, to) { for (let i = 0; i < from.attributes.length; i++) { let n = from.attributes[i].name; to.setAttribute(n, from.getAttribute(n)); } }