dejats
Version:
Extracts JATS to HTML
419 lines (406 loc) • 15.3 kB
JavaScript
/* eslint array-bracket-spacing: 0, space-in-parens: 0 */
let { parseString } = require('get-xml')
, dom = require('get-dom')
, xpath = require('xpath')
, Marcheur = require('marcheur')
, nodal = require('marcheur/nodal')
, Matcher = require('marcheur/matcher')
, {
ELEMENT_NODE,
TEXT_NODE,
CDATA_SECTION_NODE,
PROCESSING_INSTRUCTION_NODE,
COMMENT_NODE,
} = require('dom-node-types')
, { ALI_NS, MATHML_NS, XLINK_NS } = require('./ns')
;
const nsMap = {
ali: ALI_NS,
xlink: XLINK_NS,
mml: MATHML_NS
}
, attrMap = {
'iso-8601-date': 'datetime',
'xml:base': 'base',
'xml:lang': 'lang',
}
, select = xpath.useNamespaces(nsMap)
;
// map the ones that just stay the same
// TODO: some of these are fishy in that they are XHTML/HTML4 archaic things, we should probably
// have a module that just kills them.
[
'id', 'abbr', 'align', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
'colspan', 'frame', 'headers', 'rowspan', 'rules', 'scope', 'span', 'style', 'summary', 'valign',
].forEach(n => (attrMap[n] = n));
// map the data ones
[
'article-type', 'authenticated', 'abbrev-type', 'abstract-type', 'alt', 'alt-title-type',
'arrange', 'assigning-authority', 'award-type',
'baseline-shift',
'calendar', 'code-type', 'code-version', 'continued-from', 'count', 'count-type', 'country',
'currency', 'date-type', 'collab-type', 'content-type', 'contrib-id-type', 'contrib-type',
'corresp',
'deceased', 'dtd-version', 'description', 'designator', 'document-id', 'document-id-type',
'document-type', 'disp-level',
'elocation-id', 'end_date', 'executable', 'ext-link-type', 'equal-contrib',
'fig-type', 'fn-type', 'fontchar', 'fontname', 'format',
'glyph-data',
'initials', 'institution-id-type', 'issue',
'journal-id', 'journal-id-type',
'kwd-group-type',
'language', 'language-version', 'license-type', 'link-type', 'list-content', 'list-type',
'mime-subtype', 'mimetype',
'name', 'name-style', 'notation', 'notes-type',
'object-id', 'object-id-type', 'object-type', 'orientation',
'page', 'person-group-type', 'platforms', 'position', 'prefix-word', 'preformat-type',
'product-type', 'pub-id-type', 'publication-format', 'publication-type', 'publisher-type', 'pub-type',
'rationale', 'ref-type', 'related-article-type', 'resolution', 'response-type', 'rid',
'specific-use', 'sec-type', 'seq', 'source-id', 'source-id-type', 'source-type', 'start_date',
'style-type', 'subj-group-type', 'supplement-type', 'symbol',
'target-type', 'toggle',
'underline-style', 'units',
'version', 'vol',
'width',
'xlink:actuate', 'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title', 'xlink:type',
'x-size', 'y-size'
].forEach(n => (attrMap[n] = `data-${n.replace(/:/g, '-')}`));
module.exports = function dejats (jats, cb) {
parseString(jats, (err, doc) => {
if (err) return cb(err);
_dejats(doc, cb);
});
};
function _dejats (jatsDoc, cb) {
let m = new Matcher(nsMap)
, walker = new Marcheur()
, wrapper = (elName, atMaker = () => ({})) => (src, out, w) => {
let output = el(elName, amap(src, atMaker(src)), out);
w.walk(output);
}
, convertXLink = (el, attrName = 'href', target = el) => {
if (el.hasAttribute('data-xlink-href')) {
target.setAttribute(attrName, el.getAttribute('data-xlink-href'));
el.removeAttribute('data-xlink-href');
}
if (el.hasAttribute('data-xlink-title')) {
target.setAttribute('title', el.getAttribute('data-xlink-title'));
el.removeAttribute('data-xlink-title');
}
}
, classLocalName = (src) => ({ class: src.localName })
, elMatch = m.el.bind(m)
, el
, amap
;
walker
// #document
.match(m.document(),
(src, out, w) => {
let doc = dom.implementation().createHTMLDocument('')
, nod = nodal(doc, attrMap, nsMap)
;
el = nod.el;
amap = nod.amap;
w.result(doc);
w.walk(doc.body);
}
)
// GENERIC MAPPING TO ELEMENTS
// article
.match(m.el('article'), wrapper('article'))
// div.localName
.match(
[ 'front', 'body', 'back', 'floats-group', 'sub-article',
'response', 'array',
'author-notes', 'award-group', 'chem-struct',
'copyright-holder', 'corresp', 'custom-meta', 'meta-value', 'custom-meta-group', 'def',
'def-head', 'def-item', 'def-list', 'element-citation', 'fn', 'front-stub', 'funding-group',
'private-char', 'license', 'license-p', 'list', 'mixed-citation', 'named-content',
'nlm-citation', 'note', 'open-access', 'permissions', 'product', 'ref', 'sig',
'speech', 'statement', 'table-wrap-foot', 'verse-group',
'verse-line', 'journal-meta', 'article-meta', 'unstructured-kwd-group', 'list-item'
].map(elMatch),
wrapper('div', classLocalName)
)
// span.localName
.match(
[ 'contrib-id', 'anonymous', 'addr-line', 'aff', 'aff-alternatives',
'alternatives', 'alt-text', 'annotation', 'award-id', 'city', 'comment', 'compound-kwd',
'compound-kwd-part', 'compound-subject', 'compound-subject-part', 'conf-name', 'conf-num',
'conf-theme', 'copyright-statement', 'copyright-year', 'country', 'meta-name', 'data-title',
'day', 'degrees', 'edition', 'elocation-id', 'era', 'etal', 'fixed-case', 'fpage',
'funding-source', 'funding-statement', 'given-names', 'glyph-ref', 'gov', 'history',
'inline-formula', 'inline-supplementary-material', 'institution', 'institution-id',
'institution-wrap', 'isbn', 'issn', 'issn-l', 'issue', 'issue-id', 'issue-part',
'issue-sponsor', 'issue-title', 'kwd', 'kwd-group', 'label', 'long-desc', 'lpage',
'milestone-end', 'milestone-start', 'month', 'name', 'name-alternatives', 'nested-kwd',
'object-id', 'on-behalf-of', 'overline', 'page-range', 'part-title', 'patent',
'ali:free_to_read', 'ali:license_ref', 'postal-code', 'prefix', 'price',
'principal-award-recipient', 'principal-investigator', 'pub-id', 'address', 'conf-loc',
'conf-sponsor', 'conference', 'publisher', 'publisher-name', 'publisher-loc',
'journal-title', 'trans-title', 'article-title', 'chapter-title', 'related-article',
'person-group', 'collab', 'collab-alternatives', 'citation-alternatives',
'trans-title-group', 'contrib', 'contrib-group', 'article-categories', 'author-comment',
'related-object', 'role', 'roman', 'sans-serif', 'sc', 'season', 'series', 'series-text',
'series-title', 'size', 'source', 'speaker', 'state', 'std', 'std-organization',
'string-name', 'styled-content', 'subj-group', 'subject', 'suffix', 'supplement', 'surname',
'target', 'term', 'term-head', 'textual-form', 'time-stamp', 'trans-source', 'version',
'volume', 'volume-id', 'volume-issue-group', 'volume-series', 'xref', 'year',
'count', 'fig-count', 'table-count', 'equation-count', 'ref-count', 'page-count',
'word-count', 'journal-id', 'article-id', 'counts', 'x', 'underline-start', 'underline-end',
'overline-start', 'overline-end', 'string-conf', 'supplementary-material'
].map(elMatch),
wrapper('span', classLocalName)
)
// section.localName
.match(
[ 'abstract', 'ack', 'app', 'app-group', 'bio', 'fn-group', 'glossary', 'notes',
'ref-list', 'sec', 'sig-block', 'trans-abstract'
].map(elMatch),
wrapper('section', classLocalName)
)
// figure.localName
.match(
[ 'chem-struct-wrap', 'disp-formula', 'disp-formula-group', 'fig', 'fig-group', 'table-wrap',
'table-wrap-group'
].map(elMatch),
wrapper('figure', classLocalName)
)
// header
.match(
['journal-title-group', 'sec-meta', 'title-group'].map(elMatch),
wrapper('header', classLocalName)
)
// p
.match(
[ 'journal-subtitle', 'trans-subtitle', 'abbrev-journal-title', 'alt-title', 'subtitle'
].map(elMatch),
wrapper('p', classLocalName)
)
// time
.match(
[ 'access-date', 'conf-date', 'date', 'date-in-citation', 'pub-date', 'string-date', 'year'
].map(elMatch),
wrapper('time', classLocalName)
)
// abbr
.match( [m.el('conf-acronym')], wrapper('abbr', classLocalName) )
// strong, em, tt
.match( [m.el('bold')], wrapper('strong') )
.match( [m.el('italic')], wrapper('em') )
.match( [m.el('monospace')], wrapper('tt') )
.match( [m.el('strike')], wrapper('s') )
.match( [m.el('underline')], wrapper('u') )
// cite (arguably a stretch)
.match( [m.el('attrib')], wrapper('cite', classLocalName) )
// aside
.match( [m.el('boxed-text')], wrapper('aside', classLocalName) )
// blockquote
.match( [m.el('disp-quote')], wrapper('blockquote', classLocalName) )
// pre
.match( [m.el('preformat')], wrapper('pre', classLocalName) )
// br
.match( [m.el('break')], (src, out) => el('br', amap(src), out) )
// COPIED ELEMENTS
// (regenerated)
.match(
[ 'col', 'colgroup', 'hr', 'p', 'rb', 'rp', 'rt', 'rtc', 'ruby', 'sub', 'sup', 'table', 'td',
'tfoot', 'th', 'thead', 'tr'
].map(elMatch),
(src, out, w) => {
let copy = el(src.localName, amap(src), out);
w.walk(copy);
}
)
// (imported)
.match(
['mml:math'].map(elMatch),
(src, out) => {
out.appendChild(importAsLocalName(src, out.ownerDocument));
}
)
// MORE INVOLVED ELEMENTS
// graphics & media
.match(
[m.el('graphic'), m.el('media')],
(src, out, w) => {
let type = src.getAttribute('mimetype')
, elName
;
if (type === 'video') elName = 'video';
else if (type === 'audio') elName = 'audio';
else if (type === 'image') elName = 'img';
else if (!type) elName = (src.localName === 'graphic') ? 'img' : 'iframe';
else elName = 'iframe';
if (src.hasChildNodes()) {
let div = el('div', { class: src.localName }, out)
, img = el(elName, amap(src), div)
;
convertXLink(img, 'src');
w.walk(div);
}
else {
let img = el(elName, amap(src), out);
convertXLink(img, 'src');
}
}
)
.match(
[m.el('inline-graphic')],
(src, out) => {
let img = el('img', amap(src), out);
convertXLink(img, 'src');
if (src.hasChildNodes()) img.setAttribute('alt', src.textContent);
}
)
// titling - all titles are h2, it's up to downstream processors to override that
.match(
[m.el('title')],
(src, out, w) => {
let h2 = el('h2', amap(src), out);
if (src.parentNode.localName === 'sec' && src.parentNode.hasAttribute('disp-level')) {
h2.setAttribute('aria-level', src.parentNode.getAttribute('disp-level'));
}
w.walk(h2);
}
)
// a
.match(
['ext-link', 'self-uri'].map(elMatch),
(src, out, w) => {
let a = el('a', amap(src, { class: src.localName }), out);
convertXLink(a);
w.walk(a);
}
)
// captioning
.match(
[m.el('caption')],
(src, out, w) => {
let pn = src.parentNode.localName
, outName = 'figcaption'
;
if (pn === 'boxed-text' || pn === 'supplementary-material') outName = 'header';
else if (pn === 'media') outName = 'div';
let cap = el(outName, amap(src, { class: src.localName }), out);
w.walk(cap);
}
)
// code
.match(
[m.el('code')],
(src, out, w) => {
let pre = el('pre', amap(src, { class: src.localName }), out)
, code = el('code', {}, pre)
;
w.walk(code);
}
)
// email, fax, phone, uri
.match(
[ 'email', 'fax', 'phone', 'uri' ].map(elMatch),
(src, out, w) => {
let a = el('a', amap(src, { class: src.localName }), out)
, ln2scheme = {
email: 'mailto:',
fax: 'fax:',
phone: 'tel:',
uri: ''
}
;
a.setAttribute('href', `${ln2scheme[src.localName]}${src.textContent.replace(/\s+/g, '')}`);
if (src.localName === 'uri') convertXLink(a);
w.walk(a);
}
)
// abbr
.match(
[m.el('abbrev')],
(src, out) => {
let abbr = el('abbr', amap(src), out)
, tit = ''
;
select('./def', src).forEach(def => {
tit += def.textContent;
src.removeChild(def);
});
if (!tit && src.hasAttribute('alt')) tit = src.getAttribute('alt');
if (tit) abbr.setAttribute('title', tit);
if (src.hasAttribute('data-xlink-href')) {
let a = el('a', {}, abbr);
convertXLink(src, 'href', a);
a.textContent = src.textContent;
}
else {
abbr.textContent = src.textContent;
}
}
)
// glyphs
.match(
[m.el('glyph-data')],
(src, out, w) => {
let script = el('script', amap(src, { class: src.localName }), out);
script.setAttribute('type', 'text/x-jats-glyph-data');
w.walk(script);
}
)
// tex/latex
.match(
[m.el('tex-math')],
(src, out, w) => {
let script = el('script', amap(src, { class: src.localName }), out)
, notation = src.getAttribute('notation').toLowerCase()
, type = 'text/x-jats-tex-math'
;
if (notation === 'latex') type = 'application/x-latex';
else if (notation === 'tex') type = 'application/x-tex';
script.setAttribute('type', type);
w.walk(script);
}
)
// tbody
.match(
[m.el('tbody')],
(src, out, w) => {
if (src.parentNode.localName === 'table') {
let tb = el('tbody', amap(src), out);
w.walk(tb);
}
else {
let table = el('table', {}, out)
, tb = el('tbody', amap(src), table)
;
w.walk(tb);
}
}
)
.run( jatsDoc, cb )
;
}
// This is a brutal XML -> HTML import function that simply uses the localName in the new document.
function importAsLocalName (el, doc) {
let imported = doc.createElement(el.localName);
copyAttr(el, imported);
Array.from(el.childNodes).forEach(child => {
let nt = child.nodeType
, kid
;
if (nt === ELEMENT_NODE) kid = importAsLocalName(child, doc);
else if (nt === TEXT_NODE || nt === CDATA_SECTION_NODE) kid = doc.createTextNode(child.data);
else if (nt === PROCESSING_INSTRUCTION_NODE) {
kid = doc.createProcessingInstruction(child.target, child.data);
}
else if (nt === COMMENT_NODE) kid = doc.createComment(child.data);
else return; // weird XML stuff
imported.appendChild(kid);
});
return imported;
}
function copyAttr (from, to) {
for (let i = 0; i < from.attributes.length; i++) {
let n = from.attributes[i].name;
to.setAttribute(n, from.getAttribute(n));
}
}