ingenta-lens
Version:
A novel way of seeing content.
1,595 lines (1,364 loc) • 85.5 kB
JavaScript
"use strict";
var _ = require("underscore");
var util = require("../substance/util");
var errors = util.errors;
var ImporterError = errors.define("ImporterError");
var Article = require("../article");
var NlmToLensConverter = function(options) {
this.options = options || NlmToLensConverter.DefaultOptions;
};
NlmToLensConverter.Prototype = function() {
this._annotationTypes = {
"bold": "strong",
"italic": "emphasis",
"monospace": "code",
"sub": "subscript",
"sup": "superscript",
"sc": "custom_annotation",
"roman": "custom_annotation",
"sans-serif": "custom_annotation",
"styled-content": "custom_annotation",
"underline": "underline",
"ext-link": "link",
"xref": "",
"email": "link",
"named-content": "",
"inline-formula": "inline-formula",
"uri": "link"
};
this._inlineNodeTypes = {
"fn": true,
};
// mapping from xref.refType to node type
this._refTypeMapping = {
"bibr": "citation_reference",
"fig": "figure_reference",
"table": "figure_reference",
"supplementary-material": "figure_reference",
"other": "figure_reference",
"list": "definition_reference",
"fn": "footnote_reference",
};
// mapping of contrib type to human readable names
// Can be overriden in specialized converter
this._contribTypeMapping = {
"author": "Author",
"author non-byline": "Author",
"autahor": "Author",
"auther": "Author",
"editor": "Editor",
"guest-editor": "Guest Editor",
"group-author": "Group Author",
"collab": "Collaborator",
"reviewed-by": "Reviewer",
"nominated-by": "Nominator",
"corresp": "Corresponding Author",
"other": "Other",
"assoc-editor": "Associate Editor",
"associate editor": "Associate Editor",
"series-editor": "Series Editor",
"contributor": "Contributor",
"chairman": "Chairman",
"monographs-editor": "Monographs Editor",
"contrib-author": "Contributing Author",
"organizer": "Organizer",
"chair": "Chair",
"discussant": "Discussant",
"presenter": "Presenter",
"guest-issue-editor": "Guest Issue Editor",
"participant": "Participant",
"translator": "Translator"
};
this.isAnnotation = function(type) {
return this._annotationTypes[type] !== undefined;
};
this.isInlineNode = function(type) {
return this._inlineNodeTypes[type] !== undefined;
};
this.isParagraphish = function(node) {
for (var i = 0; i < node.childNodes.length; i++) {
var el = node.childNodes[i];
if (el.nodeType !== Node.TEXT_NODE && !this.isAnnotation(el.tagName.toLowerCase())) return false;
}
return true;
};
this.test = function(xml, documentUrl) {
/* jshint unused:false */
return true;
};
// Helpers
// --------
this.getName = function(nameEl) {
if (!nameEl) return "N/A";
var names = [];
var surnameEl = nameEl.querySelector("surname");
var givenNamesEl = nameEl.querySelector("given-names");
var suffix = nameEl.querySelector("suffix");
if (givenNamesEl) names.push(givenNamesEl.textContent);
if (surnameEl) names.push(surnameEl.textContent);
if (suffix && suffix.textContent.trim() !== "") return [names.join(" "), suffix.textContent].join(", ");
return names.join(" ");
};
this.toHtml = function(el) {
if (!el) return "";
var tmp = document.createElement("DIV");
tmp.appendChild(el.cloneNode(true));
return tmp.innerHTML;
};
this.mmlToHtmlString = function(el) {
var html = this.toHtml(el);
html = html.replace(/<(\/)?mml:([^>]+)>/g, "<$1$2>");
return html;
};
this.selectDirectChildren = function(scopeEl, selector) {
// Note: if the ':scope' pseudo class was supported by more browsers
// it would be the correct selector based solution.
// However, for now we do simple filtering.
var result = [];
var els = scopeEl.querySelectorAll(selector);
for (var i = 0; i < els.length; i++) {
var el = els[i];
if (el.parentElement === scopeEl) result.push(el);
}
return result;
};
// ### The main entry point for starting an import
this.import = function(input) {
var xmlDoc;
// Note: when we are using jqueries get("<file>.xml") we
// magically get a parsed XML document already
if (_.isString(input)) {
var parser = new DOMParser();
xmlDoc = parser.parseFromString(input,"text/xml");
} else {
xmlDoc = input;
}
this.sanitizeXML(xmlDoc);
// Creating the output Document via factore, so that it is possible to
// create specialized NLMImporter later which would want to instantiate
// a specialized Document type
var doc = this.createDocument();
// For debug purposes
window.doc = doc;
// A deliverable state which makes this importer stateless
var state = this.createState(xmlDoc, doc);
// Note: all other methods are called corresponding
return this.document(state, xmlDoc);
};
// Sometimes we need to deal with unconsistent XML
// When overwriting this function in your custom converter
// you can solve those issues in a preprocessing step instead of adding
// hacks in the main converter code
this.sanitizeXML = function(xmlDoc) {
/* jshint unused:false */
};
this.createState = function(xmlDoc, doc) {
return new NlmToLensConverter.State(this, xmlDoc, doc);
};
// Overridden to create a Lens Article instance
this.createDocument = function() {
var doc = new Article();
return doc;
};
this.show = function(state, nodes) {
_.each(nodes, function(n) {
this.showNode(state, n);
}, this);
};
this.extractDate = function(dateEl) {
if (!dateEl) return null;
var year = dateEl.querySelector("year");
var month = dateEl.querySelector("month");
var day = dateEl.querySelector("day");
var res = [year.textContent];
if (month) res.push(month.textContent);
if (day) res.push(day.textContent);
return res.join("-");
};
this.extractPublicationInfo = function(state, article) {
var doc = state.doc;
var articleMeta = article.querySelector("article-meta");
var pubDate = articleMeta.querySelector("pub-date");
var history = articleMeta.querySelectorAll("history date");
// Journal title
//
var journalTitle = article.querySelector("journal-title");
// DOI
//
// <article-id pub-id-type="doi">10.7554/eLife.00003</article-id>
var articleDOI = article.querySelector("article-id[pub-id-type=doi]");
// Related article if exists
//
// TODO: can't there be more than one?
var relatedArticle = article.querySelector("related-article");
// Article information
var articleInfo = this.extractArticleInfo(state, article);
// Funding information
var fundingInfo = this.extractFundingInfo(state, article);
// Create PublicationInfo node
// ---------------
var pubInfoNode = {
"id": "publication_info",
"type": "publication_info",
"published_on": this.extractDate(pubDate),
"journal": journalTitle ? journalTitle.textContent : "",
"related_article": relatedArticle ? relatedArticle.getAttribute("xlink:href") : "",
"doi": articleDOI ? articleDOI.textContent : "",
"article_info": articleInfo.id,
"funding_info": fundingInfo,
// TODO: 'article_type' should not be optional; we need to find a good default implementation
"article_type": "",
// Optional fields not covered by the default implementation
// Implement config.enhancePublication() to complement the data
// TODO: think about how we could provide good default implementations
"keywords": [],
"links": [],
"subjects": [],
"supplements": [],
"history": [],
// TODO: it seems messy to have this in the model
// Instead it would be cleaner to add 'custom': 'object' field
"research_organisms": [],
// TODO: this is in the schema, but seems to be unused
"provider": "",
};
for (var i = 0; i < history.length; i++) {
var dateEl = history[i];
var historyEntry = {
type: dateEl.getAttribute('date-type'),
date: this.extractDate(dateEl)
};
pubInfoNode.history.push(historyEntry);
}
doc.create(pubInfoNode);
doc.show("info", pubInfoNode.id, 0);
this.enhancePublicationInfo(state, pubInfoNode);
};
this.extractArticleInfo = function(state, article) {
// Initialize the Article Info object
var articleInfo = {
"id": "articleinfo",
"type": "paragraph",
};
var doc = state.doc;
var nodes = [];
// Reviewing editor
nodes = nodes.concat(this.extractEditor(state, article));
// Datasets
nodes = nodes.concat(this.extractDatasets(state, article));
// Includes meta information (such as impact statement for eLife)
nodes = nodes.concat(this.extractCustomMetaGroup(state, article));
// Acknowledgments
nodes = nodes.concat(this.extractAcknowledgements(state, article));
// License and Copyright
nodes = nodes.concat(this.extractCopyrightAndLicense(state, article));
// Notes (<note> elements)
nodes = nodes.concat(this.extractNotes(state, article));
articleInfo.children = nodes;
doc.create(articleInfo);
return articleInfo;
};
this.extractFundingInfo = function(state, article) {
var fundingInfo = [];
var fundingStatementEls = article.querySelectorAll("funding-statement");
if (fundingStatementEls.length > 0){
for (var i = 0; i < fundingStatementEls.length; i++) {
fundingInfo.push(this.annotatedText(state, fundingStatementEls[i], ["publication_info", "funding_info", i]));
}
}
return fundingInfo;
};
// Get reviewing editor
// --------------
// TODO: it is possible to have multiple editors. This does only show the first one
// However, this would be easy: just querySelectorAll and have 'Reviewing Editors' as heading when there are multiple nodes found
this.extractEditor = function(state, article) {
var nodes = [];
var doc = state.doc;
var editor = article.querySelector("contrib[contrib-type=editor]");
if (editor) {
var content = [];
var name = this.getName(editor.querySelector('name'));
if (name) content.push(name);
var inst = editor.querySelector("institution");
if (inst) content.push(inst.textContent);
var country = editor.querySelector("country");
if (country) content.push(country.textContent);
var h1 = {
"type": "heading",
"id": state.nextId("heading"),
"level": 3,
"content": "Reviewing Editor"
};
doc.create(h1);
nodes.push(h1.id);
var t1 = {
"type": "text",
"id": state.nextId("text"),
"content": content.join(", ")
};
doc.create(t1);
nodes.push(t1.id);
}
return nodes;
};
//
// Extracts major datasets
// -----------------------
this.extractDatasets = function(state, article) {
var nodes = [];
var doc = state.doc;
var datasets = article.querySelectorAll('sec');
for (var i = 0;i <datasets.length;i++){
var data = datasets[i];
var type = data.getAttribute('sec-type');
if (type === 'datasets') {
var h1 = {
"type" : "heading",
"id" : state.nextId("heading"),
"level" : 3,
"content" : "Major Datasets"
};
doc.create(h1);
nodes.push(h1.id);
var ids = this.datasets(state, util.dom.getChildren(data));
for (var j=0;j < ids.length;j++) {
if (ids[j]) {
nodes.push(ids[j]);
}
}
}
}
return nodes;
};
var _capitalized = function(str, all) {
if (all) {
return str.split(' ').map(function(s){
return _capitalized(s);
}).join(' ');
} else {
return str.charAt(0).toUpperCase() + str.slice(1);
}
};
this.capitalized = function(str, all) {
return _capitalized(str, all);
};
//
// Extracts Acknowledgements
// -------------------------
this.extractAcknowledgements = function(state, article) {
var nodes = [];
var doc = state.doc;
var acks = article.querySelectorAll("ack");
if (acks && acks.length > 0) {
_.each(acks, function(ack) {
var title = ack.querySelector('title');
var header = {
"type" : "heading",
"id" : state.nextId("heading"),
"level" : 3,
"content" : title ? this.capitalized(title.textContent.toLowerCase(), "all") : "Acknowledgements"
};
doc.create(header);
nodes.push(header.id);
// There may be multiple paragraphs per ack element
var pars = this.bodyNodes(state, util.dom.getChildren(ack), {
ignore: ["title"]
});
_.each(pars, function(par) {
nodes.push(par.id);
});
}, this);
}
return nodes;
};
//
// Extracts notes that should be shown in article info
// ------------------------------------------
//
this.extractNotes = function(state, article) {
/* jshint unused:false */
return [];
};
// Can be overridden by custom converter to ignore <meta-name> values.
// TODO: Maybe switch to a whitelisting approach, so we don't show
// nonsense. See HighWire implementation
this.__ignoreCustomMetaNames = [];
this.extractCustomMetaGroup = function(state, article) {
var nodeIds = [];
var doc = state.doc;
var customMetaEls = article.querySelectorAll('article-meta custom-meta');
if (customMetaEls.length === 0) return nodeIds;
for (var i = 0; i < customMetaEls.length; i++) {
var customMetaEl = customMetaEls[i];
var metaNameEl = customMetaEl.querySelector('meta-name');
var metaValueEl = customMetaEl.querySelector('meta-value');
if (!_.include(this.__ignoreCustomMetaNames, metaNameEl.textContent)) {
var header = {
"type" : "heading",
"id" : state.nextId("heading"),
"level" : 3,
"content" : ""
};
header.content = this.annotatedText(state, metaNameEl, [header.id, 'content']);
doc.create(header);
var bodyNodes = this.paragraphGroup(state, metaValueEl);
nodeIds.push(header.id);
nodeIds = nodeIds.concat(_.pluck(bodyNodes, 'id'));
}
}
return nodeIds;
};
//
// Extracts Copyright and License Information
// ------------------------------------------
this.extractCopyrightAndLicense = function(state, article) {
var nodes = [];
var doc = state.doc;
var license = article.querySelector("permissions");
if (license) {
var h1 = {
"type" : "heading",
"id" : state.nextId("heading"),
"level" : 3,
"content" : "Copyright & License"
};
doc.create(h1);
nodes.push(h1.id);
// TODO: this is quite messy. We should introduce a dedicated note for article info
// and do that rendering related things there, e.g., '. ' separator
var par;
var copyright = license.querySelector("copyright-statement");
if (copyright) {
par = this.paragraphGroup(state, copyright);
if (par && par.length) {
nodes = nodes.concat( _.map(par, function(p) { return p.id; } ) );
// append '.' only if there is none yet
if (copyright.textContent.trim().slice(-1) !== '.') {
// TODO: this needs to be more robust... what if there are no children
var textid = _.last(_.last(par).children);
doc.nodes[textid].content += ". ";
}
}
}
var lic = license.querySelector("license");
if (lic) {
for (var child = lic.firstElementChild; child; child = child.nextElementSibling) {
var type = util.dom.getNodeType(child);
if (type === 'p' || type === 'license-p') {
par = this.paragraphGroup(state, child);
if (par && par.length) {
nodes = nodes.concat( _.pluck(par, 'id') );
}
}
}
}
}
return nodes;
};
this.extractCover = function(state, article) {
var doc = state.doc;
var docNode = doc.get("document");
var cover = {
id: "cover",
type: "cover",
title: docNode.title,
authors: [], // docNode.authors,
abstract: docNode.abstract
};
// Create authors paragraph that has contributor_reference annotations
// to activate the author cards
_.each(docNode.authors, function(contributorId) {
var contributor = doc.get(contributorId);
var authorsPara = {
"id": "text_"+contributorId+"_reference",
"type": "text",
"content": contributor.name
};
doc.create(authorsPara);
cover.authors.push(authorsPara.id);
var anno = {
id: state.nextId("contributor_reference"),
type: "contributor_reference",
path: ["text_" + contributorId + "_reference", "content"],
range: [0, contributor.name.length],
target: contributorId
};
doc.create(anno);
}, this);
// Move to elife configuration
// -------------------
// <article-categories>
// <subj-group subj-group-type="display-channel">...</subj-group>
// <subj-group subj-group-type="heading">...</subj-group>
// </article-categories>
// <article-categories>
// <subj-group subj-group-type="display-channel">
// <subject>Research article</subject>
// </subj-group>
// <subj-group subj-group-type="heading">
// <subject>Biophysics and structural biology</subject>
// </subj-group>
// </article-categories>
this.enhanceCover(state, cover, article);
doc.create(cover);
doc.show("content", cover.id, 0);
};
// Note: Substance.Article supports only one author.
// We use the first author found in the contribGroup for the 'creator' property.
this.contribGroup = function(state, contribGroup) {
var i;
var contribs = contribGroup.querySelectorAll("contrib");
for (i = 0; i < contribs.length; i++) {
this.contributor(state, contribs[i]);
}
// Extract on-behalf-of element and stick it to the document
var doc = state.doc;
var onBehalfOf = contribGroup.querySelector("on-behalf-of");
if (onBehalfOf) doc.on_behalf_of = onBehalfOf.textContent.trim();
};
this.affiliation = function(state, aff) {
var doc = state.doc;
var department = aff.querySelector("institution[content-type=dept]");
if (department) {
var institution = aff.querySelector("institution:not([content-type=dept])");
} else {
var department = aff.querySelector("addr-line named-content[content-type=department]");
var institution = aff.querySelector("institution");
}
var country = aff.querySelector("country");
var label = aff.querySelector("label");
var city = aff.querySelector("addr-line named-content[content-type=city]");
// TODO: there are a lot more elements which can have this.
var specific_use = aff.getAttribute('specific-use');
// TODO: this is a potential place for implementing a catch-bin
// For that, iterate all children elements and fill into properties as needed or add content to the catch-bin
var affiliationNode = {
id: state.nextId("affiliation"),
type: "affiliation",
source_id: aff.getAttribute("id"),
label: label ? label.textContent : null,
department: department ? department.textContent : null,
city: city ? city.textContent : null,
institution: institution ? institution.textContent : null,
country: country ? country.textContent: null,
specific_use: specific_use || null
};
doc.create(affiliationNode);
};
this.contributor = function(state, contrib) {
var doc = state.doc;
var id = state.nextId("contributor");
var contribNode = {
id: id,
source_id: contrib.getAttribute("id"),
type: "contributor",
name: "",
affiliations: [],
fundings: [],
bio: [],
// Not yet supported... need examples
image: "",
deceased: false,
emails: [],
contribution: "",
members: []
};
// Extract contrib type
var contribType = contrib.getAttribute("contrib-type");
// Assign human readable version
contribNode["contributor_type"] = this._contribTypeMapping[contribType];
// Extract role
var role = contrib.querySelector("role");
if (role) {
contribNode["role"] = role.textContent;
}
// Search for author bio and author image
var bio = contrib.querySelector("bio");
if (bio) {
_.each(util.dom.getChildren(bio), function(par) {
var graphic = par.querySelector("graphic");
if (graphic) {
var imageUrl = graphic.getAttribute("xlink:href");
contribNode.image = imageUrl;
} else {
var pars = this.paragraphGroup(state, par);
if (pars.length > 0) {
contribNode.bio = [ pars[0].id ];
}
}
}, this);
}
// Deceased?
if (contrib.getAttribute("deceased") === "yes") {
contribNode.deceased = true;
}
// Extract ORCID
// -----------------
//
// <uri content-type="orcid" xlink:href="http://orcid.org/0000-0002-7361-560X"/>
var orcidURI = contrib.querySelector("uri[content-type=orcid]");
if (orcidURI) {
contribNode.orcid = orcidURI.getAttribute("xlink:href");
}
// Extracting equal contributions
var nameEl = contrib.querySelector("name");
if (nameEl) {
contribNode.name = this.getName(nameEl);
} else {
var collab = contrib.querySelector("collab");
// Assuming this is an author group
if (collab) {
contribNode.name = collab.textContent;
} else {
contribNode.name = "N/A";
}
}
this.extractContributorProperties(state, contrib, contribNode);
// HACK: for cases where no explicit xrefs are given per
// contributor we assin all available affiliations
if (contribNode.affiliations.length === 0) {
contribNode.affiliations = state.affiliations;
}
// HACK: if author is assigned a conflict, remove the redundant
// conflict entry "The authors have no competing interests to declare"
// This is a data-modelling problem on the end of our input XML
// so we need to be smart about it in the converter
if (contribNode.competing_interests.length > 1) {
contribNode.competing_interests = _.filter(contribNode.competing_interests, function(confl) {
return confl.indexOf("no competing") < 0;
});
}
if (contrib.getAttribute("contrib-type") === "author") {
doc.nodes.document.authors.push(id);
}
doc.create(contribNode);
doc.show("info", contribNode.id);
};
this._getEqualContribs = function (state, contrib, contribId) {
var result = [];
var refs = state.xmlDoc.querySelectorAll("xref[rid="+contribId+"]");
// Find xrefs within contrib elements
_.each(refs, function(ref) {
var c = ref.parentNode;
if (c !== contrib) result.push(this.getName(c.querySelector("name")));
}, this);
return result;
};
this.extractContributorProperties = function(state, contrib, contribNode) {
var doc = state.doc;
// Extract equal contributors
var equalContribs = [];
var compInterests = [];
// extract affiliations stored as xrefs
var xrefs = contrib.querySelectorAll("xref");
_.each(xrefs, function(xref) {
if (xref.getAttribute("ref-type") === "aff") {
var affId = xref.getAttribute("rid");
var affNode = doc.getNodeBySourceId(affId);
if (affNode) {
contribNode.affiliations.push(affNode.id);
state.used[affId] = true;
}
} else if (xref.getAttribute("ref-type") === "other") {
// FIXME: it seems *very* custom to interprete every 'other' that way
// TODO: try to find and document when this is applied
console.log("FIXME: please add documentation about using 'other' as indicator for extracting an awardGroup.");
var awardGroup = state.xmlDoc.getElementById(xref.getAttribute("rid"));
if (!awardGroup) return;
var fundingSource = awardGroup.querySelector("funding-source");
if (!fundingSource) return;
var awardId = awardGroup.querySelector("award-id");
awardId = awardId ? ", "+awardId.textContent : "";
// Funding source nodes are looking like this
//
// <funding-source>
// National Institutes of Health
// <named-content content-type="funder-id">http://dx.doi.org/10.13039/100000002</named-content>
// </funding-source>
//
// and we only want to display the first text node, excluding the funder id
// or this
//
// They can also look like this
//
// <funding-source>
// <institution-wrap>
// <institution-id institution-id-type="FundRef">http://dx.doi.org/10.13039/100005156</institution-id>
// <institution>Alexander von Humboldt-Stiftung</institution>
// </institution-wrap>
// </funding-source>
// Then we take the institution element
var institution = fundingSource.querySelector('institution')
var fundingSourceName = institution ? institution.textContent : fundingSource.childNodes[0].textContent;
contribNode.fundings.push([fundingSourceName, awardId].join(''));
} else if (xref.getAttribute("ref-type") === "corresp") {
var correspId = xref.getAttribute("rid");
var corresp = state.xmlDoc.getElementById(correspId);
if (!corresp) return;
// TODO: a corresp element allows *much* more than just an email
// Thus, we are leaving this like untouched, so that it may be grabbed by extractAuthorNotes()
// state.used[correspId] = true;
var email = corresp.querySelector("email");
if (!email) return;
contribNode.emails.push(email.textContent);
} else if (xref.getAttribute("ref-type") === "fn") {
var fnId = xref.getAttribute("rid");
var fnElem = state.xmlDoc.getElementById(fnId);
var used = true;
if (fnElem) {
var fnType = fnElem.getAttribute("fn-type");
switch (fnType) {
case "con":
if (fnElem.getAttribute("id").indexOf("equal-contrib")>=0) {
equalContribs = this._getEqualContribs(state, contrib, fnElem.getAttribute("id"));
} else {
contribNode.contribution = fnElem.textContent;
}
break;
case "conflict":
compInterests.push(fnElem.textContent.trim());
break;
case "present-address":
contribNode.present_address = fnElem.querySelector("p").textContent;
break;
case "equal":
console.log("FIXME: isn't fnElem.getAttribute(id) === fnId?");
equalContribs = this._getEqualContribs(state, contrib, fnElem.getAttribute("id"));
break;
case "other":
// HACK: sometimes equal contribs are encoded as 'other' plus special id
console.log("FIXME: isn't fnElem.getAttribute(id) === fnId?");
if (fnElem.getAttribute("id").indexOf("equal-contrib")>=0) {
equalContribs = this._getEqualContribs(state, contrib, fnElem.getAttribute("id"));
} else {
used = false;
}
break;
default:
used = false;
}
if (used) state.used[fnId] = true;
}
} else {
// TODO: this is a potential place for implementing a catch-bin
// For that, we could push the content of the referenced element into the contrib's catch-bin
console.log("Skipping contrib's xref", xref.textContent);
}
}, this);
// Extract member list for person group
// eLife specific?
// ----------------
if (compInterests.length > 1) {
compInterests = _.filter(compInterests, function(confl) {
return confl.indexOf("no competing") < 0;
});
}
contribNode.competing_interests = compInterests;
var memberList = contrib.querySelector("xref[ref-type=other]");
if (memberList) {
var memberListId = memberList.getAttribute("rid");
var members = state.xmlDoc.querySelectorAll("#"+memberListId+" contrib");
contribNode.members = _.map(members, function(m) {
return this.getName(m.querySelector("name"));
}, this);
}
contribNode.equal_contrib = equalContribs;
contribNode.competing_interests = compInterests;
};
// Parser
// --------
// These methods are used to process XML elements in
// using a recursive-descent approach.
// ### Top-Level function that takes a full NLM tree
// Note: a specialized converter can derive this method and
// add additional pre- or post-processing.
this.document = function(state, xmlDoc) {
var doc = state.doc;
var article = xmlDoc.querySelector("article");
if (!article) {
throw new ImporterError("Expected to find an 'article' element.");
}
// recursive-descent for the main body of the article
this.article(state, article);
this.postProcess(state);
// Rebuild views to ensure consistency
_.each(doc.containers, function(container) {
container.rebuild();
});
return doc;
};
this.postProcess = function(state) {
this.postProcessAnnotations(state);
};
this.postProcessAnnotations = function(state) {
// Creating the annotations afterwards, to make sure
// that all referenced nodes are available
for (var i = 0; i < state.annotations.length; i++) {
var anno = state.annotations[i];
if (anno.target) {
var targetNode = state.doc.getNodeBySourceId(anno.target);
if (targetNode) {
anno.target = targetNode.id;
} else {
// NOTE: I've made this silent because it frequently occurs that no targetnode is
// available (e.g. for inline formulas)
// console.log("Could not lookup targetNode for annotation", anno);
}
}
state.doc.create(state.annotations[i]);
}
};
// Article
// --------
// Does the actual conversion.
//
// Note: this is implemented as lazy as possible (ALAP) and will be extended as demands arise.
//
// If you need such an element supported:
// - add a stub to this class (empty body),
// - add code to call the method to the appropriate function,
// - and implement the handler here if it can be done in general way
// or in your specialized importer.
this.article = function(state, article) {
var doc = state.doc;
// Assign id
var articleId = article.querySelector("article-id");
// Note: Substance.Article does only support one id
if (articleId) {
doc.id = articleId.textContent;
} else {
// if no id was set we create a random one
doc.id = util.uuid();
}
// Extract glossary
this.extractDefinitions(state, article);
// Extract authors etc.
this.extractAffilitations(state, article);
this.extractContributors(state, article);
// Same for the citations, also globally
this.extractCitations(state, article);
// Make up a cover node
this.extractCover(state, article);
// Extract ArticleMeta
this.extractArticleMeta(state, article);
// Populate Publication Info node
this.extractPublicationInfo(state, article);
var body = article.querySelector("body");
if (body) {
this.body(state, body);
}
this.extractFigures(state, article);
// catch all unhandled foot-notes
this.extractFootNotes(state, article);
// Extract back element, if it exists
var back = article.querySelector("back");
if (back){
this.back(state,back);
}
this.enhanceArticle(state, article);
};
this.extractDefinitions = function(state /*, article*/) {
var defItems = state.xmlDoc.querySelectorAll("def-item");
_.each(defItems, function(defItem) {
var term = defItem.querySelector("term");
var def = defItem.querySelector("def");
// using hwp:id as a fallback MCP articles don't have def.id set
var id = def.id || def.getAttribute("hwp:id") || state.nextId('definition');
var definitionNode = {
id: id,
type: "definition",
title: term.textContent,
description: def.textContent
};
state.doc.create(definitionNode);
state.doc.show("definitions", definitionNode.id);
});
};
// #### Front.ArticleMeta
//
this.extractArticleMeta = function(state, article) {
var articleMeta = article.querySelector("article-meta");
if (!articleMeta) {
throw new ImporterError("Expected element: 'article-meta'");
}
// <article-id> Article Identifier, zero or more
var articleIds = articleMeta.querySelectorAll("article-id");
this.articleIds(state, articleIds);
// <title-group> Title Group, zero or one
var titleGroup = articleMeta.querySelector("title-group");
if (titleGroup) {
this.titleGroup(state, titleGroup);
}
// <pub-date> Publication Date, zero or more
var pubDates = articleMeta.querySelectorAll("pub-date");
this.pubDates(state, pubDates);
this.abstracts(state, articleMeta);
// Not supported yet:
// <trans-abstract> Translated Abstract, zero or more
// <kwd-group> Keyword Group, zero or more
// <conference> Conference Information, zero or more
// <counts> Counts, zero or one
// <custom-meta-group> Custom Metadata Group, zero or one
};
this.extractAffilitations = function(state, article) {
var affiliations = article.querySelectorAll("aff");
for (var i = 0; i < affiliations.length; i++) {
this.affiliation(state, affiliations[i]);
}
};
this.extractContributors = function(state, article) {
// TODO: the spec says, that there may be any combination of
// 'contrib-group', 'aff', 'aff-alternatives', and 'x'
// However, in the articles seen so far, these were sub-elements of 'contrib-group', which itself was single
var contribGroup = article.querySelector("article-meta contrib-group");
if (contribGroup) {
this.contribGroup(state, contribGroup);
}
};
// Catch-all implementation for figures et al.
this.extractFigures = function(state, xmlDoc) {
// Globally query all figure-ish content, <fig>, <supplementary-material>, <table-wrap>, <media video>
// mimetype="video"
// NOTE: We previously only considered figures within <body> but since
// appendices can also have figures we now use a gobal selector.
var figureElements = xmlDoc.querySelectorAll("fig, table-wrap, supplementary-material, media[mimetype=video]");
var nodes = [];
for (var i = 0; i < figureElements.length; i++) {
var figEl = figureElements[i];
// skip converted elements
if (figEl._converted) continue;
var type = util.dom.getNodeType(figEl);
var node = null;
if (type === "fig") {
node = this.figure(state, figEl);
} else if (type === "table-wrap") {
node = this.tableWrap(state, figEl);
} else if (type === "media") {
node = this.video(state, figEl);
} else if (type === "supplementary-material") {
node = this.supplement(state, figEl);
}
if (node) {
nodes.push(node);
}
}
this.show(state, nodes);
};
// Catch-all implementation for footnotes that have not been
// converted yet.
this.extractFootNotes = function(state, article) {
var fnEls = article.querySelectorAll('fn');
for (var i = 0; i < fnEls.length; i++) {
var fnEl = fnEls[i];
if (fnEl.__converted__) continue;
this.footnote(state, fnEl);
}
};
this.extractCitations = function(state, xmlDoc) {
var refList = xmlDoc.querySelector("ref-list");
if (refList) {
this.refList(state, refList);
}
};
// articleIds: array of <article-id> elements
this.articleIds = function(state, articleIds) {
var doc = state.doc;
// Note: Substance.Article does only support one id
if (articleIds.length > 0) {
doc.id = articleIds[0].textContent;
} else {
// if no id was set we create a random one
doc.id = util.uuid();
}
};
this.titleGroup = function(state, titleGroup) {
var doc = state.doc;
var articleTitle = titleGroup.querySelector("article-title");
if (articleTitle) {
doc.title = this.annotatedText(state, articleTitle, ['document', 'title'], {
ignore: ['xref']
});
}
// Not yet supported:
// <subtitle> Document Subtitle, zero or one
};
// Note: Substance.Article supports no publications directly.
// We use the first pub-date for created_at
this.pubDates = function(state, pubDates) {
var doc = state.doc;
if (pubDates.length > 0) {
var converted = this.pubDate(state, pubDates[0]);
doc.created_at = converted.date;
}
};
// Note: this does not follow the spec but only takes the parts as it was necessary until now
// TODO: implement it thoroughly
this.pubDate = function(state, pubDate) {
var day = -1;
var month = -1;
var year = -1;
_.each(util.dom.getChildren(pubDate), function(el) {
var type = util.dom.getNodeType(el);
var value = el.textContent;
if (type === "day") {
day = parseInt(value, 10);
} else if (type === "month") {
month = parseInt(value, 10);
} else if (type === "year") {
year = parseInt(value, 10);
}
}, this);
var date = new Date(year, month, day);
return {
date: date
};
};
this.abstracts = function(state, articleMeta) {
// <abstract> Abstract, zero or more
var abstracts = articleMeta.querySelectorAll("abstract");
_.each(abstracts, function(abs) {
this.abstract(state, abs);
}, this);
};
// TODO: abstract should be a dedicated node
// as it can have some extra information in JATS, such as specific-use
this.abstract = function(state, abs) {
var doc = state.doc;
var nodes = [];
var title = abs.querySelector("title");
var heading = {
id: state.nextId("heading"),
type: "heading",
level: 1,
content: title ? title.textContent : "Abstract"
};
doc.create(heading);
nodes.push(heading);
// with eLife there are abstracts having an object-id.
// TODO: we should store that in the model instead of dropping it
nodes = nodes.concat(this.bodyNodes(state, util.dom.getChildren(abs), {
ignore: ["title", "object-id"]
}));
if (nodes.length > 0) {
this.show(state, nodes);
}
};
// ### Article.Body
//
this.body = function(state, body) {
var doc = state.doc;
var nodes = this.bodyNodes(state, util.dom.getChildren(body));
if (nodes.length > 0) {
this.show(state, nodes);
}
};
this._ignoredBodyNodes = {
// figures and table-wraps are treated globally
"fig": true,
"table-wrap": true
};
// Top-level elements as they can be found in the body or
// in a section
// Note: this is also used for boxed-text elements
this._bodyNodes = {};
this.bodyNodes = function(state, children, options) {
var nodes = [], node;
for (var i = 0; i < children.length; i++) {
var child = children[i];
var type = util.dom.getNodeType(child);
if (this._bodyNodes[type]) {
var result = this._bodyNodes[type].call(this, state, child);
if (_.isArray(result)) {
nodes = nodes.concat(result);
} else if (result) {
nodes.push(result);
} else {
// skip
}
} else if (this._ignoredBodyNodes[type] || (options && options.ignore && options.ignore.indexOf(type) >= 0) ) {
// Note: here are some node types ignored which are
// processed in an extra pass (figures, tables, etc.)
node = this.ignoredNode(state, child, type);
if (node) nodes.push(node);
} else {
console.error("Node not supported as block-level element: " + type +"\n"+child.outerHTML);
}
}
return nodes;
};
this._bodyNodes["p"] = function(state, child) {
return this.paragraphGroup(state, child);
};
this._bodyNodes["sec"] = function(state, child) {
return this.section(state, child);
};
this._bodyNodes["list"] = function(state, child) {
return this.list(state, child);
};
this._bodyNodes["disp-formula"] = function(state, child) {
return this.formula(state, child);
};
this._bodyNodes["caption"] = function(state, child) {
return this.caption(state, child);
};
this._bodyNodes["boxed-text"] = function(state, child) {
return this.boxedText(state, child);
};
this._bodyNodes["disp-quote"] = function(state, child) {
return this.quoteText(state, child);
};
this._bodyNodes["attrib"] = function(state, child) {
return this.paragraphGroup(state, child);
};
this._bodyNodes["comment"] = function(state, child) {
return this.comment(state, child);
};
// Disable fig as a body node, otherwise the order of nodes in the Figures tab can be incorrect
//this._bodyNodes["fig"] = function(state, child) {
// return this.figure(state, child);
//};
// Overwirte in specific converter
this.ignoredNode = function(/*state, node, type*/) {
};
this.comment = function(/*state, comment*/) {
// TODO: this is not yet represented in the article data model
return null;
};
this.boxedText = function(state, box) {
var doc = state.doc;
// Assuming that there are no nested <boxed-text> elements
var childNodes = this.bodyNodes(state, util.dom.getChildren(box));
var boxId = state.nextId("box");
// Optional heading label
var label = this.selectDirectChildren(box, "label")[0];
var boxNode = {
"type": "box",
"id": boxId,
"source_id": box.getAttribute("id"),
"label": label ? label.textContent : "",
"children": _.pluck(childNodes, 'id')
};
doc.create(boxNode);
return boxNode;
};
this.quoteText = function (state, quote) {
var doc = state.doc;
// Assuming that there are no nested <disp-quote> elements
var childNodes = this.bodyNodes(state, util.dom.getChildren(quote));
var quoteId = state.nextId("quote");
var quoteNode = {
"type": "quote",
"id": quoteId,
"source_id": quote.getAttribute("id"),
"label": "",
"children": _.pluck(childNodes, 'id')
};
doc.create(quoteNode);
return quoteNode;
};
this.datasets = function(state, datasets) {
var nodes = [];
for (var i=0;i<datasets.length;i++) {
var data = datasets[i];
var type = util.dom.getNodeType(data);
if (type === 'p') {
var obj = data.querySelector('related-object');
if (obj) {
nodes = nodes.concat(this.indivdata(state,obj));
}
else {
var par = this.paragraphGroup(state, data);
if (par.length > 0) nodes.push(par[0].id);
}
}
}
return nodes;
};
this.indivdata = function(state,indivdata) {
var doc = state.doc;
var p1 = {
"type" : "paragraph",
"id" : state.nextId("paragraph"),
"children" : []
};
var text1 = {
"type" : "text",
"id" : state.nextId("text"),
"content" : ""
};
p1.children.push(text1.id);
var input = util.dom.getChildren(indivdata);
for (var i = 0;i<input.length;i++) {
var info = input[i];
var type = util.dom.getNodeType(info);
var par;
if (type === "name") {
var children = util.dom.getChildren(info);
for (var j = 0;j<children.length;j++) {
var name = children[j];
if (j === 0) {
par = this.paragraphGroup(state,name);
p1.children.push(par[0].children[0]);
}
else {
var text2 = {
"type" : "text",
"id" : state.nextId("text"),
"content" : ", "
};
doc.create(text2);
p1.children.push(text2.id);
par = this.paragraphGroup(state,name);
p1.children.push(par[0].children[0]);
}
}
}
else {
par = this.paragraphGroup(state,info);
// Smarter null reference check?
if (par && par[0] && par[0].children) {
p1.children.push(par[0].children[0]);
}
}
}
doc.create(p1);
doc.create(text1);
return p1.id;
};
this.section = function(state, section) {
// pushing the section level to track the level for nested sections
state.sectionLevel++;
var doc = state.doc;
var children = util.dom.getChildren(section);
var nodes = [];
// Optional heading label
var label = this.selectDirectChildren(section, "label")[0];
// create a heading
var title = this.selectDirectChildren(section, 'title')[0];
if (!title) {
console.error("FIXME: every section should have a title", this.toHtml(section));
}
// Recursive Descent: get all section body nodes
nodes = nodes.concat(this.bodyNodes(state, children, {
ignore: ["title", "label"]
}));
if (nodes.length > 0 && title) {
var id = state.nextId("heading");
var heading = {
id: id,
source_id: section.getAttribute("id"),
type: "heading",
level: state.sectionLevel,
content: title ? this.annotatedText(state, title, [id, 'content']) : ""
};
if (label) {
heading.label = label.textContent;
}
if (heading.content.length > 0) {
doc.create(heading);
nodes.unshift(heading);
}
} else if (nodes.length === 0) {
console.info("NOTE: skipping section without content:", title ? title.innerHTML : "no title");
}
// popping the section level
state.sectionLevel--;
return nodes;
};
this.ignoredParagraphElements = {
"comment": true,
"supplementary-material": true,
"fig": true,
"fig-group": true,
"table-wrap": true,
"media": true
};
this.acceptedParagraphElements = {
"boxed-text": {handler: "boxedText"},
"disp-quote": {handler: "quoteText"},
"list": { handler: "list" },
"disp-formula": { handler: "formula" },
};
this.inlineParagraphElements = {
"inline-graphic": true,
"inline-formula": true,
"fn": true
};
// Segments children elements of a NLM <p> element
// into blocks grouping according to following rules:
// - "text", "inline-graphic", "inline-formula", and annotations
// - ignore comments, supplementary-materials
// - others are treated as singles
this.segmentParagraphElements = function(paragraph) {
var blocks = [];
var lastType = "";
var iterator = new util.dom.ChildNodeIterator(paragraph);
// first fragment the childNodes into blocks
while (iterator.hasNext()) {
var child = iterator.next();
var type = util.dom.getNodeType(child);
// ignore some elements
if (this.ignoredParagraphElements[type]) continue;
// paragraph block-types such as disp-formula
// i.e they are allowed within a paragraph, but
// we pull them out on the top level
if (this.acceptedParagraphElements[type]) {
blocks.push(_.extend({node: child}, this.acceptedParagraphElements[type]));
}
// paragraph elements
//if (type === "text" || this.isAnnotation(type) || this.inlineParagraphElements[type]) {
else {
if (lastType !== "paragraph") {
blocks.push({ handler: "paragraph", nodes: [] });
lastType = "paragraph";
}
_.last(blocks).nodes.push(child);
continue;
}
lastType = type;
}
return blocks;
};
// A 'paragraph' is given a '<p>' tag
// An NLM <p> can contain nested elements that are represented flattened in a Substance.Article
// Hence, this function returns an array of nodes
this.paragraphGroup = function(state, paragraph) {
var nodes = [];
// Note: there are some elements in the NLM paragraph allowed
// which are flattened here. To simplify further processing we
// segment the children of the paragraph elements in blocks
var blocks = this.segmentParagraphElements(paragraph);
for (var i = 0; i < blocks.length; i++) {
var block = blocks[i];
var node;
if (block.handler === "paragraph") {
node = this.paragraph(state, block.nodes);
if (node) node.source_id = paragraph.getAttribute("id");
} else {
node = this[block.handler](state, block.node);
}
if (node) nodes.push(node);
}
return nodes;
};
// DEPRECATED: using this handler for <p> elements is
// deprecated, as in JATS <p> can contain certain block-level
// elements. Better use this.paragraphGroup in cases where you
// convert <p> elements.
// TODO: we should refactor this and make it a 'private' helper
this.paragraph = function(state, children) {
var doc = state.doc;
// Reset whitespace handling at the beginning of a paragraph.
// I.e., whitespaces at the beginning will be removed rigorously.
state.skipWS = true;
var node = {
id: state.nextId("paragraph"),
type: "paragraph",
children: null
};
var nodes = [];
var iterator = new util.dom.ChildNodeIterator(children);
while (iterator.hasNext()) {
var child = iterator.next();
var type = util.dom.getNode