ingenta-lens
Version:
A novel way of seeing content.
848 lines (754 loc) • 27.6 kB
JavaScript
var _ = require('underscore');
var util = require("../../substance/util");
var LensConverter = require('../../converter');
var LensArticle = require("../../article");
var MathNodeTypes = require("./nodes");
// Options:
// - see Lens.Converter options
// - 'equationLabelSide': 'left' | 'right' (default: 'left')
var MathConverter = function(options) {
LensConverter.call(this, options);
this.options.equationLabelSide = this.options.equationLabelSide || 'left';
};
MathConverter.Prototype = function MathConverterPrototype() {
var __super__ = LensConverter.prototype;
this._refTypeMapping["disp-formula"] = "formula_reference";
this._refTypeMapping["statement"] = "math_environment_reference";
this.acceptedParagraphElements = _.extend(__super__.acceptedParagraphElements, {
"def-list": { handler: 'defList' }
});
this._annotationTypes = _.extend(__super__._annotationTypes, {
"roman": "custom_annotation"
});
this.test = function(xmlDoc, documentUrl) {
/* jshint unused:false */
var publisherName = xmlDoc.querySelector("publisher-name").textContent;
return publisherName === "American Mathematical Society";
};
this.createState = function(xmlDoc, doc) {
var state = __super__.createState.call(this, xmlDoc, doc);
// Note: This overrides the eLife behavior introducing an extra level for 'Main Text' (et al)
state.sectionLevel = 0;
// instead of adding nodes to the content panel instantly
// we defer this to be able to decide whether a node should be shown in flow or not
state.shownNodes = [];
// to store display related information (such as environment should go into math panel)
state.nodeInfo = {};
// Data structures to store relationships between environments, formulas and labels.
// Later these are used to create 'deep' linking formula references.
state.envStack = [];
state.environments = {};
state.formulas = {};
state.labels = {};
state.formulaForLabel = {};
state.labelsForFormula = {};
state.envForFormula = {};
return state;
};
// For the time of development the math nodes are implemented within this project
// and we create an Lens.Article which supports these new node types
this.createDocument = function() {
var doc = new LensArticle({
nodeTypes: (this.options.nodeTypes || MathNodeTypes)
});
// initialize a container for the math environments
doc.create({
type: 'view',
id: 'math',
nodes: []
});
return doc;
};
// TODO: the default implemenation should be plain, i.e. not adding an extra heading 'Main Text'
// Instead the LensConverter should override this...
// ...or we should consider adding an option (if the eLife way to do it is more often applicable...)
this.body = function(state, body) {
var nodes = this.bodyNodes(state, util.dom.getChildren(body));
if (nodes.length > 0) {
this.show(state, nodes);
}
};
this._bodyNodes["statement"] = function(state, statement) {
var contentType = statement.getAttribute('content-type');
// HACK: workaround as atm there are proofs that still use @disp-level
if (!contentType) {
contentType = statement.getAttribute('disp-level');
console.error('FIXME: statement element using @disp-level instead of @content-type');
}
// Math environments: thmdefinition, thmplain, thmremark
if (contentType === "theorem") {
return this.mathEnvironment(state, statement);
}
// Proofs
else if (contentType === "proof") {
return this.proof(state, statement);
} else {
console.log("Unsupported statement element", contentType);
}
};
this._bodyNodes['def-list'] = this.defList = function(state, defList) {
var enumerationNode = {
type: 'enumeration',
id: state.nextId('enumeration'),
items: []
};
var defItems = this.selectDirectChildren(defList, 'def-item');
for (var i = 0; i < defItems.length; i++) {
var defItem = defItems[i];
var term = defItem.querySelector('term');
var termId = term.id;
var def = defItem.querySelector('def');
var enumItemNode = {
type: 'enumeration-item',
// TODO: enabling the correct id makes warnings disappear
// which are given when seeing references to this def
// However, to work properly, we would need nesting support
// for definition references
// so we leave it for now
// id: termId || state.nextId('enumeration-item'),
id: state.nextId('enumeration-item'),
children: []
};
// convert label
enumItemNode.label = this.annotatedText(state, term, [enumItemNode.id, 'label']);
// convert content
// TODO: is the assumption correct that def-item content is always wrapped in a p element?
var pEls = this.selectDirectChildren(def, 'p');
for (var j = 0; j < pEls.length; j++) {
var p = pEls[j];
var children = this.paragraphGroup(state, p);
var pgroup = {
type: 'paragraph',
id: state.nextId('pgroup'),
children: _.pluck(children, 'id')
};
state.doc.create(pgroup);
enumItemNode.children.push(pgroup.id);
}
state.doc.create(enumItemNode);
enumerationNode.items.push(enumItemNode.id);
}
state.doc.create(enumerationNode);
return enumerationNode;
};
// HACK: There is content that has nested <app> elements, which is not allowed
// we just treat them as sections
this._bodyNodes['app'] = this._bodyNodes['sec'];
this.extractDefinitions = function(/*state, article*/) {
// We don't want to show a definitions (glossary) panel
// TODO: we should consider making this a static configuration for lens-converter
return;
};
this.proof = function(state, proofEl) {
var doc = state.doc;
// Assuming that there are no nested <boxed-text> elements
var childNodes = this.bodyNodes(state, util.dom.getChildren(proofEl), {
ignore: ["title"]
});
var titleEl = proofEl.querySelector('title');
var label = titleEl ? titleEl.textContent : 'Proof';
var id = state.nextId("proof");
var proofNode = {
"type": "proof",
"id": id,
"source_id": proofEl.getAttribute("id"),
"label": label,
"children": _.pluck(childNodes, 'id')
};
doc.create(proofNode);
return proofNode;
};
this.mathEnvironment = function(state, secNode) {
var doc = state.doc;
// fetch the math environment content:
// type
// title -> can contain math and citations
// body -> 1+ paragraphs
// rid -> reference id
var envType = secNode.getAttribute('style');
var specificUse = secNode.getAttribute('specific-use');
var id = state.nextId('math_environment');
var rId = secNode.getAttribute('id') || id;
// bookkeeping to be able to associate formulas to environments
state.envStack.push(id);
// TODO: are there better semantic representations?
// I have seen 'statement' as a dedicated element for environments...
var labelEl = secNode.querySelector('label');
var titleEl = secNode.querySelector('title');
var bodyNodes = this.bodyNodes(state, util.dom.getChildren(secNode), {
ignore: ["label", "title"]
});
var mathEnv = {
id: id,
type: "math_environment",
source_id: rId,
envType: envType,
body: _.pluck(bodyNodes, 'id')
};
var info = {
specificUse: specificUse
};
if (labelEl) {
mathEnv.label = this.annotatedText(state, labelEl, [mathEnv.id, 'label']);
}
if (titleEl) {
mathEnv.comment = this.annotatedText(state, titleEl, [mathEnv.id, 'comment']);
}
if (!labelEl && !titleEl) {
console.error('There are cases without label and without title!');
}
mathEnv = doc.create(mathEnv);
mathEnv.specificUse = specificUse;
// keep track of the math environment for formula references
state.environments[id] = mathEnv;
state.nodeInfo[id] = info;
state.envStack.pop();
return mathEnv;
};
this._bodyNodes["disp-formula"] = function(state, child) {
var formulaNode = this.formula(state, child);
// Add a label for display formulas not part of an environment
if (!state.envForFormula[formulaNode.id]) {
var labels = state.labelsForFormula[formulaNode.id];
var labelIds = labels ? Object.keys(labels) : [];
if (labelIds.length > 0) {
var labelTitles = [];
_.each(labels, function(label) {
if (label.title) {
labelTitles.push(label.title);
}
});
formulaNode.label = [
labelTitles.length > 1 ? "Equations" : "Equation",
labelTitles.join('')
].join(" ");
}
}
return formulaNode;
};
this.addAnnotationDataForXref = function(state, anno, el) {
__super__.addAnnotationDataForXref.apply(this, arguments, el);
// for formula_references convert the annotation target into an array
if (anno.type === 'formula_reference') {
anno.target = [ anno.target ];
}
};
// Formula Node Type
// --------
// <mml:mrow xmlns:xlink="http://www.w3.org/1999/xlink" xlink:type="resource" xlink:label="derivata.elbert"/
this._extractLabels = function(el) {
var result = {};
var xlinkResources = el.querySelectorAll('[*|type="resource"]');
for (var i = 0; i < xlinkResources.length; i++) {
var res = xlinkResources[i];
var label = res.getAttribute('xlink:label');
var role = res.getAttribute('xlink:role');
var title = res.getAttribute('xlink:title') || "";
if (label) {
result[label] = {
id: label,
role: role,
el: res,
title: title
};
}
}
return result;
};
this._extractLabelsFromMathJaxTex = function(tex) {
var result = {};
var re = /cssId\{([^}]+)\}/g;
var match;
var label;
while ( (match = re.exec(tex)) ) {
label = match[1];
result[label] = {
id: label,
role: 'equation',
title: ""
};
}
return result;
};
this._getFormulaData = function(state, formulaElement, formulaId, inline) {
var result = [];
var labels = {'tex' : {}, 'svg': {}, 'html': {}, 'math': {}};
var el = formulaElement;
var alternatives = el.querySelector('alternatives');
if (alternatives) el = alternatives;
for (var child = el.firstElementChild; child; child = child.nextElementSibling) {
var type = util.dom.getNodeType(child);
switch (type) {
case "graphic":
case "inline-graphic":
result.push({
format: 'image',
data: child.getAttribute('xlink:href')
});
break;
case "svg":
labels.svg = this._extractLabels(child);
result.push({
format: "svg",
data: this.toHtml(child)
});
break;
case "textual-form":
labels.html = this._extractLabels(child);
result.push({
format: "html",
data: $(child).text()
});
break;
case "mml:math":
case "math":
// HACK: make sure that mml in display-formulas has set display="block"
// Although this should be present in proper MathML it does not really hurt to enforce it here
if (!inline) {
child.setAttribute("display", "block");
}
// override label alignment (default: left)
var mtable = child.querySelector('mtable');
if (mtable) {
mtable.setAttribute('side', this.options.equationLabelSide);
}
var mml = this.mmlToHtmlString(child);
labels.math = this._extractLabels(child);
result.push({
format: "mathml",
data: mml
});
break;
case "tex-math":
result.push({
format: "latex",
data: child.textContent
});
labels.tex = this._extractLabelsFromMathJaxTex(child.textContent);
break;
case "label":
// Skipping - is handled in this.formula()
var label = child.textContent;
labels[label] = { id: label, el: child };
break;
default:
console.error('Unsupported formula element of type ' + type);
}
}
// console.log("Extracted labels for formula", formulaId, labels);
// do some bookkeeping to be able to look up formulas via label (for formula_references)
labels = _.extend(labels.tex, labels.svg, labels.math);
_.extend(state.labels, labels);
_.each(labels, function(l) {
state.formulaForLabel[l.id] = formulaId;
});
state.labelsForFormula[formulaId] = labels;
return result;
};
this.formula = function(state, formulaElement, inline) {
var doc = state.doc;
var id = state.nextId("formula");
var formulaNode = {
id: id,
source_id: formulaElement.getAttribute("id"),
type: "formula",
label: "",
inline: !!inline,
data: [],
format: [],
};
var info = {
specificUse: formulaElement.getAttribute('specific-use')
};
// TODO: there could be multiple labels
var label = formulaElement.querySelector("label");
if (label) {
formulaNode.label = label.textContent;
}
var formulaData = this._getFormulaData(state, formulaElement, id, inline);
for (var i = 0; i < formulaData.length; i++) {
formulaNode.format.push(formulaData[i].format);
formulaNode.data.push(formulaData[i].data);
}
doc.create(formulaNode);
// do some bookkeeping to be able to look up environments via formula (for formula_references)
state.formulas[id] = formulaNode;
if (state.envStack.length > 0) {
var envId = _.last(state.envStack);
state.envForFormula[id] = envId;
}
state.nodeInfo[id] = info;
return formulaNode;
};
var _defaultXmlToHtmlMapping = {
'ext-link': function(el) {
return [
'<a class="ext-link" href="', el.getAttribute('xlink:href'), '" target="_blank">',
'<i class="fa fa-external-link"></i> ',
el.textContent,
"</a>"
].join('');
},
'inline-formula': function(el) {
return [
'<span class="inline-formula">',
'<span class="MathJax_Preview">',
el.textContent,
'</span>',
'<script type="math/tex">',
el.textContent,
'</script>',
'</span>'
].join('');
}
};
this.convertXmlToHtml = function(element, mapping) {
mapping = _.extend({}, _defaultXmlToHtmlMapping, mapping);
var str = [];
function _convert(element) {
var tagName = element.tagName.toLowerCase();
if (_.isFunction(mapping[tagName])) {
var elContent = mapping[tagName](element);
str.push(elContent);
} else {
var elType = 'span' || mapping[tagName];
str.push('<' + elType + ' class="' + tagName + '">');
for (var childNode = element.firstChild; childNode; childNode = childNode.nextSibling) {
if (childNode.nodeType === Document.TEXT_NODE) {
str.push(childNode.textContent);
} else if (childNode.nodeType === Document.ELEMENT_NODE) {
_convert(childNode);
} else {
console.error("Unsupported node type.", childNode.nodeType);
}
}
str.push('</'+ elType + '>');
}
}
_convert(element);
return str.join('');
};
this.ref = function(state, ref) {
var citation = ref.querySelector("mixed-citation");
var rawCitations = ref.querySelectorAll("raw-citation");
var rawFormats = [];
_.each(rawCitations, function(rawCitation) {
var type = rawCitation.getAttribute('type');
var content = rawCitation.textContent;
rawFormats.push({
type: type,
content: content
});
});
var i;
var id = state.nextId("plain_citation");
var citationNode = {
"id": id,
"source_id": ref.getAttribute("id"),
"type": "plain_citation",
"label": "",
"authors": [],
"raw_formats": rawFormats,
"content": "",
};
var label = ref.querySelector("label");
if(label) citationNode.label = label.textContent;
var personGroups = citation.querySelectorAll("person-group");
for (var j = 0; j < personGroups.length; j++) {
var personGroup = personGroups[j];
var nameElements = personGroup.querySelectorAll("name");
for (i = 0; i < nameElements.length; i++) {
citationNode.authors.push(this.getName(nameElements[i]));
}
// Consider collab elements (treat them as authors)
var collabElements = personGroup.querySelectorAll("collab");
for (i = 0; i < collabElements.length; i++) {
citationNode.authors.push(collabElements[i].textContent);
}
}
citationNode.label += " " + citationNode.authors.join(", ");
citationNode.label = citationNode.label.trim();
// Don't treat the content at all, just make it simple HTML/CSS
citationNode.content = this.annotatedText(state, citation, [citationNode.id, "content"]);
state.doc.create(citationNode);
state.doc.show("citations", id);
};
this.affiliation = function(state, aff) {
var doc = state.doc;
var label = aff.querySelector("label");
var institutionText = '';
for (var el = aff.firstChild; el; el = el.nextSibling) {
var type = util.dom.getNodeType(el);
if (type === 'label' || !el.textContent) continue;
institutionText += el.textContent;
}
var specific_use = aff.getAttribute('specific-use');
// TODO: we might add a property to the affiliation node that collects
// data which is not handled here
var affiliationNode = {
id: state.nextId("affiliation"),
type: "affiliation",
source_id: aff.getAttribute("id"),
label: label ? label.textContent : null,
institution: institutionText,
specific_use: specific_use || null
};
state.affiliations.push(affiliationNode.id);
doc.create(affiliationNode);
};
this.extractAuthorImpactStatement = function(state, article) {
/* jshint unused:false */
console.error('FIXME: the default implementation is not useful and needs to be replaced.');
return [];
};
// Configuration
// -------------------
var MATH_PANEL = 'math';
this.enhancePublicationInfo = function(state) {
var article = state.xmlDoc.querySelector("article");
var articleMeta = article.querySelector("article-meta");
var publicationInfo = state.doc.get('publication_info');
// Extract keywords
// ------------
//
// <kwd-group kwd-group-type="author-keywords">
// <title>Author keywords</title>
// <kwd>innate immunity</kwd>
// <kwd>histones</kwd>
// <kwd>lipid droplet</kwd>
// <kwd>anti-bacterial</kwd>
// </kwd-group>
var keywordEls = articleMeta.querySelectorAll("kwd-group kwd");
// Extract subjects
// ------------
//
// <subj-group subj-group-type="heading">
// <subject>Immunology</subject>
// </subj-group>
// <subj-group subj-group-type="heading">
// <subject>Microbiology and infectious disease</subject>
// </subj-group>
var subjectEls = articleMeta.querySelectorAll("subj-group[subj-group-type=heading] subject");
// Article Type
//
// <subj-group subj-group-type="display-channel">
// <subject>Research article</subject>
// </subj-group>
var articleType = articleMeta.querySelector("subj-group[subj-group-type=display-channel] subject");
// Extract PDF link
// ---------------
//
// <self-uri content-type="pdf" xlink:href="elife00007.pdf"/>
var pdfURI = article.querySelector("self-uri[content-type=pdf]");
var links = [];
if (pdfURI) {
links.push({
url: pdfURI.getAttribute("xlink:href"),
name: "PDF",
type: "pdf"
});
}
// Extract raw citation formats for the article
// ---------------
//
var rawCitations = articleMeta.querySelectorAll("article-citation");
var rawFormats = [];
_.each(rawCitations, function(rawCitation) {
var type = rawCitation.getAttribute('type');
var content = rawCitation.textContent;
rawFormats.push({
type: type,
content: content
});
});
publicationInfo.raw_formats = rawFormats;
var keywords = [];
for (var i = 0; i < keywordEls.length; i++) {
keywords.push(this.annotatedText(state, keywordEls[i], ["publication_info", "keywords", i]));
}
publicationInfo.keywords = keywords;
var subjects = [];
for (var i = 0; i < subjectEls.length; i++) {
subjects.push(this.annotatedText(state, subjectEls[i], ["publication_info", "subjects", i]));
}
publicationInfo.subjects = subjects;
publicationInfo.article_type = articleType ? articleType.textContent : "";
publicationInfo.links = links;
};
// Overidden, as we want to show nodes in-flow ('content') depending on whether they have been
// referenced or not. This information is available after postProcessAnnotations().
// Instead of showing nodes right away we keep them for post-processing.
this.show = function(state, nodes) {
if (nodes && nodes.length > 0) {
state.shownNodes = state.shownNodes.concat(nodes);
}
};
this.postProcess = function(state) {
// we can now set proper annotation targets, as the addressed nodes
// now exist
this.resolveAnnotationTargets(state);
// now anything is available to decide which nodes are shown where
// Note: we have overridden LensConverter.show(state, nodes)
// so that it stores the ids in the order of occurrence
// and here we actually put them into the according panel
this.populatePanels(state);
};
function _getMathReferenceInfo(state, refId) {
var envId = null;
var formulaId = null;
var labelId = null;
if (state.labels[refId]) {
labelId = refId;
formulaId = state.formulaForLabel[labelId];
envId = state.envForFormula[formulaId];
} else if (state.formulas[refId]) {
formulaId = refId;
envId = state.envForFormula[formulaId];
} else {
console.error("Could not resolve target for formula reference", refId);
}
return {
envId: envId,
formulaId: formulaId,
labelId: labelId
};
}
// Post-processing, such as creating annotations
// as at this moment all information is available (e.g. referenced nodes exist)
this.resolveAnnotationTargets = function(state) {
var doc = state.doc;
var targetNode;
var referencedMath = {};
for (var i = 0; i < state.annotations.length; i++) {
var anno = state.annotations[i];
if (anno.target) {
if (anno.type === "formula_reference") {
var refTarget = _getMathReferenceInfo(state, anno.target[0]);
if (refTarget.formulaId) {
var newTarget = [];
if (refTarget.envId) newTarget.push(refTarget.envId);
if (refTarget.formulaId) newTarget.push(refTarget.formulaId);
if (refTarget.labelId) newTarget.push(refTarget.labelId);
anno.target = newTarget;
}
if (refTarget.envId) {
referencedMath[refTarget.envId] = true;
} else if (refTarget.formulaId) {
referencedMath[refTarget.formulaId] = true;
}
} else if (anno.type === "math_environment_reference") {
targetNode = state.doc.getNodeBySourceId(anno.target) || state.doc.get(anno.target);
if (targetNode) {
anno.target = targetNode.id;
} else {
console.log("Could not lookup math environment for reference", anno);
continue;
}
referencedMath[targetNode.id] = true;
} else {
targetNode = state.doc.getNodeBySourceId(anno.target) || state.doc.get(anno.target);
if (targetNode) {
anno.target = targetNode.id;
targetNode.isReferenced = true;
} else {
console.log("Could not lookup targetNode for annotation", anno);
continue;
}
}
}
doc.create(anno);
}
_.each(referencedMath, function(val, id) {
var mathNode = doc.get(id);
if (!mathNode) {
console.warn('Referenced math node does not exist:', id);
} else {
mathNode.isReferenced = true;
}
});
state.referencedMath = referencedMath;
};
function _showFigure(state, node) {
// show all figures in the figures panel
state.doc.show('figures', node.id);
// show unreferenced and anchored figures in the main content
if (!node.isReferenced || node.position === 'anchor') {
state.doc.show('content', node.id);
}
}
function _showFormulaOrEnvironment(state, node, nested) {
var referencedMath = state.referencedMath;
var info = state.nodeInfo[node.id];
// only show formulas and environments in the math panel
// - if they are referenced
// - or have specificUse='resource' set explicitly
if (referencedMath[node.id] ||
(info && info.specificUse === "resource")) {
doc.show(MATH_PANEL, node.id);
}
if (!nested) {
doc.show('content', node.id);
}
// a math environment can have nested content
// such as figures or environments which need
// to be processed recursively
if (node.type === 'math_environment') {
_showNestedContent(state, node.body);
}
}
function _showNestedContent(state, nodeIds) {
var referencedMath = state.referencedMath;
for (var i = 0; i < nodeIds.length; i++) {
var nodeId = nodeIds[i]
var node = state.doc.get(nodeId);
var info = state.nodeInfo[nodeId];
switch (node.type) {
case 'figure':
// show all figures in the figures panel
state.doc.show('figures', nodeId);
// hide referenced and unanchored figures from the environment
if (node.isReferenced && node.position !== 'anchor') {
nodeIds.splice(i, 1);
i--;
}
break;
case 'formula':
case 'math_environment':
_showFormulaOrEnvironment(state, node, 'nested');
break;
default:
// nothing
}
}
}
function _showProof(state, node) {
// proofs are always shown only in the content
state.doc.show('content', node.id);
_showNestedContent(state, node.children);
}
this.populatePanels = function(state) {
var doc = state.doc;
var referencedMath = state.referencedMath;
var node, child, info;
for (var i = 0; i < state.shownNodes.length; i++) {
node = doc.get(state.shownNodes[i].id);
switch (node.type) {
case 'figure':
_showFigure(state, node);
break;
case 'formula':
case 'math_environment':
_showFormulaOrEnvironment(state, node);
break;
case 'proof':
_showProof(state, node);
break;
default:
LensConverter.prototype.showNode.call(this, state, node);
}
}
};
};
MathConverter.Prototype.prototype = LensConverter.prototype;
MathConverter.prototype = new MathConverter.Prototype();
module.exports = MathConverter;