ldpm-pubmed
Version:
pubmed as package.jsonld
1,685 lines (1,329 loc) • 48.2 kB
JavaScript
var tools = require('./tools')
, path = require('path')
, isUrl = require('is-url')
, XMLSerializer = require('xmldom').XMLSerializer
, clone = require('clone')
, _ = require('underscore')
, url = require('url');
/**
* Cf. http://jats.nlm.nih.gov/archiving/tag-library/1.1d1/index.html
*/
exports.publisher = function($journalMeta){
if(! $journalMeta) return;
var publisher = { '@type': 'Organization' };
var $publisherName = $journalMeta.getElementsByTagName('publisher-name')[0];
if($publisherName){
publisher.name = tools.cleanText($publisherName.textContent);
}
var $publisherLoc = $journalMeta.getElementsByTagName('publisher-loc')[0];
if($publisherLoc){
publisher.location = {
'@type': 'PostalAddress',
description: tools.cleanText($publisherLoc.textContent)
}
}
if(Object.keys(publisher).length>1){
return publisher;
}
};
exports.issn = function($issns){
var issn;
for(var i=0; i<$issns.length; i++){ //epub if possible because digital age
issn = tools.cleanText($issns[i].textContent);
if($issns[i].getAttribute('pub-type') === 'epub'){
return issn;
}
}
return issn;
};
exports.periodical = function($journalMeta){
if(! $journalMeta) return;
var periodical = { '@type': 'Periodical' };
var $journalTitle = $journalMeta.getElementsByTagName('journal-title')[0];
if($journalTitle){
periodical.name = tools.cleanText($journalTitle.textContent);
}
var $journalId = $journalMeta.getElementsByTagName('journal-id');
for(i=0; i<$journalId.length; i++){
var journalIdType = $journalId[i].getAttribute('journal-id-type');
if(journalIdType === 'nlm-ta'){
periodical.alternateName = tools.cleanText($journalId[i].textContent);
break;
}
}
if(!periodical.alternateName){ //try again with <abbrev-journal-title>
var $abbrevJournalTitle = $journalMeta.getElementsByTagName('abbrev-journal-title');
if($abbrevJournalTitle && $abbrevJournalTitle.length){
for(i=0; i<$abbrevJournalTitle.length; i++){
var abbrevType = $abbrevJournalTitle[i].getAttribute('abbrev-type');
if(abbrevType === 'nlm-ta'){
periodical.alternateName = tools.cleanText($abbrevJournalTitle[i].textContent);
break;
}
}
}
}
var $issn = $journalMeta.getElementsByTagName('issn');
if($issn){
periodical.issn = exports.issn($issn);
}
if(Object.keys(periodical).length > 1){
return periodical;
}
};
exports.ids = function($articleMeta){
if(! $articleMeta) return;
var ids = {};
var $articleId = $articleMeta.getElementsByTagName('article-id');
if($articleId){
Array.prototype.forEach.call($articleId, function($el){
var t = $el.getAttribute('pub-id-type');
if(t === 'doi'){
ids.doi = $el.textContent;
} else if (t === 'pmid'){
ids.pmid = $el.textContent;
} else if (t === 'pmcid'){
ids.pmcid = $el.textContent;
}
});
}
if(Object.keys(ids).length){
return ids;
}
};
exports.keywords = function($article){
if(! $article) return;
var keywords = [];
//keywords from <article-categories>
var $articleMeta = $article.getElementsByTagName('article-meta')[0];
if($articleMeta){
var $articleCategories = $articleMeta.getElementsByTagName('article-categories')[0];
if($articleCategories){
var $subjects = $articleCategories.getElementsByTagName('subject');
if($subjects && $subjects.length){
keywords = keywords.concat(Array.prototype.map.call($subjects, function($s){
return tools.cleanText($s.textContent);
}));
}
}
}
//keywords from kw
var $kws = $article.getElementsByTagName('kw');
if($kws && $kws.length){
keywords = keywords.concat(Array.prototype.map.call($kws, function($kw){
return tools.cleanText($kw.textContent);
}));
}
if(keywords.length){
return _.uniq(keywords);
}
};
exports.affiliations = function($articleMeta){
if(! $articleMeta) return;
var affiliations = {}; // affiliations are generally defined independently of authors, with keys that the author spans point to.
var $affs = $articleMeta.getElementsByTagName('aff');
if($affs){
Array.prototype.forEach.call($affs, function($aff){
var id = $aff.getAttribute('id');
if(!id) return;
var affiliation = { '@type': 'Organization' };
var desc = '';
var $institution = $aff.getElementsByTagName('institution')[0];
var $addrLine = $aff.getElementsByTagName('addr-line')[0];
var $country = $aff.getElementsByTagName('country')[0];
var $fax = $aff.getElementsByTagName('fax')[0];
var $phone = $aff.getElementsByTagName('phone')[0];
var $email = $aff.getElementsByTagName('email')[0];
if($institution){
affiliation.name = $institution.textContent;
desc = affiliation.name + '. ';
}
if($addrLine){
desc += $addrLine.textContent + '. ';
}
if($country){
affiliation.address = {
'@type': 'PostalAddress',
addressCountry: $country.textContent
};
desc += $country.textContent + '. ';
}
if($fax){
affiliation.faxNumber = $fax.textContent;
}
if($phone){
affiliation.telephone = $phone.textContent;
}
if($email){
affiliation.email = $email.textContent;
}
if(!desc){
desc = _getTextExcludingTagNames($aff, ['sup', 'label']);
}
if(desc){
affiliation.description = tools.cleanText(desc);
}
if(affiliations[id]){
affiliations[id].push(affiliation);
} else {
affiliations[id] = [affiliation];
}
});
}
if(Object.keys(affiliations).length){
return affiliations;
}
};
exports.emails = function ($articleMeta){
if(! $articleMeta) return;
var emails = {};
var $authorNotes = $articleMeta.getElementsByTagName('author-notes');
if($authorNotes){
Array.prototype.forEach.call($authorNotes, function($el){
var $corresp = $el.getElementsByTagName('corresp')[0];
var id = $corresp.getAttribute('id');
var $email = $corresp.getElementsByTagName('email')[0];
if(id && $email){
emails[id] = $email.textContent;
}
});
}
if(Object.keys(emails).length){
return emails;
}
};
//TODO: refine (see) <collab>
exports.collab = function($collab){
if(! $collab) return;
return {
'@type': 'Organization',
name: tools.cleanText($collab.textContent)
};
};
exports.personName = function($name){
if(! $name) return;
var person = { '@type': 'Person' };
var $givenNames = $name.getElementsByTagName('given-names')[0];
if($givenNames){
person.givenName = tools.cleanText($givenNames.textContent);
}
var $surname = $name.getElementsByTagName('surname')[0];
if($surname){
person.familyName = tools.cleanText($surname.textContent);
}
var $prefix = $name.getElementsByTagName('prefix')[0];
if($prefix){
person.honorificPrefix = tools.cleanText($prefix.textContent);
}
var $suffix = $name.getElementsByTagName('suffix')[0];
if($suffix){
person.honorificSuffix = tools.cleanText($suffix.textContent);
}
return person;
};
exports.allContributors = function($articleMeta){
if(! $articleMeta) return;
var affiliations = exports.affiliations($articleMeta) || {};
var emails = exports.emails($articleMeta) || {};
var allContributors = {};
var author;
var contributor = [];
var accountablePerson = [];
var editor = [];
var $contribGroups = $articleMeta.getElementsByTagName('contrib-group');
if($contribGroups){
Array.prototype.forEach.call($contribGroups, function($contribGroup){
var authCnt = 0;
Array.prototype.forEach.call($contribGroup.childNodes, function($el){
if($el.tagName === 'contrib'){
var $contrib = $el;
var contribType = $contrib.getAttribute('contrib-type');
//try to get a Person
var person = exports.personName($contrib.getElementsByTagName('name')[0] || $citation.getElementsByTagName('string-name')[0]) || { '@type': 'Person' };
var $role = $contrib.getElementsByTagName('role')[0];
if($role){
person.jobTitle = tools.cleanText($role.textContent);
}
var affiliation = [];
var email;
var corresp = !!($contrib.getAttribute('corresp') === 'yes');
var $xrefs = $contrib.getElementsByTagName('xref');
if($xrefs){
Array.prototype.forEach.call($xrefs, function($xref){
var refType = $xref.getAttribute('ref-type');
var rid = $xref.getAttribute('rid');
if(refType === 'aff'){
if(affiliations[rid]){
affiliation = affiliation.concat(affiliations[rid]);
}
} else if(refType === 'corresp'){
if(emails[rid]){
email = emails[rid];
}
corresp = true;
}
});
}
var $email = $contrib.getElementsByTagName('email')[0];
if($email){
email = $email.textContent;
}
if(email){
person.email = email
}
if(affiliation.length){
person.affiliation = affiliation;
}
var collab = exports.collab($contrib.getElementsByTagName('collab')[0]);
if(Object.keys(person).length === 1 && collab){
person = collab;
}
if(!contribType || contribType === 'author'){
if(authCnt++ === 0){
author = person;
} else {
contributor.push(person);
}
if (corresp){
accountablePerson.push(person);
}
} else if(contribType === 'editor'){
editor.push(person);
}
}
});
});
}
if(author && Object.keys(author).length > 1){
allContributors.author = author;
}
if(contributor && contributor.length){
allContributors.contributor = contributor;
}
if(editor && editor.length){
allContributors.editor = editor;
}
if(accountablePerson && accountablePerson.length){
allContributors.accountablePerson = accountablePerson;
}
if(Object.keys(allContributors).length){
return allContributors;
}
};
exports.headline = function($articleMeta){
if(! $articleMeta) return;
var $articleTitle = $articleMeta.getElementsByTagName('article-title')[0];
if($articleTitle){
return tools.cleanText($articleTitle.textContent);
}
};
exports.alternativeHeadline = function($articleMeta){
if(! $articleMeta) return;
var $altTitle = $articleMeta.getElementsByTagName('alt-title')[0];
if($altTitle){
return tools.cleanText($altTitle.textContent);
}
};
/**
* Grants are put in http://www.schema.org/sourceOrganization
*/
exports.sourceOrganization = function($article){
var sourceOrganization = [];
//case 1: <funding-statement> without <funding-source> and without <award-id>
var $fundingStatements = $article.getElementsByTagName('funding-statement');
if($fundingStatements && $fundingStatements.length){
Array.prototype.forEach.call($fundingStatements, function($fundingStatement){
var isfundingSource = $fundingStatement.getElementsByTagName('funding-source')[0];
var isAwardId = $fundingStatement.getElementsByTagName('award-id')[0];
if(!isfundingSource || !isAwardId){
sourceOrganization.push({description: tools.cleanText($fundingStatement.textContent)});
}
});
}
//case 2: <funding-source> and <award-id> the "challenge" is to group the 2 together
var tmpGrant = {};
var $fundingSources = $article.getElementsByTagName('funding-source');
if($fundingSources && $fundingSources.length){
Array.prototype.forEach.call($fundingSources, function($fundingSource){
var id = $fundingSource.getAttribute('id');
var rid = $fundingSource.getAttribute('rid');
var country = $fundingSource.getAttribute('country');
var furl = $fundingSource.getAttribute('xlink:href');
var s = {};
if(furl){
s['@id'] = furl;
}
s['@type'] = 'Organization';
s['name'] = tools.cleanText($fundingSource.textContent);
if(country){
s.address = {'@type': 'PostalAddress', addressCountry: country };
}
if(id || rid){ //we will get <award-id> with matching id or rid later
tmpGrant[id || rid] = s;
} else if((Object.keys(tmpGrant)
.filter(function(k){ return tmpGrant[k].name; })
.map(function(k){ return tmpGrant[k].name; })
.indexOf(s.name) === -1) &&
(sourceOrganization
.filter(function(x){ return x.name; })
.map(function(x){ return x.name; })
.indexOf(s.name) === -1)) {
//check if we can find an associated <award-id>
var $group = $fundingSource.parentNode;
if($group && ($group.tagName === 'funding-group' || $group.tagName === 'award-group')){
var $awardIds = $group.getElementsByTagName('award-id');
if($awardIds){
if($awardIds.length === 1){
s.grantId = tools.cleanText($awardIds[0].textContent);
} else if ($awardIds.length >1){ //multiple <award-id>, we suppose they are all attached to the funding source => can only make association if there is only 1 funding source
var $localFoundingSources = $group.getElementsByTagName('founding-source');
if($localFoundingSources && $localFoundingSources.length ===1){
s.grantId = Array.prototype.map.call($awardIds, function($awardId){
return tools.cleanText($awardId.textContent);
});
}
}
}
}
sourceOrganization.push(s);
}
});
}
//get <award-id> with id or rid or orphan ones (the one grouped in <funding-group> or <award-group> have been handled above)
var $awardIds = $article.getElementsByTagName('award-id');
if($awardIds && $awardIds.length){
Array.prototype.forEach.call($awardIds, function($awardId){
var id = $awardId.getAttribute('id') || $awardId.getAttribute('rid');
var awardId = tools.cleanText($awardId.textContent);
if(id && (id in tmpGrant)){
if(tmpGrant[id]['grantId']){
if(Array.isArray(tmpGrant[id]['grantId'])){
if(tmpGrant[id]['grantId'].indexOf(awardId) === -1){
tmpGrant[id]['grantId'].push(awardId);
}
} else if(tmpGrant[id]['grantId'] !== awardId){
tmpGrant[id]['grantId'] = [tmpGrant[id]['grantId'], awardId];
}
} else {
tmpGrant[id]['grantId'] = awardId;
}
} else { //might be orphan i.e just an <award-id> without associated <funding-source>
var isFundingSources = !! $awardId.parentNode.getElementsByTagName('funding-source')[0];
if(!isFundingSources){ //orphan
if ((Object.keys(tmpGrant)
.filter(function(k){ return tmpGrant[k].grantId; })
.map(function(k){ return tmpGrant[k].grantId; })
.indexOf(awardId) === -1) &&
(sourceOrganization
.filter(function(x){ return x.grantId; })
.map(function(x){ return x.grantId; })
.indexOf(awardId) === -1)) {
sourceOrganization.push({grantId: awardId});
}
}
}
});
}
for(var keyId in tmpGrant){
sourceOrganization.push(tmpGrant[keyId]);
}
if(sourceOrganization.length){
return sourceOrganization;
}
};
exports.citation = function($ref){
var ref = {};
var id = $ref.getAttribute('id');
if(id){
ref.name = id;
}
var $label = $ref.getElementsByTagName('label')[0];
if($label && $label.parentNode.tagName === 'ref'){
ref.alternateName = tools.cleanText($label.textContent);
}
var $citation = $ref.getElementsByTagName('mixed-citation')[0] || $ref.getElementsByTagName('element-citation')[0];
if($citation){
var $pubIds = $citation.getElementsByTagName('pub-id');
if($pubIds && $pubIds.length){
Array.prototype.forEach.call($pubIds, function($pubId){
var pubIdType = $pubId.getAttribute('pub-id-type');
if(pubIdType){ //doi, pmid...
ref[pubIdType] = $pubId.textContent;
}
});
}
//try again to get doi
if(!ref.doi){
var $comment = $citation.getElementsByTagName('comment')[0];
if($comment){
var $extLinks = $comment.getElementsByTagName('ext-link');
if($extLinks){
Array.prototype.forEach.call($extLinks, function($extLink){
var href = $extLink.getAttribute('xlink:href');
if(href && isUrl(href)){
var purl = url.parse(href);
if(purl.host === 'dx.doi.org'){
ref.doi = purl.pathname.replace(/^\//, '');
}
}
});
}
}
}
//try to get ref.url
if(ref.doi){
ref.url = 'http://dx.doi.org/' + ref.doi;
if(ref.pmid){
ref.sameAs = 'http://www.ncbi.nlm.nih.gov/pubmed/' + ref.pmid;
}
} else if(ref.pmid){
ref.url = 'http://www.ncbi.nlm.nih.gov/pubmed/' + ref.pmid;
} else {
var $extLinks = $citation.getElementsByTagName('ext-link');
if($extLinks){
for(var i=0; i<$extLinks.length; i++){
if(['uri', 'ftp'].indexOf($extLinks[i].getAttribute('ext-link-type'))>-1){
var uriHref = $extLinks[i].getAttribute('xlink:href');
if(uriHref && isUrl(uriHref)){
ref.url = uriHref;
}
}
}
}
}
var publicationType = $citation.getAttribute('publication-type');
var $articleTitle = $citation.getElementsByTagName('article-title')[0];
var $source = $citation.getElementsByTagName('source')[0];
if(publicationType === 'journal'){
ref['@type'] = 'ScholarlyArticle';
if($articleTitle){
//QUESTION: use name instead of headline (but conflict with the id)
ref.headline = tools.cleanText($articleTitle.textContent);
}
var periodical = { '@type': 'Periodical' };
if($source){ //!!<source> in an article it's the peridical name, in a book, it's the book title
periodical.name = tools.cleanText($source.textContent);
}
var $issn = $citation.getElementsByTagName('issn')
if($issn){
periodical.issn = exports.issn($issn);
}
//issue, volume, periodical, all nested...
var isPartOf;
var publicationIssue = exports.publicationIssue($citation);
if(publicationIssue){
isPartOf = publicationIssue;
}
var publicationVolume = exports.publicationVolume($citation);
if(publicationVolume){
if(publicationIssue){
publicationIssue.isPartOf = publicationVolume;
} else {
isPartOf = publicationVolume;
}
}
if(Object.keys(periodical).length >1){
if(publicationVolume){
publicationVolume.isPartOf = periodical;
} else if (publicationIssue){
publicationIssue.isPartOf = periodical;
} else {
isPartOf = periodical;
}
}
if(isPartOf){
ref.isPartOf = isPartOf;
}
} else {
if(publicationType === 'book'){
ref['@type'] = 'Book';
} else {
ref['@type'] = 'CreativeWork';
}
//TODO <chapter-title> ??
if($source){
//QUESTION: use name instead of headline (but conflict with the id)
ref.headline = tools.cleanText($source.textContent);
} else if($articleTitle){ //try again... sometimes there are no <source> but <article-title>...
ref.headline = tools.cleanText($articleTitle.textContent);
}
var $isbn = $citation.getElementsByTagName('isbn')[0];
if($isbn){
ref.isbn = tools.cleanText($articleTitle.textContent);
}
}
var $comment = $citation.getElementsByTagName('comment')[0];
if ($comment){
ref.comment = { '@type': 'Comment', text: tools.cleanText($comment.textContent) };
}
var publisher = exports.publisher($ref);
if(publisher){
ref.publisher = publisher;
}
var pageStart = exports.pageStart($ref);
if(pageStart !== undefined){
ref.pageStart = pageStart;
}
var pageEnd = exports.pageEnd($ref);
if(pageEnd !== undefined){
ref.pageEnd = pageEnd;
}
var jsDate = _getDate($citation);
if(jsDate){
try{
ref.datePublished = jsDate.toISOString();
} catch(e){};
}
//authors
var $names = $citation.getElementsByTagName('name');
if(!($names && $names.length)){
$names = $citation.getElementsByTagName('string-name');
}
var $collabs = $citation.getElementsByTagName('collab');
if($names && $names.length){
Array.prototype.forEach.call($names, function($name, i){
var person = exports.personName($name) || { '@type': 'Person' };
if(i===0){
ref.author = person;
} else {
if(!ref.contributor){
ref.contributor = [];
}
ref.contributor.push(person);
}
});
if($citation.getElementsByTagName('etal')[0]){
ref.unnamedContributors = true; //indicates that more than the listed author and contributors.
}
} else if($collabs && $collabs.length) {
Array.prototype.forEach.call($collabs, function($collab, i){
var collab = exports.collab($collab) || { '@type': 'Organization' };
if(i === 0){
ref.author = collab;
} else {
if(ref.contributor){
ref.contributor.push(collab);
} else {
ref.contributor = [ collab ];
}
}
});
}
}
if(Object.keys(ref).length){
return ref;
}
};
exports.citations = function($article){
if(!$article) return;
var citations = [];
var $back = $article.getElementsByTagName('back')[0]; //http://jats.nlm.nih.gov/archiving/tag-library/1.1d1/index.html <back>Back Matter Back matter typically contains supporting material such as an appendix, acknowledgment, glossary, or bibliographic reference list.
var $refList;
if($back){
$refList = $back.getElementsByTagName('ref-list')[0];
} else {
$refList = $article.getElementsByTagName('ref-list')[0];
}
if($refList){
var $refs = $refList.getElementsByTagName('ref');
if($refs){
Array.prototype.forEach.call($refs, function($ref){
var ref = exports.citation($ref);
if(ref){
citations.push(ref);
}
});
}
}
if(citations.length){
return citations;
}
};
exports.inlines = function($article){
if(!$article) return;
//inline content (get a list of ids from xlink:href)
var inlines = [];
//inline-formula contain inline-graphic so no need to take special case of inline-formula into account
var $inlineGraphics = $article.getElementsByTagName('inline-graphic');
if($inlineGraphics && $inlineGraphics.length){
Array.prototype.forEach.call($inlineGraphics, function($inlineGraphic){
inlines.push($inlineGraphic.getAttribute('xlink:href'));
});
}
['chem-struct-wrap', 'disp-formula'].forEach(function(inlineTag){
var $els = $article.getElementsByTagName(inlineTag);
if($els && $els.length){
Array.prototype.forEach.call($els, function($el){
var $graphic = $el.getElementsByTagName('graphic')[0];
if($graphic){
inlines.push($graphic.getAttribute('xlink:href'));
}
});
}
});
if(inlines.length){
return _.uniq(inlines);
}
};
exports.datePublished = function($articleMeta){
if(! $articleMeta) return;
var $pubDate = $articleMeta.getElementsByTagName('pub-date');
var jsDate;
for(i=0; i<$pubDate.length; i++){
var iso = $pubDate[i].getAttribute('iso-8601-date');
if(iso){
jsDate = new Date(iso);
} else {
jsDate = _getDate($pubDate[i]);
}
if($pubDate[i].getAttribute('pub-type') === 'epub' || $pubDate[i].getAttribute('publication-format') === 'electronic'){
break;
}
}
if(jsDate){
return jsDate.toISOString();
}
};
exports.publicationVolume = function($articleMeta){
if(! $articleMeta) return;
var $volume = $articleMeta.getElementsByTagName('volume')[0];
if($volume){
return {
'@type': 'PublicationVolume',
volumeNumber: tools.parseInt($volume.textContent)
};
}
};
exports.publicationIssue = function($articleMeta){
if(! $articleMeta) return;
var $issue = $articleMeta.getElementsByTagName('issue')[0];
if($issue){
return {
'@type': 'PublicationIssue',
issueNumber: tools.parseInt($issue.textContent)
};
}
};
exports.pageStart = function($articleMeta){
if(! $articleMeta) return;
var $fpage = $articleMeta.getElementsByTagName('fpage')[0];
if($fpage){
return tools.parseInt($fpage.textContent);
}
};
exports.pageEnd = function($articleMeta){
if(! $articleMeta) return;
var $lpage = $articleMeta.getElementsByTagName('lpage')[0];
if($lpage){
return tools.parseInt($lpage.textContent);
}
};
exports.pageCount = function($articleMeta){
if(! $articleMeta) return;
var $pageCount = $articleMeta.getElementsByTagName('page-count')[0];
if($pageCount){
var pageCountCount = $pageCount.getAttribute('count');
if(pageCountCount){
return tools.parseInt(pageCountCount);
}
}
};
exports.copyrightYear = function($articleMeta){
if(! $articleMeta) return;
var $copyrightYear = $articleMeta.getElementsByTagName('copyright-year')[0];
if($copyrightYear){
return parseInt($copyrightYear.textContent, 10);
}
};
exports.copyrightHolder = function($articleMeta){
if(! $articleMeta) return;
var $copyrightHolder = $articleMeta.getElementsByTagName('copyright-holder')[0];
if($copyrightHolder){
return {name: tools.cleanText($copyrightHolder.textContent)};
}
};
exports.license = function($articleMeta){
if(! $articleMeta) return;
var $license = $articleMeta.getElementsByTagName('license')[0];
if($license){
var license = {};
var licenseLink = $license.getAttribute('xlink:href');
if(licenseLink && isUrl(licenseLink)){
license.url = licenseLink;
}
var licenseType = $license.getAttribute('license-type');
if(licenseType){
license.name = licenseType;
}
var $licenseP = $license.getElementsByTagName('license-p');
if($licenseP && $licenseP.length){
license.text = tools.cleanText(Array.prototype.map.call($licenseP, function(p){ return tools.cleanText(p.textContent);}).join(' '));
}
if(Object.keys(license).length){
return license;
}
}
};
exports.abstract = function($articleMeta){
if(! $articleMeta) return;
var $abstracts = $articleMeta.getElementsByTagName('abstract');
if($abstracts && $abstracts.length){
return Array.prototype.map.call($abstracts, function($abstract){
var myAbstract = { '@type': 'Abstract' };
var abstractType = $abstract.getAttribute('abstract-type');
if(abstractType){
myAbstract.name = abstractType;
}
var $secs = $abstract.getElementsByTagName('sec'); //NOTE: can be bad if nested <sec> TODO only check childNodes
if($secs && $secs.length){ //structured abstract
var parts = Array.prototype.map.call($secs, function($sec){
var part = { '@type': 'Abstract' };
var $title = $sec.getElementsByTagName('title')[0];
if($title){
part.headline = tools.cleanText($title.textContent);
}
part.abstractBody = _getTextExcludingTagNames($sec, ['title']);
return part;
});
if(parts.length === 1){
if(parts[0].headline){
myAbstract.headline = parts[0].headline;
}
myAbstract.abstractBody = parts[0].abstractBody;
} else {
myAbstract.hasPart = parts;
}
} else {
var $title = $abstract.getElementsByTagName('title')[0];
if($title){
myAbstract.headline = tools.cleanText($title.textContent);
}
myAbstract.abstractBody = _getTextExcludingTagNames($abstract, ['title']);
}
return myAbstract;
});
}
};
exports.resources = function($article, resources){
if(! $article) return;
var resourcesMeta = _findResourcesMeta($article);
var typeMap = { 'image': 'encoding', 'audio': 'encoding', 'video': 'encoding', 'sourceCode': 'targetProduct', 'dataset': 'distribution', 'article': 'encoding' };
var sresources = {};
var allMatchedNames = {
dataset: [],
sourceCode: [],
image: [],
audio: [],
video: [],
article: []
};
resourcesMeta.forEach(function(mr, mrId){ //mr: resource with meta data (hence the m...)
var hrefs = [];
['graphic', 'media', 'code', 'table', 'si', 'inlineSi'].forEach(function(type){
if(type in mr){
mr[type].forEach(function(x){
if(x.href){
hrefs.push(x.href);
}
});
}
});
var matched = [];
Object.keys(resources).forEach(function(type){
(resources[type] || []).forEach(function(r){
var tr = {type: type, value: r};
if(_match(r, type, hrefs)){
matched.push(tr);
allMatchedNames[type].push(r.name);
}
});
});
//get **main** ldpm type (dataset, image, audio, video or sourceCode)
var mainLdpmType;
var mainTypeFromFiles;
var typesFromFiles = {};
if(matched.length){
typesFromFiles = _.countBy(matched, function(x){ return x.type; });
}
if ('dataset' in typesFromFiles){
mainTypeFromFiles = 'dataset';
} else if ('sourceCode' in typesFromFiles){
mainTypeFromFiles = 'sourceCode';
} else if ('video' in typesFromFiles){
mainTypeFromFiles = 'video';
} else if ('audio' in typesFromFiles){
mainTypeFromFiles = 'audio';
} else if ('image' in typesFromFiles){
mainTypeFromFiles = 'image';
} else if ('article' in typesFromFiles){
mainTypeFromFiles = 'article';
} else {
mainTypeFromFiles = 'dataset';
}
//assign ```mainLdpmType```
if (mr.tag === 'table-wrap'){
mainLdpmType = 'dataset';
} else if ('code' in mr){
mainLdpmType = 'sourceCode';
} else if ( ('si' in mr) || ('inlineSi' in mr) ){ //if si or inlineSi => only 1 resource (Cf. findResourcesMeta)
var mymr = (mr.si && mr.si[0]) || (mr.inlineSi && mr.inlineSi[0]);
if(mymr.mimetype.indexOf('video') > -1){
mainLdpmType = 'video';
} else if (mymr.mimetype.indexOf('audio') > -1){
mainLdpmType = 'audio';
} else if (mymr.mimetype.indexOf('image') > -1){
mainLdpmType = 'image';
} else { //rely on typesFronFiles or default to dataset
mainLdpmType = mainTypeFromFiles;
}
} else if ('media' in mr){
var typesFromMedia = _.countBy(mr.media, function(x){
if(x.mimetype.indexOf('video') > -1){
return 'video';
} else if (x.mimetype.indexOf('audio') > -1){
return 'audio';
} else if (x.mimetype.indexOf('image') > -1){
return 'image';
} else {
return '?';
}
});
if ('video' in typesFromMedia){
mainLdpmType = 'video';
} else if ('audio' in typesFromMedia){
mainLdpmType = 'audio';
} else if ('image' in typesFromMedia){
mainLdpmType = 'image';
} else {
mainLdpmType = mainTypeFromFiles;
}
} else if ('graphic' in mr){
mainLdpmType = 'image';
} else {
mainLdpmType = mainTypeFromFiles || 'dataset';
}
var sr = {}; //the new resources we are creating. All matched resource will indeed be different representation of this same resource ```sr```...
sr.name = mr.id || (matched.length && matched[0].value.name) || ('resource-' + mrId);
if(mr.label) sr.alternateName = mr.label;
if(mr.caption){
if(mr.caption.title) sr.description = mr.caption.title;
if(mr.caption.content){
if(mainLdpmType === 'image' || mainLdpmType === 'video'){
sr.caption = mr.caption.content;
} else {
sr.description = tools.cleanText([sr.description || '', mr.caption.content].join(' '));
}
}
}
if(mr.ids){
for(var key in mr.ids){
sr[key] = mr.ids[key];
}
}
if(mr.fn && mr.fn.length){
var comments = [];
mr.fn.forEach(function(c){
var comment = {'@type': 'Comment'};
if(c.id) comment.name = c.id;
if(c.label) comment.alternateName = c.alternateName;
if(c.content) comment.text = c.content;
if(Object.keys(comment).length>1){
comments.push(comment);
}
});
if(comments.length){
sr.comment = comments;
}
}
var encodings = [];
if (matched.length) {
//add type specific props !== encoding infered from ldpm init (e.g about, programmingLanguage...)
var props = Object.keys(sr);
matched.forEach(function(x){
if(x.type === mainLdpmType){
Object.keys(x.value).forEach(function(p){
if(p !== typeMap[mainLdpmType] && props.indexOf(p)=== -1){
sr[p] = clone(x.value[p]);
}
});
}
});
//fill encoding
//special case for pubmed central: 2 representation one .jpg and .gif: in this case: .gif is the thumbnail
if (mainLdpmType === 'image' &&
matched[0].type === 'image' &&
matched.length === 1 &&
matched[0].value.encoding &&
matched[0].value.encoding.length === 2 &&
( ( matched[0].value.encoding[0].encodingFormat === 'image/gif' && matched[0].value.encoding[1].encodingFormat === 'image/jpeg' ) || ( matched[0].value.encoding[0].encodingFormat === 'image/jpeg' && matched[0].value.encoding[1].encodingFormat === 'image/gif') )
){
encodings = matched[0].value.encoding.filter(function(x){ return x.encodingFormat === 'image/jpeg'; });
sr.thumbnailPath = (sr.thumbnailPath || []).concat(
matched[0].value.encoding
.filter(function(x){ return x.encodingFormat === 'image/gif'; })
.map(function(x){ return x.contentPath; })
);
} else {
matched.forEach(function(x){
if(x.type === mainLdpmType){
encodings = encodings.concat(x.value[typeMap[mainLdpmType]]);
} else if(x.type === 'image') { //treats as thumbnailPath -> ldpm will transform thumbnailPath into thumbnailUrl
var thumbnailPaths = x.value.encoding.map(function(enc){ return enc.contentPath; });
if(thumbnailPaths.length){
sr.thumbnailPath = (sr.thumbnailPath || []).concat(thumbnailPaths);
}
}
});
}
}
//add HTML representation of a table or code snippet (sampleType)
if(mainLdpmType === 'dataset'){
if (mr.table && mr.table.length){
encodings = encodings.concat(mr.table.map(function(t){
return {
contentData: t.html,
encodingFormat: 'text/html'
};
}));
};
} else if (mainLdpmType === 'sourceCode'){ //code snippet
if(mr.code && mr.code.length === 1){
Object.keys(mr.code[0]).forEach(function(key){
if (mr.code[0][key]) {
sr[key] = mr.code[0][key];
}
});
}
}
if(encodings.length){
sr[typeMap[mainLdpmType]] = encodings;
}
if(mainLdpmType in sresources){
sresources[mainLdpmType].push(sr);
} else {
sresources[mainLdpmType] = [ sr ];
}
});
Object.keys(resources).forEach(function(type){
resources[type] = resources[type]
.filter(function(x){
return allMatchedNames[type].indexOf(x.name) === -1;
})
.concat(sresources[type] || []);
if(!resources[type].length){
delete resources[type];
}
});
return resources;
};
/**
* helper functions
*/
/**
* find figure, tables, supplementary materials and their captions.
*/
function _findResourcesMeta($article){
var resources = [];
var tags = [ 'fig', 'table-wrap', 'supplementary-material' ];
tags.forEach(function(tag){
Array.prototype.forEach.call($article.getElementsByTagName(tag), function($el){
var r = {
tag: tag,
id: $el.getAttribute('id')
};
//label -> alternateName
var $label = $el.getElementsByTagName('label')[0];
if($label){
r.label = tools.cleanText($label.textContent);
}
if(r.label){
if(r.label.match(/\d+$/)){
r.num = r.label.match(/\d+$/)[0];
}
}
//caption
//<title> -> description.
//<p>s -> caption if and only if it's ONLY plain text i.e does not contain (inline-graphic or formula)
var $caption = $el.getElementsByTagName('caption')[0];
if($caption){
r.caption = {};
var $title = $caption.getElementsByTagName('title')[0];
if($title){
r.caption.title = tools.cleanText($title.textContent);
}
var $ps = $caption.getElementsByTagName('p');
if($ps && $ps.length && _isPlainText($caption)){
//TODO replace <xref ref-type="bibr" rid="pcbi.1000960-Romijn1">[24]</xref> by the description of the ref
r.caption.content = Array.prototype.map.call($ps, function($p){
return tools.cleanText($p.textContent);
}).join(' ');
r.caption.content = tools.cleanText(r.caption.content);
}
}
//DOI and co.
//We only support figure level DOIs: check that parent is ```tag``` if not discard
var $objectIds = $el.getElementsByTagName('object-id');
if($objectIds && $objectIds.length){
r.ids = {};
Array.prototype.forEach.call($objectIds, function($o){
if($o.parentNode.tagName === tag){
var pubIdType = $o.getAttribute('pub-id-type');
if(pubIdType){
r.ids[pubIdType] = tools.cleanText($o.textContent);
}
}
});
}
//footnote -> Comment
r.fn = [];
var $fns = $el.getElementsByTagName('fn');
if($fns && $fns.length){
Array.prototype.forEach.call($fns, function($fn){
if(_isPlainText($fn)){
r.fn.push(_getFn($fn));
}
});
}
//<table-wrap-foot> e.g PMC3532326 http://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:3532326&metadataPrefix=pmc
//!!<fn> already parsed...
var $tableWrapFoot = $el.getElementsByTagName('table-wrap-foot')[0];
if($tableWrapFoot && _isPlainText($tableWrapFoot)){
var istableWrapFootFns = $tableWrapFoot.getElementsByTagName('fn')[0];
if(!istableWrapFootFns){
r.fn.push(_getFn($tableWrapFoot));
}
}
if(tag === 'supplementary-material' && $el.getAttribute('xlink:href')){ //if no ```xlink:href```: => will be taken into account by graphic, media and code in the ```else```. The reason is that for example, a <supplementary-material> element could contain a description of an animation, including the first frame of the animation (tagged as a <graphic> element), a caption describing the animation, and a cross-reference made to the external file that held the full animation.
r.si = [ _getIdMimeHref($el) ];
} else {
//get figure, media, table or code. <alternatives> first. If no alternative check that only 1 graphic or 1 media or 1 table or 1 code
var $alternatives = $el.getElementsByTagName('alternatives');
if($alternatives){ //filter to alternatives direct descendant of $el (to avoid the one in caption for instance)
$alternatives = Array.prototype.filter.call($alternatives, function($alt){
return !! ($alt.parentNode.tagName === tag);
});
}
if($alternatives && $alternatives.length){
['graphic', 'media', 'code', 'table'].forEach(function(mtag){
var $mtags = $alternatives[0].getElementsByTagName(mtag);
if($mtags && $mtags.length){
r[mtag] = [];
array.prototype.forEach.call($mtags, function($m){
if(mtag === 'table'){
if(_isplaintext($m)){
r[mtag].push( _getTable($m) );
}
} else if (mtag === 'code') {
r[mtag].push( _getCode($m) );
} else {
r[mtag].push( _getIdMimeHref($m) );
}
});
}
});
} else { //there must be only 1 graphic, media, table or code
var mym = [];
//put mtag that direct descendant of ```tag``` in an array and check length == 1
['graphic', 'media', 'code', 'table'].forEach(function(mtag){
var $m = $el.getElementsByTagName(mtag);
if( $m && $m.length){
for(var i=0; i<$m.length; i++){
if( $m[i].parentNode.tagName === tag ){
mym.push( { mtag: mtag, value: $m[i] } );
}
}
}
});
if(mym.length === 1){
if(mym[0].mtag === 'table'){
if(_isPlainText(mym[0].value)){
r[mym[0].mtag] = [ _getTable(mym[0].value) ];
}
} else if (mym[0].mtag === 'code'){
r[mym[0].mtag] = [ _getCode(mym[0].value) ];
} else {
r[mym[0].mtag] = [ _getIdMimeHref(mym[0].value) ];
}
}
}
}
resources.push(r);
});
});
//<inline-supplementary-material>
//@xlink:title -> description
var $inlineSupplementaryMaterials = $article.getElementsByTagName('inline-supplementary-material');
if($inlineSupplementaryMaterials && $inlineSupplementaryMaterials.length){
Array.prototype.forEach.call($inlineSupplementaryMaterials, function($sup){
resources.push({
tag: 'inline-supplementary-material',
id: $sup.getAttribute('id'),
caption: { title: $sup.getattribute('xlink:title') },
inlineSi: [ _getIdMimeHref($sup) ]
});
});
}
return resources;
};
function _getDate($node){
var jsDate;
var $day = $node.getElementsByTagName('day')[0];
var $month = $node.getElementsByTagName('month')[0];
var $year = $node.getElementsByTagName('year')[0];
var jsDate, month;
if($month){
month = $month.textContent.toLowerCase().substring(0,3);
var month2int = {
'jan': 0,
'feb': 1,
'mar': 2,
'apr': 3,
'may': 4,
'jun': 5,
'jul': 6,
'aug': 7,
'sep': 8,
'oct': 9,
'nov': 10,
'dec': 11
};
if(month in month2int){
month = month2int[month];
} else {
month -= 1; //in JS date constructor, month start at 0...
}
}
if($year && month && $day){
jsDate = Date.UTC($year.textContent, month, $day.textContent, 0, 0, 0, 0);
} else if($year && month){
jsDate = Date.UTC($year.textContent, month, 1, 0, 0, 0, 0);
} else if($year){
jsDate = Date.UTC($year.textContent, 0, 1, 0, 0, 0, 0);
}
return new Date(jsDate - 1000*5*60*60); //UTC to Eastern Time Zone (UTC-05:00)
};
function _getTextExcludingTagNames($node, tagNamesToExclude){
var txt = '';
Array.prototype.forEach.call($node.childNodes, function($el){
if(tagNamesToExclude.indexOf($el.tagName) === -1){
if($el.nodeType === 3){
txt += $el.textContent;
} else if ($el.nodeType === 1){
txt += _getTextExcludingTagNames($el, tagNamesToExclude);
}
}
});
return txt;
};
/**
* return undefined if the table contains element that cannot be serialized (e.g graphics, media, formulaes)
* TODO replace <bold> and other tags...
*/
function _getTable($table){
_removeAttributes($table);
var serializer = new XMLSerializer();
return {
id: $table.getAttribute('id'),
html: [
'<!DOCTYPE html>',
'<html>',
'<head>',
'<meta charset="utf-8">',
'</head>',
'<body>',
serializer.serializeToString($table),
'</body>',
'</html>'
].join('')
};
};
function _getIdMimeHref($el){
return {
id: $el.getAttribute('id'),
mimetype: $el.getAttribute('mimetype'),
mimeSubtype: $el.getAttribute('mime-subtype'),
href: $el.getAttribute('xlink:href')
};
};
function _getCode($code){
return {
programmingLanguage: $code.getAttribute('code-type') || $code.getAttribute('language'),
runtime: $code.getAttribute('platforms'),
sampeType: $code.textContent
};
};
function _getFn($fn){
var fn = { id: $fn.getAttribute('id') };
var $title = $fn.getElementsByTagName('title')[0];
if($title){
fn.title = tools.cleanText($title.textContent);
}
var $label = $fn.getElementsByTagName('label')[0];
if($label){
fn.label = tools.cleanText($label.textContent);
}
var $ps = $fn.getElementsByTagName('p');
if($ps && $ps.length){
fn.content = Array.prototype.map.call($ps, function($p){
return tools.cleanText($p.textContent);
}).join(' ');
fn.content = tools.cleanText(fn.content);
}
return fn;
};
function _isPlainText($el){
//note: inline-formula contains inline-graphic so no need to check it.
var evilTags = ['inline-graphic', 'chem-struct-wrap', 'disp-formula', 'graphic', 'media'];
for(var i=0; i<evilTags.length; i++){
if ($el.getElementsByTagName(evilTags[i])[0]){
return false;
}
}
return true;
};
function _removeAttributes($el){
if($el.attributes && $el.attributes.length){
var atts = Array.prototype.map.call($el.attributes, function(x){return x.name;});
if(atts.length){
atts.forEach(function(att){
$el.removeAttribute(att);
})
}
}
if($el.childNodes && $el.childNodes.length){
for(var i=0; i<$el.childNodes.length; i++){
_removeAttributes($el.childNodes[i]);
}
}
};
function _match(r, type, hrefs){
var typeMap = { 'image': 'encoding', 'audio': 'encoding', 'video': 'encoding', 'sourceCode': 'targetProduct', 'dataset': 'distribution', 'article': 'encoding' };
if(r[typeMap[type]] && r[typeMap[type]].length){
for(var i=0; i<r[typeMap[type]].length; i++){
var mpath = r[typeMap[type]][i].contentPath || r[typeMap[type]][i].bundlePath || r[typeMap[type]][i].filePath; //note: bundlePath is first!
if(mpath){
var cext = ['.gz', '.gzip', '.tgz', '.zip', '.tar.gz'];
var mbase = path.basename(mpath);
var mname = path.basename(mpath, path.extname(mpath));
var mnamens = mname.replace(/ /g, '-');
//in case of compression of a single media file.
var mnamedir = path.dirname(mpath);
var mynames = [ mbase, mname, mnamens ];
if(mnamedir !== '.'){
cext.forEach(function(ext){
mynames.push(mnamedir + ext);
});
}
if(r[typeMap[type]][i].bundlePath){
cext.forEach(function(ext){
mynames.push(mbase + ext);
});
}
if(mynames.some(function(x){ return hrefs.indexOf(x) >-1; })){
return true;
}
}
}
}
return false;
};