UNPKG

ldpm-pubmed

Version:

pubmed as package.jsonld

1,461 lines (1,209 loc) 47 kB
var request = require('request') , fs = require('fs') , url = require('url') , async = require('async') , path = require('path') , temp = require('temp') , tar = require('tar') , once = require('once') , pubmed = require('./pubmed').pubmed , Client = require('ftp') , DecompressZip = require('decompress-zip') , zlib = require('zlib') , recursiveReaddir = require('recursive-readdir') , isUrl = require('is-url') , DOMParser = require('xmldom').DOMParser , tools = require('./lib/tools'); process.maxTickDepth = 10000; // to avoid warnings when using nextTick // https://groups.google.com/forum/#!topic/nodejs/9_uM04IDNWg temp.track(); module.exports = oapmc; /** * 'this' is an Ldpm instance */ function oapmc(uri, opts, callback){ callback = once(callback); if(arguments.length === 2){ callback = opts; opts = {}; } var that = this; var puri = url.parse(uri, true); // check url if (puri.hostname === 'www.pubmedcentral.nih.gov' && puri.pathname === '/utils/oa/oa.fcgi' && puri.query.id){ var pmcid = puri.query.id; // 0. Preliminary fetches that.logHttp('GET', uri); // Fetch the url of the tar.gz of the article request(uri, function(error, response, oaContentBody){ if(error) return callback(error); that.logHttp(response.statusCode, uri); if(response.statusCode >= 400){ var err = new Error(oaContentBody); err.code = response.statusCode; return callback(err); } var conversionUrl = 'http://www.pubmedcentral.nih.gov/utils/idconv/v1.0/?ids=' + pmcid + '&format=json'; that.logHttp('GET', conversionUrl); // For PMC article, the idconv api returns {pmid,pmcid,doi} when given any of the three. request(conversionUrl, function(error, response, idConversionBody) { if(error) return callback(error); that.logHttp(response.statusCode,conversionUrl); if(response.statusCode >= 400){ var err = new Error(idConversionBody); err.code = response.statusCode; return callback(err); } var res = JSON.parse(idConversionBody); var doi = res['records'][0]['doi']; var pmid = res['records'][0]['pmid']; if(pmid==undefined){ // OAPMC entries do not all have a PMID (eg PMC3875093) opts.noPubmed = true; } // 1. Fetch : resources, xml, and pubmed metadata // a. resources //get URI of the tarball var doc = new DOMParser().parseFromString(oaContentBody, 'text/xml'); var $links = doc.getElementsByTagName('link'); try { var $linkTgz = Array.prototype.filter.call($links, function(x){return x.getAttribute('format') === 'tgz';})[0]; var tgzUri = $linkTgz.getAttribute('href'); } catch(e) { return callback(new Error('could not get tar.gz URI')); } fetchTar(tgzUri, that, function(err, files){ if(err) return callback(err); var mainArticleName; // first way to get the name of the main article: from the pdf name in oaContentBody that contains pdf and tar.gz try { var $linkPdf = Array.prototype.filter.call($links, function(x){return x.getAttribute('format') === 'pdf';})[0]; mainArticleName = url.parse($linkPdf.getAttribute('href')).pathname; mainArticleName = path.basename(mainArticleName, path.extname(mainArticleName)); mainArticleName = mainArticleName.slice(0, mainArticleName.lastIndexOf('.')).replace(/ /g, '-'); //eg pone.0012255.PMC2924383 -> pone.0012255 } catch(e){ mainArticleName = undefined; } // Second way to get the name of the main article: from the name of the nxml file // in the tar.gz. OAPMC entries always have at least a pdf or an nxml. if(!mainArticleName){ mainArticleName = extractNXMLName(files); } // b. xml fetchXml('http://www.pubmedcentral.nih.gov/oai/oai.cgi?verb=GetRecord&identifier=oai:pubmedcentral.nih.gov:' + pmcid.slice(3) + '&metadataPrefix=pmc', that, function(err, xml){ if(err) return callback(err); // c. pubmed metadata (if opts.noPubmed will callback immediately) fetchPubmedMetadata('http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id='+pmid+'&rettype=abstract&retmode=xml', that, opts, function(err, pubmedPkg){ if(err) return callback(err); // 2. Parse and complete pkg // a. resources: identify different encodings, substitute plos urls to contentPaths parseResources({}, files, doi, that, function(err, pkg){ if(err) return callback(err); // b. xml: get captions, citations, authors, publishers etc from the xml try{ pkg = parseXml(xml, pkg, pmcid, mainArticleName, that, opts); } catch(err){ return callback(err); } if(err) return callback(err); var artInd = tools.getArtInd(pkg, mainArticleName); // index of the main article in pkg.article // 3. Convert xml + pkg to html // a. two steps conversion of the xml articleBody: xml -> json -> html var jsonBody = xml2json(xml); tools.json2html(that, jsonBody, pkg, function(err, htmlBody){ if(err) return callback(err); // b. if formulas have been inlined as base 64 in the text, // they're removed from the pkg resources removeInlineFormulas(pkg, that, function(err,pkg){ if(err) return callback(err); // c. integrate the html article as a resource of the pkg fs.writeFile(path.join(that.root, pkg.article[artInd].name + '.html'), htmlBody, function(err){ if(err) return callback(err); that.paths2resources([path.join(that.root,pkg.article[artInd].name + '.html')], function(err,resources){ if(err) return callback(err); if(pkg.article[artInd].encoding==undefined){ pkg.article[artInd].encoding = []; } pkg.article[artInd].encoding.push(resources.article[0].encoding[0]); // d. extract pubmed annotations, adapt the target, and add to the pkg tools.addPubmedAnnotations(pkg, pubmedPkg, that, function(err,pkg){ if(err) return callback(err); callback(null, pkg); }); }); }); }); }); }); }); }); }); }); }); } else { callback(new Error('unrecognized uri')); } }; function extractNXMLName(files){ for(var i=0; i<files.length; i++){ if(path.extname(path.basename(files[i])) === '.nxml'){ return path.basename(path.basename(files[i]), path.extname(path.basename(files[i]))).replace(/ /g, '-'); } } }; function fetchTar(tgzUri, ldpm, callback){ // return the list of files contained in the tar.gz of the article, // and move them to the current directory callback = once(callback); var puri = url.parse(tgzUri); var root = ldpm.root; var c = new Client(); c.connect({ host: puri.host }); ldpm.logHttp('GET', tgzUri, 'ftp'); c.on('ready', function() { temp.mkdir('__ldpmTmp', function(err, dirPath) { c.get(puri.path, function(err, stream) { if (err) return callback(err); ldpm.logHttp(200, tgzUri, 'ftp'); stream = stream .pipe(zlib.Unzip()) .pipe(tar.Extract({ path: dirPath, strip: 1 })); stream.on('end', function() { recursiveReaddir(path.resolve(dirPath), function (err, files) { if (err) return callback(err); var newFiles = []; async.each(files, function(file,cb){ var extname = path.extname(path.basename(file)); var basename = path.basename(file, extname); var newpath = path.join(ldpm.root, basename.replace(/ /g, '-') + extname); fs.rename(file, newpath, function(err){ if(err) return cb(err); newFiles.push(newpath); cb(null); }); }, function(err){ if(err) return callback(err); c.end(); return callback(null,newFiles); }); }); }); stream.on('error', callback); }) }); }); }; function fetchXml(uri, ldpm, callback){ ldpm.logHttp('GET', uri); request(uri, function(error, response, body){ if(error) return callback(error); ldpm.logHttp(response.statusCode, uri); if(response.statusCode >= 400){ var err = new Error(body); err.code = response.statusCode; return callback(err); } callback(null, body); }); }; function fetchPubmedMetadata(uri, ldpm, opts, callback){ if(opts.noPubmed){ callback(null, {}); } else { // call to the pubmed plugin. // writeHTML: false prevents the pubmed plugin from writing to write // the html article it generates on the disk, to avoid conflicts with // the one generated by oapmc. pubmed.call(ldpm, uri, { writeHTML: false }, function(err, pubmedPkg){ if(err) return callback(err); callback(null, pubmedPkg); }); } }; function parseResources(pkg, files, doi, ldpm, callback){ callback = once(callback); var codeBundles = []; var compressedBundles = []; var typeMap = { 'figure': 'figure', 'audio': 'audio', 'video': 'video', 'code': 'targetProduct', 'dataset': 'distribution', 'article': 'encoding'}; var toUnlink = []; // identify bundles var tmpAr = []; files.forEach(function(file,i){ if(['.gz', '.gzip', '.tgz','.zip'].indexOf(path.extname(file))>-1){ codeBundles.push(path.basename(file, path.extname(file))); compressedBundles.push(file); } else { tmpAr.push(file); } }); files = tmpAr; var opts = { codeBundles: codeBundles }; var ind = 0; async.each(compressedBundles, function(f, cb){ cb = once(cb); // uncompress bundles if((path.extname(f) === '.tgz')||(path.extname(f) === '.gz')){ var s = fs.createReadStream(path.join(ldpm.root, path.basename(f))); s = s.pipe(zlib.Unzip()).pipe(tar.Extract({ path: path.join(ldpm.root, path.basename(f, path.extname(f))) })); s.on('error', cb); s.on('end', function() { cb(null); }); } else if(path.extname(f)=='.zip') { var unzipper = new DecompressZip(f); unzipper.on('error', cb); unzipper.on('extract', function (log) { cb(null); }); unzipper.extract({ path: path.join(ldpm.root, path.basename(f, path.extname(f))) }); } else { zlib.unzip(f, cb); } }, function(err){ if(err) return callback(err); var urls = []; var plosJournalsList = ['pone.','pbio.','pmed.','pgen.','pcbi.','ppat.','pntd.']; var plosJournalsLinks = { 'pone.': 'http://www.plosone.org/article/info:doi/', 'pbio.': 'http://www.plosbiology.org/article/info:doi/', 'pmed.': 'http://www.Plasticine.org/article/info:doi/', 'pgen.': 'http://www.plosgenetics.org/article/info:doi/', 'pcbi.': 'http://www.ploscompbiol.org/article/info:doi', 'ppat.': 'http://www.plospathogens.org/article/info:doi', 'pntd.': 'http://www.plosntds.org/article/info:doi' }; var tmpfiles = []; // generate potential valid urls if resources identified as plos resources files.forEach(function(f, i){ var found = false; plosJournalsList.forEach(function(p, j){ var basename = path.basename(f,path.extname(f)); var extname = path.extname(f); basename = basename.replace(/ /g, '-'); if( (basename.slice(0,p.length) === p) && (extname !== '.nxml') && (basename.split('.')[basename.split('.').length-1][0] !== 'e') ) { // note: figures which index starts with e (eg: pcbi.1000960.e001.jpg) are inline formulas. We don't bother // to test urls for them as they will be inlined. found = true; if( extname === '.pdf' ){ var tmp = basename; tmp = '.'+tmp.split('.')[tmp.split('.').length-1]; var tmpind = plosJournalsLinks[p].indexOf('info:doi'); urls.push(plosJournalsLinks[p].slice(0,tmpind) + 'fetchObject.action?uri=info:doi/' + doi + tmp.slice(0,tmp.lastIndexOf('.')) + '&representation=PDF'); } else { var tmp = basename; tmp = '.' + tmp.split('.')[tmp.split('.').length - 1]; var tmpind = plosJournalsLinks[p].indexOf('info:doi'); urls.push(plosJournalsLinks[p].slice(0,tmpind) + 'fetchSingleRepresentation.action?uri=info:doi/' + doi + tmp ); if(['.gif', '.jpg', '.tif'].indexOf(extname) > -1){ if(urls.indexOf(plosJournalsLinks[p] + doi + tmp + '/' + 'powerpoint')==-1){ urls.push(plosJournalsLinks[p] + doi + tmp + '/' + 'powerpoint'); urls.push(plosJournalsLinks[p] + doi + tmp + '/' + 'largerimage'); urls.push(plosJournalsLinks[p] + doi + tmp + '/' + 'originalimage'); } } } } }); if(!found){ tmpfiles.push(f) } }); var validatedurls = []; async.each(urls, function(uri, cb){ // check which urls are valid ldpm.logHttp('HEAD', uri); request.head(uri, function (error, response, body) { if(error) return cb(error); ldpm.logHttp(response.statusCode, uri); if (response.statusCode == 200) { validatedurls.push(uri); } cb(null); }); }, function(err){ var files = tmpfiles; ldpm.paths2resources(files, opts, function(err, resources){ if(err) return callback(err); ldpm.urls2resources(validatedurls, function(err, resourcesFromUrls){ if(err) return callback(err); // plos resources need to be renamed: ldpm tools use the url basename while plos uses // that to specify the encoding ['figure','audio','video'].forEach(function(type){ resourcesFromUrls[type].forEach(function(x){ if(x.name.indexOf('SingleRepresentation')>-1){ x.name = x[type][0].contentUrl.split('/')[x[type][0].contentUrl.split('/').length-1]; } else if(x[type][0].contentUrl.indexOf('/powerpoint')>-1){ x.name = x[type][0].contentUrl.split('/')[x[type][0].contentUrl.split('/').length-2]; } else if(x[type][0].contentUrl.indexOf('/largerimage')>-1){ x.name = x[type][0].contentUrl.split('/')[x[type][0].contentUrl.split('/').length-2]; } else if(x[type][0].contentUrl.indexOf('/originalimage')>-1){ x.name = x[type][0].contentUrl.split('/')[x[type][0].contentUrl.split('/').length-2]; } else { x.name = x[type][0].contentUrl.split('/')[x[type][0].contentUrl.split('/').length-1]; } if( (x.name.slice(0,8)==='journal.') || (x.name.slice(0,8)==='journal-') ){ x.name = x.name.slice(8).replace(/ /g,'-'); } }) }); resourcesFromUrls['code'].forEach(function(x){ if(x.name.indexOf('SingleRepresentation')>-1){ x.name = x['targetProduct'][0].contentUrl.split('/')[x[['targetProduct']][0].contentUrl.split('/').length-1].replace(/ /g,'-'); } else { x.name = x[['targetProduct']][0].contentUrl.split('/')[x[['targetProduct']][0].contentUrl.split('/').length-2].replace(/ /g,'-'); } if( (x.name.slice(0,8)==='journal.') || (x.name.slice(0,8)==='journal-') ){ x.name = x.name.slice(8).replace(/ /g,'-'); } }); resourcesFromUrls['dataset'].forEach(function(x){ if(x.name.indexOf('SingleRepresentation')>-1){ x.name = x['distribution'][0].contentUrl.split('/')[x[['distribution']][0].contentUrl.split('/').length-1].replace(/ /g,'-'); } else { x.name = x[['distribution']][0].contentUrl.split('/')[x[['distribution']][0].contentUrl.split('/').length-2].replace(/ /g,'-'); } if( (x.name.slice(0,8)==='journal.') || (x.name.slice(0,8)==='journal-') ){ x.name = x.name.slice(8).replace(/ /g,'-'); } }); resourcesFromUrls['article'].forEach(function(x){ if(x.name.indexOf('fetchObject')>-1){ x.name = x['encoding'][0].contentUrl.slice(0,x['encoding'][0].contentUrl.indexOf('&representation=PDF')).split('/')[x[['encoding']][0].contentUrl.split('/').length-1].replace(/ /g,'-'); } else if(x['encoding'].indexOf("representation=PDF")>-1){ x.name = x['encoding'][0].contentUrl.slice(0,x['encoding'][0].contentUrl.indexOf('&representation=PDF')).split('/')[x[['encoding']][0].contentUrl.split('/').length-2].replace(/ /g,'-'); } else { x.name = x['encoding'][0].contentUrl.split('/')[x['encoding'][0].contentUrl.split('/').length-1].replace(/ /g,'-'); } if( (x.name.slice(0,8)==='journal.') || (x.name.slice(0,8)==='journal-') ){ x.name = x.name.slice(8).replace(/ /g,'-'); } }); //merge for (var type in resources){ resources[type] = resources[type].concat(resourcesFromUrls[type]); } tmpAr = []; resources.dataset.forEach(function(x,i){ if(!(path.extname(x.distribution[0].contentPath) === '.nxml')){ // remove the .nxml from pkg.dataset tmpAr.push(x); } else { toUnlink.push(path.join(ldpm.root,x.distribution[0].contentPath)); } }); resources.dataset = tmpAr; //TODO CHECK triple check splice because it affects length.. // -> Jo: that's ok, ind2 incremented only when no splice. // merge resources that are different encodings of the same content ['figure','audio','video','code','article'].forEach(function(type){ var ind=0; while(ind < resources[type].length){ var ind2=ind+1; while(ind2 < resources[type].length){ r2 = resources[type][ind2]; if(resources[type][ind].name === r2.name && r2[type]){ resources[type][ind][typeMap[type]].push(r2[type][0]); resources[type].splice(ind2, 1); } else { ind2 +=1; } } ind += 1; } }); // rm SingleRepresentation (PLOS) when there are alternatives ['figure','audio','video','code','article'].forEach(function(type){ if(resources[type]){ resources[type].forEach(function(r,i){ tmpAr = []; r[typeMap[type]].forEach(function(x,i){ if(x.contentUrl != undefined){ if( !((x.contentUrl.indexOf('fetchSingleRepresentation')>-1) && (r[typeMap[type]].length>1)) ){ tmpAr.push(x); } } else { tmpAr.push(x); } }); r[typeMap[type]] = tmpAr; }); } }); // create pkg var pkg = {}; if(resources!=undefined){ pkg = ldpm.addResources(pkg,resources); } // inline license and remove file var found = false; if(pkg.dataset){ tmpAr = []; pkg.dataset.forEach(function(d,i){ if(d.name === 'license'){ found = true; fs.readFile(path.join(ldpm.root,d.distribution[0].contentPath), {encoding: 'utf8'}, function(err,txt){ if(err) return callback(err); pkg.license = txt; toUnlink.push(path.join(ldpm.root,d.distribution[0].contentPath)); async.each(toUnlink, fs.unlink, function(err){ if(err) return callback(err); callback(null,pkg); }); }); } else { tmpAr.push(d); } }); pkg.dataset = tmpAr; } if(!found){ async.each(toUnlink, fs.unlink, function(err){ if(err) return callback(err); callback(null,pkg); }); } }); }); }); }); }; /** * Cf. http://jats.nlm.nih.gov/archiving/tag-library/1.1d1/index.html */ function parseXml(xml, pkg, pmcid, mainArticleName, ldpm, opts){ var artInd = tools.getArtInd(pkg, mainArticleName); if(artInd === -1){ if(!pkg.article){ pkg.article = []; } pkg.article.push({}); artInd = pkg.article.length-1; } if(Array.isArray(pkg.article[artInd]['@type']) && pkg.article[artInd]['@type'].indexOf('ScholarlyArticle') === -1){ pkg.article[artInd]['@type'].push('ScholarlyArticle'); } else { pkg.article[artInd]['@type'] = 'ScholarlyArticle'; } var doc = new DOMParser().parseFromString(xml, 'text/xml'); var meta = {}; var i; var $article = doc.getElementsByTagName('article')[0]; var articleType = $article.getAttribute('article-type'); if(articleType){ pkg.article[artInd].publicationType = articleType; } var $publisherName = $article.getElementsByTagName('publisher-name')[0]; if($publisherName){ meta.publisher = { '@type': 'Organization', name: $publisherName.textContent }; } var $publisherLoc = $article.getElementsByTagName('publisher-loc')[0]; if($publisherLoc){ if(!meta.publisher){ meta.publisher = {}; } meta.publisher.location = { '@type': 'PostalAddress', description: tools.cleanText($publisherLoc.textContent) } } var $journalTitle = $article.getElementsByTagName('journal-title')[0]; if($journalTitle){ meta.journal = { '@type': 'Journal', name: tools.cleanText($journalTitle.textContent) } } //get journalShortName: will be used as a prefix of the pkg name => lover case, no space var $journalId = $article.getElementsByTagName('journal-id'); for(i=0; i<$journalId.length; i++){ var journalIdType = $journalId[i].getAttribute('journal-id-type'); if(journalIdType === 'nlm-ta'){ meta.journalShortName = $journalId[i].textContent.split(' ').map(function(x){return x.trim().replace(/\W/g, '').toLowerCase();}).join('-'); break; } } if(!meta.journalShortName){ if(meta.journal && meta.journal.name){ meta.journalShortName = meta.journal.name.split(' ').map(function(x){return x.trim().replace(/\W/g, '').toLowerCase();}).join('-'); } else { meta.journalShortName = ''; } } var $issn = $article.getElementsByTagName('issn'); if($issn){ if(!meta.journal) meta.journal = {}; for(i=0; i<$issn.length; i++){ //epub if possible because digital age meta.journal.issn = $issn[i].textContent; if($issn[i].getAttribute('pub-type') === 'epub'){ break; } } } var $articleMeta = $article.getElementsByTagName('article-meta')[0]; var $articleId = $articleMeta.getElementsByTagName('article-id'); if($articleId){ Array.prototype.forEach.call($articleId, function($el){ var t = $el.getAttribute('pub-id-type'); if(t === 'doi'){ meta.doi = $el.textContent; } else if (t === 'pmid'){ meta.pmid = $el.textContent; } else if (t === 'pmcid'){ meta.pmcid = $el.textContent; } }); } meta.pmcid = pmcid; //always known -> can ensure pkg name in any case var $articleCategories = $articleMeta.getElementsByTagName('article-categories'); if($articleCategories){ var keywords = []; Array.prototype.forEach.call($articleCategories, function($ac){ Array.prototype.forEach.call($ac.childNodes, function($el){ if($el.tagName === 'subj-group'){ keywords = keywords.concat(tools.extractKeywords($el)); } }); }); if(keywords.length){ meta.keywords = keywords; } } var $articleTitle = $articleMeta.getElementsByTagName('article-title')[0]; if($articleTitle){ meta.title = tools.cleanText($articleTitle.textContent); } var $altTitle = $articleMeta.getElementsByTagName('alt-title')[0]; if($altTitle){ meta.shortTitle = $altTitle.textContent; } var affiliations = {}; // affiliations are generally defined independently of authors, with keys that the author spans point to. var $affs = $articleMeta.getElementsByTagName('aff'); if($affs){ Array.prototype.forEach.call($affs, function($aff){ var id = $aff.getAttribute('id'); if(!id) return; var affiliation = { '@type': 'Organization' }; var desc = ''; var $institution = $aff.getElementsByTagName('institution')[0]; var $addrLine = $aff.getElementsByTagName('addr-line')[0]; var $country = $aff.getElementsByTagName('country')[0]; var $fax = $aff.getElementsByTagName('fax')[0]; var $phone = $aff.getElementsByTagName('phone')[0]; var $email = $aff.getElementsByTagName('email')[0]; if($institution){ affiliation.name = $institution.textContent; desc = affiliation.name + '. '; } if($addrLine){ desc += $addrLine.textContent + '. '; } if($country){ affiliation.address = { '@type': 'PostalAddress', addressCountry: $country.textContent }; desc += $country.textContent + '. '; } if($fax){ affiliation.faxNumber = $fax.textContent; } if($phone){ affiliation.telephone = $phone.textContent; } if($email){ affiliation.email = $email.textContent; } if(desc){ affiliation.description = tools.cleanText(desc); } else { //avoid label or sup in description... Array.prototype.forEach.call($aff.childNodes, function($el){ if($el.tagName !== 'label' || $el.tagName !== 'sup'){ if($el.nodeType === 3){ desc += $el.nodeValue; } else if ($el.nodeType === 1){ Array.prototype.forEach.call($el.childNodes, function($subEl){ if($el.nodeType === 3){ desc += $el.nodeValue } }); } } }); if(desc){ affiliation.description = tools.cleanText(desc); } } if(affiliations[id]){ affiliations[id].push(affiliation); } else { affiliations[id] = [affiliation]; } }); } var emails = {}; var $authorNotes = $articleMeta.getElementsByTagName('author-notes'); if($authorNotes){ Array.prototype.forEach.call($authorNotes, function($el){ var $corresp = $el.getElementsByTagName('corresp')[0]; var id = $corresp.getAttribute('id'); var $email = $corresp.getElementsByTagName('email')[0]; if(id && $email){ emails[id] = $email.textContent; } }); } var author; var contributor = []; var accountablePerson = []; var editor = []; var $contribGroups = $articleMeta.getElementsByTagName('contrib-group'); if($contribGroups){ Array.prototype.forEach.call($contribGroups, function($contribGroup){ var authCnt = 0; Array.prototype.forEach.call($contribGroup.childNodes, function($el){ if($el.tagName === 'contrib'){ var $contrib = $el; var contribType = $contrib.getAttribute('contrib-type'); var $name = $contrib.getElementsByTagName('name')[0]; if($name){ var $givenNames = $name.getElementsByTagName('given-names')[0]; if($givenNames){ var givenName = $givenNames.textContent; } var $surname = $name.getElementsByTagName('surname')[0]; if($surname){ var familyName = $surname.textContent; } } var affiliation = []; var email; var corresp = !!($contrib.getAttribute('corresp') === 'yes'); var $xrefs = $contrib.getElementsByTagName('xref'); if($xrefs){ Array.prototype.forEach.call($xrefs, function($xref){ var refType = $xref.getAttribute('ref-type'); var rid = $xref.getAttribute('rid'); if(refType === 'aff'){ if(affiliations[rid]){ affiliation = affiliation.concat(affiliations[rid]); } } else if(refType === 'corresp'){ if(emails[rid]){ email = emails[rid]; } corresp = true; } }); } var $email = $contrib.getElementsByTagName('email')[0]; if($email){ email = $email.textContent; } var person = { '@type': 'Person' }; var tmpname = ''; if(givenName){ person.givenName = givenName; tmpname += givenName + ' '; } if(familyName){ person.familyName = familyName; tmpname += familyName; } if(tmpname.length){ person.name = tmpname; } if (email){ person.email = email } if(affiliation.length){ person.affiliation = affiliation; } if(contribType === 'author'){ if(authCnt++ === 0){ author = person; } else { contributor.push(person); } if (corresp){ accountablePerson.push(person); } } else if(contribType === 'editor'){ editor.push(person); } } }); }); } meta.author = author; meta.contributor = contributor; meta.editor = editor; meta.accountablePerson = accountablePerson; //TODO! funding and grants are put in http://www.schema.org/sourceOrganization // var sourceOrganisation = []; var $pubDate = $articleMeta.getElementsByTagName('pub-date'); var tmpDate; for(i=0; i<$pubDate.length; i++){ var iso = $pubDate[i].getAttribute('iso-8601-date'); if(iso){ tmpDate = iso } else { var $day = $pubDate[i].getElementsByTagName('day')[0]; var $month = $pubDate[i].getElementsByTagName('month')[0]; var $year = $pubDate[i].getElementsByTagName('year')[0]; if($year){ meta.year = $year.textContent; } if($day && $month && $year){ tmpDate = [$year.textContent, $month.textContent, $day.textContent].join('-'); } } if($pubDate[i].getAttribute('pub-type') === 'epub' || $pubDate[i].getAttribute('publication-format') === 'electronic'){ break; } } if(tmpDate){ var jsDate = new Date(tmpDate); meta.publicationDate = jsDate.toISOString(); //TODO fix timezone for bethesda DC because NLM meta.year = jsDate.getFullYear(); } var $volume = $articleMeta.getElementsByTagName('volume')[0]; if($volume){ meta.volume = parseInt($volume.textContent, 10); } var $issue = $articleMeta.getElementsByTagName('issue')[0]; if($issue){ meta.issue = parseInt($issue.textContent, 10); } var $fpage = $articleMeta.getElementsByTagName('fpage')[0]; if($fpage){ meta.pageStart = parseInt($fpage.textContent, 10); } var $lpage = $articleMeta.getElementsByTagName('lpage')[0]; if($lpage){ meta.pageEnd = parseInt($lpage.textContent, 10); } var $pageCount = $articleMeta.getElementsByTagName('page-count')[0]; if($pageCount){ var pageCountCount = $pageCount.getAttribute('count'); if(pageCountCount){ meta.pageCount = parseInt(pageCountCount, 10); } } var $copyrightYear = $articleMeta.getElementsByTagName('copyright-year')[0]; if($copyrightYear){ meta.copyrightYear = parseInt($copyrightYear.textContent, 10); } var $copyrightHolder = $articleMeta.getElementsByTagName('copyright-holder')[0]; if($copyrightHolder){ meta.copyrightHolder = $copyrightHolder.textContent; } var $license = $articleMeta.getElementsByTagName('license')[0]; if($license){ var licenseLink = $license.getAttribute('xlink:href'); if(licenseLink){ meta.license = licenseLink; } else { var $licenseP = $license.getElementsByTagName('license-p')[0]; if($licenseP){ meta.license = $licenseP.textContent; } } } //TODO different type of abstracts + structure var $abstract = $articleMeta.getElementsByTagName('abstract')[0]; if($abstract){ meta.abstract = { name: 'abstract', description: tools.cleanText($abstract.textContent) }; } //references var $back = $article.getElementsByTagName('back')[0]; //http://jats.nlm.nih.gov/archiving/tag-library/1.1d1/index.html <back>Back Matter Back matter typically contains supporting material such as an appendix, acknowledgment, glossary, or bibliographic reference list. var references = []; var $refList = $back.getElementsByTagName('ref-list')[0]; if($refList){ var $refs = $refList.getElementsByTagName('ref'); if($refs){ Array.prototype.forEach.call($refs, function($ref){ var ref = {}; var id = $ref.getAttribute('id'); if(id){ ref.name = id; } var $mixedCitation = $ref.getElementsByTagName('mixed-citation')[0]; if($mixedCitation){ ref.description = tools.cleanText($mixedCitation.textContent); var publicationType = $mixedCitation.getAttribute('publication-type'); var $articleTitle = $mixedCitation.getElementsByTagName('article-title')[0]; var $source = $mixedCitation.getElementsByTagName('source')[0]; if(publicationType === 'journal'){ ref['@type'] = 'ScholarlyArticle'; if($articleTitle){ ref.header = $articleTitle.textContent; } if($source){ ref.journal = $source.textContent; } } else { if($source){ ref.header = $source.textContent; } } var $volume = $mixedCitation.getElementsByTagName('volume')[0]; if($volume){ ref.volume = parseInt($volume.textContent, 10); } var $fpage = $mixedCitation.getElementsByTagName('fpage')[0]; if($fpage){ ref.pageStart = parseInt($fpage.textContent, 10); } var $lpage = $mixedCitation.getElementsByTagName('lpage')[0]; if($lpage){ ref.pageEnd = parseInt($lpage.textContent, 10); } var $day = $mixedCitation.getElementsByTagName('day')[0]; var $month = $mixedCitation.getElementsByTagName('month')[0]; var $year = $mixedCitation.getElementsByTagName('year')[0]; var jsDate; if($year && $month && $day){ jsDate = new Date($year.textContent, $month.textContent, $day.textContent); } else if($year && $month){ jsDate = new Date($year.textContent, $month.textContent); } else if($year){ jsDate = new Date($year.textContent); } if(jsDate){ ref.publicationDate = jsDate.toISOString(); } var $pubId = $mixedCitation.getElementsByTagName('pub-id')[0]; if($pubId){ var pubIdType = $pubId.getAttribute('pub-id-type'); if(pubIdType){ //doi, pmid... ref[pubIdType] = $pubId.textContent; } } //try again to get doi if(!ref.doi){ var $comment = $mixedCitation.getElementsByTagName('comment')[0]; if($comment){ var $extLinks = $comment.getElementsByTagName('ext-link'); if($extLinks){ Array.prototype.forEach.call($extLinks, function($extLink){ var href = $extLink.getAttribute('xlink:href'); if(href && isUrl(href)){ var purl = url.parse(href); if(purl.host === 'dx.doi.org'){ ref.doi = purl.pathname.replace(/^\//, ''); } } }); } } } //try to get ref.url if(ref.doi){ ref.url = 'http://dx.doi.org/' + ref.doi; if(ref.pmid){ ref.sameAs = 'http://www.ncbi.nlm.nih.gov/pubmed/' + ref.pmid; } } else if(ref.pmid){ ref.url = 'http://www.ncbi.nlm.nih.gov/pubmed/' + ref.pmid; } else { var $extLinks = $mixedCitation.getElementsByTagName('ext-link'); if($extLinks){ for(var i=0; i<$extLinks.length; i++){ if($extLinks[i].getAttribute('ext-link-type') === 'uri'){ var uriHref = $extLinks[i].getAttribute('xlink:href'); if(uriHref && isUrl(uriHref)){ ref.url = uriHref; } } } } } //authors var $names = $mixedCitation.getElementsByTagName('name'); if(!$names){ $names = $mixedCitation.getElementsByTagName('string-name'); } if($names){ Array.prototype.forEach.call($names, function($name, i){ var person = { '@type': 'Person' }; var $surname = $name.getElementsByTagName('surname')[0]; if($surname){ person.familyName = $surname.textContent; } var $givenName = $name.getElementsByTagName('given-names')[0]; if($givenName){ person.givenName = $givenName.textContent; } if(i===0){ ref.author = person; } else { if(!ref.contributor){ ref.contributor = []; } ref.contributor.push(person); } }); if($mixedCitation.getElementsByTagName('etal')[0]){ ref.unnamedContributors = true; //indicates that more than the listed author and contributors. } } } if(Object.keys(ref).length){ references.push(ref); } }); } } if(references.length){ meta.references = references; } //add extracted props from meta and resources to newPkg. We create a new Pkg to have control on the key order var newpkg = {}; //idealy name pkg with (journal-)lastname-year var pkgName = []; if(meta.journalShortName){ pkgName.push(meta.journalShortName); } if(meta.author && meta.author.familyName){ pkgName.push(tools.removeDiacritics(meta.author.familyName.toLowerCase()).replace(/\W/g, '')); } if(meta.year){ pkgName.push(meta.year); } if(pkgName.length>=2){ newpkg.name = pkgName.join('-'); } else { newpkg.name = pmcid; } newpkg.version = '0.0.0'; if(meta.keywords && meta.keywords.length){ newpkg.keywords = meta.keywords; } if(meta.title){ newpkg.description = meta.title; } if(meta.license){ newpkg.license = meta.license; } else if (pkg.license){ newpkg.license = pkg.license; } if(!pkg.sameAs && meta.url){ newpkg.sameAs = meta.url; } newpkg.author = meta.author; if(meta.contributor.length){ newpkg.contributor = meta.contributor; } newpkg.provider = { '@type': 'Organization', '@id': 'http://www.ncbi.nlm.nih.gov/pmc/', description: 'From PMC®, a database of the U.S. National Library of Medicine.' }; if(meta.editor.length){ if(Object.keys(meta.editor[0])){ newpkg.editor = meta.editor; } } if(meta.publisher){ newpkg.publisher = meta.publisher; } if(meta.journal){ newpkg.journal = meta.journal; } newpkg.accountablePerson = { '@type': 'Organization', name: 'Standard Analytics IO', email: 'contact@standardanalytics.io' }; if( meta.copyrightHolder ){ newpkg.copyrightHolder = meta.copyrightHolder; } else if (meta.publisher) { newpkg.copyrightHolder = meta.publisher; } var typeMap = { 'dataset': 'Dataset', 'code': 'Code', 'figure': 'ImageObject', 'audio': 'AudioObject', 'video': 'VideoObject', 'article': 'Article' }; //add the caption from the extracted ```resources``` var resources = findResources(doc); // finds the resources and their captions in the xml ['dataset', 'code', 'figure', 'audio', 'video', 'article'].forEach(function(type){ if(pkg[type]) { pkg[type].forEach(function(r, i){ if(!r.name){ r.name = type + '-' + i; } if(!r['@type']){ r['@type'] = typeMap[type]; } if(meta.publicationDate){ r.datePublished = meta.publicationDate; } resources.forEach(function(x){ var potentialNames = []; if(x.id){ potentialNames.push(x.id); potentialNames.push(x.id.replace(/ /g,'-')); potentialNames.push(path.basename(x.id).replace(/ /g,'-')); } if(x.href){ potentialNames.push(x.href); potentialNames.push(x.href.replace(/ /g,'-')); potentialNames.push(path.basename(url.parse(x.href).pathname).replace(/ /g,'-')); } if(potentialNames.indexOf(r.name) !==-1){ var descr = ''; if (x.label){ descr = x.label + '. '; } if (x.caption){ descr += x.caption; } descr = tools.cleanText(descr); if(descr){ if(type === 'figure' || type === 'video'){ r.caption = descr; } else { r.description = descr; } } if(x.alternateName){ r.alternateName = x.alternateName; } } }); }); newpkg[type] = pkg[type]; // delete resource types that have no entries. if(!newpkg[type].length){ delete newpkg[type]; } } }); // in plos, figures have a doi. We reconstruct it. var plosJournalsList = ['pone', 'pbio', 'pmed', 'pgen', 'pcbi', 'ppat', 'pntd']; if(newpkg.figure && meta.doi){ newpkg.figure.forEach(function(r){ plosJournalsList.forEach(function(p){ if(r.name.slice(0, p.length) === p){ r.doi = meta.doi + '.' + r.name.split('-')[r.name.split('-').length-1]; } }); }); } if ( artInd > -1){ if(meta.accountablePerson){ newpkg.article[artInd].accountablePerson = meta.accountablePerson; } if(meta.journal){ newpkg.article[artInd].journal = meta.journal; } if(meta.doi){ newpkg.article[artInd].doi = meta.doi; } if(meta.pmid){ newpkg.article[artInd].pmid = meta.pmid; } if(meta.pmcid){ newpkg.article[artInd].pmcid = meta.pmcid; } if(meta.title){ newpkg.article[artInd].headline = meta.title; } if (meta.abstract){ newpkg.article[artInd].about = meta.abstract; } if(meta.issue){ newpkg.article[artInd].issue = meta.issue; } if(meta.volume){ newpkg.article[artInd].volume = meta.volume; } if(meta.pageStart){ newpkg.article[artInd].pageStart = meta.pageStart; } if(meta.pageEnd){ newpkg.article[artInd].pageEnd = meta.pageEnd; } if(meta.references){ newpkg.article[artInd].citation = meta.references; } } return newpkg; }; function xml2json(xml){ var doc = new DOMParser().parseFromString(xml,'text/xml'); if(doc.getElementsByTagName('body').length){ var body = doc.getElementsByTagName('body')[0]; } else { var body = '<body>Emptybody</body>';//doc.getElementsByTagName('article')[0]; } return tools.parseXmlNodesRec(body,xml); }; function removeInlineFormulas(pkg, ldpm, callback){ // We assume that figures corresponding to inline formulas have an identifier // starting with 'e' (plos convention) var plosJournalsList = ['pone','pbio','pmed','pgen','pcbi','ppat','pntd']; var tmpFigure = []; var toUnlink = []; if(pkg.figure){ pkg.figure.forEach(function(fig){ var keep = true; plosJournalsList.forEach(function(p,j){ if(fig.name.slice(0,p.length)===p){ if(fig.name.split('-')[fig.name.split('-').length-1].slice(0,1)==='e'){ keep = false; } } }) if(keep){ tmpFigure.push(fig); } else { fig.figure.forEach(function(enc){ toUnlink.push(path.resolve(ldpm.root,enc.contentPath)); }) } }) } async.each(toUnlink, fs.unlink, function(err){ if(err) return callback(err); pkg.figure = tmpFigure; if(pkg.figure.length==0){ delete pkg.figure; } callback(null,pkg); }); }; function findResources(doc){ // find figure, tables, supplementary materials and their captions var resources = []; var tags = ['fig', 'table-wrap', 'supplementary-material']; tags.forEach(function(tag){ Array.prototype.forEach.call(doc.getElementsByTagName(tag), function(x){ var r = {}; if(x.getElementsByTagName('label')[0]){ r.label = x.getElementsByTagName('label')[0].textContent; } if(r.label){ if(r.label.match(/\d+$/)){ r.num = r.label.match(/\d+$/)[0]; } } if(x.getElementsByTagName('caption')[0]){ r.caption = tools.cleanText(x.getElementsByTagName('caption')[0].textContent); } r.id = x.getAttribute('id'); if(r.id){ r.alternateName = r.id; } if(x.getElementsByTagName('graphic')[0]){ r.href = x.getElementsByTagName('graphic')[0].getAttribute('xlink:href'); } resources.push(r); }); }); return resources; };