@cyanheads/pubmed-mcp-server
Version:
Search PubMed/Europe PMC, fetch articles and full text (PMC/EPMC/Unpaywall), citations, MeSH terms via MCP. STDIO or Streamable HTTP.
365 lines • 14.5 kB
JavaScript
/**
* @fileoverview Helper functions for parsing detailed PubMed Article XML structures,
* typically from EFetch results.
* @module src/services/ncbi/parsing/article-parser
*/
import { ensureArray, getAttribute, getText } from './xml-helpers.js';
/**
* Extracts and formats author information from XML, deduplicating affiliations.
* Affiliations are collected into a single array; each author references them by index.
* This avoids repeating identical institutional strings per-author (common in multi-center papers).
* @param authorListXml - The XML AuthorList element.
* @returns Authors and a deduplicated affiliations list.
*/
export function extractAuthors(authorListXml) {
if (!authorListXml)
return { authors: [], affiliations: [] };
const affiliationMap = new Map();
const affiliationList = [];
function getAffiliationIndex(text) {
const existing = affiliationMap.get(text);
if (existing !== undefined)
return existing;
const idx = affiliationList.length;
affiliationList.push(text);
affiliationMap.set(text, idx);
return idx;
}
const xmlAuthors = ensureArray(authorListXml.Author);
const authors = xmlAuthors.map((auth) => {
const collectiveName = getText(auth.CollectiveName);
if (collectiveName) {
return { collectiveName };
}
// Collect all affiliations for this author, deduplicated at article level
const authorAffiliationInfos = ensureArray(auth.AffiliationInfo);
const indices = [];
for (const info of authorAffiliationInfos) {
const text = getText(info?.Affiliation);
if (text)
indices.push(getAffiliationIndex(text));
}
// Extract ORCID from Identifier elements with Source="ORCID"
let orcid;
const identifiers = ensureArray(auth.Identifier);
for (const id of identifiers) {
if (getAttribute(id, 'Source') === 'ORCID') {
const val = getText(id);
if (val) {
orcid = val;
break;
}
}
}
return {
lastName: getText(auth.LastName),
firstName: getText(auth.ForeName), // XML uses ForeName
initials: getText(auth.Initials),
...(indices.length > 0 && { affiliationIndices: indices }),
...(orcid && { orcid }),
};
});
return { authors, affiliations: affiliationList };
}
/**
* Extracts and formats journal information from XML.
* @param journalXml - The XML Journal element from an Article.
* @param articleXml - The XML Article element (for Pagination).
* @returns Formatted journal information.
*/
export function extractJournalInfo(journalXml, articleXml) {
if (!journalXml)
return;
const pubDate = journalXml.JournalIssue?.PubDate;
const year = getText(pubDate?.Year, getText(pubDate?.MedlineDate, '').match(/\d{4}/)?.[0]);
// Extract ISSN, separating print and electronic
const issnElement = journalXml.ISSN;
const issnValue = getText(issnElement);
const issnType = getAttribute(issnElement, 'IssnType');
const issn = issnType === 'Electronic' ? undefined : issnValue || undefined;
const eIssn = issnType === 'Electronic' ? issnValue || undefined : undefined;
const month = getText(pubDate?.Month);
const day = getText(pubDate?.Day);
const medlineDate = getText(pubDate?.MedlineDate);
return {
title: getText(journalXml.Title),
isoAbbreviation: getText(journalXml.ISOAbbreviation),
...(issn && { issn }),
...(eIssn && { eIssn }),
volume: getText(journalXml.JournalIssue?.Volume),
issue: getText(journalXml.JournalIssue?.Issue),
pages: getText(articleXml?.Pagination?.MedlinePgn),
publicationDate: {
...(year && { year }),
...(month && { month }),
...(day && { day }),
...(medlineDate && { medlineDate }),
},
};
}
/**
* Extracts and formats MeSH terms from XML.
* @param meshHeadingListXml - The XML MeshHeadingList element.
* @returns An array of formatted MeSH term objects.
*/
export function extractMeshTerms(meshHeadingListXml) {
if (!meshHeadingListXml)
return [];
const meshHeadings = ensureArray(meshHeadingListXml.MeshHeading);
return meshHeadings.map((mh) => {
const isMajorDescriptor = getAttribute(mh.DescriptorName, 'MajorTopicYN') === 'Y';
const isMajorRoot = getAttribute(mh, 'MajorTopicYN') === 'Y';
const descriptorUi = getAttribute(mh.DescriptorName, 'UI');
// Parse all qualifiers, not just the first
const rawQualifiers = ensureArray(mh.QualifierName);
const qualifiers = rawQualifiers.flatMap((q) => {
const name = getText(q);
if (!name)
return [];
const ui = getAttribute(q, 'UI');
return {
qualifierName: name,
...(ui && { qualifierUi: ui }),
isMajorTopic: getAttribute(q, 'MajorTopicYN') === 'Y',
};
});
const isMajorAnyQualifier = qualifiers.some((q) => q.isMajorTopic);
return {
descriptorName: getText(mh.DescriptorName),
...(descriptorUi && { descriptorUi }),
...(qualifiers.length > 0 && { qualifiers }),
isMajorTopic: isMajorRoot || isMajorDescriptor || isMajorAnyQualifier,
};
});
}
/**
* Extracts and formats grant information from XML.
* @param grantListXml - The XML GrantList element.
* @returns An array of formatted grant objects.
*/
export function extractGrants(grantListXml) {
if (!grantListXml)
return [];
const grants = ensureArray(grantListXml.Grant);
return grants.map((g) => {
const grantId = getText(g.GrantID);
const acronym = getText(g.Acronym);
const agency = getText(g.Agency);
const country = getText(g.Country);
return {
...(grantId && { grantId }),
...(acronym && { acronym }),
...(agency && { agency }),
...(country && { country }),
};
});
}
/**
* Extracts DOI from various possible locations in the XML.
* Prioritizes ELocationID with ValidYN='Y', then any ELocationID, then ArticleIdList,
* then PubmedData.ArticleIdList.
* @param articleXml - The XML Article element.
* @param pubmedDataArticleIdList - Optional ArticleIdList from PubmedData (sibling of MedlineCitation).
* @returns The DOI string or undefined.
*/
export function extractDoi(articleXml, pubmedDataArticleIdList) {
if (!articleXml)
return;
// Check ELocationID first
const eLocationIDs = ensureArray(articleXml.ELocationID);
// Prioritize valid DOI
for (const eloc of eLocationIDs) {
if (getAttribute(eloc, 'EIdType') === 'doi' && getAttribute(eloc, 'ValidYN') === 'Y') {
const doi = getText(eloc);
if (doi)
return doi;
}
}
// Fallback to any DOI in ELocationID
for (const eloc of eLocationIDs) {
if (getAttribute(eloc, 'EIdType') === 'doi') {
const doi = getText(eloc);
if (doi)
return doi;
}
}
// Check Article.ArticleIdList
const articleIds = ensureArray(articleXml.ArticleIdList?.ArticleId);
for (const aid of articleIds) {
if (getAttribute(aid, 'IdType') === 'doi') {
const doi = getText(aid);
if (doi)
return doi;
}
}
// Fallback to PubmedData.ArticleIdList (common canonical DOI location)
if (pubmedDataArticleIdList) {
const pubmedDataIds = ensureArray(pubmedDataArticleIdList.ArticleId);
for (const aid of pubmedDataIds) {
if (getAttribute(aid, 'IdType') === 'doi') {
const doi = getText(aid);
if (doi)
return doi;
}
}
}
return;
}
/**
* Extracts PMC ID from ArticleIdList locations in the XML.
* Searches Article.ArticleIdList and PubmedData.ArticleIdList for IdType='pmc'.
* @param articleXml - The XML Article element.
* @param pubmedDataArticleIdList - Optional ArticleIdList from PubmedData.
* @returns The PMC ID string (e.g. 'PMC1234567') or undefined.
*/
export function extractPmcId(articleXml, pubmedDataArticleIdList) {
// Check Article.ArticleIdList
const articleIds = ensureArray(articleXml?.ArticleIdList?.ArticleId);
for (const aid of articleIds) {
if (getAttribute(aid, 'IdType') === 'pmc') {
const val = getText(aid);
if (val)
return val;
}
}
// Fallback to PubmedData.ArticleIdList
if (pubmedDataArticleIdList) {
const pubmedDataIds = ensureArray(pubmedDataArticleIdList.ArticleId);
for (const aid of pubmedDataIds) {
if (getAttribute(aid, 'IdType') === 'pmc') {
const val = getText(aid);
if (val)
return val;
}
}
}
return;
}
/**
* Extracts publication types from XML.
* @param publicationTypeListXml - The XML PublicationTypeList element.
* @returns An array of publication type strings.
*/
export function extractPublicationTypes(publicationTypeListXml) {
if (!publicationTypeListXml)
return [];
const pubTypes = ensureArray(publicationTypeListXml.PublicationType);
return pubTypes.map((pt) => getText(pt)).filter(Boolean);
}
/**
* Extracts keywords from XML. Handles single or multiple KeywordList elements.
* @param keywordListsXml - The XML KeywordList element or an array of them.
* @returns An array of keyword strings.
*/
export function extractKeywords(keywordListsXml) {
if (!keywordListsXml)
return [];
const lists = ensureArray(keywordListsXml);
const allKeywords = [];
for (const list of lists) {
for (const kw of ensureArray(list.Keyword)) {
const keywordText = getText(kw);
if (keywordText) {
allKeywords.push(keywordText);
}
}
}
return allKeywords;
}
/**
* Extracts abstract text from XML. Handles structured abstracts by concatenating sections.
* If AbstractText is an array, joins them. If it's a single object/string, uses it directly.
* Prefixes with Label if present.
* @param abstractXml - The XML Abstract element from an Article.
* @returns The abstract text string, or undefined if not found or empty.
*/
export function extractAbstractText(abstractXml) {
if (!abstractXml?.AbstractText)
return;
const abstractTexts = ensureArray(abstractXml.AbstractText);
if (abstractTexts.length === 0)
return;
const processedTexts = abstractTexts
.map((at) => {
// AbstractText can be string directly or object
if (typeof at === 'string') {
return at;
}
// If it's an object, it should have #text or Label
const sectionText = getText(at); // Handles at['#text']
const label = getAttribute(at, 'Label');
if (label && sectionText) {
return `${label.trim()}: ${sectionText.trim()}`;
}
return sectionText.trim();
})
.filter(Boolean); // Remove any empty strings resulting from empty sections
if (processedTexts.length === 0)
return;
return processedTexts.join('\n\n').trim() || undefined; // Join sections with double newline
}
/**
* Extracts PMID from MedlineCitation.
* @param medlineCitationXml - The XML MedlineCitation element.
* @returns The PMID string or undefined.
*/
export function extractPmid(medlineCitationXml) {
if (!medlineCitationXml?.PMID)
return;
return getText(medlineCitationXml.PMID);
}
/**
* Extracts article dates from XML.
* @param articleXml - The XML Article element.
* @returns An array of parsed article dates.
*/
export function extractArticleDates(articleXml) {
if (!articleXml?.ArticleDate)
return [];
const articleDatesXml = ensureArray(articleXml.ArticleDate);
return articleDatesXml.map((ad) => ({
dateType: getAttribute(ad, 'DateType'),
year: getText(ad.Year),
month: getText(ad.Month),
day: getText(ad.Day),
}));
}
/**
* Parses a full PubMed article XML structure into a ParsedArticle object,
* combining all individual extractors.
* @param xmlArticle - The raw XML PubmedArticle element.
* @param options - Options controlling which optional sections to include.
* @returns A fully parsed article object.
*/
export function parseFullArticle(xmlArticle, options = {}) {
const medlineCitation = xmlArticle.MedlineCitation;
const article = medlineCitation?.Article;
const { includeMesh = true, includeGrants = false } = options;
const abstractText = extractAbstractText(article?.Abstract);
const journalInfo = extractJournalInfo(article?.Journal, article);
const pubmedDataArticleIdList = xmlArticle.PubmedData?.ArticleIdList;
const doi = extractDoi(article, pubmedDataArticleIdList);
const pmcId = extractPmcId(article, pubmedDataArticleIdList);
const { authors, affiliations } = extractAuthors(article?.AuthorList);
const publicationTypes = extractPublicationTypes(article?.PublicationTypeList);
const keywords = extractKeywords(medlineCitation?.KeywordList ?? article?.KeywordList);
const articleDates = extractArticleDates(article);
const meshTerms = includeMesh ? extractMeshTerms(medlineCitation?.MeshHeadingList) : undefined;
const grantList = includeGrants ? extractGrants(article?.GrantList) : undefined;
return {
pmid: extractPmid(medlineCitation) ?? '',
title: getText(article?.ArticleTitle),
...(abstractText !== undefined && { abstractText }),
...(affiliations.length > 0 && { affiliations }),
authors,
...(journalInfo !== undefined && { journalInfo }),
...(publicationTypes.length > 0 && { publicationTypes }),
...(keywords.length > 0 && { keywords }),
...(meshTerms !== undefined && meshTerms.length > 0 && { meshTerms }),
...(grantList !== undefined && grantList.length > 0 && { grantList }),
...(doi !== undefined && { doi }),
...(pmcId !== undefined && { pmcId }),
...(articleDates.length > 0 && { articleDates }),
};
}
//# sourceMappingURL=article-parser.js.map