@cyanheads/pubmed-mcp-server
Version:
A Model Context Protocol (MCP) server enabling AI agents to intelligently search, retrieve, and analyze biomedical literature from PubMed via NCBI E-utilities. Built on the mcp-ts-template for robust, production-ready performance.
230 lines (229 loc) • 8.77 kB
JavaScript
/**
* @fileoverview Helper functions for parsing detailed PubMed Article XML structures,
* typically from EFetch results.
* @module src/utils/parsing/ncbi-parsing/pubmedArticleStructureParser
*/
import { ensureArray, getText, getAttribute } from "./xmlGenericHelpers.js";
/**
* Extracts and formats author information from XML.
* @param authorListXml - The XML AuthorList element.
* @returns An array of formatted author objects.
*/
export function extractAuthors(authorListXml) {
if (!authorListXml)
return [];
const authors = ensureArray(authorListXml.Author);
return authors.map((auth) => {
let affiliation = "";
const affiliations = ensureArray(auth.AffiliationInfo);
if (affiliations.length > 0) {
affiliation = getText(affiliations[0]?.Affiliation);
}
return {
lastName: getText(auth.LastName),
firstName: getText(auth.ForeName), // XML uses ForeName
initials: getText(auth.Initials),
affiliation: affiliation || undefined, // Ensure undefined if empty
};
});
}
/**
* Extracts and formats journal information from XML.
* @param journalXml - The XML Journal element from an Article.
* @param medlineCitationXml - The XML MedlineCitation element (for MedlinePgn).
* @returns Formatted journal information.
*/
export function extractJournalInfo(journalXml, medlineCitationXml) {
if (!journalXml)
return undefined;
const pubDate = journalXml.JournalIssue?.PubDate;
const year = getText(pubDate?.Year, getText(pubDate?.MedlineDate, "").match(/\d{4}/)?.[0]);
return {
title: getText(journalXml.Title),
isoAbbreviation: getText(journalXml.ISOAbbreviation),
volume: getText(journalXml.JournalIssue?.Volume),
issue: getText(journalXml.JournalIssue?.Issue),
pages: getText(medlineCitationXml?.MedlinePgn) ||
getText(medlineCitationXml?.Article?.Pagination?.MedlinePgn),
publicationDate: {
year: year || undefined,
month: getText(pubDate?.Month) || undefined,
day: getText(pubDate?.Day) || undefined,
medlineDate: getText(pubDate?.MedlineDate) || undefined,
},
};
}
/**
* Extracts and formats MeSH terms from XML.
* @param meshHeadingListXml - The XML MeshHeadingList element.
* @returns An array of formatted MeSH term objects.
*/
export function extractMeshTerms(meshHeadingListXml) {
if (!meshHeadingListXml)
return [];
const meshHeadings = ensureArray(meshHeadingListXml.MeshHeading);
return meshHeadings.map((mh) => {
const qualifiers = ensureArray(mh.QualifierName);
const firstQualifier = qualifiers[0];
// Check MajorTopicYN at DescriptorName, QualifierName, and the root MeshHeading element
const isMajorDescriptor = getAttribute(mh.DescriptorName, "MajorTopicYN") === "Y";
const isMajorQualifier = firstQualifier
? getAttribute(firstQualifier, "MajorTopicYN") === "Y"
: false;
// Some schemas might place MajorTopicYN directly on MeshHeading if no qualifiers
const isMajorRoot = getAttribute(mh, "MajorTopicYN") === "Y";
return {
descriptorName: getText(mh.DescriptorName),
descriptorUi: getAttribute(mh.DescriptorName, "UI"),
qualifierName: firstQualifier ? getText(firstQualifier) : undefined,
qualifierUi: firstQualifier
? getAttribute(firstQualifier, "UI")
: undefined,
isMajorTopic: isMajorRoot || isMajorDescriptor || isMajorQualifier,
};
});
}
/**
* Extracts and formats grant information from XML.
* @param grantListXml - The XML GrantList element.
* @returns An array of formatted grant objects.
*/
export function extractGrants(grantListXml) {
if (!grantListXml)
return [];
const grants = ensureArray(grantListXml.Grant);
return grants.map((g) => ({
grantId: getText(g.GrantID) || undefined,
agency: getText(g.Agency) || undefined,
country: getText(g.Country) || undefined,
}));
}
/**
* Extracts DOI from various possible locations in the XML.
* Prioritizes ELocationID with ValidYN="Y", then any ELocationID, then ArticleIdList.
* @param articleXml - The XML Article element.
* @returns The DOI string or undefined.
*/
export function extractDoi(articleXml) {
if (!articleXml)
return undefined;
// Check ELocationID first
const eLocationIDs = ensureArray(articleXml.ELocationID);
// Prioritize valid DOI
for (const eloc of eLocationIDs) {
if (getAttribute(eloc, "EIdType") === "doi" &&
getAttribute(eloc, "ValidYN") === "Y") {
const doi = getText(eloc);
if (doi)
return doi;
}
}
// Fallback to any DOI in ELocationID
for (const eloc of eLocationIDs) {
if (getAttribute(eloc, "EIdType") === "doi") {
const doi = getText(eloc);
if (doi)
return doi;
}
}
// Check ArticleIdList as a secondary source
const articleIds = ensureArray(articleXml.ArticleIdList?.ArticleId);
for (const aid of articleIds) {
if (getAttribute(aid, "IdType") === "doi") {
const doi = getText(aid);
if (doi)
return doi;
}
}
return undefined;
}
/**
* Extracts publication types from XML.
* @param publicationTypeListXml - The XML PublicationTypeList element.
* @returns An array of publication type strings.
*/
export function extractPublicationTypes(publicationTypeListXml) {
if (!publicationTypeListXml)
return [];
const pubTypes = ensureArray(publicationTypeListXml.PublicationType);
return pubTypes.map((pt) => getText(pt)).filter(Boolean);
}
/**
* Extracts keywords from XML. Handles single or multiple KeywordList elements.
* @param keywordListsXml - The XML KeywordList element or an array of them.
* @returns An array of keyword strings.
*/
export function extractKeywords(keywordListsXml) {
if (!keywordListsXml)
return [];
const lists = ensureArray(keywordListsXml);
const allKeywords = [];
for (const list of lists) {
const keywords = ensureArray(list.Keyword);
keywords.forEach((kw) => {
const keywordText = getText(kw);
if (keywordText) {
allKeywords.push(keywordText);
}
});
}
return allKeywords;
}
/**
* Extracts abstract text from XML. Handles structured abstracts by concatenating sections.
* If AbstractText is an array, joins them. If it's a single object/string, uses it directly.
* Prefixes with Label if present.
* @param abstractXml - The XML Abstract element from an Article.
* @returns The abstract text string, or undefined if not found or empty.
*/
export function extractAbstractText(abstractXml) {
if (!abstractXml || !abstractXml.AbstractText)
return undefined;
const abstractTexts = ensureArray(abstractXml.AbstractText);
if (abstractTexts.length === 0)
return undefined;
const processedTexts = abstractTexts
.map((at) => {
// AbstractText can be string directly or object
if (typeof at === "string") {
return at;
}
// If it's an object, it should have #text or Label
let sectionText = getText(at); // Handles at["#text"]
const label = getAttribute(at, "Label");
if (label && sectionText) {
return `${label.trim()}: ${sectionText.trim()}`;
}
return sectionText.trim();
})
.filter(Boolean); // Remove any empty strings resulting from empty sections
if (processedTexts.length === 0)
return undefined;
return processedTexts.join("\n\n").trim() || undefined; // Join sections with double newline
}
/**
* Extracts PMID from MedlineCitation.
* @param medlineCitationXml - The XML MedlineCitation element.
* @returns The PMID string or undefined.
*/
export function extractPmid(medlineCitationXml) {
if (!medlineCitationXml || !medlineCitationXml.PMID)
return undefined;
return getText(medlineCitationXml.PMID);
}
/**
* Extracts article dates from XML.
* @param articleXml - The XML Article element.
* @returns An array of parsed article dates.
*/
export function extractArticleDates(articleXml) {
if (!articleXml || !articleXml.ArticleDate)
return [];
const articleDatesXml = ensureArray(articleXml.ArticleDate);
return articleDatesXml.map((ad) => ({
dateType: getAttribute(ad, "DateType"),
year: getText(ad.Year),
month: getText(ad.Month),
day: getText(ad.Day),
}));
}