UNPKG

@cyanheads/pubmed-mcp-server

Version:

Search PubMed/Europe PMC, fetch articles and full text (PMC/EPMC/Unpaywall), citations, MeSH terms via MCP. STDIO or Streamable HTTP.

290 lines 12.1 kB
/** * @fileoverview Parser for PMC full-text articles in JATS XML format. * Consumes the ordered tree shape from fast-xml-parser's `preserveOrder: true` * mode (see `pmc-xml-helpers.ts`) so that mixed-content elements — `<p>`, * `<abstract>`, `<title>` with inline `<italic>`, `<xref>`, `<sup>`, etc. — * read back in document order. Parsing in the default object shape scrambles * inline children and drops body sections for markup-heavy articles. * @module src/services/ncbi/parsing/pmc-article-parser */ import { attrOf, childrenOf, findAll, findOne, tagNameOf, textContent, } from './pmc-xml-helpers.js'; // ─── Article IDs ──────────────────────────────────────────────────────────── function extractArticleId(articleMeta, pubIdType) { if (!articleMeta) return; for (const idNode of findAll(articleMeta, 'article-id')) { if (attrOf(idNode, 'pub-id-type') === pubIdType) { return textContent(idNode) || undefined; } } return; } // ─── Authors & Affiliations ───────────────────────────────────────────────── /** Extract authors from a single `<contrib-group>` node. Non-author contributors are skipped. */ export function extractJatsAuthors(contribGroup) { if (!contribGroup) return []; const authors = []; for (const contrib of findAll(contribGroup, 'contrib')) { const contribType = attrOf(contrib, 'contrib-type'); if (contribType && contribType !== 'author') continue; const collab = findOne(contrib, 'collab'); if (collab) { const collectiveName = textContent(collab); if (collectiveName) authors.push({ collectiveName }); continue; } const nameNode = findOne(contrib, 'name'); if (nameNode) { const lastName = textContent(findOne(nameNode, 'surname')) || undefined; const givenNames = textContent(findOne(nameNode, 'given-names')) || undefined; authors.push({ ...(lastName && { lastName }), ...(givenNames && { givenNames }), }); } } return authors; } function extractAffiliations(articleMeta) { if (!articleMeta) return []; const result = []; for (const aff of findAll(articleMeta, 'aff')) { const text = textContent(aff); if (text) result.push(text); } return result; } // ─── Journal & Publication Date ───────────────────────────────────────────── function extractJournal(journalMeta, articleMeta) { if (!journalMeta) return; const titleGroup = findOne(journalMeta, 'journal-title-group'); const title = textContent(findOne(titleGroup, 'journal-title')) || textContent(findOne(journalMeta, 'journal-title')) || undefined; const firstIssn = findAll(journalMeta, 'issn')[0]; const issn = firstIssn ? textContent(firstIssn) || undefined : undefined; const volume = articleMeta ? textContent(findOne(articleMeta, 'volume')) || undefined : undefined; const issue = articleMeta ? textContent(findOne(articleMeta, 'issue')) || undefined : undefined; const fpage = articleMeta ? textContent(findOne(articleMeta, 'fpage')) : ''; const lpage = articleMeta ? textContent(findOne(articleMeta, 'lpage')) : ''; const pages = fpage && lpage ? `${fpage}-${lpage}` : fpage || undefined; if (!title && !issn && !volume && !issue && !pages) return; return { ...(title && { title }), ...(issn && { issn }), ...(volume && { volume }), ...(issue && { issue }), ...(pages && { pages }), }; } function extractPubDate(articleMeta) { if (!articleMeta) return; const dates = findAll(articleMeta, 'pub-date'); if (dates.length === 0) return; const preferred = dates.find((d) => attrOf(d, 'pub-type') === 'epub') ?? dates.find((d) => attrOf(d, 'pub-type') === 'ppub') ?? dates.find((d) => attrOf(d, 'date-type') === 'pub') ?? dates[0]; if (!preferred) return; const year = textContent(findOne(preferred, 'year')) || undefined; if (!year) return; const month = textContent(findOne(preferred, 'month')) || undefined; const day = textContent(findOne(preferred, 'day')) || undefined; return { year, ...(month && { month }), ...(day && { day }), }; } // ─── Abstract & Keywords ──────────────────────────────────────────────────── function extractAbstract(articleMeta) { if (!articleMeta) return; const abstractNode = findOne(articleMeta, 'abstract'); if (!abstractNode) return; const sections = findAll(abstractNode, 'sec'); if (sections.length > 0) { const parts = []; for (const sec of sections) { const title = textContent(findOne(sec, 'title')); const text = findAll(sec, 'p') .map((p) => textContent(p)) .filter(Boolean) .join(' '); if (title && text) parts.push(`${title}: ${text}`); else if (text) parts.push(text); } return parts.join('\n\n').trim() || undefined; } const paragraphs = findAll(abstractNode, 'p'); if (paragraphs.length > 0) { return (paragraphs .map((p) => textContent(p)) .filter(Boolean) .join(' ') || undefined); } return textContent(abstractNode) || undefined; } function extractKeywords(articleMeta) { if (!articleMeta) return []; const keywords = []; for (const group of findAll(articleMeta, 'kwd-group')) { for (const kwd of findAll(group, 'kwd')) { const text = textContent(kwd); if (text) keywords.push(text); } } return keywords; } // ─── Body Sections ────────────────────────────────────────────────────────── /** * Extract body sections from a `<body>` node, walking children in document order. * Consecutive bare `<p>` siblings are collected into an untitled section so * articles with mixed structure (direct paragraphs + trailing `<sec>`, common * in manuscript-submitted PMC deposits) preserve their main text. */ export function extractBodySections(body) { if (!body) return []; const sections = []; let pendingParagraphs = []; const flushPending = () => { if (pendingParagraphs.length > 0) { sections.push({ text: pendingParagraphs.join('\n\n') }); pendingParagraphs = []; } }; for (const child of childrenOf(body)) { const tag = tagNameOf(child); if (tag === 'p') { const text = textContent(child); if (text) pendingParagraphs.push(text); } else if (tag === 'sec') { flushPending(); const section = extractSection(child); if (section) sections.push(section); } } flushPending(); return sections; } function extractSection(sec) { const title = textContent(findOne(sec, 'title')) || undefined; const label = textContent(findOne(sec, 'label')) || undefined; const paragraphs = findAll(sec, 'p'); const textParts = paragraphs.map((p) => textContent(p)).filter(Boolean); const subsections = findAll(sec, 'sec') .map(extractSection) .filter((s) => s !== null); const text = textParts.join('\n\n'); if (!text && subsections.length === 0) return null; return { ...(title && { title }), ...(label && { label }), text, ...(subsections.length > 0 && { subsections }), }; } // ─── References ───────────────────────────────────────────────────────────── /** Extract references from a `<back>` node. Prefers mixed-citation over element-citation. */ export function extractReferences(back) { if (!back) return []; const refList = findOne(back, 'ref-list'); if (!refList) return []; const results = []; for (const ref of findAll(refList, 'ref')) { const citationNode = findOne(ref, 'mixed-citation') ?? findOne(ref, 'element-citation'); if (!citationNode) continue; const citation = textContent(citationNode); if (!citation) continue; const id = attrOf(ref, 'id'); const label = textContent(findOne(ref, 'label')) || undefined; results.push({ ...(id && { id }), ...(label && { label }), citation, }); } return results; } // ─── Main Parser ──────────────────────────────────────────────────────────── /** * Parse a single JATS `<article>` node (from PMC EFetch via the ordered parser) * into a structured `ParsedPmcArticle`. The input node is the element wrapper * itself — `{ article: [...], ':@': { '@_article-type': ... } }` — not the * outer `<pmc-articleset>`. */ export function parsePmcArticle(articleNode) { const front = findOne(articleNode, 'front'); const articleMeta = findOne(front, 'article-meta'); const journalMeta = findOne(front, 'journal-meta'); const body = findOne(articleNode, 'body'); const back = findOne(articleNode, 'back'); const pmcId = extractArticleId(articleMeta, 'pmcid') ?? extractArticleId(articleMeta, 'pmc-uid') ?? ''; const pmid = extractArticleId(articleMeta, 'pmid'); const doi = extractArticleId(articleMeta, 'doi'); const titleGroup = findOne(articleMeta, 'title-group'); const title = textContent(findOne(titleGroup, 'article-title')) || undefined; const authors = collectAuthors(articleMeta); const affiliations = extractAffiliations(articleMeta); const journal = extractJournal(journalMeta, articleMeta); const publicationDate = extractPubDate(articleMeta); const abstract = extractAbstract(articleMeta); const keywords = extractKeywords(articleMeta); const sections = extractBodySections(body); const references = extractReferences(back); const normalizedPmcId = !pmcId ? '' : pmcId.startsWith('PMC') ? pmcId : `PMC${pmcId}`; const articleType = attrOf(articleNode, 'article-type'); return { pmcId: normalizedPmcId, ...(pmid && { pmid }), ...(doi && { doi }), ...(title && { title }), ...(authors.length > 0 && { authors }), ...(affiliations.length > 0 && { affiliations }), ...(journal && { journal }), ...(publicationDate && { publicationDate }), ...(abstract && { abstract }), ...(keywords.length > 0 && { keywords }), sections, ...(references.length > 0 && { references }), ...(articleType && { articleType }), pmcUrl: `https://www.ncbi.nlm.nih.gov/pmc/articles/${normalizedPmcId}/`, ...(pmid && { pubmedUrl: `https://pubmed.ncbi.nlm.nih.gov/${pmid}/` }), }; } /** Collect authors across every `<contrib-group>` under `<article-meta>`. */ function collectAuthors(articleMeta) { if (!articleMeta) return []; const result = []; for (const group of findAll(articleMeta, 'contrib-group')) { result.push(...extractJatsAuthors(group)); } return result; } //# sourceMappingURL=pmc-article-parser.js.map