UNPKG

@cyanheads/pubmed-mcp-server

Version:

Search PubMed/Europe PMC, fetch articles and full text (PMC/EPMC/Unpaywall), citations, MeSH terms via MCP. STDIO or Streamable HTTP.

1,103 lines 52 kB
/** * @fileoverview Full-text fetch tool. Resolves full-text articles through a * three-stage chain: NCBI PMC EFetch → Europe PMC `fullTextXML` → Unpaywall. * Accepts three mutually-exclusive input shapes: * * - `pmcids` — fetch directly by PMC ID. Articles not in PMC fall through to * EPMC by PMC ID, then to Unpaywall when the DOI is available. * - `pmids` — resolve PMID → PMCID via PMC ID Converter, then run the chain. * - `dois` — skip PMC EFetch (no PMCID); resolve via EPMC search-by-DOI → * fullTextXML, then Unpaywall. Covers EPMC-only OA and preprints with no * PubMed presence. * * Output uses a discriminated union on `source` (`pmc` | `unpaywall`) with an * extra `viaSource` discriminator that records which layer produced the * content. EPMC's JATS reuses the `pmc` schema shape because it's the same * DTD; `viaSource: 'europepmc'` distinguishes it from PMC EFetch output. * * @module src/mcp-server/tools/definitions/fetch-fulltext.tool */ import { tool, z } from '@cyanheads/mcp-ts-core'; import { htmlExtractor, pdfParser } from '@cyanheads/mcp-ts-core/utils'; import { EUROPEPMC_SERVICE_ERRORS, NCBI_SERVICE_ERRORS, UNPAYWALL_SERVICE_ERRORS, } from '../../../services/error-contracts.js'; import { getEuropePmcService, } from '../../../services/europe-pmc/europe-pmc-service.js'; import { getNcbiService } from '../../../services/ncbi/ncbi-service.js'; import { extractDoi, extractPmid } from '../../../services/ncbi/parsing/article-parser.js'; import { parsePmcArticle } from '../../../services/ncbi/parsing/pmc-article-parser.js'; import { findAll, findOne } from '../../../services/ncbi/parsing/pmc-xml-helpers.js'; import { ensureArray } from '../../../services/ncbi/parsing/xml-helpers.js'; import { getUnpaywallService, } from '../../../services/unpaywall/unpaywall-service.js'; import { conceptMeta, EDAM_DATA_RETRIEVAL, SCHEMA_SCHOLARLY_ARTICLE } from './_concepts.js'; import { pmidStringSchema } from './_schemas.js'; function normalizePmcId(id) { return id.replace(/^PMC/i, ''); } function withPmcPrefix(id) { return id.startsWith('PMC') ? id : `PMC${id}`; } function filterSections(sections, sectionFilter) { const lowerFilter = sectionFilter.map((s) => s.toLowerCase()); return sections.filter((s) => s.title && lowerFilter.some((f) => s.title?.toLowerCase().includes(f))); } function applyPmcFilters(article, filters) { let out = article; if (filters.sections?.length) { out = { ...out, sections: filterSections(out.sections, filters.sections) }; } if (filters.maxSections !== undefined) { out = { ...out, sections: out.sections.slice(0, filters.maxSections) }; } if (!filters.includeReferences) { const { references: _, ...rest } = out; out = rest; } return out; } // ─── Schemas ───────────────────────────────────────────────────────────────── const SubsectionSchema = z .object({ title: z.string().optional().describe('Subsection heading'), label: z.string().optional().describe('Subsection label'), text: z.string().describe('Subsection body text'), }) .describe('Article subsection'); const SectionSchema = z .object({ title: z.string().optional().describe('Section heading'), label: z.string().optional().describe('Section label'), text: z.string().describe('Section body text'), subsections: z.array(SubsectionSchema).optional().describe('Nested subsections'), }) .describe('Article body section'); const AuthorSchema = z .object({ collectiveName: z.string().optional().describe('Group name'), givenNames: z.string().optional().describe('Given names'), lastName: z.string().optional().describe('Last name'), }) .describe('Author entry'); const JournalSchema = z .object({ title: z.string().optional().describe('Journal title'), issn: z.string().optional().describe('ISSN'), volume: z.string().optional().describe('Volume number'), issue: z.string().optional().describe('Issue number'), pages: z.string().optional().describe('Page range'), }) .describe('Journal information'); const ReferenceSchema = z .object({ citation: z.string().describe('Citation text'), id: z.string().optional().describe('Reference ID'), label: z.string().optional().describe('Reference label'), }) .describe('Reference entry'); const PublicationDateSchema = z .object({ year: z.string().optional().describe('Publication year'), month: z.string().optional().describe('Publication month'), day: z.string().optional().describe('Publication day'), }) .describe('Publication date'); const PmcArticleSchema = z .object({ source: z .literal('pmc') .describe('Structured JATS — same DTD whether sourced from NCBI PMC or Europe PMC'), viaSource: z .enum(['pmc', 'europepmc']) .describe('Which layer produced the JATS: `pmc` for NCBI PMC EFetch (db=pmc), `europepmc` for Europe PMC `fullTextXML`. Both paths return the same JATS shape; the discriminator records origin for observability and license attribution.'), pmcId: z .string() .optional() .describe('PMC ID — present for NCBI PMC records and Europe PMC entries that have a PMC counterpart. Absent for EPMC-only records like preprints; use `epmcId` in that case.'), pmcUrl: z.string().optional().describe('PMC URL — derived from `pmcId` when present'), pmid: z.string().optional().describe('PubMed ID'), pubmedUrl: z.string().optional().describe('PubMed URL'), doi: z.string().optional().describe('DOI'), title: z.string().optional().describe('Article title'), abstract: z.string().optional().describe('Abstract'), authors: z.array(AuthorSchema).optional().describe('Authors'), affiliations: z.array(z.string()).optional().describe('Author affiliations'), journal: JournalSchema.optional(), keywords: z.array(z.string()).optional().describe('Keywords'), articleType: z.string().optional().describe('Article type'), publicationDate: PublicationDateSchema.optional(), sections: z.array(SectionSchema).describe('Article body sections'), references: z.array(ReferenceSchema).optional().describe('Reference list'), epmcId: z .string() .optional() .describe('Europe PMC record id — present when `viaSource` is `europepmc`'), epmcSource: z .string() .optional() .describe('Europe PMC source code when `viaSource` is `europepmc`. Common values: `MED` (PubMed-derived), `PMC` (PMC counterpart), `PPR` (preprint), `PAT` (patent), `AGR` (Agricola), plus less common codes (`CTX`, `CBA`, `ETH`, `HIR`). Treat as opaque — EPMC may introduce new codes.'), }) .describe('Structured JATS full-text article. `viaSource` records whether the JATS came from NCBI PMC or Europe PMC.'); const UnpaywallArticleSchema = z .object({ source: z .literal('unpaywall') .describe('Content fetched from an open-access copy indexed by Unpaywall. Best-effort — structural fidelity depends on `contentFormat`.'), viaSource: z .literal('unpaywall') .describe('Layer that produced this article. Constant `unpaywall` for this branch.'), contentFormat: z .enum(['html-markdown', 'pdf-text']) .describe('How `content` was extracted. html-markdown: Defuddle extracted Markdown from an HTML landing page; light section structure may survive but is not guaranteed. pdf-text: unpdf extracted plain text from a PDF; no section, reference, or heading structure.'), pmid: z .string() .optional() .describe('PubMed ID when input was `pmids`; absent for `dois` input'), pubmedUrl: z.string().optional().describe('PubMed URL — present when `pmid` is set'), doi: z.string().describe('DOI used to locate the open-access copy'), sourceUrl: z.string().describe('URL the content was fetched from'), title: z.string().optional().describe('Detected article title when present'), content: z.string().describe('Full article text — Markdown or plain text per `contentFormat`'), wordCount: z .number() .optional() .describe('Approximate word count reported by the HTML extractor; absent for PDFs'), totalPages: z .number() .optional() .describe('Page count reported by the PDF extractor; absent for HTML'), license: z.string().optional().describe('License identifier from Unpaywall (e.g. cc-by, cc0)'), hostType: z .string() .optional() .describe('`publisher` or `repository` — where the OA copy is hosted'), version: z .string() .optional() .describe('OA version: submittedVersion | acceptedVersion | publishedVersion'), }) .describe('Best-effort full text from an open-access copy'); const ArticleSchema = z .discriminatedUnion('source', [PmcArticleSchema, UnpaywallArticleSchema]) .describe('Full-text article; shape depends on `source` (pmc = structured JATS, unpaywall = best-effort)'); const UnavailableReasonSchema = z .enum([ 'not-found', 'no-pmc-fallback-disabled', 'no-epmc-fulltext', 'no-doi', 'no-oa', 'fetch-failed', 'parse-failed', 'service-error', ]) .describe('Why no full text was returned. not-found: upstream returned no record for this ID. no-pmc-fallback-disabled: every tier was skipped (`triedTiers` is all `not-attempted`) — typically because EPMC (`EUROPEPMC_ENABLED`) and Unpaywall (`UNPAYWALL_EMAIL`) are not configured. no-epmc-fulltext: EPMC indexed the record but publishes no fullTextXML. no-doi: no DOI to query Unpaywall. no-oa: Unpaywall has no OA copy. fetch-failed: download failed. parse-failed: extraction empty. service-error: upstream server failure (threw, timed out, or returned malformed data).'); const TierOutcomeSchema = z .enum([ 'not-attempted', 'miss', 'no-fulltext', 'no-doi', 'no-oa', 'fetch-failed', 'parse-failed', 'service-error', ]) .describe('Per-tier outcome. not-attempted: tier was skipped. miss: tier returned no record. no-fulltext: EPMC indexed the record but publishes no fullTextXML. no-doi: no DOI to query Unpaywall. no-oa: Unpaywall reports no open-access copy. fetch-failed: OA copy download failed. parse-failed: extraction produced empty content. service-error: tier service threw.'); const TriedTierSchema = z .object({ tier: z.enum(['pmc', 'europepmc', 'unpaywall']).describe('Which tier in the resolution chain'), outcome: TierOutcomeSchema, detail: z.string().optional().describe('Tier-specific context when available'), }) .describe('One tier the resolution chain attempted, with its outcome'); const UnavailableSchema = z .object({ id: z .string() .describe('Identifier the chain could not resolve — PMID, PMCID, or DOI per `idType`'), idType: z.enum(['pmid', 'pmcid', 'doi']).describe('Which input branch the id came from'), reason: UnavailableReasonSchema, triedTiers: z .array(TriedTierSchema) .describe('Per-tier outcomes the chain produced for this id, in execution order. Covers `pmc`, `europepmc`, and `unpaywall` — the same tiers the tool description references. Tiers that the chain skipped appear as `outcome: not-attempted` with a `detail` explaining why.'), }) .describe('One identifier that could not be returned, with the full chain it traversed'); // ─── Tool Definition ───────────────────────────────────────────────────────── export const fetchFulltextTool = tool('pubmed_fetch_fulltext', { description: 'Fetch full-text articles from PubMed Central with structured sections and references. When PMC misses, transparently falls back to Europe PMC `fullTextXML` (structured JATS for records with a PMC counterpart), then to Unpaywall — publisher-hosted or institutional open-access copies as HTML-as-Markdown or PDF-as-text. Provide exactly one of `pmcids` (PMC IDs directly), `pmids` (PubMed IDs, auto-resolved), or `dois` (preprints and EPMC-only OA records that lack PMID/PMCID).', annotations: { readOnlyHint: true, openWorldHint: true }, _meta: conceptMeta([SCHEMA_SCHOLARLY_ARTICLE, EDAM_DATA_RETRIEVAL]), sourceUrl: 'https://github.com/cyanheads/pubmed-mcp-server/blob/main/src/mcp-server/tools/definitions/fetch-fulltext.tool.ts', errors: [ ...NCBI_SERVICE_ERRORS, ...UNPAYWALL_SERVICE_ERRORS, ...EUROPEPMC_SERVICE_ERRORS, ], input: z .object({ pmcids: z .array(z.string()) .min(1) .max(10) .optional() .describe('PMC IDs to fetch (e.g. ["PMC9575052"]). Provide exactly one of `pmcids`, `pmids`, or `dois`.'), pmids: z .array(pmidStringSchema) .min(1) .max(10) .optional() .describe('PubMed IDs. Provide exactly one of `pmcids`, `pmids`, or `dois`. Articles in PMC are returned as structured JATS; articles not in PMC fall through to Europe PMC (when EPMC has a `fullTextXML`), then to Unpaywall when `UNPAYWALL_EMAIL` is set and a DOI is available.'), dois: z .array(z.string().min(3)) .min(1) .max(10) .optional() .describe('DOIs to resolve (e.g. ["10.21203/rs.3.rs-9010375/v1"]). Provide exactly one of `pmcids`, `pmids`, or `dois`. Covers preprints and EPMC-only OA records that lack PMID/PMCID. Chain: Europe PMC search-by-DOI → fullTextXML → Unpaywall.'), includeReferences: z .boolean() .default(false) .describe('Include reference list. Applies to `source=pmc` results only.'), maxSections: z .number() .int() .min(1) .max(50) .optional() .describe('Maximum top-level body sections. Applies to `source=pmc` results only.'), sections: z .array(z.string()) .optional() .describe('Filter to specific sections by title, case-insensitive (e.g. ["Introduction", "Methods", "Results", "Discussion"]). Applies to `source=pmc` results only.'), }) .refine((v) => [v.pmcids, v.pmids, v.dois].filter((b) => b !== undefined).length === 1, { message: 'Provide exactly one of `pmcids`, `pmids`, or `dois` (not zero, not more).', }), output: z.object({ articles: z.array(ArticleSchema).describe('Full-text articles'), totalReturned: z.number().describe('Number of articles returned'), unavailable: z .array(UnavailableSchema) .optional() .describe('Per-identifier explanations for any requested PMIDs, PMCIDs, or DOIs with no returnable full text. `idType` discriminates which branch the id came from.'), }), async handler(input, ctx) { ctx.log.info('Executing pubmed_fetch_fulltext', { hasPmcids: !!input.pmcids, hasPmids: !!input.pmids, hasDois: !!input.dois, idCount: (input.pmcids ?? input.pmids ?? input.dois)?.length, }); // ── Chain tracking ────────────────────────────────────────────────────── // Per-input-id tier history (the `triedTiers` array on unavailable entries). // Keys: pmid for `pmids` input, prefixed PMCID for `pmcids` input, doi for // `dois` input. `recoveredIds` collects ids the chain produced an article // for, so we can skip them when building `unavailable[]`. const chainByInput = new Map(); const recoveredIds = new Set(); // PMCIDs the converter resolved from a pmid → PMID, for back-mapping after // the PMC and EPMC stages. const pmcidToPmid = new Map(); // DOI hints captured during pmids→pmcid routing so PMC-misses on the pmids // branch can still reach Unpaywall without re-fetching from PubMed metadata. const pmidContext = new Map(); const idType = input.pmids ? 'pmid' : input.pmcids ? 'pmcid' : 'doi'; // ── Branch routing → produce buckets the staged chain consumes ────────── let pmcIds = []; let pmidFallbackCandidates = []; let pmcidFallbackCandidates = []; let doiCandidates = []; if (input.pmids) { for (const id of input.pmids) chainByInput.set(id, []); const records = await getNcbiService().idConvert(input.pmids, 'pmid', ctx.signal ? { signal: ctx.signal } : undefined); const seen = new Set(); for (const r of records) { if (r.pmid === undefined) continue; const pmid = String(r.pmid); seen.add(pmid); if (r.pmcid) { const normalized = normalizePmcId(String(r.pmcid)); pmcIds.push(normalized); pmcidToPmid.set(withPmcPrefix(normalized), pmid); pmidContext.set(pmid, { pmid, ...(r.doi && { doi: r.doi }) }); } else { chainByInput.get(pmid)?.push({ tier: 'pmc', outcome: 'not-attempted', detail: 'PMID has no PMC counterpart', }); pmidFallbackCandidates.push({ pmid, ...(r.doi && { doi: r.doi }) }); } } for (const requested of input.pmids) { if (!seen.has(requested)) { chainByInput.get(requested)?.push({ tier: 'pmc', outcome: 'not-attempted', detail: 'ID Converter returned no record for this PMID', }); pmidFallbackCandidates.push({ pmid: requested }); } } } else if (input.pmcids) { for (const id of input.pmcids) chainByInput.set(withPmcPrefix(normalizePmcId(id)), []); pmcIds = input.pmcids.map(normalizePmcId); } else if (input.dois) { for (const doi of input.dois) { chainByInput.set(doi, [ { tier: 'pmc', outcome: 'not-attempted', detail: 'DOI input bypasses PMC EFetch' }, ]); } doiCandidates = input.dois.map((doi) => ({ doi })); } // Route PMC-missed prefixed PMCIDs into the fallback buckets so EPMC and // (for pmids) Unpaywall still get a chance. For pmids input we look up the // captured DOI hint via `pmidContext` to avoid an extra PubMed eFetch when // available; the converter often returns the DOI alongside a PMCID match. const routePmcMissesToFallback = (missingPrefixed) => { if (missingPrefixed.length === 0) return; if (input.pmcids) { pmcidFallbackCandidates = missingPrefixed.map((pmcid) => ({ pmcid })); } else if (input.pmids) { for (const prefixed of missingPrefixed) { const pmid = pmcidToPmid.get(prefixed); if (pmid) pmidFallbackCandidates.push(pmidContext.get(pmid) ?? { pmid }); } } }; // ── Stage 1: PMC EFetch ───────────────────────────────────────────────── // Wrapped so transient NCBI failures fall through to EPMC/Unpaywall rather // than sinking the whole batch — the chain's contract is graceful fallback. let pmcArticles = []; if (pmcIds.length > 0) { try { const xmlData = await getNcbiService().eFetch({ db: 'pmc', id: pmcIds.join(','), retmode: 'xml' }, { retmode: 'xml', useOrderedParser: true, usePost: pmcIds.length > 5, signal: ctx.signal, }); const articleSet = findOne(xmlData, 'pmc-articleset'); if (!articleSet) { throw new Error('PMC EFetch response missing pmc-articleset wrapper'); } const parsed = findAll(articleSet, 'article') .map(parsePmcArticle) .map((a) => applyPmcFilters(a, input)); pmcArticles = parsed.map((a) => ({ source: 'pmc', viaSource: 'pmc', ...a, })); const returnedPmcIds = new Set(pmcArticles.map((a) => a.pmcId).filter((id) => !!id)); for (const prefixed of returnedPmcIds) { recoveredIds.add(pmcidToPmid.get(prefixed) ?? prefixed); } const missing = pmcIds .map((id) => withPmcPrefix(id)) .filter((id) => !returnedPmcIds.has(id)); for (const prefixed of missing) { const inputId = pmcidToPmid.get(prefixed) ?? prefixed; chainByInput.get(inputId)?.push({ tier: 'pmc', outcome: 'miss' }); } routePmcMissesToFallback(missing); } catch (error) { const detail = error instanceof Error ? error.message : String(error); ctx.log.warning('PMC EFetch failed; chain continues with next layer', { pmcIdCount: pmcIds.length, error: detail, }); const allPrefixed = pmcIds.map(withPmcPrefix); for (const prefixed of allPrefixed) { const inputId = pmcidToPmid.get(prefixed) ?? prefixed; chainByInput.get(inputId)?.push({ tier: 'pmc', outcome: 'service-error', detail }); } routePmcMissesToFallback(allPrefixed); } } // ── Stage 2: Europe PMC fullTextXML ───────────────────────────────────── const epmc = getEuropePmcService(); const epmcOutcomes = epmc ? await runEpmcStage(epmc, { pmidFallbackCandidates, pmcidFallbackCandidates, doiCandidates, input, ctx, }) : { articles: [], remainingPmid: pmidFallbackCandidates, remainingPmcid: pmcidFallbackCandidates, remainingDoi: doiCandidates, pmidOutcomes: new Map(), pmcidOutcomes: new Map(), doiOutcomes: new Map(), }; pmcArticles = pmcArticles.concat(epmcOutcomes.articles); // Fold EPMC outcomes into each id's chain. EPMC-served articles count as // recovered, so their ids are added to `recoveredIds` here. if (!epmc) { const epmcDisabledEntry = { tier: 'europepmc', outcome: 'not-attempted', detail: 'EUROPEPMC_ENABLED=false', }; for (const c of pmidFallbackCandidates) chainByInput.get(c.pmid)?.push(epmcDisabledEntry); for (const c of pmcidFallbackCandidates) { const prefixed = withPmcPrefix(c.pmcid); chainByInput.get(pmcidToPmid.get(prefixed) ?? prefixed)?.push(epmcDisabledEntry); } for (const c of doiCandidates) chainByInput.get(c.doi)?.push(epmcDisabledEntry); } else { for (const [pmid, outcome] of epmcOutcomes.pmidOutcomes) { if (outcome.kind === 'hit') { recoveredIds.add(pmid); continue; } chainByInput.get(pmid)?.push(epmcTierFromOutcome(outcome)); } for (const [prefixed, outcome] of epmcOutcomes.pmcidOutcomes) { const inputId = pmcidToPmid.get(prefixed) ?? prefixed; if (outcome.kind === 'hit') { recoveredIds.add(inputId); continue; } chainByInput.get(inputId)?.push(epmcTierFromOutcome(outcome)); } for (const [doi, outcome] of epmcOutcomes.doiOutcomes) { if (outcome.kind === 'hit') { recoveredIds.add(doi); continue; } chainByInput.get(doi)?.push(epmcTierFromOutcome(outcome)); } } pmidFallbackCandidates = epmcOutcomes.remainingPmid; pmcidFallbackCandidates = epmcOutcomes.remainingPmcid; doiCandidates = epmcOutcomes.remainingDoi; // ── Stage 3: Unpaywall fallback ───────────────────────────────────────── const unpaywall = getUnpaywallService(); const fallbackArticles = []; // PMC misses on `pmcids` input don't get an Unpaywall attempt — the current // implementation doesn't resolve PMCID → DOI for that branch. for (const c of pmcidFallbackCandidates) { const prefixed = withPmcPrefix(c.pmcid); chainByInput.get(pmcidToPmid.get(prefixed) ?? prefixed)?.push({ tier: 'unpaywall', outcome: 'not-attempted', detail: 'pmcids input does not resolve a DOI for Unpaywall', }); } if (pmidFallbackCandidates.length > 0) { // The PMC ID Converter only returns DOIs for articles it has in PMC, so // candidates here are missing DOIs by default. Pull them from PubMed // metadata (db=pubmed) before dispatching to Unpaywall. const needDoi = pmidFallbackCandidates.filter((c) => !c.doi).map((c) => c.pmid); if (needDoi.length > 0) { try { const doiMap = await fetchPubmedDois(needDoi, ctx.signal); pmidFallbackCandidates = pmidFallbackCandidates.map((c) => { if (c.doi) return c; const doi = doiMap.get(c.pmid); return doi ? { ...c, doi } : c; }); } catch (error) { ctx.log.warning('Failed to batch-fetch DOIs from PubMed for Unpaywall fallback', { error: error instanceof Error ? error.message : String(error), pmidCount: needDoi.length, }); } } if (!unpaywall) { for (const c of pmidFallbackCandidates) { chainByInput.get(c.pmid)?.push({ tier: 'unpaywall', outcome: 'not-attempted', detail: 'UNPAYWALL_EMAIL is not set', }); } } else { const outcomes = await Promise.all(pmidFallbackCandidates.map(async (candidate) => ({ candidate, result: candidate.doi ? await resolveUnpaywall({ pmid: candidate.pmid, doi: candidate.doi }, unpaywall, ctx) : { unavailable: { reason: 'no-doi' } }, }))); for (const { candidate, result } of outcomes) { if ('article' in result) { fallbackArticles.push(result.article); recoveredIds.add(candidate.pmid); } else { const u = result.unavailable; chainByInput.get(candidate.pmid)?.push({ tier: 'unpaywall', outcome: unpaywallReasonToTierOutcome(u.reason), ...(u.detail && { detail: u.detail }), }); } } } } if (doiCandidates.length > 0) { if (!unpaywall) { for (const c of doiCandidates) { chainByInput.get(c.doi)?.push({ tier: 'unpaywall', outcome: 'not-attempted', detail: 'UNPAYWALL_EMAIL is not set', }); } } else { // `resolveUnpaywall` catches its own failures so this Promise.all // doesn't reject under normal operation. const outcomes = await Promise.all(doiCandidates.map(async (c) => ({ doi: c.doi, result: await resolveUnpaywall({ doi: c.doi }, unpaywall, ctx), }))); for (const { doi, result } of outcomes) { if ('article' in result) { fallbackArticles.push(result.article); recoveredIds.add(doi); } else { const u = result.unavailable; chainByInput.get(doi)?.push({ tier: 'unpaywall', outcome: unpaywallReasonToTierOutcome(u.reason), ...(u.detail && { detail: u.detail }), }); } } } } // ── Assemble unavailable[] from chains ────────────────────────────────── const unavailable = []; for (const [id, chain] of chainByInput) { if (recoveredIds.has(id)) continue; unavailable.push({ id, idType, reason: reasonFromChain(chain), triedTiers: chain, }); } const articles = [...pmcArticles, ...fallbackArticles]; ctx.log.info('pubmed_fetch_fulltext completed', { requested: (input.pmids ?? input.pmcids ?? input.dois)?.length ?? 0, returned: articles.length, pmcHits: pmcArticles.filter((a) => a.viaSource === 'pmc').length, epmcHits: pmcArticles.filter((a) => a.viaSource === 'europepmc').length, unpaywallHits: fallbackArticles.length, unavailable: unavailable.length, }); return { articles, totalReturned: articles.length, ...(unavailable.length > 0 && { unavailable }), }; }, format: (result) => { const lines = [`## Full-Text Articles`, `**Articles Returned:** ${result.totalReturned}`]; if (result.unavailable?.length) { lines.push(`\n**Unavailable (${result.unavailable.length}):**`); for (const u of result.unavailable) { lines.push(`- [${u.idType}] ${u.id} — ${u.reason}`); const chain = u.triedTiers .map((t) => `${t.tier}:${t.outcome}${t.detail ? ` (${t.detail})` : ''}`) .join(' → '); if (chain) lines.push(` chain: ${chain}`); } } if (result.totalReturned === 0) { lines.push(`\n> No full-text articles returned. Articles must be open-access and indexed in PMC, Europe PMC, or recoverable via Unpaywall to retrieve full text. For metadata and abstracts only, use \`pubmed_fetch_articles\`.`); } for (const a of result.articles) { lines.push(''); if (a.source === 'pmc') formatPmcArticle(a, lines); else formatUnpaywallArticle(a, lines); } return [{ type: 'text', text: lines.join('\n') }]; }, }); /** * Run the Europe PMC step against everything that fell through PMC EFetch * plus any direct DOI input. Each candidate goes through search-by-best-id → * fullTextXML. Hits become `source: 'pmc'` articles with `viaSource: 'europepmc'`; * misses flow through to the Unpaywall stage unchanged. * * Candidates run in parallel — the EPMC request queue caps concurrency so this * stays polite without serializing. Errors are caught and logged inside the * helpers; a transient EPMC failure must not block the downstream Unpaywall * fallback. */ async function runEpmcStage(epmc, args) { const runOne = async (c, query, contextPmid) => { const search = await searchEpmcSafe(epmc, query, args.ctx); if (search.kind === 'error') { return { c, outcome: { kind: 'service-error', detail: search.detail } }; } if (search.kind === 'miss') return { c, outcome: { kind: 'miss' } }; const fetched = await fetchEpmcArticle(epmc, search.hit, args, contextPmid); if (fetched.kind === 'error') { return { c, outcome: { kind: 'service-error', detail: fetched.detail } }; } if (fetched.kind === 'no-fulltext') { return { c, outcome: { kind: 'no-fulltext', ...(fetched.detail && { detail: fetched.detail }) }, }; } return { c, outcome: { kind: 'hit' }, article: fetched.article }; }; const fetchForPmid = (c) => runOne(c, `EXT_ID:"${c.pmid}" AND SRC:MED`, c.pmid); const fetchForPmcid = (c) => { const normalized = withPmcPrefix(c.pmcid); return runOne({ c, normalized }, `PMCID:"${normalized}" AND SRC:PMC`, undefined); }; const fetchForDoi = (c) => runOne(c, `DOI:"${c.doi}"`, undefined); const [pmidResults, pmcidResults, doiResults] = await Promise.all([ Promise.all(args.pmidFallbackCandidates.map(fetchForPmid)), Promise.all(args.pmcidFallbackCandidates.map(fetchForPmcid)), Promise.all(args.doiCandidates.map(fetchForDoi)), ]); const articles = []; const remainingPmid = []; const remainingPmcid = []; const remainingDoi = []; const pmidOutcomes = new Map(); const pmcidOutcomes = new Map(); const doiOutcomes = new Map(); for (const { c, outcome, article } of pmidResults) { pmidOutcomes.set(c.pmid, outcome); if (article) articles.push(article); else remainingPmid.push(c); } for (const { c: pair, outcome, article } of pmcidResults) { pmcidOutcomes.set(pair.normalized, outcome); if (article) articles.push(article); else remainingPmcid.push(pair.c); } for (const { c, outcome, article } of doiResults) { doiOutcomes.set(c.doi, outcome); if (article) articles.push(article); else remainingDoi.push(c); } return { articles, remainingPmid, remainingPmcid, remainingDoi, pmidOutcomes, pmcidOutcomes, doiOutcomes, }; } /** * Single-hit Europe PMC search with discriminated outcomes so the chain can * record `miss` vs `service-error` separately. Errors are logged and swallowed * so transient EPMC failures fall through to the next stage instead of * aborting the chain. */ async function searchEpmcSafe(epmc, query, ctx) { try { const result = await epmc.search({ query, resultType: 'core', pageSize: 1, ...(ctx.signal && { signal: ctx.signal }), }); return result.hits[0] ? { kind: 'hit', hit: result.hits[0] } : { kind: 'miss' }; } catch (error) { const detail = error instanceof Error ? error.message : String(error); ctx.log.warning('Europe PMC search failed; chain continues with next layer', { query, error: detail, }); return { kind: 'error', detail }; } } /** * Fetch and parse the JATS for an EPMC hit. Returns a discriminated outcome so * the chain can record `no-fulltext` (record exists but EPMC publishes no JATS) * separately from `service-error` (transient failure). Preprints/patents and * MED-only records without a PMC counterpart short-circuit to `no-fulltext` * since EPMC's fullTextXML endpoint is PMC-keyed. */ async function fetchEpmcArticle(epmc, hit, args, contextPmid) { // EPMC's fullTextXML endpoint is PMC-keyed (URL: `/{PMC<digits>}/fullTextXML`). // For PMC-source hits, `hit.id` already is the PMC ID; for MED hits, `hit.pmcid` // carries the counterpart when one exists. Preprints (PPR) and patents (PAT) // have no PMC ID, so fullTextXML is never available. const pmcLookupId = hit.pmcid ?? (hit.source === 'PMC' ? hit.id : undefined); if (!pmcLookupId) { return { kind: 'no-fulltext', detail: `EPMC source ${hit.source} has no PMC counterpart` }; } try { const result = await epmc.fullTextXml(pmcLookupId, hit.source, args.ctx.signal ?? undefined); if (result.kind === 'not-available') { return { kind: 'no-fulltext', detail: 'EPMC fullTextXML not available for this record' }; } const articleNode = epmc.parseFullTextXml(result.xml); if (!articleNode) { return { kind: 'no-fulltext', detail: 'EPMC fullTextXML payload had no <article> element' }; } const parsed = applyPmcFilters(parsePmcArticle(articleNode), args.input); // `parsePmcArticle` always returns string fields (sometimes empty). Strip // empty `pmcId`/`pmcUrl` for EPMC-only records (preprints) so the schema's // optional shape is respected — agents read `epmcId`/`epmcSource` for those. const { pmcId, pmcUrl, ...rest } = parsed; const pmid = rest.pmid ?? hit.pmid ?? contextPmid; const doi = rest.doi ?? hit.doi; return { kind: 'article', article: { source: 'pmc', viaSource: 'europepmc', ...rest, ...(pmcId && { pmcId, pmcUrl }), ...(pmid && { pmid, pubmedUrl: rest.pubmedUrl ?? `https://pubmed.ncbi.nlm.nih.gov/${pmid}/`, }), ...(doi && { doi }), epmcId: hit.id, epmcSource: hit.source, }, }; } catch (error) { const detail = error instanceof Error ? error.message : String(error); args.ctx.log.warning('Europe PMC fullTextXML failed; chain continues with next layer', { epmcId: hit.id, source: hit.source, error: detail, }); return { kind: 'error', detail }; } } /** * Batch-fetch DOIs from PubMed metadata for PMIDs that lack one after the PMC * ID Converter roundtrip. The Converter only returns DOIs for articles already * in PMC, so non-PMC PMIDs arrive here with `doi: undefined` — yet the DOI is * present in PubMed's own record (ELocationID / ArticleIdList) and is required * to query Unpaywall. One eFetch call covers the whole batch. */ async function fetchPubmedDois(pmids, signal) { const out = new Map(); if (pmids.length === 0) return out; const xmlData = await getNcbiService().eFetch({ db: 'pubmed', id: pmids.join(','), retmode: 'xml' }, { retmode: 'xml', usePost: pmids.length >= 100, ...(signal && { signal }) }); const articles = xmlData?.PubmedArticleSet?.PubmedArticle ? ensureArray(xmlData.PubmedArticleSet.PubmedArticle) : []; for (const article of articles) { if (!article?.MedlineCitation) continue; const pmid = extractPmid(article.MedlineCitation); if (!pmid) continue; const doi = extractDoi(article.MedlineCitation.Article, article.PubmedData?.ArticleIdList); if (doi) out.set(pmid, doi); } return out; } /** * Resolve a DOI to an open-access article via Unpaywall. `pmid`, when set, * is stamped onto the resulting article so the pmid-input branch carries its * cross-reference through. */ async function resolveUnpaywall(args, service, ctx) { const { pmid, doi } = args; let resolution; try { resolution = await service.resolve(doi, ctx.signal); } catch (error) { const detail = error instanceof Error ? error.message : String(error); ctx.log.warning('Unpaywall DOI resolve failed', { doi, error: detail }); return { unavailable: { reason: 'service-error', detail } }; } if (resolution.kind === 'no-oa') { return { unavailable: { reason: 'no-oa', detail: resolution.reason } }; } let content; try { content = await service.fetchContent(resolution.location, ctx.signal); } catch (error) { const detail = error instanceof Error ? error.message : String(error); ctx.log.warning('Unpaywall content fetch failed', { doi, error: detail }); return { unavailable: { reason: 'fetch-failed', detail } }; } try { if (content.kind === 'html') { const extracted = await htmlExtractor.extract(content.body, { url: content.fetchedUrl, format: 'markdown', }); const body = extracted.content.trim(); if (!body) { return { unavailable: { reason: 'parse-failed', detail: 'HTML extraction produced empty content', }, }; } return { article: buildUnpaywallArticle({ ...(pmid && { pmid }), doi, sourceUrl: content.fetchedUrl, location: resolution.location, contentFormat: 'html-markdown', content: body, title: extracted.title, wordCount: extracted.wordCount, }), }; } const extracted = await pdfParser.extractText(content.body, { mergePages: true }); const text = typeof extracted.text === 'string' ? extracted.text.trim() : ''; if (!text) { return { unavailable: { reason: 'parse-failed', detail: 'PDF extraction produced empty text' }, }; } return { article: buildUnpaywallArticle({ ...(pmid && { pmid }), doi, sourceUrl: content.fetchedUrl, location: resolution.location, contentFormat: 'pdf-text', content: text, totalPages: extracted.totalPages, }), }; } catch (error) { const detail = error instanceof Error ? error.message : String(error); ctx.log.warning('Unpaywall content extraction failed', { pmid, doi, detail }); return { unavailable: { reason: 'parse-failed', detail } }; } } function buildUnpaywallArticle(args) { const { location } = args; return { source: 'unpaywall', viaSource: 'unpaywall', contentFormat: args.contentFormat, ...(args.pmid && { pmid: args.pmid, pubmedUrl: `https://pubmed.ncbi.nlm.nih.gov/${args.pmid}/`, }), doi: args.doi, sourceUrl: args.sourceUrl, content: args.content, ...(args.title && { title: args.title }), ...(args.wordCount !== undefined && { wordCount: args.wordCount }), ...(args.totalPages !== undefined && { totalPages: args.totalPages }), ...(location.license && { license: location.license }), ...(location.host_type && { hostType: location.host_type }), ...(location.version && { version: location.version }), }; } /** * Convert an EPMC stage outcome into the `triedTiers` entry stored on * `chainByInput`. `hit` is filtered before calling — the chain only records * failure outcomes since recovered ids never appear in `unavailable[]`. */ function epmcTierFromOutcome(outcome) { switch (outcome.kind) { case 'miss': return { tier: 'europepmc', outcome: 'miss' }; case 'no-fulltext': return { tier: 'europepmc', outcome: 'no-fulltext', ...(outcome.detail && { detail: outcome.detail }), }; case 'service-error': return { tier: 'europepmc', outcome: 'service-error', detail: outcome.detail }; } } /** * Map an Unpaywall-resolver `UnavailableReason` to its `TierOutcome` * counterpart. The two enums overlap on the values the Unpaywall path can * actually emit (`no-doi`, `no-oa`, `fetch-failed`, `parse-failed`, * `service-error`). Defensive branches cover values the resolver returns under * dead-code safety checks but never in normal flow. */ function unpaywallReasonToTierOutcome(reason) { switch (reason) { case 'no-doi': case 'no-oa': case 'fetch-failed': case 'parse-failed': case 'service-error': return reason; case 'no-pmc-fallback-disabled': return 'not-attempted'; case 'no-epmc-fulltext': return 'no-fulltext'; case 'not-found': return 'miss'; } } /** * Derive the terminal `reason` shown on the unavailable entry from its chain. * Skips `not-attempted` entries when summarizing — those record config state, * not content state, so they make a misleading `reason` when an earlier tier * produced a real signal (`pmc:miss`, `unpaywall:no-oa`, etc.). Only when every * tier was skipped does `reason` fall back to `no-pmc-fallback-disabled`. */ function reasonFromChain(chain) { let lastSignal; for (const t of chain) { if (t.outcome !== 'not-attempted') lastSignal = t; } if (!lastSignal) return 'no-pmc-fallback-disabled'; const key = `${lastSignal.tier}:${lastSignal.outcome}`; switch (key) { case 'pmc:miss': case 'europepmc:miss': return 'not-found'; case 'europepmc:no-fulltext': return 'no-epmc-fulltext'; case 'unpaywall:no-doi': return 'no-doi'; case 'unpaywall:no-oa': return 'no-oa'; case 'unpaywall:fetch-failed': return 'fetch-failed'; case 'unpaywall:parse-failed': return 'parse-failed'; case 'pmc:service-error': case 'unpaywall:service-error': case 'europepmc:service-error': return 'service-error'; default: return 'not-found'; } } // ─── format() helpers ──────────────────────────────────────────────────────── function formatPmcArticle(a, lines) { lines.push(`### ${a.title ?? a.pmcId}`); const sourceLabel = a.viaSource === 'europepmc' ? `Europe PMC (structured JATS${a.epmcSource ? `, source: ${a.epmcSource}` : ''})` : 'PMC (structured JATS)'; lines.push(`**Source:** ${sourceLabel}`); if (a.authors?.length) { lines.push(`\n**Authors (${a.authors.length}):**`); for (const au of a.authors) lines.push(`- ${formatPmcAuthor(au)}`); } if (a.affiliations?.length) { lines.push(`\n**Affiliations:**`); for (const [i, aff] of a.affiliations.entries()) lines.push(`${i + 1}. ${aff}`); } if (a.journal) { const parts = []; if (a.journal.title) parts.push(a.journal.title); if (a.journal.volume) parts.push(`**${a.journal.volume}**${a.journal.issue ? `(${a.journal.issue})` : ''}`); if (a.journal.pages) parts.push(a.journal.pages); if (a.journal.issn) parts.push(`ISSN ${a.journal.issn}`); if (parts.length) lines.push(`\n**Journal:** ${parts.join(', ')}`); } if (a.articleType) lines.push(`**Type:** ${a.articleType}`); if (a.publicationDate) { const d = a.publicationDate; const dateParts = [d.year, d.month, d.day].filter(Boolean); if (dateParts.length) lines.push(`**Published:** ${dateParts.join('-')}`); } if (a.pmcId) lines.push(`**PMCID:** ${a.pmcId}`); if (a.epmcId) lines.push(`**EPMC ID:** ${a.epmcId}${a.epmcSource ? ` (${a.epmcSource})` : ''}`); if (a.pmid) lines.push(`**PMID:** ${a.pmid}`); if (a.doi) lines.push(`**DOI:** ${a.doi}`); if (a.pmcUrl) lines.push(`**PMC:** ${a.pmcUrl}`); if (a.pubmedUrl) lines.push(`**PubMed:** ${a.pubmedUrl}`); if (a.keywords?.length) lines.push(`**Keywords:** ${a.keywords.join(', ')}`); if (a.abstract) lines.push(`\n#### Abstract\n${a.abstract}`); for (const sec of a.sections) { if (sec.title) lines.push(`\n#### ${formatHeading(sec.label, sec.title)}`); if (sec.text) lines.push(sec.text); if (sec.subsections?.length) { for (const sub of sec.subsections) { if (sub.title) lines.push(`\n##### ${formatHeading(sub.label, sub.title)}`); if (sub.text) lines.push(sub.text); } } } if (a.references?.length) { lines.push(`\n#### References (${a.references.length})`); for (const ref of a.references) { const tag = [ref.label, ref.id].filter(Boolean).join(' '); lines.push(`- ${tag ? `[${tag}] ` : ''}${ref.citation}`); } } } function formatUnpaywallArticle(a, lines) { const heading = a.title ?? (a.pmid ? `PMID ${a.pmid}` : `DOI ${a.doi}`); const formatLabel = a.contentFormat === 'html-markdown' ? 'Unpaywall (HTML → Markdown, best-effort)' : 'Unpaywall (PDF → plain text)'; lines.push(`### ${heading}`); lines.push(`**Source:** ${formatLabel}`); if (a.pmid) lines.push(`**PMID:** ${a.pmid}`); lines.push(`**DOI:** ${a.doi}`); if (a.pubmedUrl) lines.push(`**PubMed:** ${a.pubmedUrl}`); lines.push(`**OA Copy:** ${