@cyanheads/pubmed-mcp-server
Version:
Search PubMed/Europe PMC, fetch articles and full text (PMC/EPMC/Unpaywall), citations, MeSH terms via MCP. STDIO or Streamable HTTP.
1,103 lines • 52 kB
JavaScript
/**
* @fileoverview Full-text fetch tool. Resolves full-text articles through a
* three-stage chain: NCBI PMC EFetch → Europe PMC `fullTextXML` → Unpaywall.
* Accepts three mutually-exclusive input shapes:
*
* - `pmcids` — fetch directly by PMC ID. Articles not in PMC fall through to
* EPMC by PMC ID, then to Unpaywall when the DOI is available.
* - `pmids` — resolve PMID → PMCID via PMC ID Converter, then run the chain.
* - `dois` — skip PMC EFetch (no PMCID); resolve via EPMC search-by-DOI →
* fullTextXML, then Unpaywall. Covers EPMC-only OA and preprints with no
* PubMed presence.
*
* Output uses a discriminated union on `source` (`pmc` | `unpaywall`) with an
* extra `viaSource` discriminator that records which layer produced the
* content. EPMC's JATS reuses the `pmc` schema shape because it's the same
* DTD; `viaSource: 'europepmc'` distinguishes it from PMC EFetch output.
*
* @module src/mcp-server/tools/definitions/fetch-fulltext.tool
*/
import { tool, z } from '@cyanheads/mcp-ts-core';
import { htmlExtractor, pdfParser } from '@cyanheads/mcp-ts-core/utils';
import { EUROPEPMC_SERVICE_ERRORS, NCBI_SERVICE_ERRORS, UNPAYWALL_SERVICE_ERRORS, } from '../../../services/error-contracts.js';
import { getEuropePmcService, } from '../../../services/europe-pmc/europe-pmc-service.js';
import { getNcbiService } from '../../../services/ncbi/ncbi-service.js';
import { extractDoi, extractPmid } from '../../../services/ncbi/parsing/article-parser.js';
import { parsePmcArticle } from '../../../services/ncbi/parsing/pmc-article-parser.js';
import { findAll, findOne } from '../../../services/ncbi/parsing/pmc-xml-helpers.js';
import { ensureArray } from '../../../services/ncbi/parsing/xml-helpers.js';
import { getUnpaywallService, } from '../../../services/unpaywall/unpaywall-service.js';
import { conceptMeta, EDAM_DATA_RETRIEVAL, SCHEMA_SCHOLARLY_ARTICLE } from './_concepts.js';
import { pmidStringSchema } from './_schemas.js';
function normalizePmcId(id) {
return id.replace(/^PMC/i, '');
}
function withPmcPrefix(id) {
return id.startsWith('PMC') ? id : `PMC${id}`;
}
function filterSections(sections, sectionFilter) {
const lowerFilter = sectionFilter.map((s) => s.toLowerCase());
return sections.filter((s) => s.title && lowerFilter.some((f) => s.title?.toLowerCase().includes(f)));
}
function applyPmcFilters(article, filters) {
let out = article;
if (filters.sections?.length) {
out = { ...out, sections: filterSections(out.sections, filters.sections) };
}
if (filters.maxSections !== undefined) {
out = { ...out, sections: out.sections.slice(0, filters.maxSections) };
}
if (!filters.includeReferences) {
const { references: _, ...rest } = out;
out = rest;
}
return out;
}
// ─── Schemas ─────────────────────────────────────────────────────────────────
const SubsectionSchema = z
.object({
title: z.string().optional().describe('Subsection heading'),
label: z.string().optional().describe('Subsection label'),
text: z.string().describe('Subsection body text'),
})
.describe('Article subsection');
const SectionSchema = z
.object({
title: z.string().optional().describe('Section heading'),
label: z.string().optional().describe('Section label'),
text: z.string().describe('Section body text'),
subsections: z.array(SubsectionSchema).optional().describe('Nested subsections'),
})
.describe('Article body section');
const AuthorSchema = z
.object({
collectiveName: z.string().optional().describe('Group name'),
givenNames: z.string().optional().describe('Given names'),
lastName: z.string().optional().describe('Last name'),
})
.describe('Author entry');
const JournalSchema = z
.object({
title: z.string().optional().describe('Journal title'),
issn: z.string().optional().describe('ISSN'),
volume: z.string().optional().describe('Volume number'),
issue: z.string().optional().describe('Issue number'),
pages: z.string().optional().describe('Page range'),
})
.describe('Journal information');
const ReferenceSchema = z
.object({
citation: z.string().describe('Citation text'),
id: z.string().optional().describe('Reference ID'),
label: z.string().optional().describe('Reference label'),
})
.describe('Reference entry');
const PublicationDateSchema = z
.object({
year: z.string().optional().describe('Publication year'),
month: z.string().optional().describe('Publication month'),
day: z.string().optional().describe('Publication day'),
})
.describe('Publication date');
const PmcArticleSchema = z
.object({
source: z
.literal('pmc')
.describe('Structured JATS — same DTD whether sourced from NCBI PMC or Europe PMC'),
viaSource: z
.enum(['pmc', 'europepmc'])
.describe('Which layer produced the JATS: `pmc` for NCBI PMC EFetch (db=pmc), `europepmc` for Europe PMC `fullTextXML`. Both paths return the same JATS shape; the discriminator records origin for observability and license attribution.'),
pmcId: z
.string()
.optional()
.describe('PMC ID — present for NCBI PMC records and Europe PMC entries that have a PMC counterpart. Absent for EPMC-only records like preprints; use `epmcId` in that case.'),
pmcUrl: z.string().optional().describe('PMC URL — derived from `pmcId` when present'),
pmid: z.string().optional().describe('PubMed ID'),
pubmedUrl: z.string().optional().describe('PubMed URL'),
doi: z.string().optional().describe('DOI'),
title: z.string().optional().describe('Article title'),
abstract: z.string().optional().describe('Abstract'),
authors: z.array(AuthorSchema).optional().describe('Authors'),
affiliations: z.array(z.string()).optional().describe('Author affiliations'),
journal: JournalSchema.optional(),
keywords: z.array(z.string()).optional().describe('Keywords'),
articleType: z.string().optional().describe('Article type'),
publicationDate: PublicationDateSchema.optional(),
sections: z.array(SectionSchema).describe('Article body sections'),
references: z.array(ReferenceSchema).optional().describe('Reference list'),
epmcId: z
.string()
.optional()
.describe('Europe PMC record id — present when `viaSource` is `europepmc`'),
epmcSource: z
.string()
.optional()
.describe('Europe PMC source code when `viaSource` is `europepmc`. Common values: `MED` (PubMed-derived), `PMC` (PMC counterpart), `PPR` (preprint), `PAT` (patent), `AGR` (Agricola), plus less common codes (`CTX`, `CBA`, `ETH`, `HIR`). Treat as opaque — EPMC may introduce new codes.'),
})
.describe('Structured JATS full-text article. `viaSource` records whether the JATS came from NCBI PMC or Europe PMC.');
const UnpaywallArticleSchema = z
.object({
source: z
.literal('unpaywall')
.describe('Content fetched from an open-access copy indexed by Unpaywall. Best-effort — structural fidelity depends on `contentFormat`.'),
viaSource: z
.literal('unpaywall')
.describe('Layer that produced this article. Constant `unpaywall` for this branch.'),
contentFormat: z
.enum(['html-markdown', 'pdf-text'])
.describe('How `content` was extracted. html-markdown: Defuddle extracted Markdown from an HTML landing page; light section structure may survive but is not guaranteed. pdf-text: unpdf extracted plain text from a PDF; no section, reference, or heading structure.'),
pmid: z
.string()
.optional()
.describe('PubMed ID when input was `pmids`; absent for `dois` input'),
pubmedUrl: z.string().optional().describe('PubMed URL — present when `pmid` is set'),
doi: z.string().describe('DOI used to locate the open-access copy'),
sourceUrl: z.string().describe('URL the content was fetched from'),
title: z.string().optional().describe('Detected article title when present'),
content: z.string().describe('Full article text — Markdown or plain text per `contentFormat`'),
wordCount: z
.number()
.optional()
.describe('Approximate word count reported by the HTML extractor; absent for PDFs'),
totalPages: z
.number()
.optional()
.describe('Page count reported by the PDF extractor; absent for HTML'),
license: z.string().optional().describe('License identifier from Unpaywall (e.g. cc-by, cc0)'),
hostType: z
.string()
.optional()
.describe('`publisher` or `repository` — where the OA copy is hosted'),
version: z
.string()
.optional()
.describe('OA version: submittedVersion | acceptedVersion | publishedVersion'),
})
.describe('Best-effort full text from an open-access copy');
const ArticleSchema = z
.discriminatedUnion('source', [PmcArticleSchema, UnpaywallArticleSchema])
.describe('Full-text article; shape depends on `source` (pmc = structured JATS, unpaywall = best-effort)');
const UnavailableReasonSchema = z
.enum([
'not-found',
'no-pmc-fallback-disabled',
'no-epmc-fulltext',
'no-doi',
'no-oa',
'fetch-failed',
'parse-failed',
'service-error',
])
.describe('Why no full text was returned. not-found: upstream returned no record for this ID. no-pmc-fallback-disabled: every tier was skipped (`triedTiers` is all `not-attempted`) — typically because EPMC (`EUROPEPMC_ENABLED`) and Unpaywall (`UNPAYWALL_EMAIL`) are not configured. no-epmc-fulltext: EPMC indexed the record but publishes no fullTextXML. no-doi: no DOI to query Unpaywall. no-oa: Unpaywall has no OA copy. fetch-failed: download failed. parse-failed: extraction empty. service-error: upstream server failure (threw, timed out, or returned malformed data).');
const TierOutcomeSchema = z
.enum([
'not-attempted',
'miss',
'no-fulltext',
'no-doi',
'no-oa',
'fetch-failed',
'parse-failed',
'service-error',
])
.describe('Per-tier outcome. not-attempted: tier was skipped. miss: tier returned no record. no-fulltext: EPMC indexed the record but publishes no fullTextXML. no-doi: no DOI to query Unpaywall. no-oa: Unpaywall reports no open-access copy. fetch-failed: OA copy download failed. parse-failed: extraction produced empty content. service-error: tier service threw.');
const TriedTierSchema = z
.object({
tier: z.enum(['pmc', 'europepmc', 'unpaywall']).describe('Which tier in the resolution chain'),
outcome: TierOutcomeSchema,
detail: z.string().optional().describe('Tier-specific context when available'),
})
.describe('One tier the resolution chain attempted, with its outcome');
const UnavailableSchema = z
.object({
id: z
.string()
.describe('Identifier the chain could not resolve — PMID, PMCID, or DOI per `idType`'),
idType: z.enum(['pmid', 'pmcid', 'doi']).describe('Which input branch the id came from'),
reason: UnavailableReasonSchema,
triedTiers: z
.array(TriedTierSchema)
.describe('Per-tier outcomes the chain produced for this id, in execution order. Covers `pmc`, `europepmc`, and `unpaywall` — the same tiers the tool description references. Tiers that the chain skipped appear as `outcome: not-attempted` with a `detail` explaining why.'),
})
.describe('One identifier that could not be returned, with the full chain it traversed');
// ─── Tool Definition ─────────────────────────────────────────────────────────
export const fetchFulltextTool = tool('pubmed_fetch_fulltext', {
description: 'Fetch full-text articles from PubMed Central with structured sections and references. When PMC misses, transparently falls back to Europe PMC `fullTextXML` (structured JATS for records with a PMC counterpart), then to Unpaywall — publisher-hosted or institutional open-access copies as HTML-as-Markdown or PDF-as-text. Provide exactly one of `pmcids` (PMC IDs directly), `pmids` (PubMed IDs, auto-resolved), or `dois` (preprints and EPMC-only OA records that lack PMID/PMCID).',
annotations: { readOnlyHint: true, openWorldHint: true },
_meta: conceptMeta([SCHEMA_SCHOLARLY_ARTICLE, EDAM_DATA_RETRIEVAL]),
sourceUrl: 'https://github.com/cyanheads/pubmed-mcp-server/blob/main/src/mcp-server/tools/definitions/fetch-fulltext.tool.ts',
errors: [
...NCBI_SERVICE_ERRORS,
...UNPAYWALL_SERVICE_ERRORS,
...EUROPEPMC_SERVICE_ERRORS,
],
input: z
.object({
pmcids: z
.array(z.string())
.min(1)
.max(10)
.optional()
.describe('PMC IDs to fetch (e.g. ["PMC9575052"]). Provide exactly one of `pmcids`, `pmids`, or `dois`.'),
pmids: z
.array(pmidStringSchema)
.min(1)
.max(10)
.optional()
.describe('PubMed IDs. Provide exactly one of `pmcids`, `pmids`, or `dois`. Articles in PMC are returned as structured JATS; articles not in PMC fall through to Europe PMC (when EPMC has a `fullTextXML`), then to Unpaywall when `UNPAYWALL_EMAIL` is set and a DOI is available.'),
dois: z
.array(z.string().min(3))
.min(1)
.max(10)
.optional()
.describe('DOIs to resolve (e.g. ["10.21203/rs.3.rs-9010375/v1"]). Provide exactly one of `pmcids`, `pmids`, or `dois`. Covers preprints and EPMC-only OA records that lack PMID/PMCID. Chain: Europe PMC search-by-DOI → fullTextXML → Unpaywall.'),
includeReferences: z
.boolean()
.default(false)
.describe('Include reference list. Applies to `source=pmc` results only.'),
maxSections: z
.number()
.int()
.min(1)
.max(50)
.optional()
.describe('Maximum top-level body sections. Applies to `source=pmc` results only.'),
sections: z
.array(z.string())
.optional()
.describe('Filter to specific sections by title, case-insensitive (e.g. ["Introduction", "Methods", "Results", "Discussion"]). Applies to `source=pmc` results only.'),
})
.refine((v) => [v.pmcids, v.pmids, v.dois].filter((b) => b !== undefined).length === 1, {
message: 'Provide exactly one of `pmcids`, `pmids`, or `dois` (not zero, not more).',
}),
output: z.object({
articles: z.array(ArticleSchema).describe('Full-text articles'),
totalReturned: z.number().describe('Number of articles returned'),
unavailable: z
.array(UnavailableSchema)
.optional()
.describe('Per-identifier explanations for any requested PMIDs, PMCIDs, or DOIs with no returnable full text. `idType` discriminates which branch the id came from.'),
}),
async handler(input, ctx) {
ctx.log.info('Executing pubmed_fetch_fulltext', {
hasPmcids: !!input.pmcids,
hasPmids: !!input.pmids,
hasDois: !!input.dois,
idCount: (input.pmcids ?? input.pmids ?? input.dois)?.length,
});
// ── Chain tracking ──────────────────────────────────────────────────────
// Per-input-id tier history (the `triedTiers` array on unavailable entries).
// Keys: pmid for `pmids` input, prefixed PMCID for `pmcids` input, doi for
// `dois` input. `recoveredIds` collects ids the chain produced an article
// for, so we can skip them when building `unavailable[]`.
const chainByInput = new Map();
const recoveredIds = new Set();
// PMCIDs the converter resolved from a pmid → PMID, for back-mapping after
// the PMC and EPMC stages.
const pmcidToPmid = new Map();
// DOI hints captured during pmids→pmcid routing so PMC-misses on the pmids
// branch can still reach Unpaywall without re-fetching from PubMed metadata.
const pmidContext = new Map();
const idType = input.pmids ? 'pmid' : input.pmcids ? 'pmcid' : 'doi';
// ── Branch routing → produce buckets the staged chain consumes ──────────
let pmcIds = [];
let pmidFallbackCandidates = [];
let pmcidFallbackCandidates = [];
let doiCandidates = [];
if (input.pmids) {
for (const id of input.pmids)
chainByInput.set(id, []);
const records = await getNcbiService().idConvert(input.pmids, 'pmid', ctx.signal ? { signal: ctx.signal } : undefined);
const seen = new Set();
for (const r of records) {
if (r.pmid === undefined)
continue;
const pmid = String(r.pmid);
seen.add(pmid);
if (r.pmcid) {
const normalized = normalizePmcId(String(r.pmcid));
pmcIds.push(normalized);
pmcidToPmid.set(withPmcPrefix(normalized), pmid);
pmidContext.set(pmid, { pmid, ...(r.doi && { doi: r.doi }) });
}
else {
chainByInput.get(pmid)?.push({
tier: 'pmc',
outcome: 'not-attempted',
detail: 'PMID has no PMC counterpart',
});
pmidFallbackCandidates.push({ pmid, ...(r.doi && { doi: r.doi }) });
}
}
for (const requested of input.pmids) {
if (!seen.has(requested)) {
chainByInput.get(requested)?.push({
tier: 'pmc',
outcome: 'not-attempted',
detail: 'ID Converter returned no record for this PMID',
});
pmidFallbackCandidates.push({ pmid: requested });
}
}
}
else if (input.pmcids) {
for (const id of input.pmcids)
chainByInput.set(withPmcPrefix(normalizePmcId(id)), []);
pmcIds = input.pmcids.map(normalizePmcId);
}
else if (input.dois) {
for (const doi of input.dois) {
chainByInput.set(doi, [
{ tier: 'pmc', outcome: 'not-attempted', detail: 'DOI input bypasses PMC EFetch' },
]);
}
doiCandidates = input.dois.map((doi) => ({ doi }));
}
// Route PMC-missed prefixed PMCIDs into the fallback buckets so EPMC and
// (for pmids) Unpaywall still get a chance. For pmids input we look up the
// captured DOI hint via `pmidContext` to avoid an extra PubMed eFetch when
// available; the converter often returns the DOI alongside a PMCID match.
const routePmcMissesToFallback = (missingPrefixed) => {
if (missingPrefixed.length === 0)
return;
if (input.pmcids) {
pmcidFallbackCandidates = missingPrefixed.map((pmcid) => ({ pmcid }));
}
else if (input.pmids) {
for (const prefixed of missingPrefixed) {
const pmid = pmcidToPmid.get(prefixed);
if (pmid)
pmidFallbackCandidates.push(pmidContext.get(pmid) ?? { pmid });
}
}
};
// ── Stage 1: PMC EFetch ─────────────────────────────────────────────────
// Wrapped so transient NCBI failures fall through to EPMC/Unpaywall rather
// than sinking the whole batch — the chain's contract is graceful fallback.
let pmcArticles = [];
if (pmcIds.length > 0) {
try {
const xmlData = await getNcbiService().eFetch({ db: 'pmc', id: pmcIds.join(','), retmode: 'xml' }, {
retmode: 'xml',
useOrderedParser: true,
usePost: pmcIds.length > 5,
signal: ctx.signal,
});
const articleSet = findOne(xmlData, 'pmc-articleset');
if (!articleSet) {
throw new Error('PMC EFetch response missing pmc-articleset wrapper');
}
const parsed = findAll(articleSet, 'article')
.map(parsePmcArticle)
.map((a) => applyPmcFilters(a, input));
pmcArticles = parsed.map((a) => ({
source: 'pmc',
viaSource: 'pmc',
...a,
}));
const returnedPmcIds = new Set(pmcArticles.map((a) => a.pmcId).filter((id) => !!id));
for (const prefixed of returnedPmcIds) {
recoveredIds.add(pmcidToPmid.get(prefixed) ?? prefixed);
}
const missing = pmcIds
.map((id) => withPmcPrefix(id))
.filter((id) => !returnedPmcIds.has(id));
for (const prefixed of missing) {
const inputId = pmcidToPmid.get(prefixed) ?? prefixed;
chainByInput.get(inputId)?.push({ tier: 'pmc', outcome: 'miss' });
}
routePmcMissesToFallback(missing);
}
catch (error) {
const detail = error instanceof Error ? error.message : String(error);
ctx.log.warning('PMC EFetch failed; chain continues with next layer', {
pmcIdCount: pmcIds.length,
error: detail,
});
const allPrefixed = pmcIds.map(withPmcPrefix);
for (const prefixed of allPrefixed) {
const inputId = pmcidToPmid.get(prefixed) ?? prefixed;
chainByInput.get(inputId)?.push({ tier: 'pmc', outcome: 'service-error', detail });
}
routePmcMissesToFallback(allPrefixed);
}
}
// ── Stage 2: Europe PMC fullTextXML ─────────────────────────────────────
const epmc = getEuropePmcService();
const epmcOutcomes = epmc
? await runEpmcStage(epmc, {
pmidFallbackCandidates,
pmcidFallbackCandidates,
doiCandidates,
input,
ctx,
})
: {
articles: [],
remainingPmid: pmidFallbackCandidates,
remainingPmcid: pmcidFallbackCandidates,
remainingDoi: doiCandidates,
pmidOutcomes: new Map(),
pmcidOutcomes: new Map(),
doiOutcomes: new Map(),
};
pmcArticles = pmcArticles.concat(epmcOutcomes.articles);
// Fold EPMC outcomes into each id's chain. EPMC-served articles count as
// recovered, so their ids are added to `recoveredIds` here.
if (!epmc) {
const epmcDisabledEntry = {
tier: 'europepmc',
outcome: 'not-attempted',
detail: 'EUROPEPMC_ENABLED=false',
};
for (const c of pmidFallbackCandidates)
chainByInput.get(c.pmid)?.push(epmcDisabledEntry);
for (const c of pmcidFallbackCandidates) {
const prefixed = withPmcPrefix(c.pmcid);
chainByInput.get(pmcidToPmid.get(prefixed) ?? prefixed)?.push(epmcDisabledEntry);
}
for (const c of doiCandidates)
chainByInput.get(c.doi)?.push(epmcDisabledEntry);
}
else {
for (const [pmid, outcome] of epmcOutcomes.pmidOutcomes) {
if (outcome.kind === 'hit') {
recoveredIds.add(pmid);
continue;
}
chainByInput.get(pmid)?.push(epmcTierFromOutcome(outcome));
}
for (const [prefixed, outcome] of epmcOutcomes.pmcidOutcomes) {
const inputId = pmcidToPmid.get(prefixed) ?? prefixed;
if (outcome.kind === 'hit') {
recoveredIds.add(inputId);
continue;
}
chainByInput.get(inputId)?.push(epmcTierFromOutcome(outcome));
}
for (const [doi, outcome] of epmcOutcomes.doiOutcomes) {
if (outcome.kind === 'hit') {
recoveredIds.add(doi);
continue;
}
chainByInput.get(doi)?.push(epmcTierFromOutcome(outcome));
}
}
pmidFallbackCandidates = epmcOutcomes.remainingPmid;
pmcidFallbackCandidates = epmcOutcomes.remainingPmcid;
doiCandidates = epmcOutcomes.remainingDoi;
// ── Stage 3: Unpaywall fallback ─────────────────────────────────────────
const unpaywall = getUnpaywallService();
const fallbackArticles = [];
// PMC misses on `pmcids` input don't get an Unpaywall attempt — the current
// implementation doesn't resolve PMCID → DOI for that branch.
for (const c of pmcidFallbackCandidates) {
const prefixed = withPmcPrefix(c.pmcid);
chainByInput.get(pmcidToPmid.get(prefixed) ?? prefixed)?.push({
tier: 'unpaywall',
outcome: 'not-attempted',
detail: 'pmcids input does not resolve a DOI for Unpaywall',
});
}
if (pmidFallbackCandidates.length > 0) {
// The PMC ID Converter only returns DOIs for articles it has in PMC, so
// candidates here are missing DOIs by default. Pull them from PubMed
// metadata (db=pubmed) before dispatching to Unpaywall.
const needDoi = pmidFallbackCandidates.filter((c) => !c.doi).map((c) => c.pmid);
if (needDoi.length > 0) {
try {
const doiMap = await fetchPubmedDois(needDoi, ctx.signal);
pmidFallbackCandidates = pmidFallbackCandidates.map((c) => {
if (c.doi)
return c;
const doi = doiMap.get(c.pmid);
return doi ? { ...c, doi } : c;
});
}
catch (error) {
ctx.log.warning('Failed to batch-fetch DOIs from PubMed for Unpaywall fallback', {
error: error instanceof Error ? error.message : String(error),
pmidCount: needDoi.length,
});
}
}
if (!unpaywall) {
for (const c of pmidFallbackCandidates) {
chainByInput.get(c.pmid)?.push({
tier: 'unpaywall',
outcome: 'not-attempted',
detail: 'UNPAYWALL_EMAIL is not set',
});
}
}
else {
const outcomes = await Promise.all(pmidFallbackCandidates.map(async (candidate) => ({
candidate,
result: candidate.doi
? await resolveUnpaywall({ pmid: candidate.pmid, doi: candidate.doi }, unpaywall, ctx)
: { unavailable: { reason: 'no-doi' } },
})));
for (const { candidate, result } of outcomes) {
if ('article' in result) {
fallbackArticles.push(result.article);
recoveredIds.add(candidate.pmid);
}
else {
const u = result.unavailable;
chainByInput.get(candidate.pmid)?.push({
tier: 'unpaywall',
outcome: unpaywallReasonToTierOutcome(u.reason),
...(u.detail && { detail: u.detail }),
});
}
}
}
}
if (doiCandidates.length > 0) {
if (!unpaywall) {
for (const c of doiCandidates) {
chainByInput.get(c.doi)?.push({
tier: 'unpaywall',
outcome: 'not-attempted',
detail: 'UNPAYWALL_EMAIL is not set',
});
}
}
else {
// `resolveUnpaywall` catches its own failures so this Promise.all
// doesn't reject under normal operation.
const outcomes = await Promise.all(doiCandidates.map(async (c) => ({
doi: c.doi,
result: await resolveUnpaywall({ doi: c.doi }, unpaywall, ctx),
})));
for (const { doi, result } of outcomes) {
if ('article' in result) {
fallbackArticles.push(result.article);
recoveredIds.add(doi);
}
else {
const u = result.unavailable;
chainByInput.get(doi)?.push({
tier: 'unpaywall',
outcome: unpaywallReasonToTierOutcome(u.reason),
...(u.detail && { detail: u.detail }),
});
}
}
}
}
// ── Assemble unavailable[] from chains ──────────────────────────────────
const unavailable = [];
for (const [id, chain] of chainByInput) {
if (recoveredIds.has(id))
continue;
unavailable.push({
id,
idType,
reason: reasonFromChain(chain),
triedTiers: chain,
});
}
const articles = [...pmcArticles, ...fallbackArticles];
ctx.log.info('pubmed_fetch_fulltext completed', {
requested: (input.pmids ?? input.pmcids ?? input.dois)?.length ?? 0,
returned: articles.length,
pmcHits: pmcArticles.filter((a) => a.viaSource === 'pmc').length,
epmcHits: pmcArticles.filter((a) => a.viaSource === 'europepmc').length,
unpaywallHits: fallbackArticles.length,
unavailable: unavailable.length,
});
return {
articles,
totalReturned: articles.length,
...(unavailable.length > 0 && { unavailable }),
};
},
format: (result) => {
const lines = [`## Full-Text Articles`, `**Articles Returned:** ${result.totalReturned}`];
if (result.unavailable?.length) {
lines.push(`\n**Unavailable (${result.unavailable.length}):**`);
for (const u of result.unavailable) {
lines.push(`- [${u.idType}] ${u.id} — ${u.reason}`);
const chain = u.triedTiers
.map((t) => `${t.tier}:${t.outcome}${t.detail ? ` (${t.detail})` : ''}`)
.join(' → ');
if (chain)
lines.push(` chain: ${chain}`);
}
}
if (result.totalReturned === 0) {
lines.push(`\n> No full-text articles returned. Articles must be open-access and indexed in PMC, Europe PMC, or recoverable via Unpaywall to retrieve full text. For metadata and abstracts only, use \`pubmed_fetch_articles\`.`);
}
for (const a of result.articles) {
lines.push('');
if (a.source === 'pmc')
formatPmcArticle(a, lines);
else
formatUnpaywallArticle(a, lines);
}
return [{ type: 'text', text: lines.join('\n') }];
},
});
/**
* Run the Europe PMC step against everything that fell through PMC EFetch
* plus any direct DOI input. Each candidate goes through search-by-best-id →
* fullTextXML. Hits become `source: 'pmc'` articles with `viaSource: 'europepmc'`;
* misses flow through to the Unpaywall stage unchanged.
*
* Candidates run in parallel — the EPMC request queue caps concurrency so this
* stays polite without serializing. Errors are caught and logged inside the
* helpers; a transient EPMC failure must not block the downstream Unpaywall
* fallback.
*/
async function runEpmcStage(epmc, args) {
const runOne = async (c, query, contextPmid) => {
const search = await searchEpmcSafe(epmc, query, args.ctx);
if (search.kind === 'error') {
return { c, outcome: { kind: 'service-error', detail: search.detail } };
}
if (search.kind === 'miss')
return { c, outcome: { kind: 'miss' } };
const fetched = await fetchEpmcArticle(epmc, search.hit, args, contextPmid);
if (fetched.kind === 'error') {
return { c, outcome: { kind: 'service-error', detail: fetched.detail } };
}
if (fetched.kind === 'no-fulltext') {
return {
c,
outcome: { kind: 'no-fulltext', ...(fetched.detail && { detail: fetched.detail }) },
};
}
return { c, outcome: { kind: 'hit' }, article: fetched.article };
};
const fetchForPmid = (c) => runOne(c, `EXT_ID:"${c.pmid}" AND SRC:MED`, c.pmid);
const fetchForPmcid = (c) => {
const normalized = withPmcPrefix(c.pmcid);
return runOne({ c, normalized }, `PMCID:"${normalized}" AND SRC:PMC`, undefined);
};
const fetchForDoi = (c) => runOne(c, `DOI:"${c.doi}"`, undefined);
const [pmidResults, pmcidResults, doiResults] = await Promise.all([
Promise.all(args.pmidFallbackCandidates.map(fetchForPmid)),
Promise.all(args.pmcidFallbackCandidates.map(fetchForPmcid)),
Promise.all(args.doiCandidates.map(fetchForDoi)),
]);
const articles = [];
const remainingPmid = [];
const remainingPmcid = [];
const remainingDoi = [];
const pmidOutcomes = new Map();
const pmcidOutcomes = new Map();
const doiOutcomes = new Map();
for (const { c, outcome, article } of pmidResults) {
pmidOutcomes.set(c.pmid, outcome);
if (article)
articles.push(article);
else
remainingPmid.push(c);
}
for (const { c: pair, outcome, article } of pmcidResults) {
pmcidOutcomes.set(pair.normalized, outcome);
if (article)
articles.push(article);
else
remainingPmcid.push(pair.c);
}
for (const { c, outcome, article } of doiResults) {
doiOutcomes.set(c.doi, outcome);
if (article)
articles.push(article);
else
remainingDoi.push(c);
}
return {
articles,
remainingPmid,
remainingPmcid,
remainingDoi,
pmidOutcomes,
pmcidOutcomes,
doiOutcomes,
};
}
/**
* Single-hit Europe PMC search with discriminated outcomes so the chain can
* record `miss` vs `service-error` separately. Errors are logged and swallowed
* so transient EPMC failures fall through to the next stage instead of
* aborting the chain.
*/
async function searchEpmcSafe(epmc, query, ctx) {
try {
const result = await epmc.search({
query,
resultType: 'core',
pageSize: 1,
...(ctx.signal && { signal: ctx.signal }),
});
return result.hits[0] ? { kind: 'hit', hit: result.hits[0] } : { kind: 'miss' };
}
catch (error) {
const detail = error instanceof Error ? error.message : String(error);
ctx.log.warning('Europe PMC search failed; chain continues with next layer', {
query,
error: detail,
});
return { kind: 'error', detail };
}
}
/**
* Fetch and parse the JATS for an EPMC hit. Returns a discriminated outcome so
* the chain can record `no-fulltext` (record exists but EPMC publishes no JATS)
* separately from `service-error` (transient failure). Preprints/patents and
* MED-only records without a PMC counterpart short-circuit to `no-fulltext`
* since EPMC's fullTextXML endpoint is PMC-keyed.
*/
async function fetchEpmcArticle(epmc, hit, args, contextPmid) {
// EPMC's fullTextXML endpoint is PMC-keyed (URL: `/{PMC<digits>}/fullTextXML`).
// For PMC-source hits, `hit.id` already is the PMC ID; for MED hits, `hit.pmcid`
// carries the counterpart when one exists. Preprints (PPR) and patents (PAT)
// have no PMC ID, so fullTextXML is never available.
const pmcLookupId = hit.pmcid ?? (hit.source === 'PMC' ? hit.id : undefined);
if (!pmcLookupId) {
return { kind: 'no-fulltext', detail: `EPMC source ${hit.source} has no PMC counterpart` };
}
try {
const result = await epmc.fullTextXml(pmcLookupId, hit.source, args.ctx.signal ?? undefined);
if (result.kind === 'not-available') {
return { kind: 'no-fulltext', detail: 'EPMC fullTextXML not available for this record' };
}
const articleNode = epmc.parseFullTextXml(result.xml);
if (!articleNode) {
return { kind: 'no-fulltext', detail: 'EPMC fullTextXML payload had no <article> element' };
}
const parsed = applyPmcFilters(parsePmcArticle(articleNode), args.input);
// `parsePmcArticle` always returns string fields (sometimes empty). Strip
// empty `pmcId`/`pmcUrl` for EPMC-only records (preprints) so the schema's
// optional shape is respected — agents read `epmcId`/`epmcSource` for those.
const { pmcId, pmcUrl, ...rest } = parsed;
const pmid = rest.pmid ?? hit.pmid ?? contextPmid;
const doi = rest.doi ?? hit.doi;
return {
kind: 'article',
article: {
source: 'pmc',
viaSource: 'europepmc',
...rest,
...(pmcId && { pmcId, pmcUrl }),
...(pmid && {
pmid,
pubmedUrl: rest.pubmedUrl ?? `https://pubmed.ncbi.nlm.nih.gov/${pmid}/`,
}),
...(doi && { doi }),
epmcId: hit.id,
epmcSource: hit.source,
},
};
}
catch (error) {
const detail = error instanceof Error ? error.message : String(error);
args.ctx.log.warning('Europe PMC fullTextXML failed; chain continues with next layer', {
epmcId: hit.id,
source: hit.source,
error: detail,
});
return { kind: 'error', detail };
}
}
/**
* Batch-fetch DOIs from PubMed metadata for PMIDs that lack one after the PMC
* ID Converter roundtrip. The Converter only returns DOIs for articles already
* in PMC, so non-PMC PMIDs arrive here with `doi: undefined` — yet the DOI is
* present in PubMed's own record (ELocationID / ArticleIdList) and is required
* to query Unpaywall. One eFetch call covers the whole batch.
*/
async function fetchPubmedDois(pmids, signal) {
const out = new Map();
if (pmids.length === 0)
return out;
const xmlData = await getNcbiService().eFetch({ db: 'pubmed', id: pmids.join(','), retmode: 'xml' }, { retmode: 'xml', usePost: pmids.length >= 100, ...(signal && { signal }) });
const articles = xmlData?.PubmedArticleSet?.PubmedArticle
? ensureArray(xmlData.PubmedArticleSet.PubmedArticle)
: [];
for (const article of articles) {
if (!article?.MedlineCitation)
continue;
const pmid = extractPmid(article.MedlineCitation);
if (!pmid)
continue;
const doi = extractDoi(article.MedlineCitation.Article, article.PubmedData?.ArticleIdList);
if (doi)
out.set(pmid, doi);
}
return out;
}
/**
* Resolve a DOI to an open-access article via Unpaywall. `pmid`, when set,
* is stamped onto the resulting article so the pmid-input branch carries its
* cross-reference through.
*/
async function resolveUnpaywall(args, service, ctx) {
const { pmid, doi } = args;
let resolution;
try {
resolution = await service.resolve(doi, ctx.signal);
}
catch (error) {
const detail = error instanceof Error ? error.message : String(error);
ctx.log.warning('Unpaywall DOI resolve failed', { doi, error: detail });
return { unavailable: { reason: 'service-error', detail } };
}
if (resolution.kind === 'no-oa') {
return { unavailable: { reason: 'no-oa', detail: resolution.reason } };
}
let content;
try {
content = await service.fetchContent(resolution.location, ctx.signal);
}
catch (error) {
const detail = error instanceof Error ? error.message : String(error);
ctx.log.warning('Unpaywall content fetch failed', { doi, error: detail });
return { unavailable: { reason: 'fetch-failed', detail } };
}
try {
if (content.kind === 'html') {
const extracted = await htmlExtractor.extract(content.body, {
url: content.fetchedUrl,
format: 'markdown',
});
const body = extracted.content.trim();
if (!body) {
return {
unavailable: {
reason: 'parse-failed',
detail: 'HTML extraction produced empty content',
},
};
}
return {
article: buildUnpaywallArticle({
...(pmid && { pmid }),
doi,
sourceUrl: content.fetchedUrl,
location: resolution.location,
contentFormat: 'html-markdown',
content: body,
title: extracted.title,
wordCount: extracted.wordCount,
}),
};
}
const extracted = await pdfParser.extractText(content.body, { mergePages: true });
const text = typeof extracted.text === 'string' ? extracted.text.trim() : '';
if (!text) {
return {
unavailable: { reason: 'parse-failed', detail: 'PDF extraction produced empty text' },
};
}
return {
article: buildUnpaywallArticle({
...(pmid && { pmid }),
doi,
sourceUrl: content.fetchedUrl,
location: resolution.location,
contentFormat: 'pdf-text',
content: text,
totalPages: extracted.totalPages,
}),
};
}
catch (error) {
const detail = error instanceof Error ? error.message : String(error);
ctx.log.warning('Unpaywall content extraction failed', { pmid, doi, detail });
return { unavailable: { reason: 'parse-failed', detail } };
}
}
function buildUnpaywallArticle(args) {
const { location } = args;
return {
source: 'unpaywall',
viaSource: 'unpaywall',
contentFormat: args.contentFormat,
...(args.pmid && {
pmid: args.pmid,
pubmedUrl: `https://pubmed.ncbi.nlm.nih.gov/${args.pmid}/`,
}),
doi: args.doi,
sourceUrl: args.sourceUrl,
content: args.content,
...(args.title && { title: args.title }),
...(args.wordCount !== undefined && { wordCount: args.wordCount }),
...(args.totalPages !== undefined && { totalPages: args.totalPages }),
...(location.license && { license: location.license }),
...(location.host_type && { hostType: location.host_type }),
...(location.version && { version: location.version }),
};
}
/**
* Convert an EPMC stage outcome into the `triedTiers` entry stored on
* `chainByInput`. `hit` is filtered before calling — the chain only records
* failure outcomes since recovered ids never appear in `unavailable[]`.
*/
function epmcTierFromOutcome(outcome) {
switch (outcome.kind) {
case 'miss':
return { tier: 'europepmc', outcome: 'miss' };
case 'no-fulltext':
return {
tier: 'europepmc',
outcome: 'no-fulltext',
...(outcome.detail && { detail: outcome.detail }),
};
case 'service-error':
return { tier: 'europepmc', outcome: 'service-error', detail: outcome.detail };
}
}
/**
* Map an Unpaywall-resolver `UnavailableReason` to its `TierOutcome`
* counterpart. The two enums overlap on the values the Unpaywall path can
* actually emit (`no-doi`, `no-oa`, `fetch-failed`, `parse-failed`,
* `service-error`). Defensive branches cover values the resolver returns under
* dead-code safety checks but never in normal flow.
*/
function unpaywallReasonToTierOutcome(reason) {
switch (reason) {
case 'no-doi':
case 'no-oa':
case 'fetch-failed':
case 'parse-failed':
case 'service-error':
return reason;
case 'no-pmc-fallback-disabled':
return 'not-attempted';
case 'no-epmc-fulltext':
return 'no-fulltext';
case 'not-found':
return 'miss';
}
}
/**
* Derive the terminal `reason` shown on the unavailable entry from its chain.
* Skips `not-attempted` entries when summarizing — those record config state,
* not content state, so they make a misleading `reason` when an earlier tier
* produced a real signal (`pmc:miss`, `unpaywall:no-oa`, etc.). Only when every
* tier was skipped does `reason` fall back to `no-pmc-fallback-disabled`.
*/
function reasonFromChain(chain) {
let lastSignal;
for (const t of chain) {
if (t.outcome !== 'not-attempted')
lastSignal = t;
}
if (!lastSignal)
return 'no-pmc-fallback-disabled';
const key = `${lastSignal.tier}:${lastSignal.outcome}`;
switch (key) {
case 'pmc:miss':
case 'europepmc:miss':
return 'not-found';
case 'europepmc:no-fulltext':
return 'no-epmc-fulltext';
case 'unpaywall:no-doi':
return 'no-doi';
case 'unpaywall:no-oa':
return 'no-oa';
case 'unpaywall:fetch-failed':
return 'fetch-failed';
case 'unpaywall:parse-failed':
return 'parse-failed';
case 'pmc:service-error':
case 'unpaywall:service-error':
case 'europepmc:service-error':
return 'service-error';
default:
return 'not-found';
}
}
// ─── format() helpers ────────────────────────────────────────────────────────
function formatPmcArticle(a, lines) {
lines.push(`### ${a.title ?? a.pmcId}`);
const sourceLabel = a.viaSource === 'europepmc'
? `Europe PMC (structured JATS${a.epmcSource ? `, source: ${a.epmcSource}` : ''})`
: 'PMC (structured JATS)';
lines.push(`**Source:** ${sourceLabel}`);
if (a.authors?.length) {
lines.push(`\n**Authors (${a.authors.length}):**`);
for (const au of a.authors)
lines.push(`- ${formatPmcAuthor(au)}`);
}
if (a.affiliations?.length) {
lines.push(`\n**Affiliations:**`);
for (const [i, aff] of a.affiliations.entries())
lines.push(`${i + 1}. ${aff}`);
}
if (a.journal) {
const parts = [];
if (a.journal.title)
parts.push(a.journal.title);
if (a.journal.volume)
parts.push(`**${a.journal.volume}**${a.journal.issue ? `(${a.journal.issue})` : ''}`);
if (a.journal.pages)
parts.push(a.journal.pages);
if (a.journal.issn)
parts.push(`ISSN ${a.journal.issn}`);
if (parts.length)
lines.push(`\n**Journal:** ${parts.join(', ')}`);
}
if (a.articleType)
lines.push(`**Type:** ${a.articleType}`);
if (a.publicationDate) {
const d = a.publicationDate;
const dateParts = [d.year, d.month, d.day].filter(Boolean);
if (dateParts.length)
lines.push(`**Published:** ${dateParts.join('-')}`);
}
if (a.pmcId)
lines.push(`**PMCID:** ${a.pmcId}`);
if (a.epmcId)
lines.push(`**EPMC ID:** ${a.epmcId}${a.epmcSource ? ` (${a.epmcSource})` : ''}`);
if (a.pmid)
lines.push(`**PMID:** ${a.pmid}`);
if (a.doi)
lines.push(`**DOI:** ${a.doi}`);
if (a.pmcUrl)
lines.push(`**PMC:** ${a.pmcUrl}`);
if (a.pubmedUrl)
lines.push(`**PubMed:** ${a.pubmedUrl}`);
if (a.keywords?.length)
lines.push(`**Keywords:** ${a.keywords.join(', ')}`);
if (a.abstract)
lines.push(`\n#### Abstract\n${a.abstract}`);
for (const sec of a.sections) {
if (sec.title)
lines.push(`\n#### ${formatHeading(sec.label, sec.title)}`);
if (sec.text)
lines.push(sec.text);
if (sec.subsections?.length) {
for (const sub of sec.subsections) {
if (sub.title)
lines.push(`\n##### ${formatHeading(sub.label, sub.title)}`);
if (sub.text)
lines.push(sub.text);
}
}
}
if (a.references?.length) {
lines.push(`\n#### References (${a.references.length})`);
for (const ref of a.references) {
const tag = [ref.label, ref.id].filter(Boolean).join(' ');
lines.push(`- ${tag ? `[${tag}] ` : ''}${ref.citation}`);
}
}
}
function formatUnpaywallArticle(a, lines) {
const heading = a.title ?? (a.pmid ? `PMID ${a.pmid}` : `DOI ${a.doi}`);
const formatLabel = a.contentFormat === 'html-markdown'
? 'Unpaywall (HTML → Markdown, best-effort)'
: 'Unpaywall (PDF → plain text)';
lines.push(`### ${heading}`);
lines.push(`**Source:** ${formatLabel}`);
if (a.pmid)
lines.push(`**PMID:** ${a.pmid}`);
lines.push(`**DOI:** ${a.doi}`);
if (a.pubmedUrl)
lines.push(`**PubMed:** ${a.pubmedUrl}`);
lines.push(`**OA Copy:** ${