@cyanheads/pubmed-mcp-server
Version:
Search PubMed/Europe PMC, fetch articles and full text (PMC/EPMC/Unpaywall), citations, MeSH terms via MCP. STDIO or Streamable HTTP.
341 lines • 17.5 kB
JavaScript
/**
* @fileoverview PubMed search tool. Searches PubMed with full query syntax,
* field-specific filters, date ranges, pagination, and optional brief summaries.
* @module src/mcp-server/tools/definitions/search-articles.tool
*/
import { tool, z } from '@cyanheads/mcp-ts-core';
import { sanitization } from '@cyanheads/mcp-ts-core/utils';
import { NCBI_SERVICE_ERRORS } from '../../../services/error-contracts.js';
import { getNcbiService } from '../../../services/ncbi/ncbi-service.js';
import { extractBriefSummaries } from '../../../services/ncbi/parsing/esummary-parser.js';
import { conceptMeta, EDAM_DATABASE_SEARCH, EDAM_PUBMED_ID, SCHEMA_SEARCH_ACTION, } from './_concepts.js';
/**
* Accepts empty strings (treated as "no filter" by the handler) or dates in
* YYYY, YYYY/MM, or YYYY/MM/DD form with `/`, `-`, or `.` separators.
* Catches obvious typos at the edge so they don't degrade silently to 0 results.
*/
const DATE_RE = /^$|^\d{4}([/\-.]\d{1,2}([/\-.]\d{1,2})?)?$/;
/**
* Produces an optional human- and agent-readable hint for edge cases where
* bare empty arrays leave the caller without enough signal to recover:
* - No matches at all (suggest spell-check / removing filters)
* - Filters applied but nothing matched (suggest relaxing filters)
* - Pagination overshoot (offset ≥ totalFound)
*/
function buildNotice(args) {
const { totalFound, pmidCount, offset, hasFilters } = args;
if (totalFound === 0) {
return hasFilters
? 'No results matched your query with the applied filters. Try removing filters (e.g. dateRange, publicationTypes, meshTerms), broadening dates, or verifying author/journal spelling.'
: 'No results matched your query. Try running pubmed_spell_check for a suggested correction or broaden the query.';
}
if (pmidCount === 0 && offset > 0 && offset >= totalFound) {
return `Offset ${offset} exceeds totalFound (${totalFound}). Reset offset to 0 or reduce it below ${totalFound} to page through results.`;
}
return;
}
const AppliedFiltersSchema = z.object({
dateRange: z
.object({
minDate: z.string().describe('Applied minimum date'),
maxDate: z.string().describe('Applied maximum date'),
dateType: z
.enum(['pdat', 'mdat', 'edat'])
.describe('Applied date field used for the range filter'),
})
.optional()
.describe('Date range filter applied to the search'),
publicationTypes: z
.array(z.string())
.optional()
.describe('Publication type filters applied to the search'),
author: z.string().optional().describe('Author filter applied to the search'),
journal: z.string().optional().describe('Journal filter applied to the search'),
meshTerms: z.array(z.string()).optional().describe('MeSH term filters applied to the search'),
language: z.string().optional().describe('Language filter applied to the search'),
hasAbstract: z
.boolean()
.optional()
.describe('Whether results were restricted to articles with abstracts'),
freeFullText: z
.boolean()
.optional()
.describe('Whether results were restricted to free full-text articles'),
species: z
.enum(['humans', 'animals'])
.optional()
.describe('Species filter applied to the search'),
});
export const searchArticlesTool = tool('pubmed_search_articles', {
description: 'Search PubMed with full query syntax, filters, and date ranges. Returns PMIDs and optional brief summaries. Supports field-specific filters (author, journal, MeSH terms), common filters (language, species, free full text), and pagination via offset for paging through large result sets.',
annotations: { readOnlyHint: true, openWorldHint: true },
_meta: conceptMeta([SCHEMA_SEARCH_ACTION, EDAM_DATABASE_SEARCH, EDAM_PUBMED_ID]),
sourceUrl: 'https://github.com/cyanheads/pubmed-mcp-server/blob/main/src/mcp-server/tools/definitions/search-articles.tool.ts',
errors: [...NCBI_SERVICE_ERRORS],
input: z.object({
query: z.string().min(1).describe('PubMed search query (supports full NCBI syntax)'),
maxResults: z.number().int().min(1).max(1000).default(20).describe('Maximum results to return'),
offset: z.number().int().min(0).default(0).describe('Result offset for pagination (0-based)'),
sort: z
.enum(['relevance', 'pub_date', 'author', 'journal'])
.default('relevance')
.describe('Sort order: relevance (default), pub_date (newest first), author, or journal'),
dateRange: z
.object({
minDate: z
.string()
.regex(DATE_RE, 'Date must be YYYY, YYYY/MM, or YYYY/MM/DD (/, -, or . separators)')
.describe('Start date (YYYY/MM/DD, YYYY/MM, or YYYY); empty string disables this bound'),
maxDate: z
.string()
.regex(DATE_RE, 'Date must be YYYY, YYYY/MM, or YYYY/MM/DD (/, -, or . separators)')
.describe('End date (YYYY/MM/DD, YYYY/MM, or YYYY); empty string disables this bound'),
dateType: z
.enum(['pdat', 'mdat', 'edat'])
.default('pdat')
.describe('Date type: pdat (publication), mdat (modification), edat (entrez)'),
})
.optional()
.describe('Filter by date range. The filter is applied only when both `minDate` and `maxDate` are non-empty; either one empty disables the entire date range.'),
publicationTypes: z
.array(z.string())
.optional()
.describe('Filter by publication type (e.g. "Review", "Clinical Trial", "Meta-Analysis"). Multiple values are OR\'d — any match qualifies.'),
author: z.string().optional().describe('Filter by author name (e.g. "Smith J")'),
journal: z.string().optional().describe('Filter by journal name'),
meshTerms: z
.array(z.string())
.optional()
.describe("Filter by MeSH terms. Multiple terms are AND'd — all must match."),
language: z.string().optional().describe('Filter by language (e.g. "english")'),
hasAbstract: z.boolean().optional().describe('Only include articles with abstracts'),
freeFullText: z.boolean().optional().describe('Only include free full text articles'),
species: z.enum(['humans', 'animals']).optional().describe('Filter by species'),
summaryCount: z
.number()
.int()
.min(0)
.max(50)
.default(0)
.describe('Fetch brief summaries for top N results (0 = PMIDs only)'),
}),
output: z.object({
query: z.string().describe('Original query'),
effectiveQuery: z
.string()
.describe('Sanitized query sent to PubMed after applying all active filters'),
appliedFilters: AppliedFiltersSchema.describe('Normalized filter values that were applied to the PubMed query'),
totalFound: z.number().describe('Total matching articles'),
offset: z.number().describe('Result offset used'),
pmids: z.array(z.string()).describe('PubMed IDs'),
summaries: z
.array(z
.object({
pmid: z.string().describe('PubMed ID'),
title: z.string().optional().describe('Article title'),
authors: z.string().optional().describe('Formatted author string'),
source: z.string().optional().describe('Journal source'),
pubDate: z.string().optional().describe('Publication date'),
doi: z.string().optional().describe('DOI'),
pmcId: z.string().optional().describe('PMC ID'),
pmcUrl: z.string().optional().describe('PMC URL'),
pubmedUrl: z.string().optional().describe('PubMed URL'),
})
.describe('Brief article summary'))
.describe('Brief summaries (empty array when summaryCount is 0)'),
searchUrl: z.string().describe('PubMed search URL'),
notice: z
.string()
.optional()
.describe('Optional guidance when results are empty or paging overshot — e.g. how to broaden filters or reset offset. Absent on successful result pages.'),
}),
async handler(input, ctx) {
ctx.log.info('Executing pubmed_search', { query: input.query });
const ncbi = getNcbiService();
let effectiveQuery = await sanitization.sanitizeString(input.query, { context: 'text' });
// Build filters — capture normalized values for both query construction and appliedFilters
let normalizedDateRange;
if (input.dateRange?.minDate && input.dateRange?.maxDate) {
normalizedDateRange = {
minDate: input.dateRange.minDate.trim().replace(/[-.]/g, '/'),
maxDate: input.dateRange.maxDate.trim().replace(/[-.]/g, '/'),
dateType: input.dateRange.dateType,
};
effectiveQuery += ` AND (${normalizedDateRange.minDate}[${normalizedDateRange.dateType}] : ${normalizedDateRange.maxDate}[${normalizedDateRange.dateType}])`;
}
let sanitizedPubTypes;
if (input.publicationTypes?.length) {
sanitizedPubTypes = await Promise.all(input.publicationTypes.map((pt) => sanitization.sanitizeString(pt, { context: 'text' })));
effectiveQuery += ` AND (${sanitizedPubTypes.map((pt) => `"${pt}"[Publication Type]`).join(' OR ')})`;
}
let sanitizedAuthor;
if (input.author) {
sanitizedAuthor = await sanitization.sanitizeString(input.author, { context: 'text' });
effectiveQuery += ` AND ${sanitizedAuthor}[Author]`;
}
let sanitizedJournal;
if (input.journal) {
sanitizedJournal = await sanitization.sanitizeString(input.journal, { context: 'text' });
effectiveQuery += ` AND "${sanitizedJournal}"[Journal]`;
}
let sanitizedMeshTerms;
if (input.meshTerms?.length) {
sanitizedMeshTerms = await Promise.all(input.meshTerms.map((term) => sanitization.sanitizeString(term, { context: 'text' })));
effectiveQuery += ` AND (${sanitizedMeshTerms.map((term) => `"${term}"[MeSH Terms]`).join(' AND ')})`;
}
let sanitizedLanguage;
if (input.language) {
sanitizedLanguage = await sanitization.sanitizeString(input.language, { context: 'text' });
effectiveQuery += ` AND ${sanitizedLanguage}[Language]`;
}
if (input.hasAbstract)
effectiveQuery += ' AND hasabstract[text word]';
if (input.freeFullText)
effectiveQuery += ' AND free full text[filter]';
if (input.species)
effectiveQuery += ` AND ${input.species}[MeSH Terms]`;
const esResult = await ncbi.eSearch({
db: 'pubmed',
term: effectiveQuery,
retmax: input.maxResults,
retstart: input.offset,
sort: input.sort,
usehistory: input.summaryCount > 0 ? 'y' : undefined,
}, { signal: ctx.signal });
const pmids = esResult.idList;
let summaries = [];
if (input.summaryCount > 0 && pmids.length > 0) {
const eSummaryParams = {
db: 'pubmed',
version: '2.0',
retmode: 'xml',
};
if (esResult.webEnv && esResult.queryKey) {
eSummaryParams.WebEnv = esResult.webEnv;
eSummaryParams.query_key = esResult.queryKey;
eSummaryParams.retmax = Math.min(input.summaryCount, pmids.length);
eSummaryParams.retstart = input.offset;
}
else {
eSummaryParams.id = pmids.slice(0, input.summaryCount).join(',');
}
const eSummaryResult = await ncbi.eSummary(eSummaryParams, { signal: ctx.signal });
if (eSummaryResult) {
const briefSummaries = await extractBriefSummaries(eSummaryResult);
summaries = briefSummaries.map((s) => ({
pmid: s.pmid,
title: s.title,
authors: s.authors,
source: s.source,
pubDate: s.pubDate,
doi: s.doi,
pmcId: s.pmcId,
...(s.pmcId && { pmcUrl: `https://www.ncbi.nlm.nih.gov/pmc/articles/${s.pmcId}/` }),
pubmedUrl: `https://pubmed.ncbi.nlm.nih.gov/${s.pmid}/`,
}));
}
}
const searchUrl = `https://pubmed.ncbi.nlm.nih.gov/?term=${encodeURIComponent(effectiveQuery)}`;
const appliedFilters = {
...(normalizedDateRange && { dateRange: normalizedDateRange }),
...(sanitizedPubTypes?.length && { publicationTypes: sanitizedPubTypes }),
...(sanitizedAuthor && { author: sanitizedAuthor }),
...(sanitizedJournal && { journal: sanitizedJournal }),
...(sanitizedMeshTerms?.length && { meshTerms: sanitizedMeshTerms }),
...(sanitizedLanguage && { language: sanitizedLanguage }),
...(input.hasAbstract && { hasAbstract: true }),
...(input.freeFullText && { freeFullText: true }),
...(input.species && { species: input.species }),
};
ctx.log.info('pubmed_search completed', {
totalFound: esResult.count,
pmidCount: pmids.length,
});
const notice = buildNotice({
totalFound: esResult.count,
pmidCount: pmids.length,
offset: input.offset,
hasFilters: Object.keys(appliedFilters).length > 0,
});
return {
query: input.query,
effectiveQuery,
appliedFilters,
totalFound: esResult.count,
offset: input.offset,
pmids,
summaries,
searchUrl,
...(notice && { notice }),
};
},
format: (result) => {
const lines = [
`## PubMed Search Results`,
`**Query:** ${result.query}`,
`**Effective Query:** ${result.effectiveQuery}`,
`**Total Found:** ${result.totalFound} | **Returned:** ${result.pmids.length} | **Offset:** ${result.offset}`,
`**Search URL:** ${result.searchUrl}`,
];
if (Object.keys(result.appliedFilters).length > 0) {
lines.push('\n### Applied Filters');
if (result.appliedFilters.dateRange) {
lines.push(`- **Date Range (${result.appliedFilters.dateRange.dateType}):** ${result.appliedFilters.dateRange.minDate} to ${result.appliedFilters.dateRange.maxDate}`);
}
if (result.appliedFilters.publicationTypes?.length) {
lines.push(`- **Publication Types:** ${result.appliedFilters.publicationTypes.join(', ')}`);
}
if (result.appliedFilters.author) {
lines.push(`- **Author:** ${result.appliedFilters.author}`);
}
if (result.appliedFilters.journal) {
lines.push(`- **Journal:** ${result.appliedFilters.journal}`);
}
if (result.appliedFilters.meshTerms?.length) {
lines.push(`- **MeSH Terms:** ${result.appliedFilters.meshTerms.join(', ')}`);
}
if (result.appliedFilters.language) {
lines.push(`- **Language:** ${result.appliedFilters.language}`);
}
if (result.appliedFilters.hasAbstract) {
lines.push(`- **Has Abstract:** Yes`);
}
if (result.appliedFilters.freeFullText) {
lines.push(`- **Free Full Text:** Yes`);
}
if (result.appliedFilters.species) {
lines.push(`- **Species:** ${result.appliedFilters.species}`);
}
}
if (result.notice)
lines.push(`\n> ${result.notice}`);
if (result.pmids.length > 0)
lines.push(`\n**PMIDs:** ${result.pmids.join(', ')}`);
if (result.summaries?.length) {
if (result.summaries.length < result.pmids.length) {
lines.push(`\n> Summaries shown for top ${result.summaries.length} of ${result.pmids.length} PMIDs. Increase \`summaryCount\` (max 50) to fetch more.`);
}
lines.push('\n### Summaries');
for (const s of result.summaries) {
lines.push(`\n#### ${s.title ?? s.pmid}`);
lines.push(`**PMID:** ${s.pmid}`);
if (s.authors)
lines.push(`**Authors:** ${s.authors}`);
if (s.source)
lines.push(`**Source:** ${s.source}`);
if (s.pubDate)
lines.push(`**Published:** ${s.pubDate}`);
if (s.doi)
lines.push(`**DOI:** ${s.doi}`);
if (s.pmcId)
lines.push(`**PMCID:** ${s.pmcId}`);
if (s.pubmedUrl)
lines.push(`**PubMed:** ${s.pubmedUrl}`);
if (s.pmcUrl)
lines.push(`**PMC:** ${s.pmcUrl}`);
}
}
return [{ type: 'text', text: lines.join('\n') }];
},
});
//# sourceMappingURL=search-articles.tool.js.map