@cyanheads/pubmed-mcp-server
Version:
Search PubMed/Europe PMC, fetch articles and full text (PMC/EPMC/Unpaywall), citations, MeSH terms via MCP. STDIO or Streamable HTTP.
457 lines • 19.4 kB
JavaScript
/**
* @fileoverview Handles parsing of NCBI E-utility responses and NCBI-specific error extraction.
* Creates an NCBI-specific XMLParser instance with `isArray` callback support for handling
* NCBI's inconsistent XML structures where single-element lists are collapsed to scalars.
* @module src/services/ncbi/response-handler
*/
import { notFound, serializationError, serviceUnavailable } from '@cyanheads/mcp-ts-core/errors';
import { logger, requestContextService } from '@cyanheads/mcp-ts-core/utils';
/**
* fast-xml-parser is pinned to `~5.7.3` in package.json. 5.8.0 deprecates
* `XMLValidator` in favor of a new sibling package `fast-xml-validator`
* (published days before the deprecation, 0 dependents, by the same author).
* 5.8.0 carries no security fix or feature we need, so we stay on 5.7.x until
* `fast-xml-validator` has time in the wild. Re-evaluate during a future
* `maintenance` pass — if `bun update --latest` bumps this, restore the pin.
*/
import { XMLParser as FastXmlParser, XMLValidator } from 'fast-xml-parser';
import { recoveryFor } from '../../services/error-contracts.js';
/**
* jpaths that NCBI may return as either a single value or an array.
* The `isArray` callback forces these to always parse as arrays for consistency.
*/
const NCBI_ARRAY_JPATHS = new Set([
'IdList.Id',
'eSearchResult.IdList.Id',
'PubmedArticleSet.PubmedArticle',
'PubmedArticleSet.DeleteCitation.PMID',
'AuthorList.Author',
'AffiliationInfo',
'MeshHeadingList.MeshHeading',
'MeshHeading.QualifierName',
'GrantList.Grant',
'KeywordList.Keyword',
'PublicationTypeList.PublicationType',
'History.PubMedPubDate',
'LinkSet.LinkSetDb.Link',
'Link.Id',
'DbInfo.FieldList.Field',
'DbInfo.LinkList.Link',
'eSummaryResult.DocSum',
'DocSum.Item',
'DescriptorRecordSet.DescriptorRecord',
'ConceptList.Concept',
'TermList.Term',
'TreeNumberList.TreeNumber',
'pmc-articleset.article',
'article-meta.article-id',
'article-meta.pub-date',
'contrib-group.contrib',
'kwd-group.kwd',
'body.sec',
'sec.sec',
'sec.p',
'ref-list.ref',
]);
/**
* Ordered paths to check for NCBI error messages in parsed XML.
* More specific paths come first so they take precedence.
*/
const ERROR_PATHS = [
'eLinkResult.ERROR',
'eSummaryResult.ERROR',
'PubmedArticleSet.ErrorList.CannotRetrievePMID',
'ERROR',
];
/**
* NCBI error messages indicating the requested record doesn't exist (permanent
* failure). Throwing NotFound for these prevents the retry loop from hammering
* NCBI on what is fundamentally a "no such record" response.
*/
const NCBI_NOT_FOUND_PATTERNS = [
/cannot get document summary/i,
/UID=\S+:\s*not found/i,
/Empty id list/i,
];
const WARNING_PATHS = [
'eSearchResult.ErrorList.PhraseNotFound',
'eSearchResult.ErrorList.FieldNotFound',
'eSearchResult.WarningList.QuotedPhraseNotFound',
'eSearchResult.WarningList.OutputMessage',
];
/**
* NCBI responses routinely contain numeric character references for punctuation
* and diacritics, especially in page ranges and author names. Keep entity
* processing enabled, but raise the aggregate expansion ceiling high enough for
* trusted PubMed payloads.
*/
const NCBI_PROCESS_ENTITIES_OPTIONS = {
enabled: true,
maxTotalExpansions: 100_000,
};
function resolvePath(obj, path) {
let current = obj;
for (const part of path.split('.')) {
if (current && typeof current === 'object' && part in current) {
current = current[part];
}
else {
return;
}
}
return current;
}
function extractTextValues(source, prefix = '') {
const items = Array.isArray(source) ? source : [source];
const messages = [];
for (const item of items) {
if (typeof item === 'string' || typeof item === 'number' || typeof item === 'boolean') {
messages.push(`${prefix}${String(item)}`);
}
else if (item && typeof item['#text'] === 'string') {
messages.push(`${prefix}${item['#text']}`);
}
}
return messages;
}
/**
* Replaces raw NCBI C++ exception traces with a concise, actionable message.
* The internal details are logged but not surfaced to the caller.
*/
function sanitizeNcbiError(message) {
if (/NCBI C\+\+ Exception|CException|CTxRawClient/i.test(message)) {
if (/closed connection|EOF|Read failed/i.test(message)) {
return 'NCBI API temporarily unavailable (connection reset) — try again in a few seconds.';
}
return 'NCBI API returned an internal error — try again in a few seconds.';
}
return message;
}
/**
* Matches `<ERROR>` (uppercase, optionally with attributes) for a cheap
* pre-parse check in ordered mode. Intentionally case-sensitive: NCBI's
* E-utilities use `<ERROR>` for response-level failures, while PMC EFetch
* uses lowercase `<error id="…">` to flag a single unavailable PMCID. The
* latter is data (a missing ID), not a transport error, so it falls through
* to the caller which reports it via the unified `unavailable[]` list.
*/
const ERROR_TAG_REGEX = /<ERROR(?:\s[^>]*)?>/;
/**
* Unicode superscript map. Covers digits, common operators, and the few
* letters that have superscript codepoints (n, i). `−` (U+2212, the proper
* minus) is normalized to U+207B alongside ASCII `-`.
*/
const SUPERSCRIPT_MAP = {
'0': '⁰',
'1': '¹',
'2': '²',
'3': '³',
'4': '⁴',
'5': '⁵',
'6': '⁶',
'7': '⁷',
'8': '⁸',
'9': '⁹',
'+': '⁺',
'-': '⁻',
'−': '⁻',
'=': '⁼',
'(': '⁽',
')': '⁾',
n: 'ⁿ',
i: 'ⁱ',
};
/**
* Unicode subscript map. Covers digits and operators; alphabetic subscripts
* are limited in Unicode and rarely appear in MEDLINE so they fall through
* to the `_X` ASCII fallback.
*/
const SUBSCRIPT_MAP = {
'0': '₀',
'1': '₁',
'2': '₂',
'3': '₃',
'4': '₄',
'5': '₅',
'6': '₆',
'7': '₇',
'8': '₈',
'9': '₉',
'+': '₊',
'-': '₋',
'−': '₋',
'=': '₌',
'(': '₍',
')': '₎',
};
function mapInlineContent(content, table, asciiPrefix) {
let out = '';
for (const ch of content) {
const mapped = table[ch];
if (mapped === undefined)
return `${asciiPrefix}${content}`;
out += mapped;
}
return out;
}
/**
* Flattens inline mixed-content markup (`<sup>`, `<sub>`, `<inf>`, `<i>`,
* `<b>`, `<u>`, `<sc>`) inside PubMed/MEDLINE XML before fast-xml-parser
* runs. The non-ordered parser used for EFetch responses doesn't preserve
* mixed content — `1.73 m<sup>2</sup>` parses to `{ '#text': '1.73 m', sup:
* 2 }`, and `extractAbstractText` only reads `#text`, so the superscript
* digit is silently dropped from abstracts and titles.
*
* Numeric and operator characters map to Unicode (²/³/⁻²/₂…); anything else
* falls back to a `^X` / `_X` ASCII prefix so the content survives in a
* recognizable form. Italic / bold / underline / small-caps tags are
* stripped (content kept) since they don't carry meaning in our text
* rendering. Only invoked on the regular parser path; the PMC JATS path
* already preserves inline markup via `preserveOrder: true`.
*
* @internal exported for direct unit tests
*/
export function flattenInlineMarkup(xml) {
return xml
.replace(/<sup>([^<]*)<\/sup>/g, (_, c) => mapInlineContent(c, SUPERSCRIPT_MAP, '^'))
.replace(/<sub>([^<]*)<\/sub>/g, (_, c) => mapInlineContent(c, SUBSCRIPT_MAP, '_'))
.replace(/<inf>([^<]*)<\/inf>/g, (_, c) => mapInlineContent(c, SUBSCRIPT_MAP, '_'))
.replace(/<\/?(?:i|b|u|sc)>/g, '');
}
/**
* Parses NCBI E-utility responses (XML, JSON, text) and checks for NCBI-specific
* error structures embedded in response bodies.
*/
export class NcbiResponseHandler {
xmlParser;
/**
* Parser configured for JATS mixed content (PMC full-text). `preserveOrder`
* keeps document order so inline markup in `<p>`, `<abstract>`, `<title>`
* doesn't collapse into reordered text. `trimValues: false` retains spacing
* between text nodes and adjacent inline children.
*/
orderedXmlParser;
constructor() {
this.xmlParser = new FastXmlParser({
ignoreAttributes: false,
attributeNamePrefix: '@_',
parseTagValue: true,
processEntities: NCBI_PROCESS_ENTITIES_OPTIONS,
htmlEntities: true,
isArray: (_name, jpath) => NCBI_ARRAY_JPATHS.has(jpath),
});
this.orderedXmlParser = new FastXmlParser({
preserveOrder: true,
ignoreAttributes: false,
attributeNamePrefix: '@_',
parseTagValue: true,
trimValues: false,
processEntities: NCBI_PROCESS_ENTITIES_OPTIONS,
htmlEntities: true,
});
}
/**
* Extract a structured error from a parsed NCBI XML body and throw it as
* `notFound()` for permanent "no such record" responses or `serviceUnavailable()`
* for transient backend failures. Never returns.
*
* NCBI returns "cannot get document summary" / "Empty id list" for invalid
* UIDs — these are permanent (the record doesn't exist), so we surface them
* as NotFound so the retry loop short-circuits instead of hammering NCBI.
*/
throwNcbiError(parsedXml, endpoint) {
const errorMessages = this.extractNcbiErrorMessages(parsedXml);
logger.error('NCBI API returned an error in XML response.', requestContextService.createRequestContext({
operation: 'NcbiXmlError',
endpoint,
errors: errorMessages,
}));
if (errorMessages.some((msg) => NCBI_NOT_FOUND_PATTERNS.some((p) => p.test(msg)))) {
throw notFound(`NCBI API Error: ${errorMessages.join('; ')}`, {
reason: 'ncbi_resource_not_found',
endpoint,
ncbiErrors: errorMessages,
...recoveryFor('ncbi_resource_not_found'),
});
}
throw serviceUnavailable(`NCBI API Error: ${errorMessages.join('; ')}`, {
reason: 'ncbi_unreachable',
endpoint,
ncbiErrors: errorMessages,
...recoveryFor('ncbi_unreachable'),
});
}
extractNcbiErrorMessages(parsedXml) {
const messages = [];
for (const path of ERROR_PATHS) {
const value = resolvePath(parsedXml, path);
if (value !== undefined) {
messages.push(...extractTextValues(value));
}
}
if (messages.length === 0) {
for (const path of WARNING_PATHS) {
const value = resolvePath(parsedXml, path);
if (value !== undefined) {
messages.push(...extractTextValues(value, 'Warning: '));
}
}
}
return messages.length > 0 ? messages.map(sanitizeNcbiError) : ['Unknown NCBI API error.'];
}
parseAndHandleResponse(responseText, endpoint, options) {
const retmode = options?.retmode ?? 'xml';
if (retmode === 'text') {
logger.debug('Received text response from NCBI.', requestContextService.createRequestContext({
operation: 'NcbiParseText',
endpoint,
retmode,
}));
return responseText;
}
if (retmode === 'xml') {
logger.debug('Parsing XML response from NCBI.', requestContextService.createRequestContext({
operation: 'NcbiParseXml',
endpoint,
retmode,
}));
const isHtml = /^\s*<(!DOCTYPE\s+html|html[\s>])/i.test(responseText);
if (isHtml) {
logger.warning('NCBI returned HTML instead of XML (likely rate-limited).', requestContextService.createRequestContext({
operation: 'NcbiHtmlResponse',
endpoint,
}));
throw serviceUnavailable('NCBI API returned an HTML response instead of XML — likely rate-limited.', { reason: 'ncbi_unreachable', endpoint, ...recoveryFor('ncbi_unreachable') });
}
// NCBI's eLink (and occasionally other endpoints) drops the root element
// when the upstream backend connection fails mid-response, yielding just
// `<?xml ... ?>` + DOCTYPE with no body. JSON retmode reveals the same
// failure as a TXCLIENT EOF in an `ERROR` field; XML just truncates.
// Reclassify as transient ServiceUnavailable so the retry chain recovers,
// rather than SerializationError which short-circuits retries.
const bodyMinusProlog = responseText
.replace(/<\?xml[^?]*\?>/gi, '')
.replace(/<!DOCTYPE[^>]*>/gi, '')
.trim();
if (bodyMinusProlog.length === 0) {
logger.warning('NCBI returned a prolog-only XML response (upstream backend failure).', requestContextService.createRequestContext({
operation: 'NcbiEmptyResponse',
endpoint,
responseLength: responseText.length,
}));
throw serviceUnavailable('NCBI returned an empty response body — the upstream backend likely failed mid-request.', { reason: 'ncbi_unreachable', endpoint, ...recoveryFor('ncbi_unreachable') });
}
const xmlForValidation = responseText.replace(/<!DOCTYPE[^>]*>/gi, '');
const validationResult = XMLValidator.validate(xmlForValidation);
if (validationResult !== true) {
logger.error('Invalid XML response from NCBI.', requestContextService.createRequestContext({
operation: 'NcbiInvalidXml',
endpoint,
responseSnippet: responseText.substring(0, 500),
}));
throw serializationError('Received invalid XML from NCBI.', {
reason: 'ncbi_invalid_response',
endpoint,
responseSnippet: responseText.substring(0, 200),
...recoveryFor('ncbi_invalid_response'),
});
}
const useOrdered = options?.useOrderedParser ?? false;
if (useOrdered && ERROR_TAG_REGEX.test(responseText)) {
// Ordered parser lacks the named-key shape error extraction relies on.
// Errors are rare, so fall back to the regular parser just to surface a
// structured message.
const errorParsed = this.xmlParser.parse(responseText);
this.throwNcbiError(errorParsed, endpoint);
}
const parser = useOrdered ? this.orderedXmlParser : this.xmlParser;
// Pre-flatten <sup>/<sub>/<inf>/<i>/<b>/<u>/<sc> on the regular parser
// path. The ordered parser walks mixed content correctly via
// preserveOrder; the regular parser does not.
const xmlForParse = useOrdered ? responseText : flattenInlineMarkup(responseText);
let parsedXml;
try {
parsedXml = parser.parse(xmlForParse);
}
catch (error) {
const parserError = error instanceof Error ? error.message : String(error);
logger.error('Failed to parse validated XML response from NCBI.', requestContextService.createRequestContext({
operation: 'NcbiXmlParseError',
endpoint,
parserError,
responseSnippet: responseText.substring(0, 500),
}));
throw serializationError(`Failed to parse XML response from NCBI: ${parserError}`, {
reason: 'ncbi_invalid_response',
endpoint,
parserError,
responseSnippet: responseText.substring(0, 200),
...recoveryFor('ncbi_invalid_response'),
}, { cause: error });
}
if (!useOrdered) {
const parsedObj = parsedXml;
const hasError = ERROR_PATHS.some((path) => resolvePath(parsedObj, path) !== undefined);
if (hasError) {
this.throwNcbiError(parsedObj, endpoint);
}
}
if (options?.returnRawXml) {
logger.debug('Returning raw XML string after validation.', requestContextService.createRequestContext({ operation: 'NcbiRawXml', endpoint }));
return responseText;
}
logger.debug('Successfully parsed XML response.', requestContextService.createRequestContext({ operation: 'NcbiParseXmlOk', endpoint }));
return parsedXml;
}
if (retmode === 'json') {
logger.debug('Parsing JSON response from NCBI.', requestContextService.createRequestContext({
operation: 'NcbiParseJson',
endpoint,
retmode,
}));
let parsed;
try {
parsed = JSON.parse(responseText);
}
catch (error) {
throw serializationError('Failed to parse NCBI JSON response.', {
reason: 'ncbi_invalid_response',
endpoint,
responseSnippet: responseText.substring(0, 200),
...recoveryFor('ncbi_invalid_response'),
}, { cause: error });
}
if (parsed && typeof parsed === 'object' && 'error' in parsed) {
const errorMessage = String(parsed.error);
logger.error('NCBI API returned an error in JSON response.', requestContextService.createRequestContext({
operation: 'NcbiJsonError',
endpoint,
error: errorMessage,
}));
if (NCBI_NOT_FOUND_PATTERNS.some((p) => p.test(errorMessage))) {
throw notFound(`NCBI API Error: ${errorMessage}`, {
reason: 'ncbi_resource_not_found',
endpoint,
ncbiErrors: [errorMessage],
...recoveryFor('ncbi_resource_not_found'),
});
}
throw serviceUnavailable(`NCBI API Error: ${errorMessage}`, {
reason: 'ncbi_unreachable',
endpoint,
ncbiError: errorMessage,
...recoveryFor('ncbi_unreachable'),
});
}
logger.debug('Successfully parsed JSON response.', requestContextService.createRequestContext({ operation: 'NcbiParseJsonOk', endpoint }));
return parsed;
}
logger.warning(`Unhandled retmode "${retmode}". Returning raw response text.`, requestContextService.createRequestContext({
operation: 'NcbiUnknownRetmode',
endpoint,
retmode,
}));
return responseText;
}
}
//# sourceMappingURL=response-handler.js.map