UNPKG

@cyanheads/pubmed-mcp-server

Version:

Search PubMed/Europe PMC, fetch articles and full text (PMC/EPMC/Unpaywall), citations, MeSH terms via MCP. STDIO or Streamable HTTP.

346 lines 14.2 kB
/** * @fileoverview Helper functions for parsing ESummary results from NCBI. * Handles different ESummary XML structures and formats the data into * consistent ParsedBriefSummary objects. * @module src/services/ncbi/parsing/esummary-parser */ import { dateParser, logger, requestContextService } from '@cyanheads/mcp-ts-core/utils'; import { ensureArray, getAttribute, getText } from './xml-helpers.js'; /** * Formats an array of ESummary authors into a string. * Limits to the first 3 authors and adds 'et al.' if more exist. */ export function formatESummaryAuthors(authors) { if (!authors || authors.length === 0) return ''; return (authors .slice(0, 3) .map((author) => author.name) .join(', ') + (authors.length > 3 ? ', et al.' : '')); } /** 3-letter month abbreviations used by NCBI ESummary PubDate/EPubDate fields. */ const MONTH_ABBREV = { Jan: '01', Feb: '02', Mar: '03', Apr: '04', May: '05', Jun: '06', Jul: '07', Aug: '08', Sep: '09', Oct: '10', Nov: '11', Dec: '12', }; /** * Matches NCBI ESummary date formats: * - "2024" → year only * - "2024 Jan" → year + month * - "2024 Jan 15" → year + month + day * - "2018 Jul-Aug" → year + month range (dash) * - "2018 Jan/Feb" → year + month range (slash) * * Groups: 1=year, 2=firstMonth?, 3=day? (second month in range is non-capturing) */ const NCBI_DATE_RE = /^(\d{4})(?:\s+([A-Za-z]{3})(?:[/-][A-Za-z]{3})?(?:\s+(\d{1,2}))?)?$/; /** * Parses NCBI ESummary date strings into YYYY-MM-DD (or YYYY-MM-01 / YYYY-01-01 * for partial dates). Returns undefined for unrecognized formats. * * NCBI's ESummary API returns a small set of predictable date formats. chrono-node * (used by the framework's dateParser) mishandles most of them — its `forwardDate` * option causes "2023 Dec" to resolve to a future December, ignoring the year. */ export function parseNcbiDate(dateStr) { const m = NCBI_DATE_RE.exec(dateStr.trim()); if (!m) return; const [, year, monthAbbrev, day] = m; if (!year) return; const month = monthAbbrev ? MONTH_ABBREV[monthAbbrev] : undefined; if (monthAbbrev && !month) return; // unrecognized month abbreviation if (month && day) return `${year}-${month}-${day.padStart(2, '0')}`; if (month) return `${year}-${month}-01`; return `${year}-01-01`; } /** * Standardizes date strings from ESummary to 'YYYY-MM-DD' format. * Uses a dedicated NCBI date parser for known formats, falling back to * chrono-node via the framework's dateParser for anything unexpected. */ export async function standardizeESummaryDate(dateStr, parentContext) { if (dateStr == null) return; const dateInputString = String(dateStr).trim(); if (!dateInputString) return; const ncbiResult = parseNcbiDate(dateInputString); if (ncbiResult) return ncbiResult; const currentContext = parentContext || requestContextService.createRequestContext({ operation: 'standardizeESummaryDateInternal', inputDate: dateInputString, }); try { const parsedDate = await dateParser.parseDate(dateInputString, currentContext); if (parsedDate) { return parsedDate.toISOString().split('T')[0]; } logger.debug(`standardizeESummaryDate: could not parse "${dateInputString}", returning undefined.`, currentContext); } catch (e) { logger.warning(`standardizeESummaryDate: dateParser.parseDate error for "${dateInputString}", returning undefined.`, { ...currentContext, error: e instanceof Error ? e.message : String(e), }); } return; } /** * Returns the id type (e.g. 'doi', 'pmc') from an ESummary ArticleId entry, * normalizing across JSON (`idtype`) and XML (`IdType`) casings. */ function getArticleIdType(id) { return id.idtype ?? id.IdType; } /** * Returns the raw value field from an ESummary ArticleId entry, * normalizing across JSON (`value`) and XML (`Value`) casings. The value may * be a string or number — `getText` handles both. */ function getArticleIdValue(id) { return id.value ?? id.Value; } function parseESummaryAuthorsFromDocumentSummary(docSummary) { const authorsProp = docSummary.Authors; if (!authorsProp) return []; const parsedAuthors = []; const processRawAuthor = (rawAuthInput) => { let name = ''; let authtype; let clusterid; if (typeof rawAuthInput === 'string') { name = rawAuthInput; } else if (rawAuthInput && typeof rawAuthInput === 'object') { const authorObj = rawAuthInput; name = getText(authorObj, ''); if (!name) { name = getText(authorObj.Name || authorObj.name, ''); } authtype = getText(authorObj.AuthType || authorObj.authtype, undefined); clusterid = getText(authorObj.ClusterId || authorObj.clusterid, undefined); if (!name) { const authInputString = JSON.stringify(authorObj); logger.warning(`Unhandled author structure in parseESummaryAuthorsFromDocumentSummary. authInput: ${authInputString.substring(0, 100)}`, requestContextService.createRequestContext({ operation: 'parseESummaryAuthorsFromDocumentSummary', detail: 'Unhandled author structure', })); const keys = Object.keys(authorObj); if (keys.length === 1 && keys[0] && typeof authorObj[keys[0]] === 'string') { name = authorObj[keys[0]]; } else if (authInputString.length < 100) { name = authInputString; } } } if (name.trim()) { parsedAuthors.push({ name: name.trim(), ...(authtype !== undefined && { authtype }), ...(clusterid !== undefined && { clusterid }), }); } }; if (Array.isArray(authorsProp)) { for (const item of authorsProp) { processRawAuthor(item); } } else if (typeof authorsProp === 'object' && 'Author' in authorsProp && authorsProp.Author) { const rawAuthors = ensureArray(authorsProp.Author); for (const item of rawAuthors) { processRawAuthor(item); } } else if (typeof authorsProp === 'string') { try { if (authorsProp.startsWith('[') && authorsProp.endsWith(']')) { const parsedJsonAuthors = JSON.parse(authorsProp); if (Array.isArray(parsedJsonAuthors)) { for (const authItem of parsedJsonAuthors) { if (typeof authItem === 'string') { parsedAuthors.push({ name: authItem.trim() }); } else if (typeof authItem === 'object' && authItem !== null && (authItem.name || authItem.Name)) { processRawAuthor(authItem); } } if (parsedAuthors.length > 0) return parsedAuthors; } } } catch (e) { logger.debug(`Failed to parse Authors string as JSON: ${authorsProp.substring(0, 100)}`, requestContextService.createRequestContext({ operation: 'parseESummaryAuthorsFromString', input: authorsProp.substring(0, 100), error: e instanceof Error ? e.message : String(e), })); } for (const namePart of authorsProp.split(/[,;]/)) { const trimmed = namePart.trim(); if (trimmed) parsedAuthors.push({ name: trimmed }); } } return parsedAuthors.filter((author) => author.name); } function parseSingleDocumentSummary(docSummary) { const pmid = docSummary['@_uid']; const authorsArray = parseESummaryAuthorsFromDocumentSummary(docSummary); let idsArray = []; const articleIdsProp = docSummary.ArticleIds; if (articleIdsProp) { idsArray = Array.isArray(articleIdsProp) ? articleIdsProp : ensureArray(articleIdsProp.ArticleId); } let doiValue = getText(docSummary.DOI, undefined); if (!doiValue) { const doiEntry = idsArray.find((id) => getArticleIdType(id) === 'doi'); if (doiEntry) { doiValue = getText(getArticleIdValue(doiEntry), undefined); } } const pmcEntry = idsArray.find((id) => getArticleIdType(id) === 'pmc'); const pmcIdValue = pmcEntry ? getText(getArticleIdValue(pmcEntry), undefined) : undefined; const title = getText(docSummary.Title); const source = getText(docSummary.Source) || getText(docSummary.FullJournalName) || getText(docSummary.SO); const rawPubDate = getText(docSummary.PubDate); const rawEPubDate = getText(docSummary.EPubDate); return { pmid: String(pmid), ...(title && { title }), authors: formatESummaryAuthors(authorsArray), ...(source && { source }), ...(doiValue && { doi: doiValue }), ...(pmcIdValue && { pmcId: pmcIdValue }), ...(rawPubDate && { rawPubDate }), ...(rawEPubDate && { rawEPubDate }), }; } function parseSingleDocSumOldXml(docSum) { const pmid = docSum.Id; const items = ensureArray(docSum.Item); const getItemValue = (name, type) => { const namesToTry = ensureArray(name); for (const n of namesToTry) { const item = items.find((i) => i['@_Name'] === n && (type ? i['@_Type'] === type : true) && i['@_Type'] !== 'ERROR'); if (item) { const textVal = getText(item); if (textVal !== undefined) return String(textVal); } } return; }; const getAuthorList = () => { const authorListItem = items.find((i) => i['@_Name'] === 'AuthorList' && i['@_Type'] === 'List'); if (authorListItem?.Item) { return ensureArray(authorListItem.Item) .filter((a) => a['@_Name'] === 'Author' && a['@_Type'] === 'String') .map((a) => ({ name: getText(a, '') })); } return items .filter((i) => i['@_Name'] === 'Author' && i['@_Type'] === 'String') .map((a) => ({ name: getText(a, '') })); }; const authorsArray = getAuthorList(); const articleIdsItem = items.find((i) => i['@_Name'] === 'ArticleIds' && i['@_Type'] === 'List'); const articleIdsList = articleIdsItem?.Item ? ensureArray(articleIdsItem.Item) : []; let doiFromItems = getItemValue('DOI', 'String'); if (!doiFromItems) { const doiIdItem = articleIdsList.find((id) => getAttribute(id, 'idtype') === 'doi' || id['@_Name'] === 'doi'); if (doiIdItem) { doiFromItems = getText(doiIdItem); } } let pmcIdFromItems; const pmcIdItem = articleIdsList.find((id) => getAttribute(id, 'idtype') === 'pmc' || id['@_Name'] === 'pmc'); if (pmcIdItem) { pmcIdFromItems = getText(pmcIdItem); } const title = getItemValue('Title', 'String'); const source = getItemValue(['Source', 'FullJournalName', 'SO'], 'String'); const rawPubDate = getItemValue(['PubDate', 'ArticleDate'], 'Date'); const rawEPubDate = getItemValue('EPubDate', 'Date'); return { pmid: String(pmid), ...(title !== undefined && { title }), authors: formatESummaryAuthors(authorsArray), ...(source !== undefined && { source }), ...(doiFromItems !== undefined && { doi: doiFromItems }), ...(pmcIdFromItems !== undefined && { pmcId: pmcIdFromItems }), ...(rawPubDate !== undefined && { rawPubDate }), ...(rawEPubDate !== undefined && { rawEPubDate }), }; } /** * Extracts and formats brief summaries from ESummary XML result. * Handles both DocumentSummarySet (newer) and older DocSum structures. */ export async function extractBriefSummaries(eSummaryResult, context) { if (!eSummaryResult) return []; const opContext = context || requestContextService.createRequestContext({ operation: 'extractBriefSummariesInternal', }); if (eSummaryResult.ERROR) { logger.warning('ESummary result contains an error', { ...opContext, errorDetails: eSummaryResult.ERROR, }); return []; } let rawSummaries = []; if (eSummaryResult.DocumentSummarySet?.DocumentSummary) { const docSummaries = ensureArray(eSummaryResult.DocumentSummarySet.DocumentSummary); rawSummaries = docSummaries.map(parseSingleDocumentSummary).filter((s) => s.pmid); } else if (eSummaryResult.DocSum) { const docSums = ensureArray(eSummaryResult.DocSum); rawSummaries = docSums.map(parseSingleDocSumOldXml).filter((s) => s.pmid); } const processedSummaries = await Promise.all(rawSummaries.map(async (rawSummary) => { const [pubDate, epubDate] = await Promise.all([ standardizeESummaryDate(rawSummary.rawPubDate, opContext), standardizeESummaryDate(rawSummary.rawEPubDate, opContext), ]); return { pmid: rawSummary.pmid, ...(rawSummary.title !== undefined && { title: rawSummary.title }), ...(rawSummary.authors !== undefined && { authors: rawSummary.authors }), ...(rawSummary.source !== undefined && { source: rawSummary.source }), ...(rawSummary.doi !== undefined && { doi: rawSummary.doi }), ...(rawSummary.pmcId !== undefined && { pmcId: rawSummary.pmcId }), ...(pubDate !== undefined && { pubDate }), ...(epubDate !== undefined && { epubDate }), }; })); return processedSummaries; } //# sourceMappingURL=esummary-parser.js.map