paper-search-mcp-nodejs
Version:
A Node.js MCP server for searching and downloading academic papers from multiple sources, including arXiv, PubMed, bioRxiv, Web of Science, and more.
412 lines • 14.9 kB
JavaScript
/**
* PubMed E-utilities API集成模块
* 支持无API密钥的免费使用(3 req/s)和有API密钥的增强使用(10 req/s)
*/
import axios from 'axios';
import * as xml2js from 'xml2js';
import { PaperFactory } from '../models/Paper.js';
import { PaperSource } from './PaperSource.js';
import { RateLimiter } from '../utils/RateLimiter.js';
export class PubMedSearcher extends PaperSource {
baseApiUrl;
rateLimiter;
retMax = 20; // 每次批量获取的最大数量
constructor(apiKey) {
super('pubmed', 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils', apiKey);
this.baseApiUrl = this.baseUrl;
// 根据是否有API密钥设置不同的速率限制
const requestsPerSecond = apiKey ? 10 : 3;
this.rateLimiter = new RateLimiter({
requestsPerSecond,
burstCapacity: requestsPerSecond,
debug: process.env.NODE_ENV === 'development'
});
}
getCapabilities() {
return {
search: true,
download: false, // PubMed不提供直接PDF下载
fullText: false, // 只有摘要,不是全文
citations: false, // 基础版本不提供被引统计
requiresApiKey: false, // 无API密钥也可以使用,但有限制
supportedOptions: ['maxResults', 'year', 'author', 'journal', 'sortBy']
};
}
/**
* 搜索PubMed文献
*/
async search(query, options = {}) {
try {
console.error(`🔍 PubMed Search Starting: query="${query}", options=`, options);
// 第一步:使用ESearch获取PMID列表
const pmids = await this.searchPMIDs(query, options);
if (pmids.length === 0) {
return [];
}
// 第二步:批量获取详细信息
const papers = [];
for (let i = 0; i < pmids.length; i += this.retMax) {
const batch = pmids.slice(i, i + this.retMax);
const batchPapers = await this.fetchPaperDetails(batch);
papers.push(...batchPapers);
}
return papers;
}
catch (error) {
console.error(`❌ PubMed Search Error:`, error.message);
console.error(`📍 PubMed Error Details:`, error.response?.data || error);
this.handleHttpError(error, 'search');
}
}
/**
* 搜索获取PMID列表
*/
async searchPMIDs(query, options) {
await this.rateLimiter.waitForPermission();
const searchQuery = this.buildSearchQuery(query, options);
const params = {
db: 'pubmed',
term: searchQuery,
retmax: (options.maxResults || 10).toString(),
retmode: 'xml',
sort: this.mapSortField(options.sortBy || 'relevance')
};
// 添加API密钥(如果有)
if (this.apiKey) {
params.api_key = this.apiKey;
}
const url = `${this.baseApiUrl}/esearch.fcgi`;
console.error(`🔍 PubMed ESearch Request: GET ${url}`);
console.error(`📋 PubMed ESearch params:`, params);
const response = await axios.get(url, { params, timeout: 15000 });
console.error(`✅ PubMed ESearch Response: ${response.status} ${response.statusText}`);
console.error(`📄 PubMed ESearch Response data:`, response.data.substring(0, 500));
const result = await this.parseXmlResponse(response.data);
let pmids = result.eSearchResult.IdList?.Id || [];
// 处理单个ID vs ID数组
if (typeof pmids === 'string') {
pmids = [pmids];
}
console.error(`🎯 PubMed Found ${pmids.length} PMIDs:`, pmids.slice(0, 5));
return pmids;
}
/**
* 获取论文详细信息
*/
async fetchPaperDetails(pmids) {
await this.rateLimiter.waitForPermission();
const params = {
db: 'pubmed',
id: pmids.join(','),
retmode: 'xml'
};
// 添加API密钥(如果有)
if (this.apiKey) {
params.api_key = this.apiKey;
}
const url = `${this.baseApiUrl}/efetch.fcgi`;
const response = await axios.get(url, { params, timeout: 30000 });
const result = await this.parseXmlResponse(response.data);
// 处理xml2js的单个元素vs数组问题
let articles = result.PubmedArticleSet?.PubmedArticle || [];
if (!Array.isArray(articles)) {
articles = [articles]; // 将单个对象转换为数组
}
return this.parsePubMedArticles(articles);
}
/**
* 构建搜索查询
*/
buildSearchQuery(query, options) {
let searchQuery = query;
// 添加作者过滤
if (options.author) {
searchQuery += ` AND ${options.author}[Author]`;
}
// 添加期刊过滤
if (options.journal) {
searchQuery += ` AND "${options.journal}"[Journal]`;
}
// 添加年份过滤
if (options.year) {
if (options.year.includes('-')) {
const [startYear, endYear] = options.year.split('-');
if (startYear && endYear) {
searchQuery += ` AND ${startYear}:${endYear}[Publication Date]`;
}
else if (startYear) {
searchQuery += ` AND ${startYear}:3000[Publication Date]`;
}
else if (endYear) {
searchQuery += ` AND 1900:${endYear}[Publication Date]`;
}
}
else {
searchQuery += ` AND ${options.year}[Publication Date]`;
}
}
// 添加文献类型过滤
if (options.publicationType && options.publicationType.length > 0) {
const typeQuery = options.publicationType
.map(type => `"${type}"[Publication Type]`)
.join(' OR ');
searchQuery += ` AND (${typeQuery})`;
}
return searchQuery;
}
/**
* 映射排序字段
*/
mapSortField(sortBy) {
const fieldMap = {
'relevance': 'relevance',
'date': 'pub+date',
'citations': 'relevance' // PubMed不直接支持按被引排序
};
return fieldMap[sortBy] || 'relevance';
}
/**
* 解析XML响应
*/
async parseXmlResponse(xmlData) {
const parser = new xml2js.Parser({
explicitArray: false, // 简化数组处理
mergeAttrs: false,
normalize: true,
normalizeTags: false,
trim: true
});
console.error(`🔍 PubMed XML Parsing - Data preview:`, xmlData.substring(0, 200));
const result = await parser.parseStringPromise(xmlData);
console.error(`📄 PubMed XML Parsed result structure:`, JSON.stringify(result, null, 2).substring(0, 1000));
return result;
}
/**
* 解析PubMed文章列表
*/
parsePubMedArticles(articles) {
return articles.map(article => this.parsePubMedArticle(article))
.filter(paper => paper !== null);
}
/**
* 解析单个PubMed文章
*/
parsePubMedArticle(article) {
try {
const medlineCitation = article.MedlineCitation;
const articleData = medlineCitation.Article;
const pubmedData = article.PubmedData;
// 提取PMID
const pmid = medlineCitation.PMID._;
// 提取标题
const title = articleData.ArticleTitle || 'No title available';
// 提取作者
const authors = this.extractAuthors(articleData.AuthorList?.Author || []);
// 提取摘要
const abstract = this.extractAbstract(articleData.Abstract);
// 提取期刊信息
const journal = articleData.Journal.Title || articleData.Journal.ISOAbbreviation || '';
// 提取发布日期
const publishedDate = this.extractPublishedDate(articleData.Journal.JournalIssue.PubDate);
// 提取DOI和其他ID
const { doi, pmc } = this.extractArticleIds([
...(articleData.ArticleIdList?.ArticleId || []),
...(pubmedData.ArticleIdList?.ArticleId || [])
]);
// 提取页码
const pages = articleData.Pagination?.MedlinePgn || '';
// 构建URL
const url = `https://pubmed.ncbi.nlm.nih.gov/${pmid}/`;
const pdfUrl = pmc ? `https://www.ncbi.nlm.nih.gov/pmc/articles/${pmc}/pdf/` : '';
return PaperFactory.create({
paperId: pmid,
title: this.cleanText(title),
authors: authors,
abstract: this.cleanText(abstract),
doi: doi,
publishedDate: publishedDate,
pdfUrl: pdfUrl,
url: url,
source: 'pubmed',
journal: journal,
volume: articleData.Journal.JournalIssue.Volume || undefined,
issue: articleData.Journal.JournalIssue.Issue || undefined,
pages: pages || undefined,
year: publishedDate?.getFullYear(),
extra: {
pmid: pmid,
pmc: pmc || undefined
}
});
}
catch (error) {
console.error('Error parsing PubMed article:', error);
return null;
}
}
/**
* 提取作者信息
*/
extractAuthors(authorList) {
if (!Array.isArray(authorList)) {
return [];
}
return authorList.map(author => {
if (author.CollectiveName) {
return author.CollectiveName;
}
const lastName = author.LastName || '';
const foreName = author.ForeName || author.Initials || '';
if (lastName && foreName) {
return `${lastName}, ${foreName}`;
}
else if (lastName) {
return lastName;
}
else if (foreName) {
return foreName;
}
return 'Unknown Author';
}).filter(name => name && name !== 'Unknown Author');
}
/**
* 提取摘要
*/
extractAbstract(abstractData) {
if (!abstractData) {
return '';
}
if (typeof abstractData.AbstractText === 'string') {
return abstractData.AbstractText;
}
if (Array.isArray(abstractData.AbstractText)) {
return abstractData.AbstractText.join(' ');
}
return '';
}
/**
* 提取发布日期
*/
extractPublishedDate(pubDate) {
if (!pubDate) {
return null;
}
const year = pubDate.Year;
const month = pubDate.Month;
const day = pubDate.Day;
if (year) {
const monthNum = month ? this.parseMonth(month) : 1;
const dayNum = day ? parseInt(day, 10) : 1;
return new Date(parseInt(year, 10), monthNum - 1, dayNum);
}
return null;
}
/**
* 解析月份(支持英文和数字)
*/
parseMonth(month) {
const monthMap = {
'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6,
'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12,
'January': 1, 'February': 2, 'March': 3, 'April': 4, 'June': 6,
'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12
};
const mapped = monthMap[month];
if (mapped) {
return mapped;
}
const num = parseInt(month, 10);
return (num >= 1 && num <= 12) ? num : 1;
}
/**
* 提取文章ID(DOI、PMC等)
*/
extractArticleIds(articleIds) {
let doi = '';
let pmc = '';
if (Array.isArray(articleIds)) {
for (const id of articleIds) {
const idType = id.$?.IdType?.toLowerCase();
const value = id._;
if (idType === 'doi' && !doi) {
doi = value;
}
else if (idType === 'pmc' && !pmc) {
pmc = value;
}
}
}
return { doi, pmc };
}
/**
* PubMed通常不支持直接PDF下载
*/
async downloadPdf(paperId, options) {
// 尝试获取PMC链接
const paper = await this.getPaperByPmid(paperId);
if (paper?.extra?.pmc) {
const pmcUrl = `https://www.ncbi.nlm.nih.gov/pmc/articles/${paper.extra.pmc}/pdf/`;
throw new Error(`PubMed paper may be available as PDF at PMC: ${pmcUrl}. Direct download not supported through this API.`);
}
throw new Error('PubMed does not support direct PDF download. Please access the paper through the publisher or PMC.');
}
/**
* PubMed不提供全文内容
*/
async readPaper(paperId, options) {
throw new Error('PubMed does not provide full-text content. Only abstracts and metadata are available.');
}
/**
* 根据PMID获取论文信息
*/
async getPaperByPmid(pmid) {
try {
const papers = await this.fetchPaperDetails([pmid]);
return papers.length > 0 ? papers[0] : null;
}
catch (error) {
console.error('Error getting paper by PMID:', error);
return null;
}
}
/**
* 根据DOI获取论文信息
*/
async getPaperByDoi(doi) {
try {
const results = await this.search(`"${doi}"[DOI]`, { maxResults: 1 });
return results.length > 0 ? results[0] : null;
}
catch (error) {
console.error('Error getting paper by DOI from PubMed:', error);
return null;
}
}
/**
* 获取速率限制器状态
*/
getRateLimiterStatus() {
return this.rateLimiter.getStatus();
}
/**
* 验证API密钥(如果提供)
*/
async validateApiKey() {
if (!this.apiKey) {
return true; // 无API密钥时总是有效(使用免费限制)
}
try {
await this.search('test', { maxResults: 1 });
return true;
}
catch (error) {
// API密钥无效通常返回400或403错误
if (error.response?.status === 400 || error.response?.status === 403) {
return false;
}
// 其他错误可能是网络问题,认为密钥可能有效
return true;
}
}
}
//# sourceMappingURL=PubMedSearcher.js.map