UNPKG

@tan-yong-sheng/paper-search-mcp-nodejs

Version:

A Node.js MCP server for searching and downloading academic papers from multiple sources, including arXiv, PubMed, bioRxiv, Web of Science, and more.

319 lines 13.1 kB
/** * Semantic Scholar API集成模块 * 支持免费API和付费API密钥 */ import axios from 'axios'; import * as fs from 'fs'; import * as path from 'path'; import { PaperFactory } from '../models/Paper.js'; import { PaperSource } from './PaperSource.js'; import { RateLimiter } from '../utils/RateLimiter.js'; export class SemanticScholarSearcher extends PaperSource { rateLimiter; baseApiUrl; constructor(apiKey) { super('semantic', 'https://api.semanticscholar.org/graph/v1', apiKey); this.baseApiUrl = this.baseUrl; // Semantic Scholar免费API限制:100 requests per 5 minutes // 付费API: 1000 requests per 5 minutes // 更保守的速率限制以避免被封 const requestsPerMinute = apiKey ? 180 : 18; // 有API密钥时更宽松 this.rateLimiter = new RateLimiter({ requestsPerSecond: requestsPerMinute / 60, burstCapacity: Math.max(3, Math.floor(requestsPerMinute / 20)), // 降低突发容量 debug: process.env.NODE_ENV === 'development' }); } getCapabilities() { return { search: true, download: true, // 部分论文有开放获取PDF fullText: false, // 只有部分PDF citations: true, // 提供引用统计 requiresApiKey: false, // 免费API可用,但有限制 supportedOptions: ['maxResults', 'year', 'fieldsOfStudy', 'sortBy'] }; } /** * 搜索Semantic Scholar论文 */ async search(query, options = {}) { await this.rateLimiter.waitForPermission(); try { const params = { query: query, limit: Math.min(options.maxResults || 10, 100), // API限制最大100 fields: [ 'paperId', 'title', 'abstract', 'venue', 'year', 'referenceCount', 'citationCount', 'influentialCitationCount', 'isOpenAccess', 'openAccessPdf', 'fieldsOfStudy', 's2FieldsOfStudy', 'publicationTypes', 'publicationDate', 'journal', 'authors', 'externalIds', 'url' ].join(',') }; // 添加年份过滤 if (options.year) { params.year = options.year; } // 添加研究领域过滤 if (options.fieldsOfStudy && options.fieldsOfStudy.length > 0) { params.fieldsOfStudy = options.fieldsOfStudy.join(','); } const url = `${this.baseApiUrl}/paper/search`; const headers = { 'User-Agent': 'Paper-Search-MCP/1.0 (Academic Research Tool)', 'Accept': 'application/json', 'Accept-Language': 'en-US,en;q=0.9' }; // 添加API密钥(如果有)- 根据官方文档推荐的方式 if (this.apiKey) { headers['x-api-key'] = this.apiKey; } console.error(`🔍 Semantic Scholar API Request: GET ${url}`); console.error(`📋 Semantic Scholar Request params:`, params); const response = await axios.get(url, { params, headers, timeout: 30000, // 改善请求可靠性 maxRedirects: 5, validateStatus: (status) => status < 500, // 允许 4xx 状态码通过,我们会手动处理 }); console.error(`✅ Semantic Scholar API Response: ${response.status} ${response.statusText}`); // 处理可能的错误响应 if (response.status === 429) { console.error('⚠️ Rate limit exceeded for Semantic Scholar API. Please wait before making more requests.'); throw new Error('Rate limit exceeded. Please try again later.'); } if (response.status >= 400) { console.error(`❌ Semantic Scholar API Error: ${response.status} - ${response.data?.message || 'Unknown error'}`); throw new Error(`API Error: ${response.status}`); } const papers = this.parseSearchResponse(response.data); console.error(`📄 Semantic Scholar Parsed ${papers.length} papers`); return papers; } catch (error) { console.error(`❌ Semantic Scholar Search Error:`, error.message); // 处理速率限制错误 if (error.response?.status === 429) { const retryAfter = error.response.headers['retry-after']; console.error(`Rate limited by Semantic Scholar API. ${retryAfter ? `Retry after ${retryAfter} seconds.` : 'Please wait before making more requests.'}`); } // 处理API限制错误 if (error.response?.status === 403) { console.error('Access denied. Please check your API key or ensure you are within the free tier limits.'); } // 如果是网络错误,返回空结果而不是抛出异常 if (error.code === 'ENOTFOUND' || error.code === 'ECONNREFUSED' || error.code === 'TIMEOUT') { console.error('⚠️ Network error accessing Semantic Scholar API, returning empty results.'); return []; } this.handleHttpError(error, 'search'); } } /** * 获取论文详细信息 */ async getPaperDetails(paperId) { await this.rateLimiter.waitForPermission(); try { const params = { fields: [ 'paperId', 'title', 'abstract', 'venue', 'year', 'referenceCount', 'citationCount', 'influentialCitationCount', 'isOpenAccess', 'openAccessPdf', 'fieldsOfStudy', 's2FieldsOfStudy', 'publicationTypes', 'publicationDate', 'journal', 'authors', 'externalIds', 'url' ].join(',') }; const url = `${this.baseApiUrl}/paper/${paperId}`; const headers = { 'User-Agent': 'Paper-Search-MCP/1.0 (Academic Research Tool)', 'Accept': 'application/json' }; if (this.apiKey) { headers['x-api-key'] = this.apiKey; } const response = await axios.get(url, { params, headers, timeout: 30000, maxRedirects: 5, validateStatus: (status) => status < 500 }); return this.parseSemanticPaper(response.data); } catch (error) { console.error('Error getting paper details from Semantic Scholar:', error.message); return null; } } /** * 下载PDF文件 */ async downloadPdf(paperId, options = {}) { try { // 首先获取论文详细信息以获取PDF URL const paper = await this.getPaperDetails(paperId); if (!paper?.pdfUrl) { throw new Error(`No PDF URL available for paper ${paperId}`); } const savePath = options.savePath || './downloads'; // 确保保存目录存在 if (!fs.existsSync(savePath)) { fs.mkdirSync(savePath, { recursive: true }); } const filename = `semantic_${paperId.replace(/[/\\:*?"<>|]/g, '_')}.pdf`; const filePath = path.join(savePath, filename); // 检查文件是否已存在 if (fs.existsSync(filePath) && !options.overwrite) { return filePath; } const response = await axios.get(paper.pdfUrl, { responseType: 'stream', timeout: 60000, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' } }); const writer = fs.createWriteStream(filePath); response.data.pipe(writer); return new Promise((resolve, reject) => { writer.on('finish', () => resolve(filePath)); writer.on('error', reject); }); } catch (error) { this.handleHttpError(error, 'download PDF'); } } /** * 读取论文全文内容 */ async readPaper(paperId, options = {}) { try { const savePath = options.savePath || './downloads'; const filename = `semantic_${paperId.replace(/[/\\:*?"<>|]/g, '_')}.pdf`; const filePath = path.join(savePath, filename); // 如果PDF不存在,先下载 if (!fs.existsSync(filePath)) { await this.downloadPdf(paperId, options); } return `PDF file downloaded at: ${filePath}. Full text extraction requires additional PDF parsing implementation.`; } catch (error) { this.handleHttpError(error, 'read paper'); } } /** * 根据DOI获取论文信息 */ async getPaperByDoi(doi) { try { return await this.getPaperDetails(`DOI:${doi}`); } catch (error) { console.error('Error getting paper by DOI from Semantic Scholar:', error); return null; } } /** * 解析搜索响应 */ parseSearchResponse(data) { if (!data.data || !Array.isArray(data.data)) { return []; } return data.data.map(item => this.parseSemanticPaper(item)) .filter(paper => paper !== null); } /** * 解析单个Semantic Scholar论文 */ parseSemanticPaper(item) { try { // 提取作者 const authors = item.authors?.map(author => author.name) || []; // 提取发表日期 const publishedDate = item.publicationDate ? this.parseDate(item.publicationDate) : (item.year ? new Date(item.year, 0, 1) : null); // 提取PDF URL let pdfUrl = ''; if (item.openAccessPdf?.url) { pdfUrl = item.openAccessPdf.url; } else if (item.openAccessPdf?.disclaimer) { // 尝试从disclaimer中提取URL const urlMatch = item.openAccessPdf.disclaimer.match(/https?:\/\/[^\s,)]+/); if (urlMatch) { pdfUrl = urlMatch[0]; } } // 提取DOI const doi = item.externalIds?.DOI || ''; // 提取分类 const fieldsOfStudy = item.fieldsOfStudy || []; const s2Fields = item.s2FieldsOfStudy?.map(field => field.category) || []; const categories = [...fieldsOfStudy, ...s2Fields]; // 构建URL const url = item.url || `https://www.semanticscholar.org/paper/${item.paperId}`; return PaperFactory.create({ paperId: item.paperId, title: this.cleanText(item.title), authors: authors, abstract: this.cleanText(item.abstract || ''), doi: doi, publishedDate: publishedDate, pdfUrl: pdfUrl, url: url, source: 'semantic', categories: [...new Set(categories)], // 去重 keywords: [], citationCount: item.citationCount || 0, journal: item.venue || item.journal?.name || '', volume: item.journal?.volume || undefined, pages: item.journal?.pages || undefined, year: item.year, extra: { semanticScholarId: item.paperId, referenceCount: item.referenceCount || 0, influentialCitationCount: item.influentialCitationCount || 0, isOpenAccess: item.isOpenAccess || false, publicationTypes: item.publicationTypes || [], externalIds: item.externalIds || {} } }); } catch (error) { console.error('Error parsing Semantic Scholar paper:', error); return null; } } /** * 获取速率限制器状态 */ getRateLimiterStatus() { return this.rateLimiter.getStatus(); } /** * 验证API密钥(如果提供) */ async validateApiKey() { if (!this.apiKey) { return true; // 无API密钥时使用免费限制 } try { await this.search('test', { maxResults: 1 }); return true; } catch (error) { if (error.response?.status === 401 || error.response?.status === 403) { return false; } return true; // 其他错误可能是网络问题 } } } //# sourceMappingURL=SemanticScholarSearcher.js.map