UNPKG

@tan-yong-sheng/paper-search-mcp-nodejs

Version:

A Node.js MCP server for searching and downloading academic papers from multiple sources, including arXiv, PubMed, bioRxiv, Web of Science, and more.

401 lines 16.7 kB
/** * Sci-Hub 论文搜索和下载器 * 支持多镜像站点轮询、自动健康检测和故障转移 */ import axios from 'axios'; import * as cheerio from 'cheerio'; import * as fs from 'fs'; import * as path from 'path'; import { PaperSource } from './PaperSource.js'; import { PaperFactory } from '../models/Paper.js'; export class SciHubSearcher extends PaperSource { mirrorSites; currentMirrorIndex = 0; axiosInstance; maxRetries = 3; mirrorTestTimeout = 5000; // 5 seconds lastHealthCheck = null; healthCheckInterval = 300000; // 5 minutes constructor() { super('Sci-Hub', 'https://sci-hub.se'); // 初始化镜像站点列表 this.mirrorSites = [ { url: 'https://sci-hub.se', isWorking: true, failureCount: 0 }, { url: 'https://sci-hub.st', isWorking: true, failureCount: 0 }, { url: 'https://sci-hub.ru', isWorking: true, failureCount: 0 }, { url: 'https://sci-hub.ren', isWorking: true, failureCount: 0 }, { url: 'https://sci-hub.mksa.top', isWorking: true, failureCount: 0 }, { url: 'https://sci-hub.ee', isWorking: true, failureCount: 0 }, { url: 'https://sci-hub.wf', isWorking: true, failureCount: 0 }, { url: 'https://sci-hub.yt', isWorking: true, failureCount: 0 }, { url: 'https://sci-hub.sci-hub.se', isWorking: true, failureCount: 0 }, { url: 'https://sci-hub.sci-hub.st', isWorking: true, failureCount: 0 }, { url: 'https://sci-hub.sci-hub.ru', isWorking: true, failureCount: 0 }, ]; this.axiosInstance = axios.create({ timeout: 15000, headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }, maxRedirects: 5, validateStatus: (status) => status < 500 }); // 启动时进行健康检测 this.checkMirrorHealth(); } getCapabilities() { return { search: true, download: true, fullText: false, citations: false, requiresApiKey: false, supportedOptions: ['maxResults'] }; } /** * 检测所有镜像站点的健康状态 */ async checkMirrorHealth() { console.log('🔍 Checking Sci-Hub mirror sites health...'); const healthPromises = this.mirrorSites.map(async (mirror, index) => { try { const startTime = Date.now(); const response = await axios.get(mirror.url, { timeout: this.mirrorTestTimeout, headers: this.axiosInstance.defaults.headers, maxRedirects: 2 }); const responseTime = Date.now() - startTime; // 检查响应是否包含 Sci-Hub 特征 const html = response.data; const isValidSciHub = html.includes('sci-hub') || html.includes('Sci-Hub') || html.includes('alexandra elbakyan'); this.mirrorSites[index] = { ...mirror, lastChecked: new Date(), responseTime, isWorking: response.status === 200 && isValidSciHub, failureCount: 0 }; if (this.mirrorSites[index].isWorking) { console.log(`✅ ${mirror.url} - OK (${responseTime}ms)`); } else { console.log(`⚠️ ${mirror.url} - Invalid response`); } } catch (error) { this.mirrorSites[index] = { ...mirror, lastChecked: new Date(), isWorking: false, failureCount: mirror.failureCount + 1 }; console.log(`❌ ${mirror.url} - Failed`); } }); await Promise.allSettled(healthPromises); // 按响应时间排序可用的镜像 this.mirrorSites.sort((a, b) => { if (a.isWorking && !b.isWorking) return -1; if (!a.isWorking && b.isWorking) return 1; if (a.isWorking && b.isWorking) { return (a.responseTime || Infinity) - (b.responseTime || Infinity); } return 0; }); this.lastHealthCheck = new Date(); const workingCount = this.mirrorSites.filter(m => m.isWorking).length; console.log(`✅ Health check complete: ${workingCount}/${this.mirrorSites.length} mirrors working`); if (workingCount === 0) { console.error('⚠️ Warning: No Sci-Hub mirrors are currently accessible!'); } } /** * 获取当前可用的镜像站点 */ async getCurrentMirror() { // 定期进行健康检查 if (!this.lastHealthCheck || Date.now() - this.lastHealthCheck.getTime() > this.healthCheckInterval) { await this.checkMirrorHealth(); } // 找到第一个可用的镜像 const workingMirror = this.mirrorSites.find(m => m.isWorking); if (!workingMirror) { // 如果没有可用镜像,重新检测 await this.checkMirrorHealth(); const retryMirror = this.mirrorSites.find(m => m.isWorking); if (!retryMirror) { throw new Error('No working Sci-Hub mirrors available'); } return retryMirror.url; } return workingMirror.url; } /** * 标记镜像站点失败并切换到下一个 */ async markMirrorFailed(mirrorUrl) { const mirrorIndex = this.mirrorSites.findIndex(m => m.url === mirrorUrl); if (mirrorIndex !== -1) { this.mirrorSites[mirrorIndex].failureCount++; if (this.mirrorSites[mirrorIndex].failureCount >= 3) { this.mirrorSites[mirrorIndex].isWorking = false; console.log(`❌ Mirror ${mirrorUrl} marked as failed after multiple attempts`); } } // 尝试下一个镜像 const nextWorkingMirror = this.mirrorSites.find((m, idx) => idx > mirrorIndex && m.isWorking); if (nextWorkingMirror) { return nextWorkingMirror.url; } // 如果没有更多镜像,重新检测健康状态 await this.checkMirrorHealth(); return this.getCurrentMirror(); } /** * 通过 DOI 或 URL 搜索论文 */ async search(query, options) { // Sci-Hub 主要通过 DOI 或直接 URL 工作 // 如果输入不是 DOI 或 URL,返回空结果 if (!this.isValidDOIOrURL(query)) { console.log('Sci-Hub requires a valid DOI or paper URL'); return []; } try { const paperInfo = await this.fetchPaperInfo(query); if (paperInfo) { return [paperInfo]; } } catch (error) { console.error('Sci-Hub search error:', error); } return []; } /** * 验证输入是否为有效的 DOI 或 URL */ isValidDOIOrURL(input) { // DOI 模式:10.xxxx/xxxxx const doiPattern = /^10\.\d{4,}\/[-._;()\/:a-zA-Z0-9]+$/; // URL 模式 const urlPattern = /^https?:\/\/.+/; // 也接受带有 doi: 前缀的格式 const doiPrefixPattern = /^doi:\s*10\.\d{4,}\/[-._;()\/:a-zA-Z0-9]+$/i; return doiPattern.test(input) || urlPattern.test(input) || doiPrefixPattern.test(input); } /** * 从 Sci-Hub 获取论文信息 */ async fetchPaperInfo(doiOrUrl) { let currentMirror = await this.getCurrentMirror(); let retries = 0; // 清理 DOI 格式 const cleanedQuery = doiOrUrl.replace(/^doi:\s*/i, ''); while (retries < this.maxRetries) { try { const searchUrl = `${currentMirror}/${cleanedQuery}`; console.log(`🔍 Searching on ${currentMirror} for: ${cleanedQuery}`); const response = await this.axiosInstance.get(searchUrl); if (response.status === 200) { const $ = cheerio.load(response.data); // 检查是否找到论文 const pdfFrame = $('#pdf'); const pdfEmbed = $('embed[type="application/pdf"]'); const pdfIframe = $('iframe[src*=".pdf"]'); let pdfUrl = ''; // 尝试多种方式获取 PDF URL if (pdfFrame.length > 0) { pdfUrl = pdfFrame.attr('src') || ''; } else if (pdfEmbed.length > 0) { pdfUrl = pdfEmbed.attr('src') || ''; } else if (pdfIframe.length > 0) { pdfUrl = pdfIframe.attr('src') || ''; } else { // 查找下载按钮 const downloadButton = $('button[onclick*="download"]'); if (downloadButton.length > 0) { const onclickAttr = downloadButton.attr('onclick') || ''; const match = onclickAttr.match(/location\.href='([^']+)'/); if (match) { pdfUrl = match[1]; } } } // 处理相对 URL if (pdfUrl && !pdfUrl.startsWith('http')) { if (pdfUrl.startsWith('//')) { pdfUrl = 'https:' + pdfUrl; } else if (pdfUrl.startsWith('/')) { pdfUrl = currentMirror + pdfUrl; } } if (pdfUrl) { // 提取标题(尝试从页面标题或 citation 信息获取) let title = $('title').text(); const citation = $('#citation').text(); if (citation) { // 从引用信息中提取标题 const titleMatch = citation.match(/([^.]+)\./); if (titleMatch) { title = titleMatch[1].trim(); } } // 清理标题 title = title.replace(/\s*\|\s*Sci-Hub.*$/, '') .replace(/Sci-Hub\s*:\s*/, '') .trim(); return PaperFactory.create({ paperId: cleanedQuery, title: title || `Paper: ${cleanedQuery}`, source: 'Sci-Hub', authors: [], abstract: '', doi: this.isValidDOIOrURL(cleanedQuery) && cleanedQuery.includes('10.') ? cleanedQuery : '', publishedDate: null, pdfUrl: pdfUrl, url: searchUrl, extra: { mirror: currentMirror, fetchedAt: new Date().toISOString() } }); } else { console.log(`Paper not found on ${currentMirror}`); currentMirror = await this.markMirrorFailed(currentMirror); retries++; } } else { console.log(`Unexpected status ${response.status} from ${currentMirror}`); currentMirror = await this.markMirrorFailed(currentMirror); retries++; } } catch (error) { console.error(`Error fetching from ${currentMirror}:`, error.message); currentMirror = await this.markMirrorFailed(currentMirror); retries++; } } return null; } /** * 下载 PDF 文件 */ async downloadPdf(paperId, options) { const savePath = options?.savePath || './downloads'; // 确保下载目录存在 if (!fs.existsSync(savePath)) { fs.mkdirSync(savePath, { recursive: true }); } // 首先获取论文信息 const paperInfo = await this.fetchPaperInfo(paperId); if (!paperInfo || !paperInfo.pdfUrl) { throw new Error(`Cannot find PDF for: ${paperId}`); } const fileName = `${paperId.replace(/[^a-zA-Z0-9]/g, '_')}.pdf`; const filePath = path.join(savePath, fileName); // 检查文件是否已存在 if (fs.existsSync(filePath) && !options?.overwrite) { console.log(`File already exists: ${filePath}`); return filePath; } // 下载 PDF let retries = 0; let currentPdfUrl = paperInfo.pdfUrl; while (retries < this.maxRetries) { try { console.log(`📥 Downloading PDF from: ${currentPdfUrl}`); const response = await this.axiosInstance.get(currentPdfUrl, { responseType: 'stream', timeout: 60000 // 60 seconds for download }); if (response.status === 200) { const writer = fs.createWriteStream(filePath); response.data.pipe(writer); return new Promise((resolve, reject) => { writer.on('finish', () => { console.log(`✅ PDF downloaded successfully: ${filePath}`); resolve(filePath); }); writer.on('error', reject); }); } else { throw new Error(`Failed to download PDF: status ${response.status}`); } } catch (error) { console.error(`Download attempt ${retries + 1} failed:`, error.message); retries++; if (retries < this.maxRetries) { // 尝试重新获取论文信息(可能 PDF URL 已更改) const updatedInfo = await this.fetchPaperInfo(paperId); if (updatedInfo?.pdfUrl && updatedInfo.pdfUrl !== currentPdfUrl) { currentPdfUrl = updatedInfo.pdfUrl; console.log('Trying updated PDF URL...'); } else { // 等待后重试 await new Promise(resolve => setTimeout(resolve, 2000 * retries)); } } } } throw new Error(`Failed to download PDF after ${this.maxRetries} attempts`); } /** * 读取论文内容(Sci-Hub 不提供文本提取) */ async readPaper(paperId, options) { // Sci-Hub 只提供 PDF 下载,不提供文本提取 const filePath = await this.downloadPdf(paperId, options); return `PDF downloaded to: ${filePath}. Please use a PDF reader to view the content.`; } /** * 根据 DOI 获取论文 */ async getPaperByDoi(doi) { return this.fetchPaperInfo(doi); } /** * 获取镜像站点状态 */ getMirrorStatus() { return this.mirrorSites.map(mirror => ({ url: mirror.url, status: mirror.isWorking ? 'Working' : 'Failed', responseTime: mirror.responseTime })); } /** * 手动触发健康检查 */ async forceHealthCheck() { await this.checkMirrorHealth(); } } //# sourceMappingURL=SciHubSearcher.js.map