UNPKG

@tan-yong-sheng/paper-search-mcp-nodejs

Version:

A Node.js MCP server for searching and downloading academic papers from multiple sources, including arXiv, PubMed, bioRxiv, Web of Science, and more.

227 lines 8.41 kB
/** * arXiv API集成模块 * 基于arXiv API v1.1实现论文搜索和下载功能 */ import axios from 'axios'; import * as fs from 'fs'; import * as path from 'path'; import * as xml2js from 'xml2js'; import { PaperFactory } from '../models/Paper.js'; import { PaperSource } from './PaperSource.js'; export class ArxivSearcher extends PaperSource { constructor() { super('arxiv', 'https://export.arxiv.org/api'); } getCapabilities() { return { search: true, download: true, fullText: true, citations: false, // arXiv本身不提供被引统计 requiresApiKey: false, supportedOptions: ['maxResults', 'year', 'author', 'category', 'sortBy', 'sortOrder'] }; } /** * 搜索arXiv论文 */ async search(query, options = {}) { try { const searchQuery = this.buildSearchQuery(query, options); const url = `${this.baseUrl}/query`; const params = { search_query: searchQuery, max_results: options.maxResults || 10, sortBy: this.mapSortField(options.sortBy || 'relevance'), sortOrder: options.sortOrder || 'descending' }; console.error(`🔍 arXiv API Request: GET ${url}`); console.error(`📋 arXiv Request params:`, params); const response = await axios.get(url, { params, timeout: 30000, headers: { 'User-Agent': 'Paper-Search-MCP/1.0 (Academic Research Tool)' } }); console.error(`✅ arXiv API Response: ${response.status} ${response.statusText}, Data length: ${response.data?.length || 0}`); const papers = await this.parseSearchResponse(response.data); console.error(`📄 arXiv Parsed ${papers.length} papers`); return papers; } catch (error) { console.error(`❌ arXiv Search Error:`, error.message); this.handleHttpError(error, 'search'); } } /** * 下载PDF文件 */ async downloadPdf(paperId, options = {}) { try { const savePath = options.savePath || './downloads'; const pdfUrl = `https://arxiv.org/pdf/${paperId}.pdf`; // 确保保存目录存在 if (!fs.existsSync(savePath)) { fs.mkdirSync(savePath, { recursive: true }); } const filename = `${paperId}.pdf`; const filePath = path.join(savePath, filename); // 检查文件是否已存在 if (fs.existsSync(filePath) && !options.overwrite) { return filePath; } const response = await axios.get(pdfUrl, { responseType: 'stream', timeout: 60000 }); const writer = fs.createWriteStream(filePath); response.data.pipe(writer); return new Promise((resolve, reject) => { writer.on('finish', () => resolve(filePath)); writer.on('error', reject); }); } catch (error) { this.handleHttpError(error, 'download PDF'); } } /** * 读取论文全文内容(从PDF中提取) */ async readPaper(paperId, options = {}) { try { const savePath = options.savePath || './downloads'; const filePath = path.join(savePath, `${paperId}.pdf`); // 如果PDF不存在,先下载 if (!fs.existsSync(filePath)) { await this.downloadPdf(paperId, options); } // 这里需要PDF解析库,暂时返回提示信息 return `PDF file downloaded at: ${filePath}. Full text extraction requires additional PDF parsing implementation.`; } catch (error) { this.handleHttpError(error, 'read paper'); } } /** * 构建搜索查询 */ buildSearchQuery(query, options) { let searchQuery = query; // 添加作者过滤 if (options.author) { searchQuery += ` AND au:"${options.author}"`; } // 添加分类过滤 if (options.category) { searchQuery += ` AND cat:${options.category}`; } // 添加年份过滤(arXiv使用日期范围) if (options.year) { const year = options.year; if (year.includes('-')) { // 年份范围 const [startYear, endYear] = year.split('-'); if (startYear) { searchQuery += ` AND submittedDate:[${startYear}0101 TO `; searchQuery += endYear ? `${endYear}1231]` : '*]'; } } else { // 单一年份 searchQuery += ` AND submittedDate:[${year}0101 TO ${year}1231]`; } } return searchQuery; } /** * 映射排序字段 */ mapSortField(sortBy) { const fieldMap = { 'relevance': 'relevance', 'date': 'submittedDate', 'citations': 'submittedDate' // arXiv没有被引排序,使用日期代替 }; return fieldMap[sortBy] || 'relevance'; } /** * 解析搜索响应 */ async parseSearchResponse(xmlData) { try { const parser = new xml2js.Parser(); const result = await parser.parseStringPromise(xmlData); if (!result.feed.entry) { return []; } const entries = Array.isArray(result.feed.entry) ? result.feed.entry : [result.feed.entry]; return entries.map(entry => this.parseArxivEntry(entry)) .filter(paper => paper !== null); } catch (error) { console.error('Error parsing arXiv response:', error); return []; } } /** * 解析单个arXiv条目 */ parseArxivEntry(entry) { try { // 提取论文ID const arxivUrl = entry.id[0]; const paperId = arxivUrl.split('/').pop()?.replace('abs/', '') || ''; // 提取标题 const title = entry.title[0]; // 提取作者 const authorData = entry.author; const authors = Array.isArray(authorData) ? authorData.map(a => a.name[0]) : [authorData.name[0]]; // 提取摘要 const abstract = entry.summary[0]; // 提取日期 const publishedDate = this.parseDate(entry.published[0]); const updatedDate = this.parseDate(entry.updated[0]); // 提取DOI const doi = entry['arxiv:doi']?.[0] || ''; // 提取分类 const primaryCategory = entry['arxiv:primary_category']?.[0]?.$?.term || ''; const categories = entry.category?.map(cat => cat.$.term) || [primaryCategory]; // 提取链接 const pdfLink = entry.link.find(link => link.$.type === 'application/pdf'); const pdfUrl = pdfLink?.$.href || `https://arxiv.org/pdf/${paperId}.pdf`; // 提取年份 const year = publishedDate?.getFullYear(); return PaperFactory.create({ paperId: paperId, title: this.cleanText(title), authors: authors, abstract: this.cleanText(abstract), doi: doi, publishedDate: publishedDate, pdfUrl: pdfUrl, url: `https://arxiv.org/abs/${paperId}`, source: 'arxiv', updatedDate: updatedDate || undefined, categories: categories, keywords: [], // arXiv通常不提供关键词 citationCount: 0, // arXiv本身不提供被引统计 year: year, extra: { primaryCategory: primaryCategory, arxivId: paperId } }); } catch (error) { console.error('Error parsing arXiv entry:', error); return null; } } } //# sourceMappingURL=ArxivSearcher.js.map