@tan-yong-sheng/paper-search-mcp-nodejs
Version:
A Node.js MCP server for searching and downloading academic papers from multiple sources, including arXiv, PubMed, bioRxiv, Web of Science, and more.
319 lines • 13.1 kB
JavaScript
/**
* Semantic Scholar API集成模块
* 支持免费API和付费API密钥
*/
import axios from 'axios';
import * as fs from 'fs';
import * as path from 'path';
import { PaperFactory } from '../models/Paper.js';
import { PaperSource } from './PaperSource.js';
import { RateLimiter } from '../utils/RateLimiter.js';
export class SemanticScholarSearcher extends PaperSource {
rateLimiter;
baseApiUrl;
constructor(apiKey) {
super('semantic', 'https://api.semanticscholar.org/graph/v1', apiKey);
this.baseApiUrl = this.baseUrl;
// Semantic Scholar免费API限制:100 requests per 5 minutes
// 付费API: 1000 requests per 5 minutes
// 更保守的速率限制以避免被封
const requestsPerMinute = apiKey ? 180 : 18; // 有API密钥时更宽松
this.rateLimiter = new RateLimiter({
requestsPerSecond: requestsPerMinute / 60,
burstCapacity: Math.max(3, Math.floor(requestsPerMinute / 20)), // 降低突发容量
debug: process.env.NODE_ENV === 'development'
});
}
getCapabilities() {
return {
search: true,
download: true, // 部分论文有开放获取PDF
fullText: false, // 只有部分PDF
citations: true, // 提供引用统计
requiresApiKey: false, // 免费API可用,但有限制
supportedOptions: ['maxResults', 'year', 'fieldsOfStudy', 'sortBy']
};
}
/**
* 搜索Semantic Scholar论文
*/
async search(query, options = {}) {
await this.rateLimiter.waitForPermission();
try {
const params = {
query: query,
limit: Math.min(options.maxResults || 10, 100), // API限制最大100
fields: [
'paperId', 'title', 'abstract', 'venue', 'year',
'referenceCount', 'citationCount', 'influentialCitationCount',
'isOpenAccess', 'openAccessPdf', 'fieldsOfStudy', 's2FieldsOfStudy',
'publicationTypes', 'publicationDate', 'journal', 'authors',
'externalIds', 'url'
].join(',')
};
// 添加年份过滤
if (options.year) {
params.year = options.year;
}
// 添加研究领域过滤
if (options.fieldsOfStudy && options.fieldsOfStudy.length > 0) {
params.fieldsOfStudy = options.fieldsOfStudy.join(',');
}
const url = `${this.baseApiUrl}/paper/search`;
const headers = {
'User-Agent': 'Paper-Search-MCP/1.0 (Academic Research Tool)',
'Accept': 'application/json',
'Accept-Language': 'en-US,en;q=0.9'
};
// 添加API密钥(如果有)- 根据官方文档推荐的方式
if (this.apiKey) {
headers['x-api-key'] = this.apiKey;
}
console.error(`🔍 Semantic Scholar API Request: GET ${url}`);
console.error(`📋 Semantic Scholar Request params:`, params);
const response = await axios.get(url, {
params,
headers,
timeout: 30000,
// 改善请求可靠性
maxRedirects: 5,
validateStatus: (status) => status < 500, // 允许 4xx 状态码通过,我们会手动处理
});
console.error(`✅ Semantic Scholar API Response: ${response.status} ${response.statusText}`);
// 处理可能的错误响应
if (response.status === 429) {
console.error('⚠️ Rate limit exceeded for Semantic Scholar API. Please wait before making more requests.');
throw new Error('Rate limit exceeded. Please try again later.');
}
if (response.status >= 400) {
console.error(`❌ Semantic Scholar API Error: ${response.status} - ${response.data?.message || 'Unknown error'}`);
throw new Error(`API Error: ${response.status}`);
}
const papers = this.parseSearchResponse(response.data);
console.error(`📄 Semantic Scholar Parsed ${papers.length} papers`);
return papers;
}
catch (error) {
console.error(`❌ Semantic Scholar Search Error:`, error.message);
// 处理速率限制错误
if (error.response?.status === 429) {
const retryAfter = error.response.headers['retry-after'];
console.error(`Rate limited by Semantic Scholar API. ${retryAfter ? `Retry after ${retryAfter} seconds.` : 'Please wait before making more requests.'}`);
}
// 处理API限制错误
if (error.response?.status === 403) {
console.error('Access denied. Please check your API key or ensure you are within the free tier limits.');
}
// 如果是网络错误,返回空结果而不是抛出异常
if (error.code === 'ENOTFOUND' || error.code === 'ECONNREFUSED' || error.code === 'TIMEOUT') {
console.error('⚠️ Network error accessing Semantic Scholar API, returning empty results.');
return [];
}
this.handleHttpError(error, 'search');
}
}
/**
* 获取论文详细信息
*/
async getPaperDetails(paperId) {
await this.rateLimiter.waitForPermission();
try {
const params = {
fields: [
'paperId', 'title', 'abstract', 'venue', 'year',
'referenceCount', 'citationCount', 'influentialCitationCount',
'isOpenAccess', 'openAccessPdf', 'fieldsOfStudy', 's2FieldsOfStudy',
'publicationTypes', 'publicationDate', 'journal', 'authors',
'externalIds', 'url'
].join(',')
};
const url = `${this.baseApiUrl}/paper/${paperId}`;
const headers = {
'User-Agent': 'Paper-Search-MCP/1.0 (Academic Research Tool)',
'Accept': 'application/json'
};
if (this.apiKey) {
headers['x-api-key'] = this.apiKey;
}
const response = await axios.get(url, {
params,
headers,
timeout: 30000,
maxRedirects: 5,
validateStatus: (status) => status < 500
});
return this.parseSemanticPaper(response.data);
}
catch (error) {
console.error('Error getting paper details from Semantic Scholar:', error.message);
return null;
}
}
/**
* 下载PDF文件
*/
async downloadPdf(paperId, options = {}) {
try {
// 首先获取论文详细信息以获取PDF URL
const paper = await this.getPaperDetails(paperId);
if (!paper?.pdfUrl) {
throw new Error(`No PDF URL available for paper ${paperId}`);
}
const savePath = options.savePath || './downloads';
// 确保保存目录存在
if (!fs.existsSync(savePath)) {
fs.mkdirSync(savePath, { recursive: true });
}
const filename = `semantic_${paperId.replace(/[/\\:*?"<>|]/g, '_')}.pdf`;
const filePath = path.join(savePath, filename);
// 检查文件是否已存在
if (fs.existsSync(filePath) && !options.overwrite) {
return filePath;
}
const response = await axios.get(paper.pdfUrl, {
responseType: 'stream',
timeout: 60000,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
});
const writer = fs.createWriteStream(filePath);
response.data.pipe(writer);
return new Promise((resolve, reject) => {
writer.on('finish', () => resolve(filePath));
writer.on('error', reject);
});
}
catch (error) {
this.handleHttpError(error, 'download PDF');
}
}
/**
* 读取论文全文内容
*/
async readPaper(paperId, options = {}) {
try {
const savePath = options.savePath || './downloads';
const filename = `semantic_${paperId.replace(/[/\\:*?"<>|]/g, '_')}.pdf`;
const filePath = path.join(savePath, filename);
// 如果PDF不存在,先下载
if (!fs.existsSync(filePath)) {
await this.downloadPdf(paperId, options);
}
return `PDF file downloaded at: ${filePath}. Full text extraction requires additional PDF parsing implementation.`;
}
catch (error) {
this.handleHttpError(error, 'read paper');
}
}
/**
* 根据DOI获取论文信息
*/
async getPaperByDoi(doi) {
try {
return await this.getPaperDetails(`DOI:${doi}`);
}
catch (error) {
console.error('Error getting paper by DOI from Semantic Scholar:', error);
return null;
}
}
/**
* 解析搜索响应
*/
parseSearchResponse(data) {
if (!data.data || !Array.isArray(data.data)) {
return [];
}
return data.data.map(item => this.parseSemanticPaper(item))
.filter(paper => paper !== null);
}
/**
* 解析单个Semantic Scholar论文
*/
parseSemanticPaper(item) {
try {
// 提取作者
const authors = item.authors?.map(author => author.name) || [];
// 提取发表日期
const publishedDate = item.publicationDate ?
this.parseDate(item.publicationDate) :
(item.year ? new Date(item.year, 0, 1) : null);
// 提取PDF URL
let pdfUrl = '';
if (item.openAccessPdf?.url) {
pdfUrl = item.openAccessPdf.url;
}
else if (item.openAccessPdf?.disclaimer) {
// 尝试从disclaimer中提取URL
const urlMatch = item.openAccessPdf.disclaimer.match(/https?:\/\/[^\s,)]+/);
if (urlMatch) {
pdfUrl = urlMatch[0];
}
}
// 提取DOI
const doi = item.externalIds?.DOI || '';
// 提取分类
const fieldsOfStudy = item.fieldsOfStudy || [];
const s2Fields = item.s2FieldsOfStudy?.map(field => field.category) || [];
const categories = [...fieldsOfStudy, ...s2Fields];
// 构建URL
const url = item.url || `https://www.semanticscholar.org/paper/${item.paperId}`;
return PaperFactory.create({
paperId: item.paperId,
title: this.cleanText(item.title),
authors: authors,
abstract: this.cleanText(item.abstract || ''),
doi: doi,
publishedDate: publishedDate,
pdfUrl: pdfUrl,
url: url,
source: 'semantic',
categories: [...new Set(categories)], // 去重
keywords: [],
citationCount: item.citationCount || 0,
journal: item.venue || item.journal?.name || '',
volume: item.journal?.volume || undefined,
pages: item.journal?.pages || undefined,
year: item.year,
extra: {
semanticScholarId: item.paperId,
referenceCount: item.referenceCount || 0,
influentialCitationCount: item.influentialCitationCount || 0,
isOpenAccess: item.isOpenAccess || false,
publicationTypes: item.publicationTypes || [],
externalIds: item.externalIds || {}
}
});
}
catch (error) {
console.error('Error parsing Semantic Scholar paper:', error);
return null;
}
}
/**
* 获取速率限制器状态
*/
getRateLimiterStatus() {
return this.rateLimiter.getStatus();
}
/**
* 验证API密钥(如果提供)
*/
async validateApiKey() {
if (!this.apiKey) {
return true; // 无API密钥时使用免费限制
}
try {
await this.search('test', { maxResults: 1 });
return true;
}
catch (error) {
if (error.response?.status === 401 || error.response?.status === 403) {
return false;
}
return true; // 其他错误可能是网络问题
}
}
}
//# sourceMappingURL=SemanticScholarSearcher.js.map