@tan-yong-sheng/paper-search-mcp-nodejs
Version:
A Node.js MCP server for searching and downloading academic papers from multiple sources, including arXiv, PubMed, bioRxiv, Web of Science, and more.
330 lines • 13.2 kB
JavaScript
/**
* IACR ePrint Archive集成模块
* 密码学和相关领域的学术论文搜索
*/
import axios from 'axios';
import * as cheerio from 'cheerio';
import * as fs from 'fs';
import * as path from 'path';
import { PaperFactory } from '../models/Paper.js';
import { PaperSource } from './PaperSource.js';
export class IACRSearcher extends PaperSource {
searchUrl;
userAgents;
constructor() {
super('iacr', 'https://eprint.iacr.org');
this.searchUrl = `${this.baseUrl}/search`;
this.userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
];
}
getCapabilities() {
return {
search: true,
download: true,
fullText: true,
citations: false,
requiresApiKey: false,
supportedOptions: ['maxResults', 'fetchDetails']
};
}
/**
* 搜索IACR ePrint Archive论文
*/
async search(query, options = {}) {
try {
const params = {
q: query
};
console.error(`🔍 IACR API Request: GET ${this.searchUrl}`);
console.error(`📋 IACR Request params:`, params);
const response = await axios.get(this.searchUrl, {
params,
timeout: 30000,
headers: {
'User-Agent': this.getRandomUserAgent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9'
}
});
console.error(`✅ IACR API Response: ${response.status} ${response.statusText}`);
const papers = await this.parseSearchResponse(response.data, options);
console.error(`📄 IACR Parsed ${papers.length} papers`);
return papers.slice(0, options.maxResults || 10);
}
catch (error) {
console.error(`❌ IACR Search Error:`, error.message);
this.handleHttpError(error, 'search');
}
}
/**
* 获取论文详细信息
*/
async getPaperDetails(paperId) {
try {
const paperUrl = paperId.startsWith('http') ? paperId : `${this.baseUrl}/${paperId}`;
const response = await axios.get(paperUrl, {
timeout: 30000,
headers: {
'User-Agent': this.getRandomUserAgent(),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9'
}
});
if (response.status !== 200) {
console.error(`Failed to fetch paper details: HTTP ${response.status}`);
return null;
}
return this.parseIACRPaperDetails(response.data, paperId);
}
catch (error) {
console.error(`Error fetching paper details for ${paperId}:`, error.message);
return null;
}
}
/**
* 下载PDF文件
*/
async downloadPdf(paperId, options = {}) {
try {
const pdfUrl = `${this.baseUrl}/${paperId}.pdf`;
const savePath = options.savePath || './downloads';
// 确保保存目录存在
if (!fs.existsSync(savePath)) {
fs.mkdirSync(savePath, { recursive: true });
}
const filename = `iacr_${paperId.replace(/\//g, '_')}.pdf`;
const filePath = path.join(savePath, filename);
// 检查文件是否已存在
if (fs.existsSync(filePath) && !options.overwrite) {
return filePath;
}
const response = await axios.get(pdfUrl, {
responseType: 'stream',
timeout: 60000,
headers: {
'User-Agent': this.getRandomUserAgent()
}
});
const writer = fs.createWriteStream(filePath);
response.data.pipe(writer);
return new Promise((resolve, reject) => {
writer.on('finish', () => resolve(filePath));
writer.on('error', reject);
});
}
catch (error) {
this.handleHttpError(error, 'download PDF');
}
}
/**
* 读取论文全文内容
*/
async readPaper(paperId, options = {}) {
try {
const savePath = options.savePath || './downloads';
const filename = `iacr_${paperId.replace(/\//g, '_')}.pdf`;
const filePath = path.join(savePath, filename);
// 如果PDF不存在,先下载
if (!fs.existsSync(filePath)) {
await this.downloadPdf(paperId, options);
}
return `PDF file downloaded at: ${filePath}. Full text extraction requires additional PDF parsing implementation.`;
}
catch (error) {
this.handleHttpError(error, 'read paper');
}
}
/**
* 解析搜索响应
*/
async parseSearchResponse(html, options) {
const $ = cheerio.load(html);
const papers = [];
// 查找所有搜索结果条目
$('.mb-4').each((index, element) => {
try {
const $element = $(element);
// 提取论文ID和链接
const paperLink = $element.find('.d-flex .paperlink').first();
if (!paperLink.length)
return;
const paperId = paperLink.text().trim();
const paperUrl = this.baseUrl + paperLink.attr('href');
// 提取PDF链接
const pdfLink = $element.find('a[href$=".pdf"]').first();
const pdfUrl = pdfLink.length ? this.baseUrl + pdfLink.attr('href') : '';
// 提取更新日期
const lastUpdatedElem = $element.find('small.ms-auto');
let updatedDate = null;
if (lastUpdatedElem.length) {
const dateText = lastUpdatedElem.text().replace('Last updated:', '').trim();
updatedDate = this.parseDate(dateText);
}
// 从内容区域提取信息
const contentDiv = $element.find('.ms-md-4');
if (!contentDiv.length)
return;
// 提取标题
const titleElem = contentDiv.find('strong').first();
const title = titleElem.text().trim();
// 提取作者
const authorsElem = contentDiv.find('span.fst-italic').first();
const authors = authorsElem.length ?
authorsElem.text().split(',').map(author => author.trim()) : [];
// 提取分类
const categoryElem = contentDiv.find('small.badge').first();
const categories = categoryElem.length ? [categoryElem.text().trim()] : [];
// 提取摘要
const abstractElem = contentDiv.find('p.search-abstract').first();
const abstract = abstractElem.text().trim();
const paper = PaperFactory.create({
paperId: paperId,
title: this.cleanText(title),
authors: authors,
abstract: this.cleanText(abstract),
doi: '',
publishedDate: updatedDate || new Date(),
pdfUrl: pdfUrl,
url: paperUrl,
source: 'iacr',
updatedDate: updatedDate || undefined,
categories: categories,
keywords: [],
citationCount: 0,
year: updatedDate?.getFullYear(),
extra: {
iacrId: paperId
}
});
papers.push(paper);
}
catch (error) {
console.error('Error parsing IACR search result:', error);
}
});
// 如果需要详细信息,获取每篇论文的详细信息
if (options.fetchDetails && papers.length > 0) {
console.error('Fetching detailed information for IACR papers...');
const detailedPapers = [];
for (const paper of papers) {
try {
const detailedPaper = await this.getPaperDetails(paper.paperId);
if (detailedPaper) {
detailedPapers.push(detailedPaper);
}
else {
detailedPapers.push(paper); // 退回到搜索结果数据
}
// 添加延迟避免过快请求
await this.delay(1000);
}
catch (error) {
console.error(`Error fetching details for ${paper.paperId}:`, error);
detailedPapers.push(paper);
}
}
return detailedPapers;
}
return papers;
}
/**
* 解析IACR论文详细页面
*/
parseIACRPaperDetails(html, paperId) {
try {
const $ = cheerio.load(html);
// 提取标题
const title = $('h3.mb-3').text().trim();
// 提取作者
const authorText = $('p.fst-italic').text().trim();
const authors = authorText ?
authorText.replace(/ and /g, ',').split(',').map(author => author.trim()) : [];
// 提取摘要
const abstract = $('p[style*="white-space: pre-wrap"]').text().trim();
// 提取关键词
const keywords = [];
$('a.badge.bg-secondary.keyword').each((index, element) => {
keywords.push($(element).text().trim());
});
// 提取发表信息和历史记录
const pageText = $.text();
const lines = pageText.split('\n').map(line => line.trim()).filter(line => line);
let publicationInfo = '';
let historyEntries = [];
let lastUpdated = null;
// 查找发表信息
for (let i = 0; i < lines.length; i++) {
if (lines[i].includes('Publication info') && i + 1 < lines.length) {
publicationInfo = lines[i + 1];
break;
}
}
// 查找历史记录
let historyFound = false;
for (let i = 0; i < lines.length; i++) {
const line = lines[i];
if (line === 'History' && !line.includes(':')) {
historyFound = true;
continue;
}
else if (historyFound && line.includes(':') && !line.startsWith('Short URL')) {
historyEntries.push(line);
// 尝试从第一个历史记录中提取最后更新日期
if (!lastUpdated) {
const dateStr = line.split(':')[0].trim();
lastUpdated = this.parseDate(dateStr);
}
}
else if (historyFound && (line.startsWith('Short URL') || line.startsWith('License'))) {
break;
}
}
// 构建PDF URL
const pdfUrl = `${this.baseUrl}/${paperId}.pdf`;
const paperUrl = `${this.baseUrl}/${paperId}`;
// 使用最后更新日期或当前日期作为发表日期
const publishedDate = lastUpdated || new Date();
return PaperFactory.create({
paperId: paperId,
title: this.cleanText(title),
authors: authors,
abstract: this.cleanText(abstract),
doi: '',
publishedDate: publishedDate,
pdfUrl: pdfUrl,
url: paperUrl,
source: 'iacr',
updatedDate: lastUpdated || undefined,
categories: [],
keywords: keywords,
citationCount: 0,
year: publishedDate.getFullYear(),
extra: {
iacrId: paperId,
publicationInfo: publicationInfo,
history: historyEntries.join('; ')
}
});
}
catch (error) {
console.error('Error parsing IACR paper details:', error);
return null;
}
}
/**
* 获取随机User-Agent
*/
getRandomUserAgent() {
return this.userAgents[Math.floor(Math.random() * this.userAgents.length)];
}
/**
* 延迟函数
*/
delay(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
//# sourceMappingURL=IACRSearcher.js.map