UNPKG

paper-search-mcp-nodejs

Version:

A Node.js MCP server for searching and downloading academic papers from multiple sources, including arXiv, PubMed, bioRxiv, Web of Science, and more.

259 lines 9.53 kB
/** * Google Scholar搜索器 - 网页抓取实现 * 基于HTML解析,包含反检测机制 */ import axios from 'axios'; import * as cheerio from 'cheerio'; import { PaperFactory } from '../models/Paper.js'; import { PaperSource } from './PaperSource.js'; export class GoogleScholarSearcher extends PaperSource { scholarUrl = 'https://scholar.google.com/scholar'; userAgents = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' ]; constructor() { super('google_scholar', 'https://scholar.google.com'); } getCapabilities() { return { search: true, download: false, // Google Scholar不提供直接下载 fullText: false, // 只有元数据和摘要 citations: true, // 可以获取引用次数 requiresApiKey: false, // 不需要API密钥,但可能被限制 supportedOptions: ['maxResults', 'year', 'author'] }; } /** * 搜索Google Scholar论文 */ async search(query, options = {}) { console.error(`🔍 Google Scholar Search: query="${query}"`); try { const papers = []; let start = 0; const resultsPerPage = 10; const maxResults = options.maxResults || 10; while (papers.length < maxResults) { // 添加随机延迟避免检测 await this.randomDelay(); const params = this.buildSearchParams(query, start, options); const response = await this.makeScholarRequest(params); if (response.status !== 200) { console.error(`❌ Google Scholar HTTP Error: ${response.status}`); break; } const $ = cheerio.load(response.data); const results = $('.gs_ri'); // 搜索结果容器 if (results.length === 0) { console.error('📋 No more results found'); break; } console.error(`📊 Found ${results.length} results on page`); // 解析每个结果 results.each((index, element) => { if (papers.length >= maxResults) return false; // 停止遍历 const paper = this.parseScholarResult($, $(element)); if (paper) { papers.push(paper); } }); start += resultsPerPage; } console.error(`📄 Google Scholar Results: Found ${papers.length} papers`); return papers; } catch (error) { this.handleHttpError(error, 'search'); } } /** * Google Scholar不支持直接PDF下载 */ async downloadPdf(paperId, options) { throw new Error('Google Scholar does not support direct PDF download. Please use the paper URL to access the publisher.'); } /** * Google Scholar不提供全文内容 */ async readPaper(paperId, options) { throw new Error('Google Scholar does not provide full-text content. Please use the paper URL to access the full text.'); } /** * 构建搜索参数 */ buildSearchParams(query, start, options) { const params = { q: query, start: start, hl: options.language || 'en', as_sdt: '0,5', // 包括文章和引用 as_vis: '1' // 排除引用,只显示学术论文 }; // 添加年份过滤 if (options.yearLow || options.yearHigh) { params.as_ylo = options.yearLow || ''; params.as_yhi = options.yearHigh || ''; } // 添加作者过滤 if (options.author) { params.as_sauthors = options.author; } return params; } /** * 发起Scholar请求 */ async makeScholarRequest(params) { const userAgent = this.getRandomUserAgent(); const config = { params, headers: { 'User-Agent': userAgent, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1' }, timeout: 30000 }; console.error(`🔍 Google Scholar Request: GET ${this.scholarUrl}`); console.error(`📋 Scholar params:`, params); return await axios.get(this.scholarUrl, config); } /** * 解析单个Scholar搜索结果 */ parseScholarResult($, element) { try { // 提取标题和链接 const titleElement = element.find('h3.gs_rt'); const titleLink = titleElement.find('a'); const title = titleElement.text().replace(/^\[PDF\]|\[HTML\]|\[BOOK\]|\[B\]/, '').trim(); const url = titleLink.attr('href') || ''; if (!title) { return null; } // 过滤掉书籍结果,优先学术论文 const titleText = titleElement.text(); if (titleText.includes('[BOOK]') || titleText.includes('[B]') || url.includes('books.google.com')) { return null; // 跳过书籍结果 } // 提取作者和出版信息 const infoElement = element.find('div.gs_a'); const infoText = infoElement.text(); const authors = this.extractAuthors(infoText); const year = this.extractYear(infoText); // 提取摘要 const abstractElement = element.find('div.gs_rs'); const abstract = abstractElement.text() || ''; // 提取引用次数 const citationElement = element.find('div.gs_fl a').filter((i, el) => { return $(el).text().includes('Cited by'); }); const citationText = citationElement.text(); const citationCount = this.extractCitationCount(citationText); // 生成论文ID const paperId = this.generatePaperId(title, authors); return PaperFactory.create({ paperId, title: this.cleanText(title), authors, abstract: this.cleanText(abstract), doi: '', // Google Scholar通常不直接提供DOI publishedDate: year ? new Date(year, 0, 1) : null, pdfUrl: '', // 需要额外处理PDF链接 url, source: 'google_scholar', categories: [], keywords: [], citationCount, journal: this.extractJournal(infoText), year, extra: { scholarId: paperId, infoText } }); } catch (error) { console.error('Error parsing Google Scholar result:', error); return null; } } /** * 提取作者信息 */ extractAuthors(infoText) { const parts = infoText.split(' - '); if (parts.length > 0) { const authorPart = parts[0]; return authorPart.split(',').map(author => author.trim()).filter(a => a.length > 0); } return []; } /** * 提取年份 */ extractYear(text) { const yearMatch = text.match(/\b(19|20)\d{2}\b/); return yearMatch ? parseInt(yearMatch[0], 10) : undefined; } /** * 提取期刊信息 */ extractJournal(infoText) { const parts = infoText.split(' - '); if (parts.length > 1) { // 通常期刊在第二部分 return parts[1].split(',')[0].trim(); } return ''; } /** * 提取引用次数 */ extractCitationCount(citationText) { const match = citationText.match(/Cited by (\d+)/); return match ? parseInt(match[1], 10) : 0; } /** * 生成论文ID */ generatePaperId(title, authors) { const titleHash = this.simpleHash(title); const authorHash = this.simpleHash(authors.join('')); return `gs_${titleHash}_${authorHash}`; } /** * 简单哈希函数 */ simpleHash(str) { let hash = 0; for (let i = 0; i < str.length; i++) { const char = str.charCodeAt(i); hash = ((hash << 5) - hash) + char; hash = hash & hash; // 转换为32位整数 } return Math.abs(hash).toString(36); } /** * 获取随机User-Agent */ getRandomUserAgent() { return this.userAgents[Math.floor(Math.random() * this.userAgents.length)]; } /** * 随机延迟 */ async randomDelay() { const delay = Math.random() * 2000 + 1000; // 1-3秒随机延迟 await new Promise(resolve => setTimeout(resolve, delay)); } } //# sourceMappingURL=GoogleScholarSearcher.js.map