periodicos-capes-mcp
Version:
MCP server para consulta de periódicos científicos do Portal de Periódicos CAPES
486 lines (485 loc) • 21.3 kB
JavaScript
import * as cheerio from 'cheerio';
import { QualisService } from './qualis-service.js';
import { OpenAlexService } from './openalex-service.js';
export class CAPESScraper {
static BASE_URL = 'https://www.periodicos.capes.gov.br/index.php/acervo/buscador.html';
static DETAIL_URL_PATTERN = 'https://www.periodicos.capes.gov.br/index.php/acervo/buscador.html?task=detalhes&source=all&id={}';
static USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36';
static DEFAULT_HEADERS = {
'User-Agent': CAPESScraper.USER_AGENT,
'Accept-Language': 'en-US,en;q=0.9,pt-BR;q=0.8,pt;q=0.7',
};
defaultTimeout;
defaultMaxWorkers;
constructor(timeout = 30000, maxWorkers = 5) {
this.defaultTimeout = timeout;
this.defaultMaxWorkers = maxWorkers;
}
constructSearchUrl(searchTerm, options, page = 1) {
// Encode search term with parentheses and spaces as expected by portal
const searchQuery = options.advanced ? `all:contains(${searchTerm})` : searchTerm;
const encodedTerm = encodeURIComponent(searchQuery).replace(/\(/g, '%28').replace(/\)/g, '%29').replace(/%20/g, '+');
let url = `${CAPESScraper.BASE_URL}?q=${encodedTerm}`;
// Source (always use all)
url += '&source=all';
if (options.advanced) {
url += '&mode=advanced';
}
if (page > 1) {
url += `&page=${page}`;
}
if (options.document_types && options.document_types.length > 0) {
options.document_types.forEach(type => {
const encodedType = encodeURIComponent(type).replace(/%20/g, '+');
url += `&type%5B%5D=type%3D%3D${encodedType}`;
});
}
if (options.open_access_only === true) {
url += '&open_access%5B%5D=open_access%3D%3D1';
}
else if (options.open_access_only === false) {
url += '&open_access%5B%5D=open_access%3D%3D0';
}
if (options.peer_reviewed_only === true) {
url += '&peer_reviewed%5B%5D=peer_reviewed%3D%3D1';
}
else if (options.peer_reviewed_only === false) {
url += '&peer_reviewed%5B%5D=peer_reviewed%3D%3D0';
}
if (options.year_min) {
url += `&publishyear_min%5B%5D=${options.year_min}`;
}
if (options.year_max) {
url += `&publishyear_max%5B%5D=${options.year_max}`;
}
if (options.languages && options.languages.length > 0) {
options.languages.forEach(lang => {
const encodedLang = encodeURIComponent(lang).replace(/%20/g, '+');
url += `&language%5B%5D=language%3D%3D${encodedLang}`;
});
}
return url;
}
extractDoi(url) {
const doiPattern = /(?:doi\.org\/|doi=|\/doi\/)(10\.\d{4,9}\/[-._;()/:A-Z0-9]+)/i;
const match = url.match(doiPattern);
return match ? match[1] : undefined;
}
extractArticleIdFromUrl(url) {
const idPattern = /id=([A-Z0-9]+)/;
const match = url.match(idPattern);
return match ? match[1] : undefined;
}
getTotalPages($) {
try {
const totalSpan = $('div.pagination-information span.total');
if (totalSpan.length && totalSpan.text().trim()) {
const totalItems = parseInt(totalSpan.text().trim().replace(/\./g, ''));
const perPage = 30; // CAPES default items per page
return Math.ceil(totalItems / perPage);
}
return 0;
}
catch (error) {
return 1;
}
}
extractBasicArticleInfo($, theme, searchTerm) {
const listings = [];
const articleSections = $('.result-busca');
articleSections.each((_, section) => {
try {
const $section = $(section);
const titleElem = $section.find('.titulo-busca');
const title = titleElem.text().trim() || 'No title found';
let articleId;
let detailUrl;
const href = titleElem.attr('href');
if (href) {
detailUrl = href.startsWith('http')
? href
: `https://www.periodicos.capes.gov.br${href}`;
articleId = this.extractArticleIdFromUrl(detailUrl);
}
let publisher;
let journal;
const journalParagraphs = $section.find('p.text-down-01');
journalParagraphs.each((_, p) => {
const text = $(p).text();
if (text.includes('| ')) {
const parts = text.split('|');
if (parts.length >= 2) {
journal = parts[1].trim();
const leftSide = parts[0];
if (leftSide.includes('-')) {
const publisherPart = leftSide.split('-', 2)[1];
publisher = publisherPart?.trim();
}
}
return false; // Stop at first match
}
});
const authors = [];
$section.find('a.view-autor').each((_, authorLink) => {
const authorName = $(authorLink).text().trim();
if (authorName && !authors.includes(authorName)) {
authors.push(authorName);
}
});
const isOpenAccess = $section.find('.text-green-cool-vivid-50, .open-access, [title*="open access"], [alt*="open access"]').length > 0 ||
$section.text().toLowerCase().includes('open access');
const isPeerReviewed = $section.find('.text-violet-50, .peer-reviewed, [title*="peer"], [alt*="peer"]').length > 0 ||
$section.text().toLowerCase().includes('peer') ||
$section.text().toLowerCase().includes('reviewed');
const documentTypeElement = $section.find('.fw-semibold').first();
const documentType = documentTypeElement.length ? documentTypeElement.text().trim() : undefined;
if (title && articleId) {
listings.push({
title,
article_id: articleId,
detail_url: detailUrl,
theme,
search_term: searchTerm,
journal,
publisher,
authors,
document_type: documentType,
is_open_access: isOpenAccess,
is_peer_reviewed: isPeerReviewed,
});
}
}
catch (error) {
}
});
return listings;
}
async scrapeArticleDetail(articleId, timeout) {
const detailUrl = CAPESScraper.DETAIL_URL_PATTERN.replace('{}', articleId);
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeout || this.defaultTimeout);
const response = await fetch(detailUrl, {
signal: controller.signal,
headers: CAPESScraper.DEFAULT_HEADERS,
});
clearTimeout(timeoutId);
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const html = await response.text();
const $ = cheerio.load(html);
const metadata = {
detail_url: detailUrl,
};
const abstractElem = $('#item-resumo');
if (abstractElem.length) {
metadata.abstract = abstractElem.text().trim();
}
// Look for ISSN in specific patterns
const issnStrong = $('strong:contains("ISSN")').first();
if (issnStrong.length) {
const nextP = issnStrong.next('p');
if (nextP.length) {
metadata.issn = nextP.text().trim();
}
}
const publisherElem = $('#item-instituicao');
if (publisherElem.length) {
metadata.publisher = publisherElem.text().replace(/;$/, '').trim();
}
const yearElem = $('#item-ano');
if (yearElem.length) {
const yearMatch = yearElem.text().match(/(\d{4})/);
if (yearMatch) {
metadata.publication_date = yearMatch[1];
}
}
const pubInfo = $('p.small.text-muted');
if (pubInfo.length) {
const text = pubInfo.text();
const volumeMatch = text.match(/Volume:\s*([^;]+)/);
if (volumeMatch) {
metadata.volume = volumeMatch[1].trim();
}
const issueMatch = text.match(/Issue:\s*(\d+)/);
if (issueMatch) {
metadata.issue = issueMatch[1].trim();
}
const langMatch = text.match(/Linguagem:\s*([^;]+)/);
if (langMatch) {
metadata.language = langMatch[1].trim();
}
}
metadata.is_open_access = $('.text-green-cool-vivid-50').length > 0;
metadata.is_peer_reviewed = $('.text-violet-50').length > 0;
const authors = [];
$('.view-autor').each((_, elem) => {
const authorText = $(elem).text().trim();
if (authorText) {
authors.push(authorText);
}
});
if (authors.length > 0) {
metadata.authors = authors;
}
$('a[href^="http"]').each((_, elem) => {
const href = $(elem).attr('href');
if (href) {
const doi = this.extractDoi(href);
if (doi) {
metadata.doi = doi;
return false; // Stop at first match
}
}
});
return metadata;
}
catch (error) {
return {};
}
}
async getListingsForTerm(searchTerm, theme, options) {
const listings = [];
const url = this.constructSearchUrl(searchTerm, options, 1);
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), options.timeout || this.defaultTimeout);
const response = await fetch(url, {
signal: controller.signal,
headers: CAPESScraper.DEFAULT_HEADERS,
});
clearTimeout(timeoutId);
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const html = await response.text();
const $ = cheerio.load(html);
// Determine total number of pages
let totalPages = this.getTotalPages($);
if (options.max_pages && options.max_pages > 0) {
totalPages = Math.min(totalPages, options.max_pages);
}
// Process first page
const pageListings = this.extractBasicArticleInfo($, theme, searchTerm);
listings.push(...pageListings);
// Process remaining pages in parallel if needed
if (totalPages > 1) {
const pagePromises = [];
const maxWorkers = options.max_workers || this.defaultMaxWorkers;
for (let page = 2; page <= totalPages; page++) {
const pagePromise = this.fetchPage(searchTerm, theme, page, options);
pagePromises.push(pagePromise);
// Process in batches to respect max_workers
if (pagePromises.length >= maxWorkers || page === totalPages) {
const batchResults = await Promise.all(pagePromises);
batchResults.forEach(pageResults => listings.push(...pageResults));
pagePromises.length = 0;
}
}
}
return listings;
}
catch (error) {
return [];
}
}
async fetchPage(searchTerm, theme, page, options) {
try {
const pageUrl = this.constructSearchUrl(searchTerm, options, page);
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), options.timeout || this.defaultTimeout);
const response = await fetch(pageUrl, {
signal: controller.signal,
headers: CAPESScraper.DEFAULT_HEADERS,
});
clearTimeout(timeoutId);
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const html = await response.text();
const $ = cheerio.load(html);
return this.extractBasicArticleInfo($, theme, searchTerm);
}
catch (error) {
return [];
}
}
async fetchArticleDetails(articleListings, options) {
const maxWorkers = options.max_workers || this.defaultMaxWorkers;
const articles = [];
const processArticle = async (listing) => {
try {
if (!listing.article_id) {
return null;
}
const details = await this.scrapeArticleDetail(listing.article_id, options.timeout);
const article = {
title: listing.title,
authors: details.authors || [],
search_term: listing.theme,
article_id: listing.article_id,
detail_url: listing.detail_url,
journal: listing.journal,
publisher: listing.publisher,
document_type: listing.document_type,
is_open_access: details.is_open_access || false,
is_peer_reviewed: details.is_peer_reviewed || false,
...details,
};
return article;
}
catch (error) {
return null;
}
};
// Process articles in batches to respect max_workers
for (let i = 0; i < articleListings.length; i += maxWorkers) {
const batch = articleListings.slice(i, i + maxWorkers);
const batchPromises = batch.map(processArticle);
const batchResults = await Promise.all(batchPromises);
batchResults.forEach(article => {
if (article)
articles.push(article);
});
}
// Add metrics (OpenAlex + Qualis) if requested
if (options.include_metrics) {
await this.enrichWithMetrics(articles);
}
return articles;
}
async enrichWithMetrics(articles) {
// First, add Qualis data for articles with ISSN
const qualisService = QualisService.getInstance();
let qualisEnrichedCount = 0;
for (const article of articles) {
if (article.issn) {
const qualisInfo = qualisService.getQualisByISSN(article.issn);
if (qualisInfo) {
if (!article.metrics) {
article.metrics = {
cited_by_count: 0,
publication_year: new Date().getFullYear(),
is_open_access: false,
};
}
article.metrics.qualis = {
classification: qualisInfo.classification,
area: qualisInfo.area,
};
qualisEnrichedCount++;
}
}
}
// Then, add OpenAlex citation metrics for articles with DOI
const articlesWithDOI = articles.filter(article => article.doi);
if (articlesWithDOI.length === 0) {
return;
}
const dois = articlesWithDOI.map(article => article.doi);
try {
// Fetch OpenAlex metrics in batches
const openAlexMetricsMap = await OpenAlexService.getMetricsByDOIs(dois);
// Apply OpenAlex metrics to articles
let openAlexEnrichedCount = 0;
for (const article of articlesWithDOI) {
if (article.doi) {
const openAlexMetrics = openAlexMetricsMap.get(article.doi);
if (openAlexMetrics) {
if (!article.metrics) {
article.metrics = openAlexMetrics;
}
else {
// Merge OpenAlex metrics with existing metrics (keeping Qualis)
article.metrics = {
...openAlexMetrics,
qualis: article.metrics.qualis, // Preserve Qualis data
};
}
openAlexEnrichedCount++;
}
}
}
}
catch (error) {
}
}
async search(options) {
// Phase 1: Get all article listings
const articleListings = await this.getListingsForTerm(options.query, options.query, // Use query as theme for consistency
options);
// Apply max_results limit to listings first (before expensive operations)
let limitedListings = articleListings;
if (options.max_results && options.max_results > 0) {
limitedListings = articleListings.slice(0, options.max_results);
}
let articles;
if (options.full_details) {
// Phase 2: Fetch detailed metadata (only for limited set)
articles = await this.fetchArticleDetails(limitedListings, options);
}
else {
// Convert basic listings to Article objects without detailed metadata
articles = limitedListings.map(listing => ({
title: listing.title,
authors: listing.authors || [],
search_term: listing.theme,
article_id: listing.article_id,
detail_url: listing.detail_url,
journal: listing.journal,
publisher: listing.publisher,
document_type: listing.document_type,
is_open_access: listing.is_open_access || false,
is_peer_reviewed: listing.is_peer_reviewed || false,
}));
}
return {
articles,
total_found: articleListings.length, // Use original listings count, not limited articles
pages_processed: options.max_pages || 0,
query: options.query,
};
}
async searchPreview(options) {
const url = this.constructSearchUrl(options.query, options, 1);
try {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), options.timeout || this.defaultTimeout);
const response = await fetch(url, {
signal: controller.signal,
headers: CAPESScraper.DEFAULT_HEADERS,
});
clearTimeout(timeoutId);
if (!response.ok) {
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
}
const html = await response.text();
const $ = cheerio.load(html);
// Get total pages and items
const totalPages = this.getTotalPages($);
const totalSpan = $('div.pagination-information span.total');
const totalFound = totalSpan.length ? parseInt(totalSpan.text().trim().replace(/\./g, '')) : 0;
// Estimate time based on total pages (assuming ~2 seconds per page with current workers)
const maxWorkers = options.max_workers || this.defaultMaxWorkers;
const estimatedTimeSeconds = Math.ceil((totalPages * 2) / maxWorkers);
return {
query: options.query,
total_found: totalFound,
estimated_time_seconds: estimatedTimeSeconds,
search_url: url,
filters_applied: {
document_types: options.document_types,
open_access_only: options.open_access_only,
peer_reviewed_only: options.peer_reviewed_only,
year_min: options.year_min,
year_max: options.year_max,
languages: options.languages,
},
};
}
catch (error) {
throw error;
}
}
}