aiwg

Version:

Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.

aiwg.io

jmagly/aiwg

337 lines (296 loc) • 8.69 kB

text/typescript

/** * Discovery service for unified research paper search * * @module research/services/discovery */ import { ResearchPaper } from '../types.js'; import { SemanticScholarClient } from '../clients/semantic-scholar.js'; import { CrossRefClient } from '../clients/crossref.js'; import { ArxivClient } from '../clients/arxiv.js'; import { CacheManager } from '../cache/manager.js'; import { SearchOptionsExtended, GapReport } from './types.js'; /** * Configuration for discovery service */ export interface DiscoveryConfig { /** Semantic Scholar client */ semanticScholar?: SemanticScholarClient; /** CrossRef client */ crossref?: CrossRefClient; /** arXiv client */ arxiv?: ArxivClient; /** Cache manager */ cache?: CacheManager; /** Corpus path for gap analysis */ corpusPath?: string; } /** * Discovery service for unified search across API clients */ export class DiscoveryService { private semanticScholar: SemanticScholarClient; private crossref: CrossRefClient; private arxiv: ArxivClient; private cache: CacheManager; constructor(config: DiscoveryConfig = {}) { this.semanticScholar = config.semanticScholar || new SemanticScholarClient(); this.crossref = config.crossref || new CrossRefClient(); this.arxiv = config.arxiv || new ArxivClient(); this.cache = config.cache || new CacheManager(); } /** * Search across all configured API clients */ async search( query: string, options: SearchOptionsExtended = {} ): Promise<ResearchPaper[]> { const { limit = 10, offset = 0, useCache = true, minYear, maxYear, relevanceThreshold = 0.0, deduplicate = true, } = options; // Check cache if (useCache) { const cacheKey = this.cache.generateKey('discovery:search', { query, limit, offset, minYear, maxYear, }); const cached = await this.cache.get<ResearchPaper[]>(cacheKey); if (cached) { return cached; } } // Search all APIs in parallel const [ssResults, crResults, arxivResults] = await Promise.allSettled([ this.searchSemanticScholar(query, limit, offset), this.searchCrossRef(query, limit, offset), this.searchArxiv(query, limit, offset), ]); // Collect successful results const allPapers: ResearchPaper[] = []; if (ssResults.status === 'fulfilled') { allPapers.push(...ssResults.value); } if (crResults.status === 'fulfilled') { allPapers.push(...crResults.value); } if (arxivResults.status === 'fulfilled') { allPapers.push(...arxivResults.value); } // Filter by year range let filtered = allPapers; if (minYear !== undefined) { filtered = filtered.filter((p) => p.year >= minYear); } if (maxYear !== undefined) { filtered = filtered.filter((p) => p.year <= maxYear); } // Deduplicate by DOI/arXiv ID/title if (deduplicate) { filtered = this.deduplicatePapers(filtered); } // Rank by relevance const ranked = this.rankByRelevance(filtered, query); // Filter by relevance threshold const thresholded = ranked.filter( (p) => this.calculateRelevance(p, query) >= relevanceThreshold ); // Apply limit const results = thresholded.slice(0, limit); // Cache results if (useCache) { const cacheKey = this.cache.generateKey('discovery:search', { query, limit, offset, minYear, maxYear, }); await this.cache.set(cacheKey, results, 'semantic-scholar'); } return results; } /** * Analyze gaps in research corpus */ async analyzeGaps(_corpusRefIds: string[]): Promise<GapReport> { // For now, return a basic report structure // In a full implementation, this would analyze corpus metadata return { underrepresentedTopics: [], yearGaps: [], sourceTypeDistribution: {}, recommendations: [ 'Corpus gap analysis requires corpus metadata', 'Consider adding more recent publications', 'Balance source types (journal vs conference vs preprint)', ], }; } /** * Follow citation network starting from a paper */ async followCitations( paperId: string, depth: number = 1 ): Promise<ResearchPaper[]> { const results: ResearchPaper[] = []; const visited = new Set<string>(); await this.followCitationsRecursive(paperId, depth, visited, results); return results; } /** * Search Semantic Scholar */ private async searchSemanticScholar( query: string, limit: number, offset: number ): Promise<ResearchPaper[]> { try { const result = await this.semanticScholar.search(query, { limit, offset, }); return result.papers; } catch (error) { console.warn('Semantic Scholar search failed:', error); return []; } } /** * Search CrossRef */ private async searchCrossRef( query: string, limit: number, offset: number ): Promise<ResearchPaper[]> { try { const result = await this.crossref.search(query, { limit, offset }); return result.papers; } catch (error) { console.warn('CrossRef search failed:', error); return []; } } /** * Search arXiv */ private async searchArxiv( query: string, limit: number, offset: number ): Promise<ResearchPaper[]> { try { const result = await this.arxiv.search(query, { limit, offset }); return result.papers; } catch (error) { console.warn('arXiv search failed:', error); return []; } } /** * Deduplicate papers by DOI, arXiv ID, or title similarity */ private deduplicatePapers(papers: ResearchPaper[]): ResearchPaper[] { const seen = new Map<string, ResearchPaper>(); for (const paper of papers) { // Use DOI as primary key if (paper.doi) { const key = `doi:${paper.doi.toLowerCase()}`; if (!seen.has(key)) { seen.set(key, paper); } continue; } // Use arXiv ID as secondary key if (paper.arxivId) { const key = `arxiv:${paper.arxivId.toLowerCase()}`; if (!seen.has(key)) { seen.set(key, paper); } continue; } // Use normalized title as fallback const normalizedTitle = paper.title .toLowerCase() .replace(/[^\w\s]/g, '') .replace(/\s+/g, ' ') .trim(); const key = `title:${normalizedTitle}`; if (!seen.has(key)) { seen.set(key, paper); } } return Array.from(seen.values()); } /** * Rank papers by relevance to query */ private rankByRelevance( papers: ResearchPaper[], query: string ): ResearchPaper[] { const scored = papers.map((paper) => ({ paper, relevance: this.calculateRelevance(paper, query), })); scored.sort((a, b) => b.relevance - a.relevance); return scored.map((s) => s.paper); } /** * Calculate relevance score (0-1) for a paper */ private calculateRelevance(paper: ResearchPaper, query: string): number { const queryLower = query.toLowerCase(); const queryTerms = queryLower.split(/\s+/); let score = 0; // Title match (weight: 0.5) const titleLower = paper.title.toLowerCase(); const titleMatches = queryTerms.filter((term) => titleLower.includes(term) ).length; score += (titleMatches / queryTerms.length) * 0.5; // Abstract match (weight: 0.3) if (paper.abstract) { const abstractLower = paper.abstract.toLowerCase(); const abstractMatches = queryTerms.filter((term) => abstractLower.includes(term) ).length; score += (abstractMatches / queryTerms.length) * 0.3; } // Citation count bonus (weight: 0.2) if (paper.citationCount) { // Normalize citation count (log scale) const citationScore = Math.log10(paper.citationCount + 1) / 3; // Max ~3 for 1000 citations score += Math.min(citationScore, 1.0) * 0.2; } return Math.min(score, 1.0); } /** * Recursively follow citations */ private async followCitationsRecursive( paperId: string, remainingDepth: number, visited: Set<string>, _results: ResearchPaper[] ): Promise<void> { if (remainingDepth === 0 || visited.has(paperId)) { return; } visited.add(paperId); // For now, just stub - full implementation would query Semantic Scholar // citation graph API endpoint // This would require additional API calls to get references/citations } }