aiwg

Version:

Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.

aiwg.io

jmagly/aiwg

387 lines (333 loc) • 9.9 kB

text/typescript

/** * Acquisition service for downloading and validating research papers * * @module research/services/acquisition */ import { promises as fs } from 'fs'; import { join, dirname } from 'path'; import { createHash } from 'crypto'; import { ResearchPaper } from '../types.js'; import { UnpaywallClient } from '../clients/unpaywall.js'; import { ArxivClient } from '../clients/arxiv.js'; import { AcquiredSource, FAIRScore } from './types.js'; /** * Configuration for acquisition service */ export interface AcquisitionConfig { /** Unpaywall client */ unpaywall?: UnpaywallClient; /** arXiv client */ arxiv?: ArxivClient; /** Download directory */ downloadDir?: string; /** Timeout for downloads in ms */ downloadTimeout?: number; /** Email for Unpaywall API */ email?: string; } /** * Acquisition service for downloading research papers */ export class AcquisitionService { private unpaywall: UnpaywallClient; private downloadDir: string; private downloadTimeout: number; constructor(config: AcquisitionConfig = {}) { this.unpaywall = config.unpaywall || new UnpaywallClient({ email: config.email }); this.downloadDir = config.downloadDir || '.aiwg/research/sources'; this.downloadTimeout = config.downloadTimeout || 300000; // 5 minutes } /** * Acquire a paper (download + metadata extraction) */ async acquire(paper: ResearchPaper): Promise<AcquiredSource> { // Determine PDF URL let pdfUrl = paper.pdfUrl; if (!pdfUrl && paper.doi) { // Try to get open access PDF from Unpaywall const oaUrl = await this.unpaywall.checkOpenAccess(paper.doi); if (oaUrl) { pdfUrl = oaUrl; } } if (!pdfUrl) { throw new Error(`No PDF URL available for paper: ${paper.id}`); } // Download PDF const filename = this.generateFilename(paper); const filePath = join(this.downloadDir, filename); await this.ensureDir(dirname(filePath)); await this.downloadFile(pdfUrl, filePath); // Compute checksum const checksum = await this.computeChecksum(filePath); // Get file size const stats = await fs.stat(filePath); const sizeBytes = stats.size; // Get next REF-XXX ID const refId = await this.assignRefId(); // Validate FAIR compliance const fairScore = await this.validateFAIR({ paper, filePath, checksum, refId, acquiredAt: new Date().toISOString(), sizeBytes, }); const source: AcquiredSource = { paper, filePath, checksum, refId, acquiredAt: new Date().toISOString(), sizeBytes, fairScore, }; return source; } /** * Compute SHA-256 checksum of a file */ async computeChecksum(filePath: string): Promise<string> { const content = await fs.readFile(filePath); const hash = createHash('sha256'); hash.update(content); return hash.digest('hex'); } /** * Assign next REF-XXX identifier */ async assignRefId(existingRefs?: string[]): Promise<string> { let refs = existingRefs; // If not provided, scan sources directory if (!refs) { refs = await this.scanExistingRefs(); } // Extract numbers from REF-XXX format const refNumbers = refs .filter((ref) => /^REF-\d{3}$/.test(ref)) .map((ref) => parseInt(ref.substring(4), 10)) .filter((n) => !isNaN(n)); // Find next available number const maxNumber = refNumbers.length > 0 ? Math.max(...refNumbers) : 0; const nextNumber = maxNumber + 1; // Format as REF-XXX with zero padding return `REF-${String(nextNumber).padStart(3, '0')}`; } /** * Validate FAIR compliance */ async validateFAIR(source: AcquiredSource): Promise<FAIRScore> { const findable = this.assessFindable(source); const accessible = this.assessAccessible(source); const interoperable = this.assessInteroperable(source); const reusable = this.assessReusable(source); const overall = (findable.score + accessible.score + interoperable.score + reusable.score) / 4; const notes: string[] = []; if (findable.score < 1.0) { notes.push('Findability can be improved by adding more metadata'); } if (accessible.score < 1.0) { notes.push('Accessibility requires proper file permissions'); } if (interoperable.score < 1.0) { notes.push('Interoperability requires standard metadata format'); } if (reusable.score < 1.0) { notes.push('Reusability requires license information'); } return { overall, findable, accessible, interoperable, reusable, notes, }; } /** * Assess Findable dimension */ private assessFindable(source: AcquiredSource) { const criteria = [ { id: 'F1', description: 'Assigned globally unique identifier (REF-XXX)', met: !!source.refId && /^REF-\d{3}$/.test(source.refId), }, { id: 'F2', description: 'Data described with rich metadata', met: !!source.paper.title && source.paper.authors.length > 0, }, { id: 'F3', description: 'Metadata includes identifier', met: !!(source.paper.doi || source.paper.arxivId), }, { id: 'F4', description: 'Indexed in searchable resource', met: true, // Assumed for papers from APIs }, ]; const metCount = criteria.filter((c) => c.met).length; const score = metCount / criteria.length; return { score, criteria }; } /** * Assess Accessible dimension */ private assessAccessible(source: AcquiredSource) { const criteria = [ { id: 'A1', description: 'Retrievable by identifier using standard protocol', met: !!source.filePath, }, { id: 'A2', description: 'Metadata accessible even when data unavailable', met: true, // Paper metadata stored separately }, ]; const metCount = criteria.filter((c) => c.met).length; const score = metCount / criteria.length; return { score, criteria }; } /** * Assess Interoperable dimension */ private assessInteroperable(source: AcquiredSource) { const criteria = [ { id: 'I1', description: 'Uses formal, accessible knowledge representation', met: true, // JSON metadata }, { id: 'I2', description: 'Uses FAIR-compliant vocabularies', met: true, // Standard paper metadata fields }, { id: 'I3', description: 'Includes qualified references to other data', met: !!(source.paper.doi || source.paper.arxivId), }, ]; const metCount = criteria.filter((c) => c.met).length; const score = metCount / criteria.length; return { score, criteria }; } /** * Assess Reusable dimension */ private assessReusable(source: AcquiredSource) { const criteria = [ { id: 'R1', description: 'Described with accurate metadata', met: !!source.paper.abstract, }, { id: 'R2', description: 'Detailed provenance', met: !!source.acquiredAt && !!source.paper.source, }, { id: 'R3', description: 'Meets community standards', met: true, // Academic paper from recognized source }, ]; const metCount = criteria.filter((c) => c.met).length; const score = metCount / criteria.length; return { score, criteria }; } /** * Download file from URL */ private async downloadFile(url: string, destination: string): Promise<void> { const controller = new AbortController(); const timeoutId = setTimeout( () => controller.abort(), this.downloadTimeout ); try { const response = await fetch(url, { signal: controller.signal, }); clearTimeout(timeoutId); if (!response.ok) { throw new Error(`Download failed: ${response.status}`); } const buffer = await response.arrayBuffer(); await fs.writeFile(destination, Buffer.from(buffer)); } catch (error) { clearTimeout(timeoutId); if (error instanceof Error && error.name === 'AbortError') { throw new Error( `Download timeout after ${this.downloadTimeout}ms` ); } throw error; } } /** * Generate filename for paper */ private generateFilename(paper: ResearchPaper): string { // Use first author's last name if available let authorPart = 'unknown'; if (paper.authors.length > 0) { const firstName = paper.authors[0].name; const parts = firstName.split(' '); authorPart = parts[parts.length - 1].toLowerCase(); } // Use year const yearPart = paper.year || 'unknown'; // Create slug from title (first 3 words) const titleWords = paper.title .toLowerCase() .replace(/[^\w\s]/g, '') .split(/\s+/) .slice(0, 3) .join('-'); return `${authorPart}-${yearPart}-${titleWords}.pdf`; } /** * Scan existing REF-XXX identifiers */ private async scanExistingRefs(): Promise<string[]> { const refs: string[] = []; try { const files = await fs.readdir(this.downloadDir); for (const file of files) { if (file.startsWith('REF-') && file.endsWith('.pdf')) { const refId = file.substring(0, 7); // REF-XXX refs.push(refId); } } } catch (error) { // Directory doesn't exist yet } return refs; } /** * Ensure directory exists */ private async ensureDir(dir: string): Promise<void> { try { await fs.mkdir(dir, { recursive: true }); } catch (error) { if ((error as NodeJS.ErrnoException).code !== 'EEXIST') { throw error; } } } }