UNPKG

@quantumai/quantum-cli-core

Version:

Quantum CLI Core - Multi-LLM Collaboration System

307 lines 17 kB
/** * @license * Copyright 2025 Google LLC * SPDX-License-Identifier: Apache-2.0 */ import { describe, it, expect, beforeEach, afterEach } from 'vitest'; import { PerformanceTracker, } from './performance-tracker.js'; import { RealTimeMetricsCollector } from './real-time-metrics.js'; import { UseCaseAnalyzer } from './use-case-analyzer.js'; import { ModelCharacteristicsService } from './model-characteristics.js'; import { QueryType } from './types.js'; describe('Model Characteristics System Integration', () => { let performanceTracker; let metricsCollector; beforeEach(() => { performanceTracker = new PerformanceTracker(); metricsCollector = new RealTimeMetricsCollector(performanceTracker); }); afterEach(() => { performanceTracker.destroy(); metricsCollector.destroy(); }); describe('End-to-End Model Selection and Tracking', () => { it('should recommend models and track their performance', async () => { // 1. Analyze a query and get recommendations const query = 'Implement a secure payment processing system with PCI compliance'; const recommendations = UseCaseAnalyzer.getRecommendations(query); expect(recommendations.analysis.queryType).toBe(QueryType.SECURITY); expect(recommendations.primaryRecommendation).toBeDefined(); const selectedModel = recommendations.primaryRecommendation.modelId; // 2. Simulate using the recommended model const requestId = 'test-request-1'; metricsCollector.startRequest(requestId, selectedModel, query); // 3. Simulate completion with performance data await new Promise((resolve) => setTimeout(resolve, 100)); // Simulate processing time metricsCollector.completeRequest(requestId, { content: 'Mock secure payment implementation...', inputTokens: 500, outputTokens: 800, cost: 0.024, queryType: QueryType.SECURITY, userRating: 4, }); // 4. Verify metrics were recorded await new Promise((resolve) => setTimeout(resolve, 50)); // Allow event processing const dashboardData = metricsCollector.getLiveDashboardData(); const modelData = dashboardData.models.find((m) => m.modelId === selectedModel); expect(modelData).toBeDefined(); expect(modelData?.currentRps).toBeGreaterThanOrEqual(0); expect(dashboardData.systemMetrics.totalRequests).toBeGreaterThan(0); }); it('should handle model comparison and selection based on real performance', async () => { // 1. Compare models for a specific use case const query = 'Write a creative marketing campaign for a new product'; const modelIds = ['gemini-2.5-pro', 'gpt-4', 'claude-3-sonnet']; const comparison = UseCaseAnalyzer.compareModelsForUseCase(modelIds, query); expect(comparison).toHaveLength(3); // 2. Simulate performance data for each model const performanceData = [ { modelId: 'gpt-4', latency: 2500, cost: 0.045, success: true }, { modelId: 'gemini-2.5-pro', latency: 3200, cost: 0.032, success: true, }, { modelId: 'claude-3-sonnet', latency: 2800, cost: 0.018, success: false, }, ]; for (let i = 0; i < performanceData.length; i++) { const data = performanceData[i]; const requestId = `test-request-${i + 2}`; metricsCollector.startRequest(requestId, data.modelId, query); if (data.success) { metricsCollector.completeRequest(requestId, { content: 'Mock creative content...', inputTokens: 300, outputTokens: 600, cost: data.cost, queryType: QueryType.CREATIVE, userRating: 4, }); } else { metricsCollector.failRequest(requestId, new Error('API Error')); } } // 3. Verify performance tracking await new Promise((resolve) => setTimeout(resolve, 100)); const dashboardData = metricsCollector.getLiveDashboardData(); // Check that models with errors have warning/error status const claudeModel = dashboardData.models.find((m) => m.modelId === 'claude-3-sonnet'); expect(claudeModel?.status).not.toBe('healthy'); // Check that successful models have reasonable metrics const gptModel = dashboardData.models.find((m) => m.modelId === 'gpt-4'); expect(gptModel?.successRate).toBe(1); }); it('should adapt recommendations based on historical performance', () => { // 1. Get baseline recommendations const codeQuery = 'Optimize this Python algorithm for better performance'; const initialRecs = UseCaseAnalyzer.getRecommendations(codeQuery); expect(initialRecs.analysis.queryType).toBe(QueryType.CODE); expect(initialRecs.primaryRecommendation.score).toBeGreaterThan(0); // 2. Verify that model characteristics are being used const primaryModel = ModelCharacteristicsService.getModelCharacteristics(initialRecs.primaryRecommendation.modelId); expect(primaryModel).toBeDefined(); expect(primaryModel?.useCases).toBeDefined(); // 3. Check that cost and latency estimates are reasonable expect(initialRecs.primaryRecommendation.estimatedCost).toBeGreaterThan(0); expect(initialRecs.primaryRecommendation.estimatedLatency).toBeGreaterThan(0); // 4. Verify reasoning is provided expect(initialRecs.primaryRecommendation.reasoning.length).toBeGreaterThan(0); expect(initialRecs.explanation).toBeDefined(); }); }); describe('Performance Tracking and Alerts', () => { it('should generate cost alerts when limits are exceeded', async () => { const modelId = 'gpt-4'; // Set a low cost limit performanceTracker.setCostLimits(modelId, 0.05); // $0.05 daily limit let alertReceived = false; metricsCollector.on('metric_event', (event) => { if (event.type === 'cost_alert') { alertReceived = true; } }); // Simulate multiple expensive requests for (let i = 0; i < 5; i++) { const requestId = `cost-test-${i}`; metricsCollector.startRequest(requestId, modelId, 'Expensive query'); metricsCollector.completeRequest(requestId, { content: 'Expensive response...', inputTokens: 1000, outputTokens: 1500, cost: 0.015, // This should trigger alert after a few requests queryType: QueryType.GENERAL, }); } await new Promise((resolve) => setTimeout(resolve, 100)); // Check current cost usage const costUsage = performanceTracker.getCurrentCostUsage(modelId); expect(costUsage.daily).toBeGreaterThan(0.05); // Note: Alert may not be triggered in test due to timing, but cost tracking should work }); it('should track performance degradation', async () => { const modelId = 'gemini-2.5-pro'; // Simulate gradually degrading performance const latencies = [1500, 2000, 3000, 4500, 6000]; // Increasing latency for (let i = 0; i < latencies.length; i++) { const requestId = `perf-test-${i}`; metricsCollector.startRequest(requestId, modelId, 'Performance test'); // Simulate the specific latency await new Promise((resolve) => setTimeout(resolve, 10)); metricsCollector.completeRequest(requestId, { content: 'Test response', inputTokens: 200, outputTokens: 300, cost: 0.008, queryType: QueryType.GENERAL, }); } await new Promise((resolve) => setTimeout(resolve, 100)); // Check dashboard data for performance issues const dashboardData = metricsCollector.getLiveDashboardData(); const modelData = dashboardData.models.find((m) => m.modelId === modelId); expect(modelData).toBeDefined(); expect(modelData?.averageLatency).toBeGreaterThan(0); }); it('should provide real-time health status', async () => { const modelIds = ['gpt-4', 'gemini-2.5-pro', 'claude-3-sonnet']; // Perform health checks for (const modelId of modelIds) { const healthCheck = await metricsCollector.performHealthCheck(modelId); expect(healthCheck.modelId).toBe(modelId); expect(healthCheck.latency).toBeGreaterThanOrEqual(0); expect(healthCheck.lastChecked).toBeInstanceOf(Date); } // Simulate some errors for one model const problematicModel = 'claude-3-sonnet'; for (let i = 0; i < 3; i++) { const requestId = `error-test-${i}`; metricsCollector.startRequest(requestId, problematicModel, 'Error test'); metricsCollector.failRequest(requestId, new Error('Simulated API error')); } await new Promise((resolve) => setTimeout(resolve, 100)); // Check health status after errors const healthCheck = await metricsCollector.performHealthCheck(problematicModel); // Health check might still pass depending on error threshold expect(healthCheck).toBeDefined(); }); }); describe('Use Case Analysis Integration', () => { it('should classify complex multi-domain queries correctly', () => { const complexQuery = ` Build a secure e-commerce platform using React and Node.js with the following requirements: - Real-time inventory tracking - PCI-compliant payment processing - Machine learning recommendations - High-performance search with Elasticsearch - Docker containerization for deployment `; const analysis = UseCaseAnalyzer.analyzeQuery(complexQuery); expect(analysis.queryType).toBe(QueryType.CODE); expect(analysis.complexity).toBe('complex'); expect(analysis.domain).toBe('web-development'); expect(analysis.frameworks).toContain('React'); expect(analysis.frameworks).toContain('Node.js'); expect(analysis.specialRequirements).toContain('real-time'); expect(analysis.specialRequirements).toContain('secure'); expect(analysis.specialRequirements).toContain('high-performance'); }); it('should provide different recommendations for different complexity levels', () => { const simpleQuery = 'What is a variable in Python?'; const complexQuery = 'Design a distributed microservices architecture with event sourcing'; const simpleRecs = UseCaseAnalyzer.getRecommendations(simpleQuery); const complexRecs = UseCaseAnalyzer.getRecommendations(complexQuery); expect(simpleRecs.analysis.complexity).toBe('simple'); expect(complexRecs.analysis.complexity).toBe('complex'); // Complex queries might prefer different models than simple ones expect(simpleRecs.primaryRecommendation).toBeDefined(); expect(complexRecs.primaryRecommendation).toBeDefined(); // Both should have valid cost estimates expect(simpleRecs.primaryRecommendation.estimatedCost).toBeGreaterThan(0); expect(complexRecs.primaryRecommendation.estimatedCost).toBeGreaterThan(0); }); it('should respect budget constraints in recommendations', () => { const query = 'Write a comprehensive technical documentation'; const lowBudgetRecs = UseCaseAnalyzer.getRecommendations(query, 0.005); const highBudgetRecs = UseCaseAnalyzer.getRecommendations(query, 0.1); expect(lowBudgetRecs.primaryRecommendation.estimatedCost).toBeLessThanOrEqual(0.005); expect(highBudgetRecs.primaryRecommendation.estimatedCost).toBeLessThanOrEqual(0.1); // High budget should allow for potentially better models expect(highBudgetRecs.primaryRecommendation.score).toBeGreaterThanOrEqual(lowBudgetRecs.primaryRecommendation.score); }); }); describe('Model Characteristics Data Integrity', () => { it('should have consistent data across all models', () => { const models = ModelCharacteristicsService.getAllModels(); expect(models.length).toBeGreaterThan(0); models.forEach((model) => { // Verify all required fields are present expect(model.id).toBeDefined(); expect(model.name).toBeDefined(); expect(model.provider).toBeDefined(); expect(model.capabilities).toBeDefined(); expect(model.strengths).toBeDefined(); expect(model.weaknesses).toBeDefined(); expect(model.performance).toBeDefined(); expect(model.cost).toBeDefined(); expect(model.useCases).toBeDefined(); expect(model.qualityScores).toBeDefined(); expect(model.constraints).toBeDefined(); // Verify use cases cover all query types const queryTypes = model.useCases.map((uc) => uc.queryType); expect(queryTypes).toContain(QueryType.CODE); expect(queryTypes).toContain(QueryType.CREATIVE); expect(queryTypes).toContain(QueryType.ANALYSIS); expect(queryTypes).toContain(QueryType.SECURITY); expect(queryTypes).toContain(QueryType.GENERAL); // Verify quality scores are in valid range Object.values(model.qualityScores).forEach((score) => { expect(score).toBeGreaterThan(0); expect(score).toBeLessThanOrEqual(1); }); // Verify cost data makes sense expect(model.cost.inputTokenCost).toBeGreaterThan(0); expect(model.cost.outputTokenCost).toBeGreaterThan(0); expect(model.cost.costEfficiencyRank).toBeGreaterThan(0); expect(model.cost.costEfficiencyRank).toBeLessThanOrEqual(10); }); }); it('should provide consistent model ranking across different metrics', () => { const models = ModelCharacteristicsService.getAllModels(); // Test ranking by different query types const queryTypes = [ QueryType.CODE, QueryType.CREATIVE, QueryType.ANALYSIS, ]; queryTypes.forEach((queryType) => { const bestModel = ModelCharacteristicsService.getBestModelForQueryType(queryType); expect(bestModel).toBeDefined(); if (bestModel) { const useCase = bestModel.useCases.find((uc) => uc.queryType === queryType); expect(useCase).toBeDefined(); expect(useCase?.suitabilityScore).toBeGreaterThan(0.5); // Should be reasonably suitable } }); // Test cost efficiency ranking const costEfficient = ModelCharacteristicsService.rankModelsByCostEfficiency(QueryType.GENERAL); expect(costEfficient.length).toBe(models.length); // Verify ranking is actually sorted for (let i = 0; i < costEfficient.length - 1; i++) { const current = costEfficient[i]; const next = costEfficient[i + 1]; const currentEfficiency = current.qualityScores.overall / current.cost.costPerQualityPoint; const nextEfficiency = next.qualityScores.overall / next.cost.costPerQualityPoint; expect(currentEfficiency).toBeGreaterThanOrEqual(nextEfficiency); } }); }); }); //# sourceMappingURL=model-characteristics-integration.test.js.map