@quantumai/quantum-cli-core
Version:
Quantum CLI Core - Multi-LLM Collaboration System
307 lines • 17 kB
JavaScript
/**
* @license
* Copyright 2025 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import { PerformanceTracker, } from './performance-tracker.js';
import { RealTimeMetricsCollector } from './real-time-metrics.js';
import { UseCaseAnalyzer } from './use-case-analyzer.js';
import { ModelCharacteristicsService } from './model-characteristics.js';
import { QueryType } from './types.js';
describe('Model Characteristics System Integration', () => {
let performanceTracker;
let metricsCollector;
beforeEach(() => {
performanceTracker = new PerformanceTracker();
metricsCollector = new RealTimeMetricsCollector(performanceTracker);
});
afterEach(() => {
performanceTracker.destroy();
metricsCollector.destroy();
});
describe('End-to-End Model Selection and Tracking', () => {
it('should recommend models and track their performance', async () => {
// 1. Analyze a query and get recommendations
const query = 'Implement a secure payment processing system with PCI compliance';
const recommendations = UseCaseAnalyzer.getRecommendations(query);
expect(recommendations.analysis.queryType).toBe(QueryType.SECURITY);
expect(recommendations.primaryRecommendation).toBeDefined();
const selectedModel = recommendations.primaryRecommendation.modelId;
// 2. Simulate using the recommended model
const requestId = 'test-request-1';
metricsCollector.startRequest(requestId, selectedModel, query);
// 3. Simulate completion with performance data
await new Promise((resolve) => setTimeout(resolve, 100)); // Simulate processing time
metricsCollector.completeRequest(requestId, {
content: 'Mock secure payment implementation...',
inputTokens: 500,
outputTokens: 800,
cost: 0.024,
queryType: QueryType.SECURITY,
userRating: 4,
});
// 4. Verify metrics were recorded
await new Promise((resolve) => setTimeout(resolve, 50)); // Allow event processing
const dashboardData = metricsCollector.getLiveDashboardData();
const modelData = dashboardData.models.find((m) => m.modelId === selectedModel);
expect(modelData).toBeDefined();
expect(modelData?.currentRps).toBeGreaterThanOrEqual(0);
expect(dashboardData.systemMetrics.totalRequests).toBeGreaterThan(0);
});
it('should handle model comparison and selection based on real performance', async () => {
// 1. Compare models for a specific use case
const query = 'Write a creative marketing campaign for a new product';
const modelIds = ['gemini-2.5-pro', 'gpt-4', 'claude-3-sonnet'];
const comparison = UseCaseAnalyzer.compareModelsForUseCase(modelIds, query);
expect(comparison).toHaveLength(3);
// 2. Simulate performance data for each model
const performanceData = [
{ modelId: 'gpt-4', latency: 2500, cost: 0.045, success: true },
{
modelId: 'gemini-2.5-pro',
latency: 3200,
cost: 0.032,
success: true,
},
{
modelId: 'claude-3-sonnet',
latency: 2800,
cost: 0.018,
success: false,
},
];
for (let i = 0; i < performanceData.length; i++) {
const data = performanceData[i];
const requestId = `test-request-${i + 2}`;
metricsCollector.startRequest(requestId, data.modelId, query);
if (data.success) {
metricsCollector.completeRequest(requestId, {
content: 'Mock creative content...',
inputTokens: 300,
outputTokens: 600,
cost: data.cost,
queryType: QueryType.CREATIVE,
userRating: 4,
});
}
else {
metricsCollector.failRequest(requestId, new Error('API Error'));
}
}
// 3. Verify performance tracking
await new Promise((resolve) => setTimeout(resolve, 100));
const dashboardData = metricsCollector.getLiveDashboardData();
// Check that models with errors have warning/error status
const claudeModel = dashboardData.models.find((m) => m.modelId === 'claude-3-sonnet');
expect(claudeModel?.status).not.toBe('healthy');
// Check that successful models have reasonable metrics
const gptModel = dashboardData.models.find((m) => m.modelId === 'gpt-4');
expect(gptModel?.successRate).toBe(1);
});
it('should adapt recommendations based on historical performance', () => {
// 1. Get baseline recommendations
const codeQuery = 'Optimize this Python algorithm for better performance';
const initialRecs = UseCaseAnalyzer.getRecommendations(codeQuery);
expect(initialRecs.analysis.queryType).toBe(QueryType.CODE);
expect(initialRecs.primaryRecommendation.score).toBeGreaterThan(0);
// 2. Verify that model characteristics are being used
const primaryModel = ModelCharacteristicsService.getModelCharacteristics(initialRecs.primaryRecommendation.modelId);
expect(primaryModel).toBeDefined();
expect(primaryModel?.useCases).toBeDefined();
// 3. Check that cost and latency estimates are reasonable
expect(initialRecs.primaryRecommendation.estimatedCost).toBeGreaterThan(0);
expect(initialRecs.primaryRecommendation.estimatedLatency).toBeGreaterThan(0);
// 4. Verify reasoning is provided
expect(initialRecs.primaryRecommendation.reasoning.length).toBeGreaterThan(0);
expect(initialRecs.explanation).toBeDefined();
});
});
describe('Performance Tracking and Alerts', () => {
it('should generate cost alerts when limits are exceeded', async () => {
const modelId = 'gpt-4';
// Set a low cost limit
performanceTracker.setCostLimits(modelId, 0.05); // $0.05 daily limit
let alertReceived = false;
metricsCollector.on('metric_event', (event) => {
if (event.type === 'cost_alert') {
alertReceived = true;
}
});
// Simulate multiple expensive requests
for (let i = 0; i < 5; i++) {
const requestId = `cost-test-${i}`;
metricsCollector.startRequest(requestId, modelId, 'Expensive query');
metricsCollector.completeRequest(requestId, {
content: 'Expensive response...',
inputTokens: 1000,
outputTokens: 1500,
cost: 0.015, // This should trigger alert after a few requests
queryType: QueryType.GENERAL,
});
}
await new Promise((resolve) => setTimeout(resolve, 100));
// Check current cost usage
const costUsage = performanceTracker.getCurrentCostUsage(modelId);
expect(costUsage.daily).toBeGreaterThan(0.05);
// Note: Alert may not be triggered in test due to timing, but cost tracking should work
});
it('should track performance degradation', async () => {
const modelId = 'gemini-2.5-pro';
// Simulate gradually degrading performance
const latencies = [1500, 2000, 3000, 4500, 6000]; // Increasing latency
for (let i = 0; i < latencies.length; i++) {
const requestId = `perf-test-${i}`;
metricsCollector.startRequest(requestId, modelId, 'Performance test');
// Simulate the specific latency
await new Promise((resolve) => setTimeout(resolve, 10));
metricsCollector.completeRequest(requestId, {
content: 'Test response',
inputTokens: 200,
outputTokens: 300,
cost: 0.008,
queryType: QueryType.GENERAL,
});
}
await new Promise((resolve) => setTimeout(resolve, 100));
// Check dashboard data for performance issues
const dashboardData = metricsCollector.getLiveDashboardData();
const modelData = dashboardData.models.find((m) => m.modelId === modelId);
expect(modelData).toBeDefined();
expect(modelData?.averageLatency).toBeGreaterThan(0);
});
it('should provide real-time health status', async () => {
const modelIds = ['gpt-4', 'gemini-2.5-pro', 'claude-3-sonnet'];
// Perform health checks
for (const modelId of modelIds) {
const healthCheck = await metricsCollector.performHealthCheck(modelId);
expect(healthCheck.modelId).toBe(modelId);
expect(healthCheck.latency).toBeGreaterThanOrEqual(0);
expect(healthCheck.lastChecked).toBeInstanceOf(Date);
}
// Simulate some errors for one model
const problematicModel = 'claude-3-sonnet';
for (let i = 0; i < 3; i++) {
const requestId = `error-test-${i}`;
metricsCollector.startRequest(requestId, problematicModel, 'Error test');
metricsCollector.failRequest(requestId, new Error('Simulated API error'));
}
await new Promise((resolve) => setTimeout(resolve, 100));
// Check health status after errors
const healthCheck = await metricsCollector.performHealthCheck(problematicModel);
// Health check might still pass depending on error threshold
expect(healthCheck).toBeDefined();
});
});
describe('Use Case Analysis Integration', () => {
it('should classify complex multi-domain queries correctly', () => {
const complexQuery = `
Build a secure e-commerce platform using React and Node.js with the following requirements:
- Real-time inventory tracking
- PCI-compliant payment processing
- Machine learning recommendations
- High-performance search with Elasticsearch
- Docker containerization for deployment
`;
const analysis = UseCaseAnalyzer.analyzeQuery(complexQuery);
expect(analysis.queryType).toBe(QueryType.CODE);
expect(analysis.complexity).toBe('complex');
expect(analysis.domain).toBe('web-development');
expect(analysis.frameworks).toContain('React');
expect(analysis.frameworks).toContain('Node.js');
expect(analysis.specialRequirements).toContain('real-time');
expect(analysis.specialRequirements).toContain('secure');
expect(analysis.specialRequirements).toContain('high-performance');
});
it('should provide different recommendations for different complexity levels', () => {
const simpleQuery = 'What is a variable in Python?';
const complexQuery = 'Design a distributed microservices architecture with event sourcing';
const simpleRecs = UseCaseAnalyzer.getRecommendations(simpleQuery);
const complexRecs = UseCaseAnalyzer.getRecommendations(complexQuery);
expect(simpleRecs.analysis.complexity).toBe('simple');
expect(complexRecs.analysis.complexity).toBe('complex');
// Complex queries might prefer different models than simple ones
expect(simpleRecs.primaryRecommendation).toBeDefined();
expect(complexRecs.primaryRecommendation).toBeDefined();
// Both should have valid cost estimates
expect(simpleRecs.primaryRecommendation.estimatedCost).toBeGreaterThan(0);
expect(complexRecs.primaryRecommendation.estimatedCost).toBeGreaterThan(0);
});
it('should respect budget constraints in recommendations', () => {
const query = 'Write a comprehensive technical documentation';
const lowBudgetRecs = UseCaseAnalyzer.getRecommendations(query, 0.005);
const highBudgetRecs = UseCaseAnalyzer.getRecommendations(query, 0.1);
expect(lowBudgetRecs.primaryRecommendation.estimatedCost).toBeLessThanOrEqual(0.005);
expect(highBudgetRecs.primaryRecommendation.estimatedCost).toBeLessThanOrEqual(0.1);
// High budget should allow for potentially better models
expect(highBudgetRecs.primaryRecommendation.score).toBeGreaterThanOrEqual(lowBudgetRecs.primaryRecommendation.score);
});
});
describe('Model Characteristics Data Integrity', () => {
it('should have consistent data across all models', () => {
const models = ModelCharacteristicsService.getAllModels();
expect(models.length).toBeGreaterThan(0);
models.forEach((model) => {
// Verify all required fields are present
expect(model.id).toBeDefined();
expect(model.name).toBeDefined();
expect(model.provider).toBeDefined();
expect(model.capabilities).toBeDefined();
expect(model.strengths).toBeDefined();
expect(model.weaknesses).toBeDefined();
expect(model.performance).toBeDefined();
expect(model.cost).toBeDefined();
expect(model.useCases).toBeDefined();
expect(model.qualityScores).toBeDefined();
expect(model.constraints).toBeDefined();
// Verify use cases cover all query types
const queryTypes = model.useCases.map((uc) => uc.queryType);
expect(queryTypes).toContain(QueryType.CODE);
expect(queryTypes).toContain(QueryType.CREATIVE);
expect(queryTypes).toContain(QueryType.ANALYSIS);
expect(queryTypes).toContain(QueryType.SECURITY);
expect(queryTypes).toContain(QueryType.GENERAL);
// Verify quality scores are in valid range
Object.values(model.qualityScores).forEach((score) => {
expect(score).toBeGreaterThan(0);
expect(score).toBeLessThanOrEqual(1);
});
// Verify cost data makes sense
expect(model.cost.inputTokenCost).toBeGreaterThan(0);
expect(model.cost.outputTokenCost).toBeGreaterThan(0);
expect(model.cost.costEfficiencyRank).toBeGreaterThan(0);
expect(model.cost.costEfficiencyRank).toBeLessThanOrEqual(10);
});
});
it('should provide consistent model ranking across different metrics', () => {
const models = ModelCharacteristicsService.getAllModels();
// Test ranking by different query types
const queryTypes = [
QueryType.CODE,
QueryType.CREATIVE,
QueryType.ANALYSIS,
];
queryTypes.forEach((queryType) => {
const bestModel = ModelCharacteristicsService.getBestModelForQueryType(queryType);
expect(bestModel).toBeDefined();
if (bestModel) {
const useCase = bestModel.useCases.find((uc) => uc.queryType === queryType);
expect(useCase).toBeDefined();
expect(useCase?.suitabilityScore).toBeGreaterThan(0.5); // Should be reasonably suitable
}
});
// Test cost efficiency ranking
const costEfficient = ModelCharacteristicsService.rankModelsByCostEfficiency(QueryType.GENERAL);
expect(costEfficient.length).toBe(models.length);
// Verify ranking is actually sorted
for (let i = 0; i < costEfficient.length - 1; i++) {
const current = costEfficient[i];
const next = costEfficient[i + 1];
const currentEfficiency = current.qualityScores.overall / current.cost.costPerQualityPoint;
const nextEfficiency = next.qualityScores.overall / next.cost.costPerQualityPoint;
expect(currentEfficiency).toBeGreaterThanOrEqual(nextEfficiency);
}
});
});
});
//# sourceMappingURL=model-characteristics-integration.test.js.map