UNPKG

@quantumai/quantum-cli-core

Version:

Quantum CLI Core - Multi-LLM Collaboration System

250 lines 14.6 kB
/** * @license * Copyright 2025 Google LLC * SPDX-License-Identifier: Apache-2.0 */ import { describe, it, expect } from 'vitest'; import { ModelCharacteristicsService, MODEL_CHARACTERISTICS, ModelCapability, ModelStrength, QueryType, } from './model-characteristics.js'; describe('ModelCharacteristicsService', () => { describe('getModelCharacteristics', () => { it('should return characteristics for existing models', () => { const geminiChars = ModelCharacteristicsService.getModelCharacteristics('gemini-2.5-pro'); expect(geminiChars).toBeDefined(); expect(geminiChars?.id).toBe('gemini-2.5-pro'); expect(geminiChars?.provider).toBe('google'); const gptChars = ModelCharacteristicsService.getModelCharacteristics('gpt-4'); expect(gptChars).toBeDefined(); expect(gptChars?.id).toBe('gpt-4'); expect(gptChars?.provider).toBe('openai'); }); it('should return undefined for non-existent models', () => { const chars = ModelCharacteristicsService.getModelCharacteristics('non-existent-model'); expect(chars).toBeUndefined(); }); }); describe('getAllModels', () => { it('should return all available models', () => { const models = ModelCharacteristicsService.getAllModels(); expect(models).toHaveLength(3); // Gemini, GPT-4, Claude expect(models.map((m) => m.id)).toContain('gemini-2.5-pro'); expect(models.map((m) => m.id)).toContain('gpt-4'); expect(models.map((m) => m.id)).toContain('claude-3-sonnet'); }); }); describe('getBestModelForQueryType', () => { it('should recommend best model for code queries', () => { const bestModel = ModelCharacteristicsService.getBestModelForQueryType(QueryType.CODE); expect(bestModel).toBeDefined(); expect(bestModel?.id).toBe('gemini-2.5-pro'); // Based on high code suitability score }); it('should recommend best model for creative queries', () => { const bestModel = ModelCharacteristicsService.getBestModelForQueryType(QueryType.CREATIVE); expect(bestModel).toBeDefined(); expect(bestModel?.id).toBe('gpt-4'); // GPT-4 has highest creative score }); it('should respect cost constraints', () => { const lowCostModel = ModelCharacteristicsService.getBestModelForQueryType(QueryType.GENERAL, 0.01); expect(lowCostModel).toBeDefined(); // Should exclude very expensive models expect(lowCostModel?.id).not.toBe('gpt-4'); }); it('should return undefined if no models meet cost requirements', () => { const model = ModelCharacteristicsService.getBestModelForQueryType(QueryType.GENERAL, 0.001); expect(model).toBeUndefined(); }); }); describe('rankModelsByCostEfficiency', () => { it('should rank models by cost efficiency for analysis tasks', () => { const rankedModels = ModelCharacteristicsService.rankModelsByCostEfficiency(QueryType.ANALYSIS); expect(rankedModels).toHaveLength(3); // Verify that ranking considers both quality and cost const first = rankedModels[0]; const last = rankedModels[rankedModels.length - 1]; const firstUseCase = first.useCases.find((uc) => uc.queryType === QueryType.ANALYSIS); const lastUseCase = last.useCases.find((uc) => uc.queryType === QueryType.ANALYSIS); const firstQuality = firstUseCase ? firstUseCase.suitabilityScore : first.qualityScores.overall; const lastQuality = lastUseCase ? lastUseCase.suitabilityScore : last.qualityScores.overall; const firstEfficiency = firstQuality / first.cost.costPerQualityPoint; const lastEfficiency = lastQuality / last.cost.costPerQualityPoint; expect(firstEfficiency).toBeGreaterThanOrEqual(lastEfficiency); }); }); describe('getModelsByPerformanceTier', () => { it('should return fast models for fast tier', () => { const fastModels = ModelCharacteristicsService.getModelsByPerformanceTier('fast'); fastModels.forEach((model) => { expect(model.performance.averageLatency).toBeLessThan(2000); }); // Should be sorted by latency for (let i = 0; i < fastModels.length - 1; i++) { expect(fastModels[i].performance.averageLatency).toBeLessThanOrEqual(fastModels[i + 1].performance.averageLatency); } }); it('should return balanced models for balanced tier', () => { const balancedModels = ModelCharacteristicsService.getModelsByPerformanceTier('balanced'); balancedModels.forEach((model) => { expect(model.performance.averageLatency).toBeLessThan(3500); expect(model.qualityScores.overall).toBeGreaterThanOrEqual(0.85); }); }); it('should return high-quality models for quality tier', () => { const qualityModels = ModelCharacteristicsService.getModelsByPerformanceTier('quality'); qualityModels.forEach((model) => { expect(model.qualityScores.overall).toBeGreaterThanOrEqual(0.88); }); // Should be sorted by quality score (descending) for (let i = 0; i < qualityModels.length - 1; i++) { expect(qualityModels[i].qualityScores.overall).toBeGreaterThanOrEqual(qualityModels[i + 1].qualityScores.overall); } }); }); describe('compareModels', () => { it('should compare two existing models', () => { const comparison = ModelCharacteristicsService.compareModels('gemini-2.5-pro', 'gpt-4'); expect(comparison).toBeDefined(); expect(comparison?.models).toHaveLength(2); expect(comparison?.comparison.quality.winner).toBeDefined(); expect(comparison?.comparison.speed.winner).toBeDefined(); expect(comparison?.comparison.cost.winner).toBeDefined(); expect(comparison?.recommendation).toBeDefined(); }); it('should return undefined for non-existent models', () => { const comparison = ModelCharacteristicsService.compareModels('non-existent', 'gpt-4'); expect(comparison).toBeUndefined(); }); it('should provide meaningful recommendations', () => { const comparison = ModelCharacteristicsService.compareModels('gemini-2.5-pro', 'claude-3-sonnet'); expect(comparison?.recommendation).toContain(''); // Should have some recommendation text expect(typeof comparison?.recommendation).toBe('string'); }); }); describe('Model Characteristics Validation', () => { it('should have valid characteristics for all models', () => { Object.values(MODEL_CHARACTERISTICS).forEach((model) => { // Basic structure validation expect(model.id).toBeDefined(); expect(model.name).toBeDefined(); expect(model.provider).toBeDefined(); // Capabilities validation expect(Array.isArray(model.capabilities)).toBe(true); expect(model.capabilities.length).toBeGreaterThan(0); // Strengths and weaknesses validation expect(Array.isArray(model.strengths)).toBe(true); expect(Array.isArray(model.weaknesses)).toBe(true); // Performance metrics validation expect(model.performance.averageLatency).toBeGreaterThan(0); expect(model.performance.accuracy).toBeGreaterThan(0); expect(model.performance.accuracy).toBeLessThanOrEqual(1); // Cost validation expect(model.cost.inputTokenCost).toBeGreaterThan(0); expect(model.cost.outputTokenCost).toBeGreaterThan(0); expect(model.cost.costEfficiencyRank).toBeGreaterThan(0); expect(model.cost.costEfficiencyRank).toBeLessThanOrEqual(10); // Use cases validation expect(Array.isArray(model.useCases)).toBe(true); model.useCases.forEach((useCase) => { expect(useCase.suitabilityScore).toBeGreaterThan(0); expect(useCase.suitabilityScore).toBeLessThanOrEqual(1); expect(useCase.confidence).toBeGreaterThan(0); expect(useCase.confidence).toBeLessThanOrEqual(1); expect(Array.isArray(useCase.reasoning)).toBe(true); }); // Quality scores validation Object.values(model.qualityScores).forEach((score) => { expect(score).toBeGreaterThan(0); expect(score).toBeLessThanOrEqual(1); }); // Constraints validation expect(model.constraints.maxTokens).toBeGreaterThan(0); expect(model.constraints.rateLimits.requestsPerMinute).toBeGreaterThan(0); }); }); it('should have all query types covered in use cases', () => { Object.values(MODEL_CHARACTERISTICS).forEach((model) => { const queryTypes = model.useCases.map((uc) => uc.queryType); // Each model should have ratings for all query types expect(queryTypes).toContain(QueryType.CODE); expect(queryTypes).toContain(QueryType.CREATIVE); expect(queryTypes).toContain(QueryType.ANALYSIS); expect(queryTypes).toContain(QueryType.SECURITY); expect(queryTypes).toContain(QueryType.GENERAL); }); }); it('should have logical relationships between strengths and use case scores', () => { const gptModel = MODEL_CHARACTERISTICS['gpt-4']; // GPT-4 has CREATIVE_TASKS as strength, should score high on creative queries expect(gptModel.strengths).toContain(ModelStrength.CREATIVE_TASKS); const creativeUseCase = gptModel.useCases.find((uc) => uc.queryType === QueryType.CREATIVE); expect(creativeUseCase?.suitabilityScore).toBeGreaterThan(0.9); const geminiModel = MODEL_CHARACTERISTICS['gemini-2.5-pro']; // Gemini has CODE_UNDERSTANDING as strength, should score high on code queries expect(geminiModel.strengths).toContain(ModelStrength.CODE_UNDERSTANDING); const codeUseCase = geminiModel.useCases.find((uc) => uc.queryType === QueryType.CODE); expect(codeUseCase?.suitabilityScore).toBeGreaterThan(0.9); }); }); describe('Performance Tier Logic', () => { it('should correctly categorize models by speed', () => { const allModels = ModelCharacteristicsService.getAllModels(); const fastModels = ModelCharacteristicsService.getModelsByPerformanceTier('fast'); const balancedModels = ModelCharacteristicsService.getModelsByPerformanceTier('balanced'); // Fast models should have lower latency than balanced models if (fastModels.length > 0 && balancedModels.length > 0) { const slowestFast = Math.max(...fastModels.map((m) => m.performance.averageLatency)); const fastestBalanced = Math.min(...balancedModels.map((m) => m.performance.averageLatency)); // This might not always be true, but generally fast tier should be faster expect(slowestFast).toBeLessThan(2000); // By definition of fast tier } }); }); }); describe('Model Characteristics Data Quality', () => { it('should have realistic performance metrics', () => { Object.values(MODEL_CHARACTERISTICS).forEach((model) => { // Latency should be reasonable (not 0, not > 30 seconds) expect(model.performance.averageLatency).toBeGreaterThan(100); expect(model.performance.averageLatency).toBeLessThan(30000); // P95 should be higher than average expect(model.performance.p95Latency).toBeGreaterThan(model.performance.averageLatency); // P99 should be higher than P95 expect(model.performance.p99Latency).toBeGreaterThan(model.performance.p95Latency); // Tokens per second should be reasonable expect(model.performance.tokensPerSecond).toBeGreaterThan(1); expect(model.performance.tokensPerSecond).toBeLessThan(1000); // Error rates should be low expect(model.performance.errorRate).toBeGreaterThanOrEqual(0); expect(model.performance.errorRate).toBeLessThan(0.1); // Less than 10% }); }); it('should have consistent cost-quality relationships', () => { const models = Object.values(MODEL_CHARACTERISTICS); // Generally, higher quality should correlate with higher cost per quality point // (though this isn't always true due to efficiency differences) models.forEach((model) => { const qualityToCostRatio = model.qualityScores.overall / model.cost.costPerQualityPoint; expect(qualityToCostRatio).toBeGreaterThan(0); }); }); it('should have logical capability-strength mappings', () => { Object.values(MODEL_CHARACTERISTICS).forEach((model) => { // If a model has CODE_GENERATION capability, it should have some code-related strength if (model.capabilities.includes(ModelCapability.CODE_GENERATION)) { const hasCodeStrength = model.strengths.some((strength) => [ ModelStrength.CODE_UNDERSTANDING, ModelStrength.PROBLEM_SOLVING, ].includes(strength)); expect(hasCodeStrength).toBe(true); } // If a model has CREATIVE_WRITING capability, it should have creative strength if (model.capabilities.includes(ModelCapability.CREATIVE_WRITING)) { const hasCreativeStrength = model.strengths.includes(ModelStrength.CREATIVE_TASKS); // This is flexible since not all models with creative capability excel at it } }); }); }); //# sourceMappingURL=model-characteristics.test.js.map