@hivetechs/hive-ai
Version:
Real-time streaming AI consensus platform with HTTP+SSE MCP integration for Claude Code, VS Code, Cursor, and Windsurf - powered by OpenRouter's unified API
315 lines (309 loc) • 13.3 kB
JavaScript
/**
* Confidence Scoring System for Consensus Pipeline
* Implements mathematical confidence calculation based on stage agreement and content quality
*/
export class ConfidenceCalculator {
/**
* Calculate confidence for an individual stage
*/
calculateStageConfidence(stageResult, contextQuality = 0.5, providerReliability = 0.8) {
const output = stageResult.output;
// Base confidence from content analysis
const contentScore = this.analyzeContentQuality(output);
// Context utilization bonus (0-0.2)
const contextBonus = Math.min(contextQuality * 0.2, 0.2);
// Provider reliability factor (0-0.3)
const reliabilityFactor = Math.min(providerReliability * 0.3, 0.3);
// Length and detail factor (0-0.1)
const detailFactor = Math.min((output.length / 500) * 0.1, 0.1);
// Technical depth bonus (0-0.1)
const technicalBonus = this.calculateTechnicalDepth(output) * 0.1;
const totalScore = contentScore + contextBonus + reliabilityFactor + detailFactor + technicalBonus;
// Cap at 98% to avoid overconfidence
return Math.min(totalScore, 0.98);
}
/**
* Calculate overall consensus confidence from all stages
*/
calculateConsensusConfidence(stageResults) {
if (stageResults.length !== 4) {
throw new Error('Consensus requires exactly 4 stage results');
}
// Calculate individual stage confidences
const stageConfidences = stageResults.map(stage => stage.confidence || this.calculateStageConfidence(stage));
// Calculate stage agreement matrix
const agreementMatrix = this.calculateAgreementMatrix(stageResults);
// Calculate weighted average with progressive weighting
const weightedConfidence = this.calculateWeightedConfidence(stageConfidences);
// Calculate stage agreement score
const agreementScore = this.calculateOverallAgreement(agreementMatrix);
// Calculate provider reliability
const providerReliability = this.calculateProviderReliability(stageResults);
// Calculate content quality progression
const contentQuality = this.calculateContentQualityProgression(stageResults);
// Calculate context utilization
const contextUtilization = this.calculateContextUtilization(stageResults);
// Final confidence calculation with agreement weighting
const finalConfidence = Math.min(weightedConfidence * agreementScore * 1.1, // Agreement bonus
0.98 // Maximum confidence cap
);
return {
content_quality: contentQuality,
stage_agreement: agreementScore,
provider_reliability: providerReliability,
context_utilization: contextUtilization,
final_confidence: finalConfidence,
agreement_matrix: agreementMatrix
};
}
/**
* Analyze content quality based on various factors
*/
analyzeContentQuality(output) {
let score = 0.5; // Base score
// Length factor (optimal range 100-800 words)
const wordCount = output.split(/\s+/).length;
if (wordCount >= 100 && wordCount <= 800) {
score += 0.1;
}
else if (wordCount < 50) {
score -= 0.1; // Penalty for too short
}
// Structure indicators
if (this.hasGoodStructure(output))
score += 0.1;
// Technical accuracy indicators
if (this.hasTechnicalContent(output))
score += 0.1;
// Completeness indicators
if (this.isComplete(output))
score += 0.1;
// Clarity indicators
if (this.hasGoodClarity(output))
score += 0.1;
return Math.min(score, 0.9);
}
/**
* Calculate agreement matrix between all stages
*/
calculateAgreementMatrix(stageResults) {
const matrix = [];
for (let i = 0; i < stageResults.length; i++) {
matrix[i] = [];
for (let j = 0; j < stageResults.length; j++) {
if (i === j) {
matrix[i][j] = 1.0; // Perfect agreement with self
}
else {
matrix[i][j] = this.calculateSemanticSimilarity(stageResults[i].output, stageResults[j].output);
}
}
}
return matrix;
}
/**
* Calculate semantic similarity between two outputs
*/
calculateSemanticSimilarity(output1, output2) {
// Simple keyword overlap approach (can be enhanced with embeddings)
const words1 = this.extractKeywords(output1.toLowerCase());
const words2 = this.extractKeywords(output2.toLowerCase());
const intersection = words1.filter(word => words2.includes(word));
const union = [...new Set([...words1, ...words2])];
if (union.length === 0)
return 0;
// Jaccard similarity with length normalization
const jaccardSimilarity = intersection.length / union.length;
// Length similarity factor
const length1 = output1.length;
const length2 = output2.length;
const lengthSimilarity = 1 - Math.abs(length1 - length2) / Math.max(length1, length2);
// Combined similarity score
return (jaccardSimilarity * 0.7) + (lengthSimilarity * 0.3);
}
/**
* Calculate weighted confidence with progressive stage weighting
*/
calculateWeightedConfidence(confidences) {
if (confidences.length !== 4)
return 0;
// Progressive weighting: later stages have more influence
const weights = [0.15, 0.25, 0.30, 0.30]; // Generator, Refiner, Validator, Curator
let weightedSum = 0;
let totalWeight = 0;
for (let i = 0; i < confidences.length; i++) {
weightedSum += confidences[i] * weights[i];
totalWeight += weights[i];
}
return weightedSum / totalWeight;
}
/**
* Calculate overall agreement score from agreement matrix
*/
calculateOverallAgreement(matrix) {
let totalAgreement = 0;
let comparisons = 0;
// Calculate average agreement excluding self-agreements
for (let i = 0; i < matrix.length; i++) {
for (let j = i + 1; j < matrix[i].length; j++) {
totalAgreement += matrix[i][j];
comparisons++;
}
}
if (comparisons === 0)
return 0.5;
const averageAgreement = totalAgreement / comparisons;
// Agreement bonus: high agreement increases confidence
if (averageAgreement > 0.8)
return 1.1; // 10% bonus
if (averageAgreement > 0.6)
return 1.05; // 5% bonus
if (averageAgreement < 0.3)
return 0.9; // 10% penalty
return 1.0; // Neutral
}
/**
* Calculate provider reliability score
*/
calculateProviderReliability(stageResults) {
// Provider reliability mapping (can be enhanced with real performance data)
const providerScores = {
'openai': 0.9,
'anthropic': 0.9,
'google': 0.85,
'mistral': 0.8,
'groq': 0.8,
'default': 0.7
};
let totalScore = 0;
for (const stage of stageResults) {
const provider = stage.provider.toLowerCase();
totalScore += providerScores[provider] || providerScores['default'];
}
return totalScore / stageResults.length;
}
/**
* Calculate content quality progression through stages
*/
calculateContentQualityProgression(stageResults) {
let progressionScore = 0.5;
// Check if content generally improves through stages
const qualityScores = stageResults.map(stage => this.analyzeContentQuality(stage.output));
// Progressive improvement bonus
let improvements = 0;
for (let i = 1; i < qualityScores.length; i++) {
if (qualityScores[i] > qualityScores[i - 1])
improvements++;
}
progressionScore += (improvements / (qualityScores.length - 1)) * 0.3;
return Math.min(progressionScore, 1.0);
}
/**
* Calculate context utilization score
*/
calculateContextUtilization(stageResults) {
// This would be enhanced with actual context tracking
// For now, estimate based on output length and technical depth
let utilizationScore = 0.5;
for (const stage of stageResults) {
const technicalDepth = this.calculateTechnicalDepth(stage.output);
utilizationScore += technicalDepth * 0.1;
}
return Math.min(utilizationScore / stageResults.length, 1.0);
}
/**
* Helper methods for content analysis
*/
hasGoodStructure(output) {
// Check for headers, bullet points, paragraphs
return /(?:\n\n|\n#{1,6}\s|\n\*\s|\n\d+\.)/.test(output);
}
hasTechnicalContent(output) {
// Check for technical terms, code, specific terminology
const technicalIndicators = /(?:algorithm|function|class|method|API|database|framework|library|implementation)/i;
return technicalIndicators.test(output);
}
isComplete(output) {
// Check if response seems complete (ends with period, has conclusion)
return output.trim().endsWith('.') || output.trim().endsWith('!') ||
/(?:conclusion|summary|finally|in summary)/i.test(output);
}
hasGoodClarity(output) {
// Check for clear language, not too many filler words
const fillerWords = /(?:um|uh|like|you know|actually|basically)/gi;
const fillerCount = (output.match(fillerWords) || []).length;
const wordCount = output.split(/\s+/).length;
return (fillerCount / wordCount) < 0.02; // Less than 2% filler words
}
calculateTechnicalDepth(output) {
// Analyze technical depth based on terminology and concepts
const technicalTerms = [
'algorithm', 'implementation', 'architecture', 'framework', 'methodology',
'optimization', 'scalability', 'performance', 'efficiency', 'complexity'
];
let technicalScore = 0;
const words = output.toLowerCase().split(/\s+/);
for (const term of technicalTerms) {
if (words.includes(term))
technicalScore += 0.1;
}
return Math.min(technicalScore, 1.0);
}
extractKeywords(text) {
// Simple keyword extraction (can be enhanced)
const stopWords = new Set([
'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'have',
'has', 'had', 'do', 'does', 'did', 'will', 'would', 'could', 'should'
]);
return text
.toLowerCase()
.replace(/[^\w\s]/g, ' ')
.split(/\s+/)
.filter(word => word.length > 2 && !stopWords.has(word))
.slice(0, 20); // Top 20 keywords
}
/**
* Generate confidence report for debugging and transparency
*/
generateConfidenceReport(metrics, stageResults) {
const report = `
🎯 CONSENSUS CONFIDENCE REPORT
═══════════════════════════════════════
📊 OVERALL CONFIDENCE: ${(metrics.final_confidence * 100).toFixed(1)}%
📈 COMPONENT SCORES:
• Content Quality: ${(metrics.content_quality * 100).toFixed(1)}%
• Stage Agreement: ${(metrics.stage_agreement * 100).toFixed(1)}%
• Provider Reliability: ${(metrics.provider_reliability * 100).toFixed(1)}%
• Context Utilization: ${(metrics.context_utilization * 100).toFixed(1)}%
🔄 STAGE ANALYSIS:
${stageResults.map((stage, i) => `• ${stage.stage_name}: ${((stage.confidence || 0.5) * 100).toFixed(1)}% (${stage.provider}:${stage.model})`).join('\n')}
🤝 AGREEMENT MATRIX:
${this.formatAgreementMatrix(metrics.agreement_matrix)}
💡 CONFIDENCE FACTORS:
• High agreement between stages boosts confidence
• Technical depth and structure improve quality scores
• Provider diversity enhances reliability
• Progressive improvement through stages is positive
${metrics.final_confidence > 0.8 ? '✅ HIGH CONFIDENCE - Result is highly reliable' :
metrics.final_confidence > 0.6 ? '⚠️ MODERATE CONFIDENCE - Result is reasonably reliable' :
'❌ LOW CONFIDENCE - Result should be verified'}
`;
return report;
}
formatAgreementMatrix(matrix) {
const stages = ['Gen', 'Ref', 'Val', 'Cur'];
let formatted = ' ' + stages.join(' ') + '\n';
for (let i = 0; i < matrix.length; i++) {
formatted += stages[i] + ' ';
for (let j = 0; j < matrix[i].length; j++) {
formatted += (matrix[i][j] * 100).toFixed(0).padStart(3) + ' ';
}
formatted += '\n';
}
return formatted;
}
}
// Singleton instance
export const confidenceCalculator = new ConfidenceCalculator();
//# sourceMappingURL=confidence-calculator.js.map