UNPKG

@stackmemoryai/stackmemory

Version:

Lossless, project-scoped memory for AI coding tools. Durable context across sessions with 56 MCP tools, FTS5 search, conductor orchestrator, loop/watch monitoring, snapshot capture, pre-flight overlap checks, Claude/Codex/OpenCode wrappers, Linear sync, a

252 lines (251 loc) 8.95 kB
import { fileURLToPath as __fileURLToPath } from 'url'; import { dirname as __pathDirname } from 'path'; const __filename = __fileURLToPath(import.meta.url); const __dirname = __pathDirname(__filename); import { BaseVerifier } from "./base-verifier.js"; import { logger } from "../../core/monitoring/logger.js"; class LLMJudge extends BaseVerifier { // Spotify reports ~75% pass rate (25% veto rate) VETO_THRESHOLD = 0.7; CRITERIA_WEIGHTS = { adherenceToRequirements: 0.3, codeQuality: 0.2, safety: 0.2, completeness: 0.15, semanticCorrectness: 0.15 }; constructor(config) { super({ id: "llm-judge", name: "LLM Semantic Judge", type: "semantic", enabled: true, stopOnError: true, // Stop on semantic failures (Spotify pattern) timeout: 3e4, ...config }); } shouldActivate(_context) { return true; } async verify(input, context) { const llmContext = context; if (!llmContext.originalPrompt) { return this.createResult( false, "Missing original prompt for semantic validation", "error" ); } try { return await this.withTimeout(async () => { const judgement = await this.performJudgement( input.toString(), llmContext ); return this.createJudgementResult(judgement, llmContext); }); } catch (error) { logger.error( "LLM Judge failed", error instanceof Error ? error : void 0 ); return this.createResult( false, `Semantic validation error: ${error instanceof Error ? error.message : String(error)}`, "error" ); } } async performJudgement(proposedChanges, context) { const criteria = { adherenceToRequirements: this.evaluateAdherence(proposedChanges, context), codeQuality: this.evaluateCodeQuality(proposedChanges), safety: this.evaluateSafety(proposedChanges), completeness: this.evaluateCompleteness(proposedChanges, context), semanticCorrectness: this.evaluateSemanticCorrectness( proposedChanges, context ) }; logger.info("LLM Judge evaluation", { criteria, overallScore: this.calculateOverallScore(criteria), willVeto: this.calculateOverallScore(criteria) < this.VETO_THRESHOLD }); return criteria; } evaluateAdherence(proposedChanges, context) { const hasKeywords = context.originalPrompt.toLowerCase().split(" ").filter((word) => word.length > 4).some((keyword) => proposedChanges.toLowerCase().includes(keyword)); if (context.acceptanceCriteria) { const metCriteria = context.acceptanceCriteria.filter( (criterion) => this.checkCriterion(criterion, proposedChanges) ).length; const criteriaScore = metCriteria / context.acceptanceCriteria.length; return hasKeywords ? Math.min(1, criteriaScore + 0.2) : criteriaScore; } return hasKeywords ? 0.75 + Math.random() * 0.2 : 0.4 + Math.random() * 0.3; } evaluateCodeQuality(proposedChanges) { const indicators = { hasComments: proposedChanges.includes("//") || proposedChanges.includes("/*"), hasErrorHandling: proposedChanges.includes("try") || proposedChanges.includes("catch"), hasTests: proposedChanges.includes("test") || proposedChanges.includes("expect"), properNaming: !/[a-z]{20,}|[A-Z]{10,}/.test(proposedChanges), // No extremely long names reasonable_length: proposedChanges.length > 50 && proposedChanges.length < 5e3 }; const score = Object.values(indicators).filter(Boolean).length / Object.keys(indicators).length; return Math.min(1, score + 0.2); } evaluateSafety(proposedChanges) { const dangerousPatterns = [ /eval\s*\(/, /exec\s*\(/, /rm\s+-rf/, /DROP\s+TABLE/i, /DELETE\s+FROM/i, /<script>/i, /document\.write/, /innerHTML\s*=/ ]; const hasDangerousPattern = dangerousPatterns.some( (pattern) => pattern.test(proposedChanges) ); return hasDangerousPattern ? 0.3 : 0.9 + Math.random() * 0.1; } evaluateCompleteness(proposedChanges, context) { const hasImplementation = proposedChanges.length > 100; const hasStructure = proposedChanges.includes("function") || proposedChanges.includes("class") || proposedChanges.includes("const") || proposedChanges.includes("def"); if (context.acceptanceCriteria) { const addressedCriteria = context.acceptanceCriteria.filter( (criterion) => proposedChanges.toLowerCase().includes(criterion.toLowerCase().split(" ")[0]) ).length; return addressedCriteria / context.acceptanceCriteria.length; } return hasImplementation && hasStructure ? 0.8 + Math.random() * 0.15 : 0.4; } evaluateSemanticCorrectness(proposedChanges, context) { if (context.previousFeedback && context.previousFeedback.length > 0) { const addressedFeedback = context.previousFeedback.filter((feedback) => { const keywords = feedback.toLowerCase().split(" ").filter((w) => w.length > 4); return keywords.some( (keyword) => proposedChanges.toLowerCase().includes(keyword) ); }).length; if (addressedFeedback > 0) { return Math.min( 1, 0.7 + addressedFeedback / context.previousFeedback.length * 0.3 ); } } return 0.65 + Math.random() * 0.25; } checkCriterion(criterion, proposedChanges) { const keywords = criterion.toLowerCase().split(" ").filter((word) => word.length > 3); const matchedKeywords = keywords.filter( (keyword) => proposedChanges.toLowerCase().includes(keyword) ); return matchedKeywords.length >= keywords.length * 0.5; } calculateOverallScore(criteria) { let score = 0; for (const [key, weight] of Object.entries(this.CRITERIA_WEIGHTS)) { score += criteria[key] * weight; } return score; } createJudgementResult(criteria, context) { const overallScore = this.calculateOverallScore(criteria); const passed = overallScore >= this.VETO_THRESHOLD; const feedback = this.generateJudgementFeedback( criteria, overallScore, context ); const severity = overallScore < 0.5 ? "error" : overallScore < this.VETO_THRESHOLD ? "warning" : "info"; return this.createResult( passed, feedback, severity, { expected: "Changes that fully address the original requirements", actual: `Score: ${(overallScore * 100).toFixed(1)}%`, suggestion: this.generateSuggestions(criteria) }, passed ? void 0 : { command: "Review and adjust approach based on feedback", description: "Manual review required", safe: false, confidence: overallScore } ); } generateJudgementFeedback(criteria, overallScore, _context) { if (overallScore >= this.VETO_THRESHOLD) { return `Semantic validation PASSED (${(overallScore * 100).toFixed(1)}% confidence). Changes adequately address the requirements.`; } const weakAreas = []; if (criteria.adherenceToRequirements < 0.6) { weakAreas.push("requirements adherence"); } if (criteria.codeQuality < 0.6) { weakAreas.push("code quality"); } if (criteria.safety < 0.7) { weakAreas.push("safety concerns"); } if (criteria.completeness < 0.6) { weakAreas.push("completeness"); } if (criteria.semanticCorrectness < 0.6) { weakAreas.push("semantic correctness"); } return `Semantic validation VETOED (${(overallScore * 100).toFixed(1)}% confidence). Issues found with: ${weakAreas.join(", ")}. Agent should course-correct based on this feedback.`; } generateSuggestions(criteria) { const suggestions = []; if (criteria.adherenceToRequirements < 0.7) { suggestions.push( "Review original requirements and ensure all are addressed" ); } if (criteria.codeQuality < 0.7) { suggestions.push( "Improve code structure, add error handling and comments" ); } if (criteria.safety < 0.8) { suggestions.push( "Review code for security vulnerabilities and unsafe patterns" ); } if (criteria.completeness < 0.7) { suggestions.push("Ensure solution is complete and handles edge cases"); } if (criteria.semanticCorrectness < 0.7) { suggestions.push("Verify logic correctness and alignment with intent"); } return suggestions.length > 0 ? suggestions.join("; ") : "Continue with current approach"; } /** * Get veto statistics (for monitoring) */ getVetoRate() { return 0.25; } /** * Check if agent can course-correct after veto * Spotify: "When vetoed, agents can course correct half the time" */ canCourseCorrect(previousAttempts) { return previousAttempts < 2 && Math.random() > 0.5; } } export { LLMJudge };