UNPKG

@stackmemoryai/stackmemory

Version:

Lossless, project-scoped memory for AI coding tools. Durable context across sessions with 56 MCP tools, FTS5 search, conductor orchestrator, loop/watch monitoring, snapshot capture, pre-flight overlap checks, Claude/Codex/OpenCode wrappers, Linear sync, a

129 lines (128 loc) 4.75 kB
import { fileURLToPath as __fileURLToPath } from 'url'; import { dirname as __pathDirname } from 'path'; const __filename = __fileURLToPath(import.meta.url); const __dirname = __pathDirname(__filename); const SWE_BENCH_BASELINES = [ { agent: "Claude Code", model: "claude-sonnet-4", benchmark: "swe-bench-verified", resolveRate: 0.704, date: "2025-12-01", source: "https://www.swebench.com/" }, { agent: "Devin", model: "mixed", benchmark: "swe-bench-verified", resolveRate: 0.551, date: "2025-10-01", source: "https://www.swebench.com/" }, { agent: "OpenHands", model: "claude-sonnet-4", benchmark: "swe-bench-verified", resolveRate: 0.535, date: "2025-09-01", source: "https://www.swebench.com/" }, { agent: "Aider", model: "claude-sonnet-4", benchmark: "swe-bench-verified", resolveRate: 0.489, date: "2025-10-01", source: "https://www.swebench.com/" } ]; const HARNESS_TARGETS = { /** Plan generation should complete within 10s */ planLatencyP95Ms: 1e4, /** * Full cycle (plan + implement + critique) within 5 minutes. * Codex execution benchmarks show 89-231s for real runs (1-2 iterations). * 300s allows for 2-iteration runs with margin. */ totalLatencyP95Ms: 3e5, /** * Single-iteration (first-pass) latency ceiling: 2.5 minutes. * Based on observed single-pass runs of 89-115s with headroom. */ singleIterLatencyP95Ms: 15e4, /** First-pass approval rate (no retries needed) */ firstPassApprovalRate: 0.7, /** Edit success rate (exact + fuzzy combined) */ editSuccessRate: 0.9, /** Edit fuzzy fallback rate (lower = better, means exact match works) */ editFuzzyFallbackRate: 0.15, /** Context overhead should be < 6000 tokens */ contextTokenBudget: 6e3 }; function summarizeRuns(runs) { if (runs.length === 0) { return { totalRuns: 0, approvalRate: 0, firstPassRate: 0, avgIterations: 0, avgPlanLatencyMs: 0, avgTotalLatencyMs: 0, p95PlanLatencyMs: 0, p95TotalLatencyMs: 0, p95SingleIterLatencyMs: 0, editSuccessRate: 0, editFuzzyRate: 0, avgContextTokens: 0, passesTargets: {} }; } const approvedRuns = runs.filter((r) => r.approved); const firstPassRuns = runs.filter((r) => r.approved && r.iterations <= 1); const totalEdits = runs.reduce((s, r) => s + r.editAttempts, 0); const totalEditSuccesses = runs.reduce((s, r) => s + r.editSuccesses, 0); const totalFuzzy = runs.reduce((s, r) => s + r.editFuzzyFallbacks, 0); const planLatencies = runs.map((r) => r.planLatencyMs).sort((a, b) => a - b); const totalLatencies = runs.map((r) => r.totalLatencyMs).sort((a, b) => a - b); const p95Idx = Math.min(Math.ceil(runs.length * 0.95) - 1, runs.length - 1); const approvalRate = approvedRuns.length / runs.length; const firstPassRate = firstPassRuns.length / runs.length; const editSuccessRate = totalEdits > 0 ? totalEditSuccesses / totalEdits : 1; const editFuzzyRate = totalEditSuccesses > 0 ? totalFuzzy / totalEditSuccesses : 0; const avgContextTokens = runs.reduce((s, r) => s + r.contextTokens, 0) / runs.length; const p95Plan = planLatencies[p95Idx]; const p95Total = totalLatencies[p95Idx]; const singleIterLatencies = runs.filter((r) => r.approved && r.iterations <= 1).map((r) => r.totalLatencyMs).sort((a, b) => a - b); const p95SingleIter = singleIterLatencies.length > 0 ? singleIterLatencies[Math.min( Math.ceil(singleIterLatencies.length * 0.95) - 1, singleIterLatencies.length - 1 )] : 0; return { totalRuns: runs.length, approvalRate, firstPassRate, avgIterations: runs.reduce((s, r) => s + r.iterations, 0) / runs.length, avgPlanLatencyMs: runs.reduce((s, r) => s + r.planLatencyMs, 0) / runs.length, avgTotalLatencyMs: runs.reduce((s, r) => s + r.totalLatencyMs, 0) / runs.length, p95PlanLatencyMs: p95Plan, p95TotalLatencyMs: p95Total, p95SingleIterLatencyMs: p95SingleIter, editSuccessRate, editFuzzyRate, avgContextTokens, passesTargets: { planLatency: p95Plan <= HARNESS_TARGETS.planLatencyP95Ms, totalLatency: p95Total <= HARNESS_TARGETS.totalLatencyP95Ms, singleIterLatency: singleIterLatencies.length === 0 || p95SingleIter <= HARNESS_TARGETS.singleIterLatencyP95Ms, firstPassApproval: firstPassRate >= HARNESS_TARGETS.firstPassApprovalRate, editSuccess: editSuccessRate >= HARNESS_TARGETS.editSuccessRate, editFuzzyRate: editFuzzyRate <= HARNESS_TARGETS.editFuzzyFallbackRate, contextBudget: avgContextTokens <= HARNESS_TARGETS.contextTokenBudget } }; } export { HARNESS_TARGETS, SWE_BENCH_BASELINES, summarizeRuns };