@stackmemoryai/stackmemory
Version:
Lossless, project-scoped memory for AI coding tools. Durable context across sessions with 56 MCP tools, FTS5 search, conductor orchestrator, loop/watch monitoring, snapshot capture, pre-flight overlap checks, Claude/Codex/OpenCode wrappers, Linear sync, a
129 lines (128 loc) • 4.75 kB
JavaScript
import { fileURLToPath as __fileURLToPath } from 'url';
import { dirname as __pathDirname } from 'path';
const __filename = __fileURLToPath(import.meta.url);
const __dirname = __pathDirname(__filename);
const SWE_BENCH_BASELINES = [
{
agent: "Claude Code",
model: "claude-sonnet-4",
benchmark: "swe-bench-verified",
resolveRate: 0.704,
date: "2025-12-01",
source: "https://www.swebench.com/"
},
{
agent: "Devin",
model: "mixed",
benchmark: "swe-bench-verified",
resolveRate: 0.551,
date: "2025-10-01",
source: "https://www.swebench.com/"
},
{
agent: "OpenHands",
model: "claude-sonnet-4",
benchmark: "swe-bench-verified",
resolveRate: 0.535,
date: "2025-09-01",
source: "https://www.swebench.com/"
},
{
agent: "Aider",
model: "claude-sonnet-4",
benchmark: "swe-bench-verified",
resolveRate: 0.489,
date: "2025-10-01",
source: "https://www.swebench.com/"
}
];
const HARNESS_TARGETS = {
/** Plan generation should complete within 10s */
planLatencyP95Ms: 1e4,
/**
* Full cycle (plan + implement + critique) within 5 minutes.
* Codex execution benchmarks show 89-231s for real runs (1-2 iterations).
* 300s allows for 2-iteration runs with margin.
*/
totalLatencyP95Ms: 3e5,
/**
* Single-iteration (first-pass) latency ceiling: 2.5 minutes.
* Based on observed single-pass runs of 89-115s with headroom.
*/
singleIterLatencyP95Ms: 15e4,
/** First-pass approval rate (no retries needed) */
firstPassApprovalRate: 0.7,
/** Edit success rate (exact + fuzzy combined) */
editSuccessRate: 0.9,
/** Edit fuzzy fallback rate (lower = better, means exact match works) */
editFuzzyFallbackRate: 0.15,
/** Context overhead should be < 6000 tokens */
contextTokenBudget: 6e3
};
function summarizeRuns(runs) {
if (runs.length === 0) {
return {
totalRuns: 0,
approvalRate: 0,
firstPassRate: 0,
avgIterations: 0,
avgPlanLatencyMs: 0,
avgTotalLatencyMs: 0,
p95PlanLatencyMs: 0,
p95TotalLatencyMs: 0,
p95SingleIterLatencyMs: 0,
editSuccessRate: 0,
editFuzzyRate: 0,
avgContextTokens: 0,
passesTargets: {}
};
}
const approvedRuns = runs.filter((r) => r.approved);
const firstPassRuns = runs.filter((r) => r.approved && r.iterations <= 1);
const totalEdits = runs.reduce((s, r) => s + r.editAttempts, 0);
const totalEditSuccesses = runs.reduce((s, r) => s + r.editSuccesses, 0);
const totalFuzzy = runs.reduce((s, r) => s + r.editFuzzyFallbacks, 0);
const planLatencies = runs.map((r) => r.planLatencyMs).sort((a, b) => a - b);
const totalLatencies = runs.map((r) => r.totalLatencyMs).sort((a, b) => a - b);
const p95Idx = Math.min(Math.ceil(runs.length * 0.95) - 1, runs.length - 1);
const approvalRate = approvedRuns.length / runs.length;
const firstPassRate = firstPassRuns.length / runs.length;
const editSuccessRate = totalEdits > 0 ? totalEditSuccesses / totalEdits : 1;
const editFuzzyRate = totalEditSuccesses > 0 ? totalFuzzy / totalEditSuccesses : 0;
const avgContextTokens = runs.reduce((s, r) => s + r.contextTokens, 0) / runs.length;
const p95Plan = planLatencies[p95Idx];
const p95Total = totalLatencies[p95Idx];
const singleIterLatencies = runs.filter((r) => r.approved && r.iterations <= 1).map((r) => r.totalLatencyMs).sort((a, b) => a - b);
const p95SingleIter = singleIterLatencies.length > 0 ? singleIterLatencies[Math.min(
Math.ceil(singleIterLatencies.length * 0.95) - 1,
singleIterLatencies.length - 1
)] : 0;
return {
totalRuns: runs.length,
approvalRate,
firstPassRate,
avgIterations: runs.reduce((s, r) => s + r.iterations, 0) / runs.length,
avgPlanLatencyMs: runs.reduce((s, r) => s + r.planLatencyMs, 0) / runs.length,
avgTotalLatencyMs: runs.reduce((s, r) => s + r.totalLatencyMs, 0) / runs.length,
p95PlanLatencyMs: p95Plan,
p95TotalLatencyMs: p95Total,
p95SingleIterLatencyMs: p95SingleIter,
editSuccessRate,
editFuzzyRate,
avgContextTokens,
passesTargets: {
planLatency: p95Plan <= HARNESS_TARGETS.planLatencyP95Ms,
totalLatency: p95Total <= HARNESS_TARGETS.totalLatencyP95Ms,
singleIterLatency: singleIterLatencies.length === 0 || p95SingleIter <= HARNESS_TARGETS.singleIterLatencyP95Ms,
firstPassApproval: firstPassRate >= HARNESS_TARGETS.firstPassApprovalRate,
editSuccess: editSuccessRate >= HARNESS_TARGETS.editSuccessRate,
editFuzzyRate: editFuzzyRate <= HARNESS_TARGETS.editFuzzyFallbackRate,
contextBudget: avgContextTokens <= HARNESS_TARGETS.contextTokenBudget
}
};
}
export {
HARNESS_TARGETS,
SWE_BENCH_BASELINES,
summarizeRuns
};