UNPKG

scai

Version:

> **A local-first AI CLI for understanding, querying, and iterating on large codebases.** > **100% local • No token costs • No cloud • No prompt injection • Private by design**

311 lines (310 loc) 12.6 kB
import fs from "fs"; import chalk from "chalk"; import { RUN_LOG_PATH } from "../constants.js"; // ---------------- Test Queries ---------------- export const testQueries = [ "Add concise comments to semanticAnalysisModule.ts and finalAnswerModule.ts describing phase boundaries.", "Refactor MainAgent runVerify flow to reduce nesting while preserving behavior.", "Explain how resolveExecutionModeStep, routingDecisionStep, and canExecuteRoute interact.", "Add stronger validation and safer fallback behavior in contextReviewStep.ts.", "Summarize CLI architecture from index.ts, commands/factory.ts, and runQueryWithDaemonControl.ts.", "Where are SQLite queries defined in db/fileIndex.ts and db/client.ts?", "Which agent step files under cli/src/agents are missing test coverage?", "What parts of runWorkLoop could cause repeated pending task steps?", "How does runSearch seed relatedFiles and how does verify prune missing paths?", "Refactor buildContextualPrompt.ts into smaller focused helpers without behavior change.", "Is error handling consistent across MainAgent.ts, evidenceVerifierStep.ts, and semanticAnalysisModule.ts?", "What important usage examples are missing from README for shell commands and /edit flow?", "How does this repo run tests and where are the test entry points configured?", "Are there flaky-test signals in __tests__ or test scripts, and where would retries be added?", "Is there dead code in cli/src/agents that is never referenced by MainAgent or command flow?", "Map the full run lifecycle in MainAgent and point out where routing decisions are made.", "For repo-wide analysis questions, which method decides verify wave budget and why?", "Compare runSearch, runVerify, and runResearch responsibilities and identify overlap risks.", "Trace how selectedFiles and candidateFiles evolve across verify waves for this query.", "Show where taskStep status can get stuck pending and propose a bounded completion rule.", "Refactor evidenceVerifierStep so verify data is separated from semantic analysis outputs.", "Identify all places where action.shouldModify is set and flag analyze-mode violations.", "Which modules still read FileAnalysis.evidence or searchScore after verify-state split?", "Given a multi-question prompt, how should planning split research vs execution steps?", "Find the minimal file set required to answer: how do I run tests and where are flaky tests detected?", "Explain why a repo-wide question might still end up analyzing only a few files.", "List all gate checks (phase/scope/route/readiness/research) and the exact stop conditions.", "Audit finalAnswer analyzed-files output for mismatch against actually executed taskSteps.", "Where does semanticAnalysis merge prior state, and what fields should remain semantic-only?", ]; // ---------------- Local Evals ---------------- export const localEvalCases = [ // ---------------- Routing / Verify ---------------- { id: "routing_repo_wide_question", query: "summarize this repo architecture and identify weak coupling points", checks: { mustContain: [ "OUTPUT | readinessGateStep", "OUTPUT | runResearch", "OUTPUT | reasonNextTaskStep", ], }, }, { id: "verify_pipeline_sqlite_query", query: "Where are SQLite queries defined in db/fileIndex.ts and db/client.ts?", checks: { mustContain: [ "OUTPUT | evidenceVerifier", "OUTPUT | preFileSearchCheckStep", "OUTPUT | readinessGateStep", ], }, }, { id: "verify_focus_evolution", query: "Trace how selectedFiles and candidateFiles evolve across verify waves for this query.", checks: { mustContain: [ "OUTPUT | evidenceVerifier", "OUTPUT | preFileSearchCheckStep", "OUTPUT | selectRelevantSources", ], }, }, // ---------------- Research ---------------- { id: "research_generation_present", query: "Is error handling consistent across the codebase and how do I run the test suite?", when: { mustContainInRunLog: ['"allowResearch": true'], }, checks: { mustContain: [ "OUTPUT | researchPlanGen", "research-impact-map", "research-symbol-trace", "research-risk-check", ], }, }, { id: "research_execution_artifacts_present", query: "Is error handling consistent across the codebase and how do I run the test suite?", when: { mustContainInRunLog: ['"allowResearch": true'], }, checks: { mustContain: [ "OUTPUT | runResearch", "OUTPUT | runResearchStep", "\"collectedData\"", ], }, }, { id: "research_synthesis_present", query: "summarize this repo architecture and identify weak coupling points", when: { mustContainInRunLog: ['"allowResearch": true'], }, checks: { mustContain: [ "research-architecture-synthesis", "OUTPUT | runResearchStep", "\"problemStatement\"", ], }, }, // ---------------- Workloop / Analysis ---------------- { id: "verify_store_used", query: "For repo-wide analysis questions, which method decides verify wave budget and why?", checks: { mustContain: [ "OUTPUT | evidenceVerifier", "OUTPUT | preFileSearchCheckStep", ], }, }, { id: "analysis_plan_and_semantic_pass", query: "Where does semanticAnalysis merge prior state, and what fields should remain semantic-only?", checks: { mustContain: [ "OUTPUT | analysisPlanGen", "OUTPUT | semanticAnalysisStep - per-file", "OUTPUT | reasonNextStep", ], }, }, { id: "loop_safety_for_pending_steps", query: "Show where taskStep status can get stuck pending and propose a bounded completion rule.", checks: { mustContain: [ "OUTPUT | reasonNextTaskStep", "OUTPUT | reasonNextStep", ], maxOccurrences: { "OUTPUT | reasonNextStep": 40, }, }, }, // ---------------- Final Answer ---------------- { id: "final_answer_for_test_entrypoints", query: "How does this repo run tests and where are the test entry points configured?", checks: { mustContain: [ "OUTPUT | finalAnswerModule", ], mustContainAnswer: [ "test", ], }, }, { id: "final_answer_analyzed_files_reported", query: "Audit finalAnswer analyzed-files output for mismatch against actually executed taskSteps.", checks: { mustContain: [ "OUTPUT | finalAnswerModule", "\"analyzedFiles\"", ], }, }, ]; function pickRandom(items) { return items[Math.floor(Math.random() * items.length)]; } function logHighContrastQuery(prefix, query) { // Bright white + bold for dark terminal readability. console.log(chalk.bold.white(`${prefix} ${query}`)); } function countOccurrences(haystack, needle) { if (!needle) return 0; return haystack.split(needle).length - 1; } function extractFinalAnswerText(runLog) { const blockRegex = /📂 OUTPUT \| finalAnswerModule[\s\S]*?\n={68}\n([\s\S]*?)\n={68}\n/g; let lastBody = ""; let match = null; while ((match = blockRegex.exec(runLog)) !== null) { lastBody = match[1] ?? ""; } if (!lastBody) return ""; try { const parsed = JSON.parse(lastBody.trim()); if (typeof parsed?.data === "string") return parsed.data; return JSON.stringify(parsed?.data ?? ""); } catch { return lastBody; } } function isEvalApplicable(test, runLog) { const mustContain = test.when?.mustContainInRunLog ?? []; for (const token of mustContain) { if (!runLog.includes(token)) { return { ok: false, reason: `condition unmet: missing "${token}"` }; } } const mustNotContain = test.when?.mustNotContainInRunLog ?? []; for (const token of mustNotContain) { if (runLog.includes(token)) { return { ok: false, reason: `condition unmet: found forbidden "${token}"` }; } } return { ok: true }; } async function runLocalEvals(runQuery) { const startedAt = Date.now(); let passCount = 0; let skipCount = 0; for (const test of localEvalCases) { fs.mkdirSync(requireRunLogDir(), { recursive: true }); fs.writeFileSync(RUN_LOG_PATH, "", { flag: "w" }); console.log(chalk.cyan(`\n[EVAL] ${test.id}`)); logHighContrastQuery("[EVAL QUERY]", test.query); fs.appendFileSync(RUN_LOG_PATH, `\n[EVAL] ${test.id}\n[EVAL QUERY] ${test.query}\n`, "utf-8"); await runQuery(test.query); const runLog = fs.existsSync(RUN_LOG_PATH) ? fs.readFileSync(RUN_LOG_PATH, "utf-8") : ""; const finalAnswerText = extractFinalAnswerText(runLog); const applicability = isEvalApplicable(test, runLog); if (!applicability.ok) { skipCount++; console.log(chalk.blue(`[SKIP] ${test.id} (${applicability.reason})`)); continue; } const failures = []; for (const token of test.checks.mustContain ?? []) { if (!runLog.includes(token)) { failures.push(`missing token: ${token}`); } } for (const token of test.checks.mustNotContain ?? []) { if (runLog.includes(token)) { failures.push(`unexpected token: ${token}`); } } for (const [token, max] of Object.entries(test.checks.maxOccurrences ?? {})) { const count = countOccurrences(runLog, token); if (count > max) { failures.push(`too many occurrences: "${token}" => ${count} (max ${max})`); } } for (const token of test.checks.mustContainAnswer ?? []) { if (!finalAnswerText.toLowerCase().includes(token.toLowerCase())) { failures.push(`final answer missing token: ${token}`); } } for (const token of test.checks.mustNotContainAnswer ?? []) { if (finalAnswerText.toLowerCase().includes(token.toLowerCase())) { failures.push(`final answer contains forbidden token: ${token}`); } } if (failures.length === 0) { passCount++; console.log(chalk.green(`[PASS] ${test.id}`)); } else { console.log(chalk.red(`[FAIL] ${test.id}`)); for (const failure of failures) { console.log(chalk.red(` - ${failure}`)); } } } const durationSec = ((Date.now() - startedAt) / 1000).toFixed(1); const total = localEvalCases.length; const failed = total - passCount - skipCount; const color = failed === 0 ? chalk.green : chalk.yellow; console.log(color(`\n[EVAL SUMMARY] pass=${passCount}/${total}, skip=${skipCount}, fail=${failed}, duration=${durationSec}s`)); } function requireRunLogDir() { const idx = RUN_LOG_PATH.lastIndexOf("/"); if (idx <= 0) return "."; return RUN_LOG_PATH.slice(0, idx); } /** * Registers testing-related shell commands. * Example: /test, /test-random, /test-evals. */ export function registerTestingCommands(customCommands, runQuery) { customCommands.test = async () => { const query = testQueries[0]; logHighContrastQuery("[TEST QUERY]", query); await runQuery(query); }; customCommands["test-random"] = async () => { const query = pickRandom(testQueries); console.log(`\n🎲 [test-random] Selected query:\n→ ${query}\n`); await runQuery(query); }; customCommands["test-evals"] = async () => { await runLocalEvals(runQuery); }; }