UNPKG

@blundergoat/goat-flow

Version:

AI coding agent harness and local dashboard for Claude Code, OpenAI Codex, Google Antigravity, and GitHub Copilot - setup audits, guardrails, structured skills, deny hooks, and persistent learning loops.

github.com/blundergoat/goat-flow

blundergoat/goat-flow

156 lines • 11.1 kB

JavaScript

import { markdownTableCell } from "./compose-quality-common.js"; /** Render the per-subtype Examples dimension criteria for the Semantic * Dimensions section. Playbook/index subtypes weight Examples higher; meta * subtype permits an explicit `n/a` with justification. */ function renderExamplesDimensionCriteria(subtype) { const lines = []; lines.push(" - Are examples concrete (real values, real expected output) or hypothetical placeholders?"); lines.push(" - Do examples show edge cases, not just the happy path?"); lines.push(" - Are BAD/GOOD pairs, labelled counter-examples, or expected-output annotations present where the artifact enforces a convention?"); if (subtype === "workflow") { lines.push(" - Workflow subtype: is at least one full phase walked through with real artifacts (file path + action + expected outcome)?"); } else if (subtype === "playbook" || subtype === "index") { lines.push(` - **${subtype} subtype: weight Examples HIGHER.** Playbooks and indexes live or die by examples; treat thin example coverage as a deduction even if other dimensions are strong.`); } else if (subtype === "meta") { lines.push(" - Meta subtype: Examples may legitimately be `n/a`. If you mark it n/a, justify why (the artifact is a contract loaded by skills, not a procedure that needs runnable examples) and exclude it from `semanticMax`."); } return lines; } function appendComposedFromInstruction(lines, report, kindLower) { const { artifact, composedFrom } = report; if (composedFrom.length === 0) { lines.push(`Assess the ${kindLower} artifact at \`${artifact.path}\`. The engine composed no additional files (single-file scoring path); read this file in full before scoring.`); return; } const exampleReferenced = composedFrom.find((s) => s.startsWith("references/")) ?? composedFrom.find((s) => s !== artifact.name && !/^SKILL\.md$/i.test(s)) ?? composedFrom[composedFrom.length - 1]; const exampleClause = exampleReferenced ? ` Structural signals from referenced files count toward your assessment: content documented in \`${exampleReferenced}\` is part of the ${kindLower}, not bonus material.` : ""; lines.push(`Assess the ${kindLower} **${artifact.name}**. Read every file in **Composed from** below - the engine composes ${composedFrom.length} file${composedFrom.length === 1 ? "" : "s"} into the runtime surface (\`${composedFrom.join("`, `")}\`).${exampleClause}`); lines.push(""); lines.push('If a `composition truncated` fit note appears below, the composed bundle is incomplete. Read the actual files listed in **Composed from** directly when context allows; otherwise note the skipped files in the final "What was not verified" section.'); } /** Compose a focused quality prompt for a single skill or reference artifact. * The prompt requires four scored semantic dimensions, an * anti-bias preamble, an explicit per-file `composedFrom` reading * instruction, a focus/scope probe, a final gate decision, and a fenced JSON * block summarising the verdict. * * @param report - skill-quality report to turn into a focused reviewer prompt * @returns prompt text for reviewing the selected skill or reference artifact */ export function composeArtifactQualityPrompt(report) { const { artifact, totalScore, maxTotalScore, subtype, recommendation, metrics, composedFrom, fitNotes, } = report; const kindLabel = artifact.kind === "skill" ? "Skill" : "Shared Reference"; const kindLower = kindLabel.toLowerCase(); const pct = maxTotalScore > 0 ? Math.round((totalScore / maxTotalScore) * 100) : 0; const lines = []; lines.push(`# ${kindLabel} Quality Review: ${artifact.name}`); lines.push(""); lines.push("REPORTING-ONLY ASSESSMENT MODE. Do not edit tracked files. This prompt is the full assessment contract. You may read files and run read-only commands."); lines.push(""); appendComposedFromInstruction(lines, report, kindLower); lines.push(""); lines.push("## Deterministic Baseline"); lines.push(""); lines.push(`- **Artifact:** ${artifact.name} (${kindLabel})`); lines.push(`- **Path:** \`${artifact.path}\``); lines.push(`- **Subtype:** ${subtype}`); if (report.shapeMismatch) { lines.push(`- **Detected shape:** ${report.detectedShape} (${Math.round(report.shapeConfidence * 100)}% confidence)`); } lines.push(`- **Score:** ${totalScore}/${maxTotalScore} (${pct}%)`); lines.push(`- **Recommendation:** ${recommendation}`); if (composedFrom.length > 0) { lines.push(`- **Composed from:** ${composedFrom.join(", ")}`); } lines.push(""); lines.push("### Metric Breakdown"); lines.push(""); lines.push("| Metric | Score | Severity | Detail |"); lines.push("|--------|-------|----------|--------|"); for (const m of metrics) { lines.push(`| ${markdownTableCell(m.label)} | ${m.score}/${m.maxScore} | ${m.severity} | ${markdownTableCell(m.detail)} |`); } lines.push(""); if (fitNotes.length > 0) { lines.push("### Fit Notes"); lines.push(""); for (const note of fitNotes) { lines.push(`- ${note}`); } lines.push(""); } lines.push("## Anti-Bias Guidance"); lines.push(""); lines.push('Score against absolute criteria, not relative to other artifacts you have reviewed. Do not let one strong dimension halo over a weak one - score each dimension independently. Recent sections of the file should not weigh more than earlier ones; read everything before scoring. If you are tempted to round up because the artifact "seems okay," round down - leniency is the most common failure mode of this rubric.'); lines.push(""); lines.push("## Semantic Dimensions"); lines.push(""); lines.push('Score each applicable dimension on a 1-5 scale. For every dimension provide: (a) the score, (b) one sentence of evidence, (c) the lowest-quality span supporting any deduction, cited as `path/file (search: "semantic-anchor")`. If a dimension is legitimately `n/a` for this subtype, mark it `n/a`, justify why, and exclude it from `semanticMax`.'); lines.push(""); lines.push("- **Clarity (1-5)** - instruction precision, imperative voice,"); lines.push(" ambiguity, technical-term grounding. Can a fresh reader execute the workflow without guessing?"); lines.push("- **Examples (1-5)** -"); for (const line of renderExamplesDimensionCriteria(subtype)) { lines.push(line); } lines.push('- **Focus (1-5)** - single purpose vs kitchen-sink. Apply the "describe in one sentence" test: if the artifact\'s purpose needs three clauses joined by AND, focus is weak.'); lines.push("- **Coherence (1-5)** - does the bundle (SKILL.md + references) tell one coherent story, or do parts contradict, drift, or duplicate?"); lines.push(""); lines.push("Compute `semanticTotal`, `semanticMax` (default `/20` across the four dimensions; subtract any `n/a` dimension's max), and `semanticPct = semanticTotal / semanticMax`."); lines.push(""); lines.push("## Your Assessment"); lines.push(""); lines.push("After the structured semantic scoring above, deepen the assessment with these questions:"); lines.push(""); lines.push(`1. **Classification challenge:** Is \`${recommendation}\` the right recommendation? If the artifact is currently a ${kindLower}, should it be reclassified? Provide evidence.`); lines.push("2. **Metric verification:** For each metric scored below max, is the deduction justified? Are there structural signals the static scorer missed?"); lines.push("3. **Quality gaps:** What quality issues exist that the deterministic metrics do not capture (e.g., misleading instructions, outdated references, unclear boundaries)?"); lines.push("4. **Top 3 improvements:** Actionable changes with file path and semantic anchor evidence. **Each improvement must cite the dimension it addresses; if a deduction has no improvement listed, that is itself a finding.**"); lines.push("5. **What was not verified:** State any aspects you could not assess from a file read."); lines.push('6. **Scope check:** Write a one-sentence summary of what this artifact does. If you cannot, that is a Clarity finding (record in semantic dimensions). If your sentence describes 3+ distinct concerns ("formats code AND evaluates it AND documents it"), recommend splitting and propose the boundaries. Note: this answers "is the scope right within the assigned subtype?" - distinct from the structural `consider-reclassifying` recommendation, which answers "is the subtype right?"'); lines.push(""); lines.push("## Final Gate"); lines.push(""); lines.push("Return one of `ship` / `revise` / `block` based on the deterministic recommendation, the semantic percentage, and per-dimension floors:"); lines.push(""); lines.push("- **ship** - deterministic recommendation is `keep-skill` OR `reference-playbook`, AND `semanticPct >= 0.8`, AND no applicable dimension scores below 3."); lines.push("- **revise** - deterministic recommendation is `keep-skill` / `reference-playbook` AND (`semanticPct < 0.8` OR any applicable dimension < 3); OR deterministic recommendation is `consider-revision` AND `semanticPct >= 0.5` AND no applicable dimension equals 1."); lines.push("- **block** - deterministic recommendation is `needs-human-review` / `retire` / `consider-reclassifying`; OR `semanticPct < 0.5`; OR any applicable dimension equals 1."); lines.push(""); lines.push("The deterministic recommendation is the structural floor. You may override it (e.g. gate `block` when the engine said `keep-skill`) only with explicit reasoning that cites specific evidence the engine missed."); lines.push(""); lines.push("## Required JSON Verdict"); lines.push(""); lines.push("End your response with a fenced ```json``` block matching this schema. The block must be the final content; the dashboard parses it best-effort."); lines.push(""); lines.push("```json"); lines.push("{"); lines.push(' "semanticScores": {'); lines.push(' "clarity": 4,'); lines.push(' "examples": 3,'); lines.push(' "focus": 5,'); lines.push(' "coherence": 4,'); lines.push(' "total": 16,'); lines.push(' "max": 20'); lines.push(" },"); lines.push(' "gateDecision": "revise",'); lines.push(' "gateRationale": "Examples score 3/5; only one realistic scenario shown.",'); lines.push(' "blockers": [],'); lines.push(' "improvements": ['); lines.push(" {"); lines.push(' "dimension": "examples",'); lines.push(' "priority": "high",'); lines.push(' "action": "Add 2 concrete scenarios showing expected output for the ambiguous-input case (cite file + semantic anchor)."'); lines.push(" }"); lines.push(" ]"); lines.push("}"); lines.push("```"); return lines.join("\n"); } //# sourceMappingURL=compose-quality-artifact.js.map