UNPKG

@blundergoat/goat-flow

Version:

AI coding agent harness and local dashboard for Claude Code, OpenAI Codex, Google Antigravity, and GitHub Copilot - setup audits, guardrails, structured skills, deny hooks, and persistent learning loops.

434 lines 18.7 kB
/** * The rubric: one MetricScorer per scoring dimension (trigger clarity, workflow completeness, gate * quality, evidence/testability, cold-start executability, token cost, tool dependencies, write * risk, and skill-vs-reference fit), plus the `ALL_METRICS` list the scorer runs in order. * * Each scorer is a pure function of its MetricInput, runs the artifact text through regex/heading * heuristics, and routes its raw score through `finalizeMetric` for subtype-specific capping - so a * dimension that does not apply to a subtype reports `n/a`, not a low score. Some scorers attach * promote/demote/meta signals that feed recommendations without changing the numeric total. The * heuristics are deliberately conservative (calibrated against the in-tree `.claude/skills` corpus) * to keep false positives low; they are advisory tips, not hard deductions, where noted. */ import { compilePatternList } from "./quality-config.js"; import { countHeadings, countSubReferences, estimateTokens, hasSection, stripYamlFrontmatter, } from "./skill-quality-content.js"; import { finalizeMetric, } from "./skill-quality-types.js"; /** Workflow-summary detection for skill descriptions. Sourced from the prime * writing-skills corpus (search: `Testing revealed that when a description * summarizes`): when a description names *what the skill does internally* * (procedural verbs, "X then Y" connectives) rather than *when to trigger*, * agents tend to follow the description and skip the skill body. Detected as * a yellow signal only - emits a tip via the trigger-clarity detail string; * never deducts score. Verb list narrowed to keep <10% false-positive rate * on the in-tree `.claude/skills` corpus. */ const WORKFLOW_VERB_RE = /\b(dispatches?|implements?(?:ing|ed)?|executes?(?:ing|ed)?|generates?|runs?|produces?|creates?|builds?|refactors?|writes?)\b/i; const WORKFLOW_CONNECTIVE_RE = /\b(then|between)\b/i; /** * Reads frontmatter descriptions to detect workflow summaries that make agents skip the skill body. */ function descriptionSummarizesWorkflow(content) { const match = /^---[\s\S]*?description:\s*"([^"]+)"[\s\S]*?---/m.exec(content); if (!match) return false; const description = match[1]; if (!description) return false; const stripped = description.replace(/^Use when [^,.;-]*[,.;-]?\s*/i, ""); return (WORKFLOW_VERB_RE.test(stripped) || WORKFLOW_CONNECTIVE_RE.test(stripped)); } // eslint-disable-next-line complexity -- intentional because exhaustive structural signal scoring keeps each trigger-clarity rule beside its note text const triggerClarity = (input) => { const { artifact, rawContent: content, subtype } = input; let score = 0; const notes = []; if (artifact.kind === "skill") { const hasFrontmatterDesc = /^---[\s\S]*?description:\s*".+"[\s\S]*?---/m.test(content); const hasWhenToUse = hasSection(content, /##\s+When to Use/i) || /\bUse when\b/i.test(content); const hasExclusion = /NOT this skill/i.test(content) || /If the user names a skill explicitly/i.test(content); if (hasFrontmatterDesc) score += 5; else notes.push("missing frontmatter description"); if (hasWhenToUse) score += 5; else notes.push('missing "When to Use" signal'); if (subtype === "dispatcher") { const hasRouteMap = hasSection(content, /##\s+Route Map/i); if (hasRouteMap) score += 5; else notes.push("dispatcher missing Route Map for trigger disambiguation"); } else if (hasExclusion) { score += 5; } else { notes.push('missing "NOT this skill" exclusion list'); } if (hasFrontmatterDesc && descriptionSummarizesWorkflow(content)) { notes.push("description summarizes workflow rather than triggering conditions"); } } else { const hasPurpose = hasSection(content, /##\s+Purpose/i) || hasSection(content, /##\s+When to (load|use)/i) || /^---[\s\S]*?goat-flow-reference-version/m.test(content); if (hasPurpose) score += 10; else notes.push("missing purpose or version header"); const hasAvailCheck = hasSection(content, /Availability Check/i); if (hasAvailCheck) score += 5; else if (subtype === "meta" || subtype === "index") score += 5; else notes.push("missing Availability Check"); } return finalizeMetric(input, "trigger-clarity", score, notes.length > 0 ? notes.join("; ") : "clear trigger definition"); }; // eslint-disable-next-line complexity -- intentional because exhaustive structural signal scoring keeps each workflow-completeness rule beside its note text const workflowCompleteness = (input) => { const { artifact, rawContent: content, subtype, config } = input; let score = 0; const notes = []; if (artifact.kind === "skill") { const hasStepZero = hasSection(content, /##\s+Step 0/i); const phaseCount = countHeadings(content, 2) + countHeadings(content, 3); const humanStop = compilePatternList(config.gateVocabulary.humanStop); const hasCheckpoint = humanStop.test(content); const hasRouteMap = hasSection(content, /##\s+Route Map/i); const hasQuickScan = hasSection(content, /##\s+Quick Scan Path/i); if (subtype === "dispatcher") { if (hasRouteMap) score += 5; else notes.push("missing dispatcher Route Map"); } else { if (hasStepZero || hasQuickScan) score += 5; else notes.push("missing Step 0 intake"); if (phaseCount >= 4) score += 5; else notes.push(`only ${phaseCount} sections (expected 4+)`); if (hasCheckpoint || subtype === "report") score += 5; else notes.push("no checkpoint or blocking gate stops"); } } else { const hasWorkflow = hasSection(content, /##\s+.*Workflow/i) || hasSection(content, /##\s+Steps/i) || hasSection(content, /###\s+Step\s+\d/i); const hasTroubleshooting = hasSection(content, /Troubleshoot/i) || hasSection(content, /Fallback/i); const hasVerificationGate = hasSection(content, /##\s+(Verification Gate|Verification|Acceptance)/i); const hasBoundaryLanguage = hasSection(content, /##\s+(Boundary|Scope|When to Load|When to Use)/i) || /\b(In scope|Out of scope|Do not use when|read-only)\b/i.test(content); const sectionCount = countHeadings(content, 2); if (subtype === "playbook") { if (hasWorkflow) score += 3; else notes.push("no workflow/steps section"); if (hasTroubleshooting) score += 3; else notes.push("no troubleshooting/fallback"); if (hasVerificationGate) score += 3; else notes.push("missing verification gate"); if (hasBoundaryLanguage) score += 3; else notes.push("missing boundary language"); if (sectionCount >= 4) score += 3; else notes.push(`only ${sectionCount} top-level sections`); } else { if (hasWorkflow || subtype === "index" || subtype === "meta") score += 5; else notes.push("no workflow/steps section"); if (hasTroubleshooting || subtype === "meta") score += 5; else notes.push("no troubleshooting/fallback"); if (sectionCount >= 3) score += 5; else notes.push(`only ${sectionCount} top-level sections`); } } return finalizeMetric(input, "workflow-completeness", score, notes.length > 0 ? notes.join("; ") : "complete workflow"); }; const gateQuality = (input) => { const { composedContent: content, config } = input; let score = 0; const notes = []; const verificationGate = compilePatternList(config.gateVocabulary.verificationGate); const explicitPass = compilePatternList(config.gateVocabulary.explicitPass); const humanStop = compilePatternList(config.gateVocabulary.humanStop); if (verificationGate.test(content)) score += 5; else notes.push("no verification gates or checklists"); if (explicitPass.test(content)) score += 3; else notes.push("no explicit pass/fail criteria"); if (humanStop.test(content)) score += 2; else notes.push("no explicit human stop or checkpoint"); return finalizeMetric(input, "gate-quality", score, notes.length > 0 ? notes.join("; ") : "strong gates"); }; const evidenceTestability = (input) => { const content = input.composedContent; let score = 0; const notes = []; const hasEvidenceTag = /\b(?:OBSERVED|INFERRED)\b/i.test(content) || /\bevidence[_-]quality\b/i.test(content); const hasEvidenceGate = /\bProof Gate\b/i.test(content) || /\bevidence\b.*\brequired\b/i.test(content); const hasSemanticAnchors = /\(search:\s*"[^"]+"\)/i.test(content) || /search:.*`[^`]+`/i.test(content); if (hasEvidenceTag) score += 4; else notes.push("no evidence quality tags"); if (hasEvidenceGate) score += 3; else notes.push("no evidence gate"); if (hasSemanticAnchors) score += 3; else notes.push("no semantic anchors"); return finalizeMetric(input, "evidence-testability", score, notes.length > 0 ? notes.join("; ") : "strong evidence contract"); }; // eslint-disable-next-line complexity -- intentional because exhaustive structural signal scoring keeps each cold-start rule beside its note text const coldStartExecutability = (input) => { const { artifact, rawContent: content } = input; let score = 0; const notes = []; if (artifact.kind === "skill") { const hasReadFirst = /\bRead First\b/i.test(content) || /\bread\b.*\bbefore\b/i.test(content); const hasContextSetup = /\bcontext\b.*\bsetup\b/i.test(content) || /\bload\b.*\bbefore\b/i.test(content) || /\bread\b.*\b(?:files|docs|references|context)\b/i.test(content); const hasStartupSection = hasSection(content, /##\s+(Step 0|Read First|Prerequisites|Inputs?|Context|Before You Start)/i); const hasPrereqsOrAssumptions = /\bprerequisites?\b|\brequires?\b|\bassumptions?\b|\binputs?\b|\bdependencies\b|\bavailable\b|before acting|before proceeding/i.test(content); const hasOperatingContext = /\bmodes?\b|\bscope\b|\bconstraints?\b|\ballowed\b|\bapproval\b|\bread-only\b|\bfile-write\b/i.test(content); if (hasReadFirst || hasContextSetup || hasStartupSection) score += 5; else notes.push("no Read First or context setup"); if (hasPrereqsOrAssumptions || hasOperatingContext) score += 5; else notes.push("no prerequisites or operating context"); } else { const hasPurpose = hasSection(content, /##\s+Purpose/i) || /^This (reference|playbook|document)/im.test(content); const hasPrereqs = /prerequisite/i.test(content) || /requires?:/i.test(content) || /Availability Check/i.test(content); if (hasPurpose) score += 5; else notes.push("no clear purpose statement"); if (hasPrereqs) score += 5; else notes.push("no prerequisites or availability check"); } return finalizeMetric(input, "cold-start", score, notes.length > 0 ? notes.join("; ") : "good cold-start"); }; const tokenCost = (input) => { const tokens = estimateTokens(input.rawContent); const subRefs = countSubReferences(input.projectRoot, input.artifact); const notes = []; let score; if (tokens > 20000) { score = 0; notes.push(`~${tokens} tokens - very large`); } else if (tokens > 10000) { score = 3; notes.push(`~${tokens} tokens - large`); } else if (tokens > 5000) { score = 7; notes.push(`~${tokens} tokens`); } else { score = 10; } if (subRefs > 5) { score = Math.max(0, score - 3); notes.push(`${subRefs} sub-references loaded`); } else if (subRefs > 0) { notes.push(`${subRefs} sub-reference(s)`); } return finalizeMetric(input, "token-cost", score, notes.length > 0 ? notes.join("; ") : `~${tokens} tokens`); }; const toolDependencyHandling = (input) => { const { composedContent, config } = input; const content = stripYamlFrontmatter(composedContent); let score = 5; const notes = []; const hasAvailCheck = /\bAvailability Check\b/i.test(content); const hasFallback = /\bfallback\b/i.test(content) || /\bif\b.*\bunavailable\b/i.test(content); const toolKeywords = new RegExp(config.toolKeywordsRegex, "i"); const hasToolRef = toolKeywords.test(content); if (hasToolRef) { if (hasAvailCheck) score += 3; else notes.push("references tools without availability check"); if (hasFallback) score += 2; else notes.push("no fallback for tool dependencies"); } else { score = 10; } return finalizeMetric(input, "tool-deps", score, notes.length > 0 ? notes.join("; ") : hasToolRef ? "tools handled" : "no external tool dependencies"); }; const writeRisk = (input) => { const { artifact, composedContent: content } = input; let score = 10; const notes = []; if (artifact.kind === "skill") { const hasModeSystem = /\b(?:Read-Only|File-Write|Plan|Implement)\b/i.test(content) && /\bmode\b/i.test(content); const hasEscalation = /\bapproval\b/i.test(content) || /\bask\b.*\bbefore\b/i.test(content); if (!hasModeSystem) { score -= 4; notes.push("no read-only vs write mode system"); } if (!hasEscalation) { score -= 3; notes.push("no escalation gate for writes"); } } else { const writesFiles = (/\b(?:write|create|modify)\b/i.test(content) || /\bedit\b.*\bfile\b/i.test(content)) && !/\bread-only\b/i.test(content); if (writesFiles) { score -= 2; notes.push("reference mentions file writes"); } } return finalizeMetric(input, "write-risk", score, notes.length > 0 ? notes.join("; ") : "controlled write risk"); }; // eslint-disable-next-line complexity -- intentional because exhaustive structural signal scoring keeps each skill-vs-reference fit rule beside its note text const skillReferenceFit = (input) => { const { artifact, rawContent: content, subtype } = input; const signals = { hasFrontmatterName: /^---[\s\S]*?name:\s*.+[\s\S]*?---/m.test(content), hasIntake: hasSection(content, /##\s+Step 0/i), hasCheckpoint: /\bCHECKPOINT\b/i.test(content), hasModes: /\b(?:Read-Only|File-Write)\b|\bPlan\b.*\bmode\b|\bImplement\b.*\bmode\b/i.test(content), hasAvailCheck: /\bAvailability Check\b/i.test(content), isToolProtocol: /\btool\b.*\bprotocol\b|\bobservation\b.*\bworkflow\b|\bcapture\b.*\bworkflow\b/i.test(content), hasRefVersion: /goat-flow-reference-version/i.test(content), hasSkillVersion: /goat-flow-skill-version/i.test(content), hasRouteMap: hasSection(content, /##\s+Route Map/i), hasQuickScan: hasSection(content, /##\s+Quick Scan Path/i), }; const skillSignals = [ signals.hasFrontmatterName, signals.hasIntake, signals.hasCheckpoint, signals.hasModes, signals.hasSkillVersion, ].filter(Boolean).length; const refSignals = [ signals.hasAvailCheck, signals.isToolProtocol, signals.hasRefVersion, !signals.hasFrontmatterName, !signals.hasIntake, ].filter(Boolean).length; const resultSignals = {}; const notes = []; let score; if (subtype === "meta" || subtype === "index") { resultSignals.isMetaReference = true; score = 10; notes.push(subtype === "index" ? "index reference; routes to sibling files" : "shared meta-reference; not user-invocable"); } else if (artifact.kind === "skill") { if ((subtype === "dispatcher" && signals.hasRouteMap) || (subtype === "report" && signals.hasQuickScan)) { score = 10; } else if (skillSignals >= 3) { score = 10; } else if (skillSignals >= 2) { score = 7; notes.push("weak skill identity - missing some structural signals"); } else { score = 3; resultSignals.shouldDemote = true; notes.push("artifact lacks skill structure - may belong in skill-docs/"); } if (refSignals >= 3 && subtype === "workflow") { score = Math.max(0, score - 3); resultSignals.shouldDemote = true; notes.push("strong reference signals - consider demoting to reference"); } } else { if (refSignals >= 3) { score = 10; } else if (refSignals >= 2) { score = 7; notes.push("adequate reference identity"); } else { score = 5; notes.push("reference lacks typical structural signals"); } if (skillSignals >= 3) { score = Math.max(0, score - 3); resultSignals.shouldPromote = true; notes.push("strong skill signals - consider promoting to skill"); } } return finalizeMetric(input, "skill-reference-fit", score, notes.length > 0 ? notes.join("; ") : "good fit for current classification", resultSignals); }; export const ALL_METRICS = [ triggerClarity, workflowCompleteness, gateQuality, evidenceTestability, coldStartExecutability, tokenCost, toolDependencyHandling, writeRisk, skillReferenceFit, ]; //# sourceMappingURL=skill-quality-metrics.js.map