@debugg-ai/debugg-ai-mcp
Version:
Zero-Config, Fully AI-Managed End-to-End Testing for all code gen platforms.
83 lines (82 loc) • 3.69 kB
JavaScript
/**
* Detect well-known transient failure signatures in completed workflow
* executions. When `isTransientWorkflowError` returns true, the MCP handler
* auto-retries the workflow (cost: one extra quota unit) — saving the caller
* from the 'pure infrastructure noise' failure mode the original client
* called out in their feedback (Pydantic JSON parse errors, etc.).
*
* Be CONSERVATIVE: only patterns documented as transient. False positives
* waste quota; false negatives leave existing behavior, which is fine — the
* caller still gets a clear error and can decide what to do.
*
* Bead `kbxy`. Patterns are extracted (not inlined) so they're easy to audit
* + extend as new transient signatures get observed in production.
*/
/**
* Patterns that match transient backend failures worth retrying. Each entry
* is a regex tested against `errorMessage` AND `state.error`. Matching ANY
* pattern in EITHER field flags the execution as transient.
*
* To add a new pattern: confirm by sampling production telemetry that the
* signature recovers on retry (a one-shot reproduce-then-retry test is
* sufficient evidence). Document the source in the comment.
*/
const TRANSIENT_PATTERNS = [
// The original client complaint. Backend agent's brain.step occasionally
// returns malformed JSON for the structured output — Pydantic chokes on
// EOF / partial JSON. A fresh agent invocation reliably recovers.
{ pattern: /Invalid JSON.*EOF while parsing/i, reason: 'pydantic-eof' },
{ pattern: /Failed to parse AgentOutput/i, reason: 'agent-output-parse' },
// Backend-side infrastructure flakes (nginx 502 from upstream + timeouts).
// Both observed in production during 2026-04-26 + 2026-04-27 deploys —
// recovery on next request is the rule, not the exception.
{ pattern: /502 Bad Gateway/i, reason: 'nginx-502' },
{ pattern: /upstream connect timeout/i, reason: 'upstream-timeout' },
// Network-layer transient — TCP reset between MCP↔backend or backend↔model.
{ pattern: /ECONNRESET|connection reset by peer/i, reason: 'econnreset' },
];
/**
* @returns true if the execution's error fields contain a known transient
* signature, indicating a retry has a reasonable chance of succeeding.
*/
export function isTransientWorkflowError(execution) {
if (!execution)
return false;
const candidates = [];
if (typeof execution.errorMessage === 'string' && execution.errorMessage) {
candidates.push(execution.errorMessage);
}
if (typeof execution.state?.error === 'string' && execution.state.error) {
candidates.push(execution.state.error);
}
if (candidates.length === 0)
return false;
for (const text of candidates) {
for (const { pattern } of TRANSIENT_PATTERNS) {
if (pattern.test(text))
return true;
}
}
return false;
}
/**
* @returns the reason tag for the matched transient pattern (for telemetry),
* or undefined if no pattern matched. Useful when you want to attach a
* classifier to a `workflow.transient_retry` event.
*/
export function transientReasonTag(execution) {
if (!execution)
return undefined;
const fields = [];
if (typeof execution.errorMessage === 'string' && execution.errorMessage)
fields.push(execution.errorMessage);
if (typeof execution.state?.error === 'string' && execution.state.error)
fields.push(execution.state.error);
for (const text of fields) {
for (const { pattern, reason } of TRANSIENT_PATTERNS) {
if (pattern.test(text))
return reason;
}
}
return undefined;
}