gepa-spo

export interface Module { /** Unique identifier for the module */ id: string; /** System prompt for this module */ prompt: string; } export interface Candidate { /** Current system prompt under evaluation (backward compatible) */ system?: string; /** Modular system with multiple modules */ modules?: Module[]; } export interface TaskItem { /** Stable identifier for logging and reproducibility */ id: string; /** User prompt text */ user: string; /** Optional metadata for metrics/safety topics, etc. */ meta?: Record<string, unknown> | null; } export interface ExecuteResult { /** Raw assistant output text produced under the candidate.system */ output: string; /** Optional trace info (tool logs, timing, etc.) */ traces?: Record<string, unknown> | null; } /** Function that runs the LLM (actor) with a candidate.system against an item */ export type SystemExecute = (args: { candidate: Candidate; item: TaskItem; }) => Promise<ExecuteResult>; /** Numeric metric on [0,1] (for Pareto set or ground-truthable tasks) */ export type MetricMu = (y: string, m: unknown) => number; /** Judge score + textual feedback derived from the output */ export type FeedbackMuF = (args: { item: TaskItem; output: string; traces?: Record<string, unknown> | null; }) => Promise<{ score: number; feedbackText: string; }> | { score: number; feedbackText: string; }; /** Minimal single-turn LLM for reflection/updating the system prompt */ export interface LLM { complete(prompt: string): Promise<string>; } /** Multi-message chat LLM (used by judge) */ export interface ChatMessage { role: 'system' | 'user' | 'assistant'; content: string; } export interface ChatLLM { chat(messages: ChatMessage[], opts?: { temperature?: number; maxTokens?: number; }): Promise<string>; } /** Core configuration */ export interface GepaOptions { execute: SystemExecute; mu: MetricMu; muf: FeedbackMuF; llm: LLM; budget: number; minibatchSize: number; paretoSize: number; holdoutSize?: number; epsilonHoldout?: number; strategiesPath?: string; /** Adaptive strategy scheduling options */ strategySchedule?: StrategyScheduleOptions; /** Which scorer populates the Pareto score matrix S; default 'muf' (judge). */ scoreForPareto?: 'muf' | 'mu'; /** Whether judge calls (muf) should consume budget; default true */ mufCosts?: boolean; /** Probability of using crossover (merge) instead of mutation; default 0 */ crossoverProbability?: number; } /** Serializable bandit state */ export interface Ucb1State { t: number; stats: Array<{ id: string; n: number; mean: number; }>; } /** Serializable optimizer state for resume */ export interface GEPAState { version: 2; budgetLeft: number; iter: number; Psystems: string[]; S: number[][]; DparetoIdx: number[]; DfbIdx: number[]; DholdIdx: number[]; bestIdx: number; seeded: boolean; bandit: Ucb1State | null; /** Current module index for round-robin mutation (0-based) */ moduleIndex?: number; /** Number of modules in the system (for round-robin scheduling) */ moduleCount?: number; /** Lineage metadata: tracks which modules changed in each candidate */ lineage?: Array<{ /** Candidate index */ candidateIndex: number; /** Array of module indices that were last changed in this candidate */ changedModules: number[]; /** Parent candidate index (for tracking ancestry) */ parentIndex?: number; }>; } /** * Controls how frequently we explore strategies vs exploit the bandit, and how often * we perform pure GEPA reflection without any strategy hint. The explore probability * scales up when recent uplifts are small. */ export interface StrategyScheduleOptions { /** Window size for recent uplifts moving average */ windowSize?: number; /** If avg uplift within window is below this, boost exploration */ slowdownThreshold?: number; /** Base exploration probability when improving well */ baseExploreProb?: number; /** Max exploration probability when stagnating */ maxExploreProb?: number; /** Probability to drop hints entirely (pure reflection) under normal conditions */ baseNoHintProb?: number; /** Max probability to drop hints entirely when stagnating */ maxNoHintProb?: number; /** Number of core strategies to consider for exploration when JSON lacks explicit "core" */ defaultCoreTopK?: number; /** Minimum applicability score to keep a strategy after LLM prefilter */ prefilterThreshold?: number; /** Max number of strategies to keep after prefilter (0 = keep all) */ prefilterTopK?: number; /** Cooldown in iterations before re-running prefilter when stagnating */ reprefilterCooldownIters?: number; }