gepa-spo
Version:
Genetic-Pareto prompt optimizer to evolve system prompts from a few rollouts with modular support and intelligent crossover
145 lines (144 loc) • 4.98 kB
TypeScript
export interface Module {
/** Unique identifier for the module */
id: string;
/** System prompt for this module */
prompt: string;
}
export interface Candidate {
/** Current system prompt under evaluation (backward compatible) */
system?: string;
/** Modular system with multiple modules */
modules?: Module[];
}
export interface TaskItem {
/** Stable identifier for logging and reproducibility */
id: string;
/** User prompt text */
user: string;
/** Optional metadata for metrics/safety topics, etc. */
meta?: Record<string, unknown> | null;
}
export interface ExecuteResult {
/** Raw assistant output text produced under the candidate.system */
output: string;
/** Optional trace info (tool logs, timing, etc.) */
traces?: Record<string, unknown> | null;
}
/** Function that runs the LLM (actor) with a candidate.system against an item */
export type SystemExecute = (args: {
candidate: Candidate;
item: TaskItem;
}) => Promise<ExecuteResult>;
/** Numeric metric on [0,1] (for Pareto set or ground-truthable tasks) */
export type MetricMu = (y: string, m: unknown) => number;
/** Judge score + textual feedback derived from the output */
export type FeedbackMuF = (args: {
item: TaskItem;
output: string;
traces?: Record<string, unknown> | null;
}) => Promise<{
score: number;
feedbackText: string;
}> | {
score: number;
feedbackText: string;
};
/** Minimal single-turn LLM for reflection/updating the system prompt */
export interface LLM {
complete(prompt: string): Promise<string>;
}
/** Multi-message chat LLM (used by judge) */
export interface ChatMessage {
role: 'system' | 'user' | 'assistant';
content: string;
}
export interface ChatLLM {
chat(messages: ChatMessage[], opts?: {
temperature?: number;
maxTokens?: number;
}): Promise<string>;
}
/** Core configuration */
export interface GepaOptions {
execute: SystemExecute;
mu: MetricMu;
muf: FeedbackMuF;
llm: LLM;
budget: number;
minibatchSize: number;
paretoSize: number;
holdoutSize?: number;
epsilonHoldout?: number;
strategiesPath?: string;
/** Adaptive strategy scheduling options */
strategySchedule?: StrategyScheduleOptions;
/** Which scorer populates the Pareto score matrix S; default 'muf' (judge). */
scoreForPareto?: 'muf' | 'mu';
/** Whether judge calls (muf) should consume budget; default true */
mufCosts?: boolean;
/** Probability of using crossover (merge) instead of mutation; default 0 */
crossoverProbability?: number;
}
/** Serializable bandit state */
export interface Ucb1State {
t: number;
stats: Array<{
id: string;
n: number;
mean: number;
}>;
}
/** Serializable optimizer state for resume */
export interface GEPAState {
version: 2;
budgetLeft: number;
iter: number;
Psystems: string[];
S: number[][];
DparetoIdx: number[];
DfbIdx: number[];
DholdIdx: number[];
bestIdx: number;
seeded: boolean;
bandit: Ucb1State | null;
/** Current module index for round-robin mutation (0-based) */
moduleIndex?: number;
/** Number of modules in the system (for round-robin scheduling) */
moduleCount?: number;
/** Lineage metadata: tracks which modules changed in each candidate */
lineage?: Array<{
/** Candidate index */
candidateIndex: number;
/** Array of module indices that were last changed in this candidate */
changedModules: number[];
/** Parent candidate index (for tracking ancestry) */
parentIndex?: number;
}>;
}
/**
* Controls how frequently we explore strategies vs exploit the bandit, and how often
* we perform pure GEPA reflection without any strategy hint. The explore probability
* scales up when recent uplifts are small.
*/
export interface StrategyScheduleOptions {
/** Window size for recent uplifts moving average */
windowSize?: number;
/** If avg uplift within window is below this, boost exploration */
slowdownThreshold?: number;
/** Base exploration probability when improving well */
baseExploreProb?: number;
/** Max exploration probability when stagnating */
maxExploreProb?: number;
/** Probability to drop hints entirely (pure reflection) under normal conditions */
baseNoHintProb?: number;
/** Max probability to drop hints entirely when stagnating */
maxNoHintProb?: number;
/** Number of core strategies to consider for exploration when JSON lacks explicit "core" */
defaultCoreTopK?: number;
/** Minimum applicability score to keep a strategy after LLM prefilter */
prefilterThreshold?: number;
/** Max number of strategies to keep after prefilter (0 = keep all) */
prefilterTopK?: number;
/** Cooldown in iterations before re-running prefilter when stagnating */
reprefilterCooldownIters?: number;
}