judgeval
Version:
Judgment SDK for TypeScript/JavaScript
250 lines (249 loc) • 7.97 kB
TypeScript
import OpenAI from 'openai';
import Anthropic from '@anthropic-ai/sdk';
import { APIJudgmentScorer, Scorer } from '../scorers/base-scorer.js';
interface NotificationConfig {
enabled?: boolean;
communication_methods?: string[];
email_addresses?: string[];
send_at?: number;
}
interface Condition {
metric: Scorer;
}
type CombineType = "all" | "any";
interface Rule {
rule_id?: string;
name: string;
description?: string;
conditions: Condition[];
combine_type: CombineType;
notification?: NotificationConfig;
}
type ApiClient = OpenAI | Anthropic | any;
type SpanType = string;
interface TraceEntry {
type: 'enter' | 'exit' | 'input' | 'output' | 'error';
function: string;
span_id: string;
depth: number;
created_at: number;
duration?: number;
output?: any;
inputs?: Record<string, any>;
span_type: SpanType;
parent_span_id?: string;
trace_id?: string;
message?: string;
}
interface EvaluationRunPayload {
organization_id: string;
log_results: boolean;
project_name: string;
eval_name: string;
examples: {
input?: string;
actual_output?: string;
expected_output?: string;
context?: string[];
retrieval_context?: string[];
tools_called?: string[];
expected_tools?: string[];
additional_metadata?: Record<string, any>;
trace_id: string;
}[];
scorers: APIJudgmentScorer[];
model?: string;
metadata?: Record<string, any>;
judgment_api_key: string;
override?: boolean;
rules?: Rule[];
trace_span_id?: string;
}
interface TraceSavePayload {
trace_id: string;
name: string;
project_name: string;
created_at: string;
duration: number;
token_counts: {
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
prompt_tokens_cost_usd: number;
completion_tokens_cost_usd: number;
total_cost_usd: number;
};
entries: CondensedSpanEntry[];
evaluation_runs: EvaluationRunPayload[];
overwrite: boolean;
parent_trace_id?: string | null;
parent_name?: string | null;
}
interface CondensedSpanEntry {
span_id: string;
function: string;
depth: number;
created_at: string;
parent_span_id?: string | null;
span_type: SpanType;
inputs: Record<string, any> | null;
output: any | null;
duration: number | null;
trace_id?: string;
children?: CondensedSpanEntry[];
}
interface TokenCostResponse {
model: string;
prompt_tokens: number;
completion_tokens: number;
total_tokens: number;
prompt_tokens_cost_usd: number;
completion_tokens_cost_usd: number;
total_cost_usd: number;
}
/**
* Client for interacting with Judgment trace API endpoints.
*/
declare class TraceManagerClient {
private apiKey;
private organizationId;
constructor(apiKey: string, organizationId: string);
private _fetch;
fetchTrace(traceId: string): Promise<any>;
saveTrace(traceData: TraceSavePayload): Promise<any>;
deleteTrace(traceId: string): Promise<any>;
deleteTraces(traceIds: string[]): Promise<any>;
/**
* Calculate token costs directly using the API endpoint.
* This is more accurate than client-side calculation as it uses the most up-to-date pricing.
*
* @param model The model name (e.g. 'gpt-4', 'claude-3-opus-20240229')
* @param promptTokens Number of tokens in the prompt/input
* @param completionTokens Number of tokens in the completion/output
* @returns Object containing token counts and calculated costs in USD
*/
calculateTokenCosts(model: string, promptTokens: number, completionTokens: number): Promise<TokenCostResponse | null>;
}
/**
* Represents an ongoing trace context.
*/
declare class TraceClient {
readonly traceId: string;
readonly name: string;
readonly projectName: string;
readonly overwrite: boolean;
readonly rules: Rule[];
readonly enableMonitoring: boolean;
readonly enableEvaluations: boolean;
readonly parentTraceId?: string | null;
readonly parentName?: string | null;
private startTime;
traceManager: TraceManagerClient | null;
private apiKey;
private organizationId;
private originalName;
private _spanDepths;
private pendingEvaluationRuns;
constructor(config: {
tracer: Tracer;
traceId?: string;
name?: string;
projectName?: string;
overwrite?: boolean;
rules?: Rule[];
enableMonitoring?: boolean;
enableEvaluations?: boolean;
parentTraceId?: string | null;
parentName?: string | null;
apiKey: string;
organizationId: string;
});
addEntry(entry: Partial<TraceEntry>): void;
recordInput(inputs: any): void;
recordOutput(output: any): void;
recordError(error: any): void;
startSpan(name: string, options?: {
spanType?: SpanType;
}): void;
endSpan(): void;
span(name: string, options?: {
spanType?: SpanType;
}): Generator<TraceClient>;
/**
* Retrieves the ID of the currently active span in this trace context.
* Relies on AsyncLocalStorage context established by observe/span.
* @returns {string | undefined} The ID of the current span, or undefined if none is active.
*/
getCurrentSpanId(): string | undefined;
getDuration(): number;
private condenseTrace;
save(emptySave?: boolean): Promise<{
traceId: string;
traceData: TraceSavePayload;
} | null>;
print(): void;
delete(): Promise<any>;
/**
* Asynchronously evaluate an example using the provided scorers,
* embedding the evaluation request into the trace data.
* Ported from the Python SDK's async_evaluate method.
*
* @param scorers Array of scorers to use for evaluation
* @param options Evaluation options including input, outputs, and metadata
* @returns Promise that resolves when the evaluation entry has been added to the trace
*/
asyncEvaluate(scorers: Scorer[], options?: {
input?: string;
actualOutput?: string;
expectedOutput?: string;
context?: string[];
retrievalContext?: string[];
toolsCalled?: string[];
expectedTools?: string[];
additionalMetadata?: Record<string, any>;
model?: string;
logResults?: boolean;
}): Promise<void>;
getOriginalName(): string;
}
/**
* Singleton Tracer class. Manages overall tracing configuration and trace creation.
*/
declare class Tracer {
private static instance;
readonly apiKey: string;
readonly organizationId: string;
readonly projectName: string;
readonly defaultRules: Rule[];
readonly enableMonitoring: boolean;
readonly enableEvaluations: boolean;
private initialized;
private currentTrace?;
private constructor();
static getInstance(config?: {
apiKey?: string;
organizationId?: string;
projectName?: string;
rules?: Rule[];
enableMonitoring?: boolean;
enableEvaluations?: boolean;
}): Tracer;
getCurrentTrace(): TraceClient | undefined;
startTrace(name: string, config: {
projectName?: string;
overwrite?: boolean;
rules?: Rule[];
}): TraceClient;
trace(name: string, options?: {
projectName?: string;
overwrite?: boolean;
createRootSpan?: boolean;
rules?: Rule[];
}): Generator<TraceClient>;
observe(options?: {
name?: string;
spanType?: SpanType;
}): <T extends any[], S>(func: (...args: T) => S) => (...args: T) => Promise<S>;
}
export declare function wrap<T extends ApiClient>(client: T): T;
export { Tracer, TraceClient, TraceManagerClient, Rule, Condition, NotificationConfig, CombineType, TraceEntry, SpanType, ApiClient, TraceSavePayload, CondensedSpanEntry };