@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

github.com/juspay/neurolink

juspay/neurolink

652 lines (651 loc) • 20.3 kB

TypeScript

/** * Evaluation type definitions for NeuroLink * Provider performance tracking, evaluation configurations, and provider optimization types */ import type { LanguageModelV3CallOptions } from "@ai-sdk/provider"; import type { TokenUsage } from "./analytics.js"; import type { GenerateResult } from "./generate.js"; import type { ToolExecution } from "./tools.js"; import type { JsonObject } from "./common.js"; import type { AggregatedScores, EvaluationTraceContext, PipelineConfig, ReportConfig, ReportFormat, ScoreResult, ScorerInput } from "./scorer.js"; /** * Evaluation provider type as specified in core module refactoring */ export type EvaluationProvider = "openai" | "anthropic" | "vertex" | "google-ai" | "local"; /** * Evaluation modes */ export type EvaluationMode = "basic" | "detailed" | "domain-aware" | "disabled"; /** * Alert severity levels */ export type AlertSeverity = "low" | "medium" | "high" | "none"; /** * Response quality evaluation scores - Comprehensive evaluation type */ export type EvaluationData = { relevance: number; accuracy: number; completeness: number; overall: number; domainAlignment?: number; terminologyAccuracy?: number; toolEffectiveness?: number; responseContent?: string; queryContent?: string; isOffTopic: boolean; alertSeverity: AlertSeverity; reasoning: string; suggestedImprovements?: string; evaluationModel: string; evaluationTime: number; evaluationDomain?: string; evaluationProvider?: string; evaluationAttempt?: number; evaluationConfig?: { mode: string; fallbackUsed: boolean; costEstimate: number; }; domainConfig?: { domainName: string; domainDescription: string; keyTerms: string[]; failurePatterns: string[]; successPatterns: string[]; evaluationCriteria?: Record<string, unknown>; }; domainEvaluation?: { domainRelevance: number; terminologyAccuracy: number; domainExpertise: number; domainSpecificInsights: string[]; }; }; /** * Enhanced evaluation context for comprehensive response assessment */ export type EvaluationContext = { userQuery: string; aiResponse: string; context?: Record<string, unknown>; primaryDomain?: string; assistantRole?: string; conversationHistory?: Array<{ role: "user" | "assistant"; content: string; timestamp?: string; }>; toolUsage?: Array<{ toolName: string; input: unknown; output: unknown; executionTime: number; }>; expectedOutcome?: string; evaluationCriteria?: string[]; }; /** * Evaluation result type * Extends EvaluationData with additional fields */ export type EnhancedEvaluationResult = EvaluationData & { domainAlignment?: number; terminologyAccuracy?: number; toolEffectiveness?: number; contextUtilization?: { conversationUsed: boolean; toolsUsed: boolean; domainKnowledgeUsed: boolean; }; evaluationContext?: { domain: string; toolsEvaluated: string[]; conversationTurns: number; }; isOffTopic: boolean; alertSeverity: AlertSeverity; reasoning: string; }; /** * Evaluation request type as specified in core module refactoring */ export type EvaluationRequest = { content: string; context?: string; domain?: string; criteria: EvaluationCriteria; }; /** * Evaluation criteria type as specified in core module refactoring */ export type EvaluationCriteria = { relevance: boolean; accuracy: boolean; completeness: boolean; domainSpecific?: boolean; }; /** * Represents the analysis of the user's query intent. * This provides a basic understanding of what the user is trying to achieve. */ export type QueryIntentAnalysis = { /** The type of query, e.g., asking a question or giving a command. */ type: "question" | "command" | "greeting" | "unknown"; /** The estimated complexity of the query. */ complexity: "low" | "medium" | "high"; /** Whether the query likely required the use of tools to be answered correctly. */ shouldHaveUsedTools: boolean; }; /** * Represents a single turn in an enhanced conversation history, * including tool executions and evaluations for richer context. */ export type EnhancedConversationTurn = { /** The role of the speaker, either 'user' or 'assistant'. */ role: "user" | "assistant"; /** The content of the message. */ content: string; /** The timestamp of the message. */ timestamp: string; /** Any tools that were executed as part of this turn. */ toolExecutions?: ToolExecution[]; /** The evaluation result for this turn, if applicable. */ evaluation?: EvaluationResult; }; /** * Contains all the rich context needed for a thorough, RAGAS-style evaluation. * This object is constructed by the `ContextBuilder` and used by the `RAGASEvaluator`. */ export type EnhancedEvaluationContext = { /** The original user query. */ userQuery: string; /** An analysis of the user's query intent. */ queryAnalysis: QueryIntentAnalysis; /** The AI's response that is being evaluated. */ aiResponse: string; /** The AI provider that generated the response. */ provider: string; /** The specific model that generated the response. */ model: string; /** The parameters used for the generation call. */ generationParams: { temperature?: number; maxTokens?: number; systemPrompt?: string; }; /** A list of tools that were executed. */ toolExecutions: ToolExecution[]; /** The history of the conversation leading up to this turn. */ conversationHistory: EnhancedConversationTurn[]; /** The response time of the AI in milliseconds. */ responseTime: number; /** The token usage for the generation. */ tokenUsage: TokenUsage; /** The results of any previous evaluation attempts for this response. */ previousEvaluations?: EvaluationResult[]; /** The current attempt number for this evaluation (1-based). */ attemptNumber: number; }; /** * Represents the result of a single evaluation attempt, based on RAGAS principles. */ export type EvaluationResult = { /** The final, overall score for the response, typically from 1 to 10. */ finalScore: number; /** How well the response addresses the user's query. */ relevanceScore: number; /** The factual accuracy of the information in the response. */ accuracyScore: number; /** How completely the response answers the user's query. */ completenessScore: number; /** Whether the final score meets the passing threshold. */ isPassing: boolean; /** Constructive response from the judge LLM on how to improve the response. */ reasoning: string; /** Specific suggestions for improving the response. */ suggestedImprovements: string; /** The raw, unparsed response from the judge LLM. */ rawEvaluationResponse: string; /** The model used to perform the evaluation. */ evaluationModel: string; /** The time taken for the evaluation in milliseconds. */ evaluationTime: number; /** The attempt number for this evaluation. */ attemptNumber: number; }; /** * Provides detailed information when a response fails quality assurance checks. */ export type QualityErrorDetails = { /** The history of all evaluation attempts for this response. */ evaluationHistory: EvaluationResult[]; /** The final score of the last attempt. */ finalScore: number; /** The total number of evaluation attempts made. */ attempts: number; /** A summary message of the failure. */ message: string; }; /** * Configuration for the main `Evaluator` class. */ export type EvaluationConfig = { /** The minimum score (1-10) for a response to be considered passing. */ threshold?: number; /** The evaluation strategy to use. Currently only 'ragas' is supported. */ evaluationStrategy?: "ragas" | "custom"; /** The model to use for the LLM-as-judge evaluation. */ evaluationModel?: string; /** The maximum number of evaluation attempts before failing. */ maxAttempts?: number; /** The provider to use for the evaluation model. */ provider?: string; /** A custom evaluator function to override the default behavior. */ customEvaluator?: (options: LanguageModelV3CallOptions, result: GenerateResult) => Promise<{ evaluationResult: EvaluationResult; evalContext: EnhancedEvaluationContext; }>; /** The score below which a response is considered off-topic. */ offTopicThreshold?: number; /** The score below which a failing response is considered a high severity alert. */ highSeverityThreshold?: number; /** An optional function to generate custom evaluation prompts. */ promptGenerator?: GetPromptFunction; }; /** * A function that generates the main body of an evaluation prompt. */ export type GetPromptFunction = (context: { userQuery: string; history: string; tools: string; retryInfo: string; aiResponse: string; }) => string; /** * Pipeline execution options */ export type PipelineExecutionOptions = { /** Correlation ID for tracing */ correlationId?: string; /** Custom timeout override */ timeout?: number; /** Skip specific scorers. Mutually exclusive with onlyScorers. */ skipScorers?: string[]; /** Only run specific scorers. Mutually exclusive with skipScorers. */ onlyScorers?: string[]; /** Additional metadata to attach */ metadata?: JsonObject; }; /** * Pipeline execution result */ export type PipelineResult = AggregatedScores & { /** Pipeline configuration used */ pipelineConfig: PipelineConfig; /** Execution options used */ executionOptions?: PipelineExecutionOptions; /** Errors that occurred during execution */ errors: Array<{ scorerId: string; error: string; }>; /** Scorers that were skipped */ skippedScorers: string[]; }; /** * Report data structure */ export type ReportData = { /** Report title */ title: string; /** Timestamp */ timestamp: number; /** Evaluation result */ result: PipelineResult | AggregatedScores; /** Optional custom sections */ customSections?: Array<{ title: string; content: string | JsonObject; }>; }; /** * Function scorer - a simple function-based scorer */ export type ScorerFunction = (input: ScorerInput) => Promise<{ score: number; reasoning: string; metadata?: JsonObject; }>; /** * Superset batch progress. `pending` is canonical; `remaining` in the * pipeline's batchStrategy was renamed during consolidation (same value). */ export type BatchProgress = { total: number; completed: number; failed: number; pending: number; percentComplete: number; succeeded?: number; estimatedTimeRemaining?: number; }; /** Input item for BatchEvaluator. */ export type BatchEvaluationItem = { id: string; options: LanguageModelV3CallOptions; result: GenerateResult; threshold?: number; }; /** Result of a single item in BatchEvaluator. */ export type BatchEvaluationItemResult = { id: string; success: boolean; data?: EvaluationData; error?: { message: string; code?: string; retryable?: boolean; }; duration: number; retryCount: number; }; /** Result of a single item in the pipeline batchStrategy. */ export type BatchItemResult = { index: number; input: ScorerInput; result?: PipelineResult; error?: string; duration: number; }; /** * Superset batch evaluation config. Union of pre-consolidation types * (BatchEvaluationConfig in BatchEvaluator, BatchConfig in batchStrategy). */ export type BatchEvaluationConfig = EvaluationConfig & { concurrency?: number; continueOnError?: boolean; onProgress?: (progress: BatchProgress) => void; maxRetries?: number; retryDelay?: number; onItemComplete?: (result: BatchEvaluationItemResult) => void; batchDelay?: number; onResult?: (result: BatchItemResult) => void; }; /** * Superset batch-result. `results` is a union of both item-result flavors; * summary field names chosen from BatchEvaluator (`succeeded`, `passingRate`). */ export type BatchEvaluationResult = { results: BatchEvaluationItemResult[] | BatchItemResult[]; summary: { total: number; succeeded: number; failed: number; averageScore: number; averageDuration: number; totalDuration: number; passingRate: number; }; allSucceeded?: boolean; }; /** Statistical summary of evaluation scores. */ export type ScoreStatistics = { min: number; max: number; mean: number; median: number; stdDev: number; variance: number; p25: number; p75: number; p90: number; p95: number; }; /** Score distribution across ranges. */ export type ScoreDistribution = { /** Items scoring 1-3 (poor) */ poor: number; /** Items scoring 4-5 (below average) */ belowAverage: number; /** Items scoring 6-7 (average) */ average: number; /** Items scoring 8-9 (good) */ good: number; /** Items scoring 10 (excellent) */ excellent: number; }; /** Trend analysis results. */ export type TrendAnalysis = { direction: "improving" | "declining" | "stable"; slope: number; rSquared: number; percentChange: number; movingAverage: number; }; /** Dimension-specific analysis for RAGAS metrics. */ export type DimensionAnalysis = { relevance: ScoreStatistics; accuracy: ScoreStatistics; completeness: ScoreStatistics; overall: ScoreStatistics; correlations: { relevanceAccuracy: number; relevanceCompleteness: number; accuracyCompleteness: number; }; }; /** Quality alerts summary. */ export type AlertSummary = { total: number; high: number; medium: number; offTopic: number; alertRate: number; }; /** Comprehensive aggregation result. */ export type AggregationResult = { count: number; statistics: ScoreStatistics; distribution: ScoreDistribution; dimensions: DimensionAnalysis; sequenceTrend?: TrendAnalysis; alerts: AlertSummary; passingRate: number; avgEvaluationTime: number; metadata: { aggregatedAt: string; threshold: number; evaluationModels: string[]; }; }; /** Configuration preset for common evaluation scenarios. */ export type EvaluatorPreset = { name: string; description: string; config?: EvaluationConfig; }; /** Configuration for evaluation strategies. */ export type EvaluationStrategyConfig = { evaluationModel?: string; provider?: string; threshold?: number; promptGenerator?: (context: { userQuery: string; history: string; tools: string; retryInfo: string; aiResponse: string; }) => string; options?: Record<string, unknown>; }; /** Function that performs evaluation and returns results. */ export type EvaluationStrategyFunction = (options: LanguageModelV3CallOptions, result: GenerateResult, config?: EvaluationStrategyConfig) => Promise<{ evaluationResult: EvaluationResult; evalContext: EnhancedEvaluationContext; }>; /** Metadata for registered evaluation strategies. */ export type EvaluationStrategyMetadata = { name: string; description: string; requiresLLM: boolean; defaultModel?: string; defaultProvider?: string; version: string; features: string[]; }; /** Canonical evaluation error code. */ export type EvaluationErrorCode = "EVALUATION_FAILED" | "PARSE_ERROR" | "STRATEGY_NOT_FOUND" | "PROVIDER_ERROR" | "CONFIGURATION_ERROR" | "CUSTOM_EVALUATOR_ERROR" | "BATCH_EVALUATION_ERROR" | "AGGREGATION_ERROR" | "REGISTRY_ERROR" | "MAX_RETRIES_EXCEEDED" | "TIMEOUT_ERROR" | "RATE_LIMIT_ERROR"; /** Extended evaluation context for error details. */ export type EvaluationErrorContext = { userQueryLength?: number; aiResponseLength?: number; attemptNumber?: number; previousScores?: number[]; strategy?: string; evaluationModel?: string; provider?: string; rawResponseLength?: number; additionalContext?: Record<string, unknown>; }; /** Minimal Langfuse client interface for evaluation hooks. */ export type LangfuseClient = { score: (params: { name: string; value: number; traceId?: string; observationId?: string; comment?: string; metadata?: Record<string, unknown>; }) => Promise<unknown>; trace?: (params: { name: string; metadata?: Record<string, unknown>; tags?: string[]; }) => { id: string; }; shutdown?: () => Promise<void>; }; /** Langfuse adapter configuration. */ export type LangfuseAdapterConfig = { client: LangfuseClient; scorePrefix?: string; includeMetadata?: boolean; tags?: string[]; sendPipelineScores?: boolean; sendScorerScores?: boolean; }; /** Events emitted by the evaluation pipeline. */ export type EvaluationEvents = { "scorer:start": { scorerId: string; scorerName: string; timestamp: number; traceContext?: EvaluationTraceContext; }; "scorer:end": { scorerId: string; scorerName: string; result: ScoreResult; timestamp: number; duration: number; traceContext?: EvaluationTraceContext; }; "scorer:error": { scorerId: string; scorerName: string; error: string; timestamp: number; traceContext?: EvaluationTraceContext; }; "pipeline:start": { pipelineName: string; scorerCount: number; timestamp: number; correlationId: string; traceContext?: EvaluationTraceContext; }; "pipeline:end": { pipelineName: string; result: PipelineResult; timestamp: number; duration: number; traceContext?: EvaluationTraceContext; }; "pipeline:error": { pipelineName: string; error: string; timestamp: number; traceContext?: EvaluationTraceContext; }; }; /** * Flat span attribute map used by the evaluation observability layer. * Named EvaluationSpanAttributes to disambiguate from the richer telemetry * SpanAttributes in span.ts (§Rule 9 domain prefix). */ export type EvaluationSpanAttributes = Record<string, string | number | boolean>; /** Metrics captured per scorer execution. */ export type ScorerMetrics = { scorerId: string; scorerName: string; totalExecutions: number; successfulExecutions: number; failedExecutions: number; passedCount: number; failedCount: number; totalScore: number; minScore: number; maxScore: number; totalDuration: number; averageDuration: number; averageScore: number; passRate: number; lastExecutionTime: number; }; /** Metrics captured per evaluation pipeline. */ export type PipelineMetrics = { pipelineName: string; totalExecutions: number; passedCount: number; failedCount: number; totalScore: number; minScore: number; maxScore: number; totalDuration: number; averageDuration: number; averageScore: number; passRate: number; lastExecutionTime: number; scorerMetrics: Map<string, ScorerMetrics>; }; /** Aggregated metrics across pipelines and scorers. */ export type AggregatedMetrics = { totalEvaluations: number; overallPassRate: number; averageScore: number; averageDuration: number; scoreDistribution: { excellent: number; good: number; fair: number; poor: number; failing: number; }; pipelineMetrics: Map<string, PipelineMetrics>; scorerMetrics: Map<string, ScorerMetrics>; collectionStartTime: number; lastUpdateTime: number; }; /** Generated evaluation report envelope. */ export type GeneratedReport = { format: ReportFormat; content: string; metadata: { generatedAt: number; format: ReportFormat; config: ReportConfig; }; };