@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, and professional CLI. Built-in tools operational, 58+ external MCP servers discoverable. Connect to filesystem, GitHub, database operations, and more. Build, test, and
307 lines (303 loc) • 12.6 kB
JavaScript
/**
* NeuroLink Evaluation System
*/
import { logger } from "../utils/logger.js";
import { AIProviderFactory } from "./factory.js";
import { z } from "zod";
import { ProviderRegistry } from "../factories/providerRegistry.js";
import { modelConfig } from "./modelConfiguration.js";
import { normalizeEvaluationData } from "../utils/evaluationUtils.js";
// Zod schema for validation
const EvaluationSchema = z.object({
relevance: z.number().min(1).max(10),
accuracy: z.number().min(1).max(10),
completeness: z.number().min(1).max(10),
overall: z.number().min(1).max(10),
domainAlignment: z.number().min(1).max(10).optional(),
terminologyAccuracy: z.number().min(1).max(10).optional(),
toolEffectiveness: z.number().min(1).max(10).optional(),
});
/**
* Get default evaluation when evaluation fails
*/
function getDefaultEvaluation(reason, evaluationTime, context) {
const functionTag = "getDefaultEvaluation";
logger.debug(`[${functionTag}] Creating default evaluation`, {
reason,
evaluationTime,
hasContext: !!context,
});
return {
relevance: 1,
accuracy: 1,
completeness: 1,
overall: 1,
domainAlignment: 1,
terminologyAccuracy: 1,
toolEffectiveness: 1,
isOffTopic: false,
alertSeverity: "low",
reasoning: `Default evaluation used due to: ${reason}`,
contextUtilization: {
conversationUsed: false,
toolsUsed: false,
domainKnowledgeUsed: false,
},
evaluationContext: {
domain: context.primaryDomain || "general",
toolsEvaluated: [],
conversationTurns: 0,
},
evaluationModel: "default",
evaluationTime,
evaluationProvider: "default",
evaluationAttempt: 1,
evaluationConfig: {
mode: "fallback",
fallbackUsed: true,
costEstimate: 0,
},
};
}
/**
* Parse unified evaluation result from text response
*/
function parseEvaluationResult(response, context) {
const functionTag = "parseEvaluationResult";
try {
logger.debug(`[${functionTag}] Parsing evaluation response`, {
responseLength: response.length,
domain: context.primaryDomain,
hasToolUsage: !!context.toolUsage?.length,
hasConversationHistory: !!context.conversationHistory?.length,
});
// Try JSON parsing first
const jsonMatch = response.match(/\{[^}]*\}/s);
if (jsonMatch) {
try {
const parsed = JSON.parse(jsonMatch[0]);
return parsed;
}
catch (jsonError) {
logger.debug(`[${functionTag}] JSON parsing failed, trying regex`, {
error: jsonError instanceof Error ? jsonError.message : String(jsonError),
jsonContent: jsonMatch[0].substring(0, 100), // First 100 chars for debugging
});
}
}
// Fallback to regex parsing
const result = {};
const patterns = {
relevance: /relevance[:\s]*([0-9]+(?:\.[0-9]+)?)/i,
accuracy: /accuracy[:\s]*([0-9]+(?:\.[0-9]+)?)/i,
completeness: /completeness[:\s]*([0-9]+(?:\.[0-9]+)?)/i,
overall: /overall[:\s]*([0-9]+(?:\.[0-9]+)?)/i,
reasoning: /reasoning[:\s]*(.+?)(?=\n\s*\w+:|\n\s*$|$)/is,
};
for (const [key, pattern] of Object.entries(patterns)) {
const match = response.match(pattern);
if (match) {
if (key === "reasoning") {
// Extract reasoning text
result.reasoning = match[1].trim();
}
else {
// Extract numerical scores
const value = parseFloat(match[1]);
if (value >= 1 && value <= 10) {
const roundedValue = Math.round(value);
if (key === "relevance") {
result.relevance = roundedValue;
}
else if (key === "accuracy") {
result.accuracy = roundedValue;
}
else if (key === "completeness") {
result.completeness = roundedValue;
}
else if (key === "overall") {
result.overall = roundedValue;
}
}
}
}
}
// Ensure minimum valid scores and validate with schema
// Use context to enhance evaluation data
const evaluationData = {
relevance: result.relevance || 1,
accuracy: result.accuracy || 1,
completeness: result.completeness || 1,
overall: result.overall || 1,
domainAlignment: result.domainAlignment || (context.primaryDomain ? 5 : undefined), // Default to 5 if domain-specific
terminologyAccuracy: result.terminologyAccuracy || (context.primaryDomain ? 5 : undefined),
toolEffectiveness: result.toolEffectiveness || (context.toolUsage?.length ? 5 : undefined), // Default to 5 if tools were used
};
// Validate against schema
try {
const validated = EvaluationSchema.parse(evaluationData);
// Enhance reasoning with context information
let enhancedReasoning = result.reasoning || "No detailed reasoning provided";
if (context.primaryDomain) {
enhancedReasoning += ` (Domain: ${context.primaryDomain})`;
}
if (context.toolUsage?.length) {
enhancedReasoning += ` (Tools used: ${context.toolUsage.map((t) => t.toolName).join(", ")})`;
}
if (context.conversationHistory?.length) {
enhancedReasoning += ` (Conversation turns: ${context.conversationHistory.length})`;
}
return {
...validated,
reasoning: enhancedReasoning,
};
}
catch (validationError) {
logger.warn(`[${functionTag}] Schema validation failed, using fallback`, {
validationError,
originalData: evaluationData,
});
return {
relevance: Math.max(1, Math.min(10, result.relevance || 1)),
accuracy: Math.max(1, Math.min(10, result.accuracy || 1)),
completeness: Math.max(1, Math.min(10, result.completeness || 1)),
overall: Math.max(1, Math.min(10, result.overall || 1)),
reasoning: result.reasoning || "No detailed reasoning provided",
};
}
}
catch (error) {
logger.error(`[${functionTag}] Failed to parse evaluation result`, {
error,
});
return {
relevance: 1,
accuracy: 1,
completeness: 1,
overall: 1,
reasoning: "Error occurred during evaluation parsing",
};
}
}
/**
* Main unified evaluation function
*/
export async function generateEvaluation(context) {
const functionTag = "generateEvaluation";
const startTime = Date.now();
logger.debug(`[${functionTag}] Starting evaluation`, {
hasUserQuery: !!context.userQuery,
hasAiResponse: !!context.aiResponse,
domain: context.primaryDomain,
});
try {
// Ensure providers are registered
await ProviderRegistry.registerAllProviders();
// Get evaluation provider
const evaluationProvider = process.env.NEUROLINK_EVALUATION_PROVIDER || "google-ai";
// Use configurable model selection instead of hardcoded default
const evaluationModel = process.env.NEUROLINK_EVALUATION_MODEL ||
modelConfig.getModelForTier(evaluationProvider, "fast") ||
"gemini-2.5-flash"; // Ultimate fallback
logger.debug(`[${functionTag}] Using provider: ${evaluationProvider}, model: ${evaluationModel}`);
const provider = await AIProviderFactory.createProvider(evaluationProvider, evaluationModel);
if (!provider) {
logger.debug(`[${functionTag}] No evaluation provider available, returning defaults`);
return getDefaultEvaluation("no-provider", Date.now() - startTime, context);
}
// Create evaluation prompt
const prompt = `
Evaluate this AI response on a scale of 1-10 for each criterion:
User Query: ${context.userQuery}
AI Response: ${context.aiResponse}
Rate on these criteria (1-10 scale):
- Relevance: How well does the response address the user's question?
- Accuracy: How factually correct and precise is the information?
- Completeness: How thoroughly does it cover the topic?
- Overall: General quality assessment
Respond in this exact format:
Relevance: [score]
Accuracy: [score]
Completeness: [score]
Overall: [score]
Reasoning: [Provide a detailed explanation of your evaluation, explaining why you gave these scores. Include specific observations about the response's strengths and all possible areas for improvement.]
`;
// Generate evaluation
const result = await provider.generate(prompt);
if (!result) {
logger.debug(`[${functionTag}] No response from provider`);
return getDefaultEvaluation("no-response", Date.now() - startTime, context);
}
// Extract text from result
const response = typeof result === "string" ? result : result?.content || String(result);
// Parse evaluation result
const parsed = parseEvaluationResult(response, context);
// Validate and enhance result using schema
const baseResult = {
relevance: parsed.relevance || 1,
accuracy: parsed.accuracy || 1,
completeness: parsed.completeness || 1,
overall: parsed.overall || 1,
domainAlignment: parsed.domainAlignment,
terminologyAccuracy: parsed.terminologyAccuracy,
toolEffectiveness: parsed.toolEffectiveness,
};
// Validate against schema before finalizing
try {
const validatedScores = EvaluationSchema.parse(baseResult);
const validatedResult = normalizeEvaluationData({
...parsed,
...validatedScores,
evaluationModel: `${evaluationProvider}/${evaluationModel}`,
evaluationTime: Date.now() - startTime,
evaluationProvider,
evaluationAttempt: 1,
evaluationConfig: {
mode: "standard",
fallbackUsed: false,
costEstimate: 0.001, // Rough estimate
},
});
logger.debug(`[${functionTag}] Schema validation passed`, {
validatedScores,
});
return validatedResult;
}
catch (validationError) {
logger.warn(`[${functionTag}] Schema validation failed in main evaluation`, {
validationError,
baseResult,
});
// Fallback with clamped values using normalizer
const validatedResult = normalizeEvaluationData({
...parsed,
relevance: parsed.relevance || 1,
accuracy: parsed.accuracy || 1,
completeness: parsed.completeness || 1,
overall: parsed.overall || 1,
evaluationModel: `${evaluationProvider}/${evaluationModel}`,
evaluationTime: Date.now() - startTime,
evaluationProvider,
evaluationAttempt: 1,
evaluationConfig: {
mode: "standard",
fallbackUsed: false,
costEstimate: 0.001, // Rough estimate
},
});
return validatedResult;
}
}
catch (error) {
logger.error(`[${functionTag}] Evaluation failed`, {
error: error instanceof Error ? error.message : String(error),
});
return getDefaultEvaluation(error instanceof Error ? error.message : "unknown-error", Date.now() - startTime, context);
}
}
// Simplified evaluation function
export async function evaluateResponse(context) {
return generateEvaluation(context);
}
// Export additional utilities
export { getDefaultEvaluation, parseEvaluationResult };