@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
955 lines • 38.4 kB
JavaScript
#!/usr/bin/env node
/**
* NeuroLink CLI Evaluate Command
*
* Evaluate AI responses using configured scorers and pipelines.
* Supports subcommands: run, score, report, presets, scorers (list-scorers)
*/
import chalk from "chalk";
import ora from "ora";
import fs from "node:fs";
import { EvaluationPipeline, getPreset, getPresetNames, PipelinePresets, } from "../../lib/evaluation/pipeline/index.js";
import { ScorerRegistry } from "../../lib/evaluation/scorers/index.js";
import { ReportGenerator } from "../../lib/evaluation/reporting/reportGenerator.js";
import { logger } from "../../lib/utils/logger.js";
/**
* Format score result for display
*/
function formatScoreResult(result, verbose) {
const passIcon = result.passed ? chalk.green("PASS") : chalk.red("FAIL");
const scoreColor = result.passed ? chalk.green : chalk.red;
let output = ` ${passIcon} ${chalk.cyan(result.scorerName)}: ${scoreColor(result.score.toFixed(2))}`;
if (verbose) {
output += `\n ${chalk.gray(result.reasoning)}`;
output += `\n ${chalk.gray(`(${result.computeTime}ms)`)}`;
}
return output;
}
/**
* Check if a preset name is valid
*/
function isValidPreset(name) {
return name in PipelinePresets;
}
/**
* Create scorer input from command arguments
*/
function createScorerInput(argv) {
// Handle context - can be array of strings or path to file
let contextArray;
if (argv.context) {
if (typeof argv.context === "string") {
// Check if it's a file path
if (fs.existsSync(argv.context)) {
try {
const content = fs.readFileSync(argv.context, "utf-8");
const parsed = JSON.parse(content);
contextArray = Array.isArray(parsed) ? parsed : [content];
}
catch {
contextArray = [argv.context];
}
}
else {
contextArray = [argv.context];
}
}
else {
contextArray = argv.context;
}
}
return {
query: argv.query ?? argv.input ?? "",
response: argv.output ?? argv.input ?? "",
context: contextArray,
groundTruth: argv.groundTruth,
};
}
/**
* List-scorers subcommand - List all available scorers
*/
const listScorersCommand = {
command: "list-scorers",
describe: "List all available scorers",
builder: (yargs) => yargs
.option("category", {
type: "string",
describe: "Filter by category (accuracy, relevancy, safety, quality, faithfulness)",
})
.option("type", {
type: "string",
describe: "Filter by type (llm, rule)",
choices: ["llm", "rule"],
})
.option("detailed", {
type: "boolean",
describe: "Show detailed scorer information",
default: false,
})
.option("json", {
type: "boolean",
describe: "Output as JSON",
default: false,
})
.example("$0 evaluate list-scorers", "List all scorers")
.example("$0 evaluate list-scorers --category safety", "List safety scorers")
.example("$0 evaluate list-scorers --type rule --detailed", "List rule-based scorers with details"),
handler: async (argv) => {
const { category, type, json, detailed } = argv;
await ScorerRegistry.registerBuiltInScorers();
let scorerList = ScorerRegistry.list();
// Apply filters
if (category) {
scorerList = scorerList.filter((s) => s.category === category);
}
if (type) {
scorerList = scorerList.filter((s) => s.type === type);
}
if (json) {
logger.always(JSON.stringify(scorerList, null, 2));
}
else {
logger.always("");
logger.always(chalk.bold("Available Scorers:"));
logger.always(chalk.gray("-".repeat(60)));
// Group by category
const byCategory = new Map();
for (const s of scorerList) {
const cat = s.category;
if (!byCategory.has(cat)) {
byCategory.set(cat, []);
}
const categoryList = byCategory.get(cat);
if (categoryList) {
categoryList.push(s);
}
}
for (const [cat, scorers] of byCategory) {
logger.always("");
logger.always(chalk.bold.underline(cat.toUpperCase()));
for (const metadata of scorers) {
const typeIcon = metadata.type === "llm" ? "AI" : "Rule";
logger.always("");
logger.always(` ${chalk.cyan(metadata.id)} [${typeIcon}]`);
logger.always(` ${chalk.gray(metadata.description)}`);
if (detailed) {
logger.always(` Required: ${metadata.requiredInputs.join(", ") || "none"}`);
if (metadata.optionalInputs.length > 0) {
logger.always(` Optional: ${metadata.optionalInputs.join(", ")}`);
}
}
}
}
logger.always("");
logger.always(chalk.gray(`Total: ${scorerList.length} scorers`));
}
},
};
/**
* Run-pipeline subcommand - Run evaluation using a predefined pipeline preset
*/
const runPipelineCommand = {
command: "run-pipeline",
describe: "Run evaluation using a predefined pipeline preset",
builder: (yargs) => yargs
.option("preset", {
type: "string",
describe: `Pipeline preset to use (${getPresetNames().join(", ")})`,
alias: "p",
demandOption: true,
})
.option("input", {
type: "string",
describe: "AI response text to evaluate",
alias: "i",
demandOption: true,
})
.option("query", {
type: "string",
describe: "Original user query",
alias: "q",
})
.option("context", {
type: "string",
describe: "Path to context file (JSON format) or context string",
alias: "c",
})
.option("threshold", {
type: "number",
describe: "Custom pass threshold (0-1)",
alias: "t",
})
.option("format", {
type: "string",
describe: "Output format",
choices: ["text", "json", "table"],
default: "text",
})
.option("json", {
type: "boolean",
describe: "Output results as JSON (shorthand for --format json)",
default: false,
})
.option("verbose", {
type: "boolean",
describe: "Show detailed reasoning and timing",
alias: "v",
default: false,
})
.example('$0 evaluate run-pipeline --preset quality --input "The capital of France is Paris."', "Run quality evaluation")
.example('$0 evaluate run-pipeline --preset rag --input "Response" --query "Question" --context ./context.json', "Run RAG evaluation with context file"),
handler: async (argv) => {
const { preset, input, query, context, threshold, json, verbose, format } = argv;
const outputFormat = json ? "json" : format;
const spinner = outputFormat === "json"
? null
: ora(`Running ${preset} evaluation pipeline...`).start();
try {
if (!isValidPreset(preset)) {
spinner?.fail(`Unknown pipeline preset: ${preset}`);
logger.always(chalk.gray(`Available presets: ${getPresetNames().join(", ")}`));
process.exit(1);
}
const presetConfig = getPreset(preset);
// Apply custom threshold if provided
if (threshold !== undefined) {
presetConfig.passThreshold = threshold;
}
const evaluationPipeline = new EvaluationPipeline(presetConfig);
const scorerInput = createScorerInput({
input: query,
output: input,
context,
});
await evaluationPipeline.initialize();
const result = await evaluationPipeline.execute(scorerInput);
spinner?.stop();
if (outputFormat === "json") {
logger.always(JSON.stringify(result, null, 2));
}
else if (outputFormat === "table") {
logger.always("");
logger.always(chalk.bold(`Pipeline: ${preset}`));
logger.always(chalk.gray("-".repeat(50)));
// Table header
logger.always(`${chalk.bold("Scorer".padEnd(25))} ${chalk.bold("Score".padEnd(10))} ${chalk.bold("Status")}`);
logger.always(chalk.gray("-".repeat(50)));
for (const score of result.scores) {
const status = score.passed ? chalk.green("PASS") : chalk.red("FAIL");
const scoreColor = score.passed ? chalk.green : chalk.red;
logger.always(`${score.scorerName.padEnd(25)} ${scoreColor(score.score.toFixed(2).padEnd(10))} ${status}`);
}
logger.always(chalk.gray("-".repeat(50)));
const overallColor = result.passed ? chalk.green : chalk.red;
logger.always(`${"Overall".padEnd(25)} ${overallColor(result.overallScore.toFixed(2).padEnd(10))} ${result.passed ? chalk.green("PASS") : chalk.red("FAIL")}`);
}
else {
// Text format
logger.always("");
logger.always(chalk.bold(`Pipeline: ${preset} Evaluation Results`));
logger.always(chalk.gray("-".repeat(50)));
const overallColor = result.passed ? chalk.green : chalk.red;
const overallIcon = result.passed ? "PASS" : "FAIL";
logger.always(`${overallColor(overallIcon)} Overall Score: ${overallColor(result.overallScore.toFixed(2))} (${result.aggregationMethod})`);
logger.always("");
logger.always(chalk.bold("Individual Scores:"));
for (const score of result.scores) {
logger.always(formatScoreResult(score, verbose ?? false));
}
if (result.errors.length > 0) {
logger.always("");
logger.always(chalk.yellow("Errors:"));
for (const error of result.errors) {
logger.always(` ${chalk.yellow("!")} ${error.scorerId}: ${error.error}`);
}
}
logger.always("");
logger.always(chalk.gray(`Total time: ${result.totalComputeTime}ms`));
}
}
catch (error) {
spinner?.fail("Pipeline evaluation failed");
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(chalk.red(`Error: ${errorMessage}`));
process.exit(1);
}
},
};
/**
* Run subcommand - Execute evaluation pipeline (legacy support)
*/
const runCommand = {
command: "run",
describe: "Run evaluation pipeline on a response",
builder: (yargs) => yargs
.option("input", {
type: "string",
describe: "Input query/question that was asked",
alias: "i",
})
.option("output", {
type: "string",
describe: "Output/answer to evaluate",
alias: "o",
})
.option("context", {
type: "array",
string: true,
describe: "Context documents for RAG evaluation (can be used multiple times)",
alias: "c",
})
.option("ground-truth", {
type: "string",
describe: "Expected/correct answer for accuracy evaluation",
alias: "g",
})
.option("pipeline", {
type: "string",
describe: `Pipeline preset to use (${getPresetNames().join(", ")})`,
alias: "p",
})
.option("scorer", {
type: "array",
string: true,
describe: "Specific scorers to use (can be used multiple times)",
alias: "s",
})
.option("json", {
type: "boolean",
describe: "Output results as JSON",
default: false,
})
.option("verbose", {
type: "boolean",
describe: "Show detailed reasoning and timing",
alias: "v",
default: false,
})
.example('$0 evaluate run -i "What is the capital of France?" -o "Paris" -p quality', "Evaluate a response using the quality pipeline"),
handler: async (argv) => {
const { input, output, context, groundTruth, pipeline, scorer, json, verbose, } = argv;
if (!input || !output) {
logger.error(chalk.red("Error: Both --input and --output are required"));
logger.always(chalk.gray("Use --help for usage information"));
process.exit(1);
}
const spinner = json ? null : ora("Initializing evaluation...").start();
try {
const scorerInput = createScorerInput({
input,
output,
context,
groundTruth,
});
let evaluationPipeline;
if (pipeline) {
if (!isValidPreset(pipeline)) {
spinner?.fail(`Unknown pipeline preset: ${pipeline}`);
logger.always(chalk.gray(`Available presets: ${getPresetNames().join(", ")}`));
process.exit(1);
}
const presetConfig = getPreset(pipeline);
evaluationPipeline = new EvaluationPipeline(presetConfig);
}
else if (scorer && scorer.length > 0) {
const pipelineConfig = {
name: "CLI Custom Pipeline",
description: "Custom pipeline from CLI scorer arguments",
scorers: scorer.map((s) => ({ id: s })),
executionMode: "parallel",
};
evaluationPipeline = new EvaluationPipeline(pipelineConfig);
}
else {
const defaultPreset = getPreset("quality");
evaluationPipeline = new EvaluationPipeline(defaultPreset);
}
if (spinner) {
spinner.text = "Running evaluation...";
}
await evaluationPipeline.initialize();
const result = await evaluationPipeline.execute(scorerInput);
spinner?.stop();
if (json) {
logger.always(JSON.stringify(result, null, 2));
}
else {
logger.always("");
logger.always(chalk.bold("Evaluation Results"));
logger.always(chalk.gray("-".repeat(50)));
const overallColor = result.passed ? chalk.green : chalk.red;
const overallIcon = result.passed ? "PASS" : "FAIL";
logger.always(`${overallColor(overallIcon)} Overall Score: ${overallColor(result.overallScore.toFixed(2))} (${result.aggregationMethod})`);
logger.always("");
logger.always(chalk.bold("Individual Scores:"));
for (const score of result.scores) {
logger.always(formatScoreResult(score, verbose ?? false));
}
if (result.errors.length > 0) {
logger.always("");
logger.always(chalk.yellow("Errors:"));
for (const error of result.errors) {
logger.always(` ${chalk.yellow("!")} ${error.scorerId}: ${error.error}`);
}
}
if (result.skippedScorers.length > 0 && verbose) {
logger.always("");
logger.always(chalk.gray(`Skipped: ${result.skippedScorers.join(", ")}`));
}
logger.always("");
logger.always(chalk.gray(`Total time: ${result.totalComputeTime}ms`));
}
}
catch (error) {
spinner?.fail("Evaluation failed");
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(chalk.red(`Error: ${errorMessage}`));
process.exit(1);
}
},
};
/**
* Score subcommand - Score a single response with a specific scorer
*/
const scoreCommand = {
command: "score <scorer>",
describe: "Score a response using a single scorer",
builder: (yargs) => yargs
.positional("scorer", {
type: "string",
describe: "Scorer ID to use (e.g., hallucination, toxicity)",
demandOption: true,
})
.option("input", {
type: "string",
describe: "Input query/question that was asked",
alias: "i",
})
.option("output", {
type: "string",
describe: "Output/answer to evaluate",
alias: "o",
})
.option("context", {
type: "array",
string: true,
describe: "Context documents for evaluation",
alias: "c",
})
.option("ground-truth", {
type: "string",
describe: "Expected answer for comparison",
alias: "g",
})
.option("json", {
type: "boolean",
describe: "Output results as JSON",
default: false,
})
.option("verbose", {
type: "boolean",
describe: "Show detailed output",
alias: "v",
default: false,
})
.example('$0 evaluate score toxicity -o "This is a test response"', "Score a response for toxicity")
.example('$0 evaluate score hallucination -i "What is 2+2?" -o "2+2 equals 4" --json', "Score for hallucinations and output JSON"),
handler: async (argv) => {
const { scorer, input, output, context, groundTruth, json, verbose } = argv;
if (!output) {
logger.error(chalk.red("Error: --output is required"));
logger.always(chalk.gray("Use --help for usage information"));
process.exit(1);
}
const spinnerInstance = json
? null
: ora(`Loading scorer: ${scorer}...`).start();
try {
await ScorerRegistry.registerBuiltInScorers();
const scorerInstance = await ScorerRegistry.getScorer(scorer);
if (!scorerInstance) {
spinnerInstance?.fail(`Scorer not found: ${scorer}`);
const available = ScorerRegistry.list().map((s) => s.id);
logger.always(chalk.gray(`Available scorers: ${available.join(", ")}`));
process.exit(1);
}
if (spinnerInstance) {
spinnerInstance.text = "Running scorer...";
}
const scorerInput = createScorerInput({
input: input ?? "",
output,
context,
groundTruth,
});
const validation = scorerInstance.validateInput(scorerInput);
if (!validation.valid) {
spinnerInstance?.fail("Input validation failed");
for (const err of validation.errors) {
logger.always(chalk.red(` - ${err}`));
}
process.exit(1);
}
const result = await scorerInstance.score(scorerInput);
spinnerInstance?.stop();
if (json) {
logger.always(JSON.stringify(result, null, 2));
}
else {
logger.always("");
logger.always(chalk.bold(`${result.scorerName} Score: ${result.score.toFixed(2)}/10`));
logger.always(result.passed
? chalk.green(" Status: PASSED")
: chalk.red(" Status: FAILED"));
logger.always(` Threshold: ${result.threshold}`);
logger.always(` Time: ${result.computeTime}ms`);
if (verbose || !result.passed) {
logger.always("");
logger.always(chalk.gray("Reasoning:"));
logger.always(chalk.gray(` ${result.reasoning}`));
}
if (result.confidence !== undefined) {
logger.always("");
logger.always(chalk.gray(`Confidence: ${(result.confidence * 100).toFixed(1)}%`));
}
if (verbose && result.metadata) {
logger.always("");
logger.always(chalk.gray("Metadata:"));
logger.always(chalk.gray(JSON.stringify(result.metadata, null, 2)));
}
}
}
catch (error) {
spinnerInstance?.fail("Scoring failed");
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(chalk.red(`Error: ${errorMessage}`));
process.exit(1);
}
},
};
/**
* Report subcommand - Generate evaluation report
*/
const reportCommand = {
command: "report",
describe: "Generate an evaluation report",
builder: (yargs) => yargs
.option("input", {
type: "string",
describe: "Input query/question that was asked",
alias: "i",
})
.option("output", {
type: "string",
describe: "Output/answer to evaluate",
alias: "o",
})
.option("context", {
type: "array",
string: true,
describe: "Context documents for evaluation",
alias: "c",
})
.option("ground-truth", {
type: "string",
describe: "Expected answer for comparison",
alias: "g",
})
.option("pipeline", {
type: "string",
describe: `Pipeline preset to use (${getPresetNames().join(", ")})`,
alias: "p",
})
.option("scorer", {
type: "array",
string: true,
describe: "Specific scorers to use",
alias: "s",
})
.option("format", {
type: "string",
describe: "Report format (text, json, markdown, html)",
choices: ["text", "json", "markdown", "html"],
default: "text",
})
.option("output-file", {
type: "string",
describe: "Save report to file",
alias: "f",
})
.option("verbose", {
type: "boolean",
describe: "Include detailed information in report",
alias: "v",
default: true,
})
.example('$0 evaluate report -i "Question" -o "Answer" -p quality --format markdown', "Generate markdown report")
.example('$0 evaluate report -i "Question" -o "Answer" -p rag --format html -f report.html', "Generate HTML report and save to file"),
handler: async (argv) => {
const { input, output, context, groundTruth, pipeline, scorer, format, outputFile, verbose, } = argv;
if (!input || !output) {
logger.error(chalk.red("Error: Both --input and --output are required"));
logger.always(chalk.gray("Use --help for usage information"));
process.exit(1);
}
const spinnerInstance = ora("Running evaluation...").start();
try {
const scorerInput = createScorerInput({
input,
output,
context,
groundTruth,
});
let evaluationPipeline;
if (pipeline && isValidPreset(pipeline)) {
evaluationPipeline = new EvaluationPipeline(getPreset(pipeline));
}
else if (scorer && scorer.length > 0) {
const pipelineConfig = {
name: "CLI Custom Pipeline",
scorers: scorer.map((s) => ({ id: s })),
executionMode: "parallel",
};
evaluationPipeline = new EvaluationPipeline(pipelineConfig);
}
else {
evaluationPipeline = new EvaluationPipeline(getPreset("quality"));
}
await evaluationPipeline.initialize();
const result = await evaluationPipeline.execute(scorerInput);
spinnerInstance.text = "Generating report...";
const reportData = {
title: `Evaluation Report - ${pipeline ?? "Custom Pipeline"}`,
timestamp: Date.now(),
result,
customSections: [
{
title: "Input",
content: { query: input, responseLength: output.length },
},
],
};
const validFormats = ["text", "json", "markdown", "html"];
const reportFormat = validFormats.includes(format)
? format
: "text";
const generator = new ReportGenerator({
format: reportFormat,
includeReasoning: verbose ?? true,
includeMetadata: verbose ?? true,
includeTiming: true,
});
const report = generator.generate(reportData);
spinnerInstance.stop();
if (outputFile) {
const fsPromises = await import("node:fs/promises");
await fsPromises.writeFile(outputFile, report.content, "utf-8");
logger.always(chalk.green(`Report saved to: ${outputFile}`));
}
else {
logger.always(report.content);
}
}
catch (error) {
spinnerInstance.fail("Report generation failed");
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(chalk.red(`Error: ${errorMessage}`));
process.exit(1);
}
},
};
/**
* Presets subcommand - List available pipeline presets
*/
const presetsCommand = {
command: "presets [preset]",
describe: "List available pipeline presets or show details of a specific preset",
builder: (yargs) => yargs
.positional("preset", {
type: "string",
describe: "Specific preset to show details for",
})
.option("json", {
type: "boolean",
describe: "Output as JSON",
default: false,
})
.example("$0 evaluate presets", "List all available presets")
.example("$0 evaluate presets rag", "Show details of the RAG preset"),
handler: async (argv) => {
const { preset, json } = argv;
if (preset) {
// Show specific preset details
if (!isValidPreset(preset)) {
logger.error(chalk.red(`Unknown preset: ${preset}`));
logger.always(chalk.gray(`Available presets: ${getPresetNames().join(", ")}`));
process.exit(1);
}
const config = getPreset(preset);
if (json) {
logger.always(JSON.stringify(config, null, 2));
}
else {
logger.always("");
logger.always(chalk.bold(`Preset: ${chalk.cyan(preset)}`));
logger.always(chalk.gray("-".repeat(50)));
if (config.description) {
logger.always(`Description: ${config.description}`);
}
logger.always(`Pass Threshold: ${config.passThreshold ?? 0.7}`);
logger.always(`Execution Mode: ${config.executionMode ?? "parallel"}`);
logger.always("");
logger.always(chalk.bold("Scorers:"));
for (const s of config.scorers) {
const weight = s.config?.weight ?? 1.0;
const threshold = s.config?.threshold ?? "default";
logger.always(` - ${chalk.cyan(s.id)} (weight: ${weight}, threshold: ${threshold})`);
}
if (config.requiredScorers && config.requiredScorers.length > 0) {
logger.always("");
logger.always(chalk.bold("Required Scorers: ") +
config.requiredScorers.join(", "));
}
if (config.aggregation) {
logger.always("");
logger.always(chalk.bold("Aggregation: ") + config.aggregation.method);
}
}
}
else {
// List all presets
const presets = getPresetNames();
if (json) {
const presetData = Object.fromEntries(presets.filter(isValidPreset).map((p) => [p, getPreset(p)]));
logger.always(JSON.stringify(presetData, null, 2));
}
else {
logger.always("");
logger.always(chalk.bold("Available Pipeline Presets:"));
logger.always(chalk.gray("-".repeat(50)));
for (const p of presets) {
if (isValidPreset(p)) {
const config = getPreset(p);
logger.always("");
logger.always(` ${chalk.cyan(p)}`);
if (config.description) {
logger.always(` ${chalk.gray(config.description)}`);
}
logger.always(` Scorers: ${config.scorers.map((s) => s.id).join(", ")}`);
}
}
logger.always("");
logger.always(chalk.gray('Use "neurolink evaluate presets <name>" for more details'));
}
}
},
};
/**
* Main evaluate command with subcommands
*/
export const evaluateCommand = {
command: "evaluate [subcommand]",
describe: "Evaluate AI responses using RAGAS-style scorers and pipelines",
builder: (yargs) => yargs
.command(listScorersCommand)
.command(runPipelineCommand)
.command(runCommand)
.command(scoreCommand)
.command(reportCommand)
.command(presetsCommand)
.option("input", {
type: "string",
describe: "AI response text to evaluate",
alias: "i",
})
.option("query", {
type: "string",
describe: "Original user query",
alias: "q",
})
.option("scorers", {
type: "array",
string: true,
describe: "List of scorers to use for evaluation",
alias: "s",
})
.option("context", {
type: "string",
describe: "Path to context file (JSON format)",
alias: "c",
})
.option("threshold", {
type: "number",
describe: "Minimum score threshold for passing (0-1)",
alias: "t",
})
.option("format", {
type: "string",
describe: "Output format",
choices: ["text", "json", "table"],
default: "text",
})
.option("json", {
type: "boolean",
describe: "Output results as JSON (shorthand for --format json)",
default: false,
})
.option("verbose", {
type: "boolean",
describe: "Show detailed reasoning and timing",
alias: "v",
default: false,
})
.example('$0 evaluate --input "Response text" --query "User question" --scorers hallucination toxicity', "Evaluate with specific scorers")
.example('$0 evaluate --input "Response" --query "Query" --context ./context.json --format json', "Evaluate with context file and JSON output")
.example("$0 evaluate list-scorers", "List all available scorers")
.example('$0 evaluate run-pipeline --preset quality --input "Response"', "Run quality pipeline evaluation"),
handler: async (argv) => {
const { input, query, scorers, context, threshold, json, verbose, format } = argv;
// If no input provided and no subcommand executed, show help
if (!input) {
return;
}
const outputFormat = json ? "json" : format;
const spinner = outputFormat === "json" ? null : ora("Running evaluation...").start();
try {
// Load context if provided
let contextArray;
if (context) {
if (fs.existsSync(context)) {
try {
const content = fs.readFileSync(context, "utf-8");
const parsed = JSON.parse(content);
contextArray = Array.isArray(parsed) ? parsed : [content];
}
catch {
contextArray = [context];
}
}
else {
contextArray = [context];
}
}
const scorerInput = {
query: query ?? "",
response: input,
context: contextArray,
};
let evaluationPipeline;
if (scorers && scorers.length > 0) {
const pipelineConfig = {
name: "CLI Custom Pipeline",
description: "Custom pipeline from CLI scorer arguments",
scorers: scorers.map((s) => ({ id: s })),
executionMode: "parallel",
passThreshold: threshold ?? 0.7,
};
evaluationPipeline = new EvaluationPipeline(pipelineConfig);
}
else {
const defaultPreset = getPreset("quality");
if (threshold !== undefined) {
defaultPreset.passThreshold = threshold;
}
evaluationPipeline = new EvaluationPipeline(defaultPreset);
}
await evaluationPipeline.initialize();
const result = await evaluationPipeline.execute(scorerInput);
spinner?.stop();
if (outputFormat === "json") {
logger.always(JSON.stringify(result, null, 2));
}
else if (outputFormat === "table") {
logger.always("");
logger.always(chalk.bold("Evaluation Results"));
logger.always(chalk.gray("-".repeat(50)));
logger.always(`${chalk.bold("Scorer".padEnd(25))} ${chalk.bold("Score".padEnd(10))} ${chalk.bold("Status")}`);
logger.always(chalk.gray("-".repeat(50)));
for (const score of result.scores) {
const status = score.passed ? chalk.green("PASS") : chalk.red("FAIL");
const scoreColor = score.passed ? chalk.green : chalk.red;
logger.always(`${score.scorerName.padEnd(25)} ${scoreColor(score.score.toFixed(2).padEnd(10))} ${status}`);
}
logger.always(chalk.gray("-".repeat(50)));
const overallColor = result.passed ? chalk.green : chalk.red;
logger.always(`${"Overall".padEnd(25)} ${overallColor(result.overallScore.toFixed(2).padEnd(10))} ${result.passed ? chalk.green("PASS") : chalk.red("FAIL")}`);
logger.always("");
logger.always(chalk.gray(`Total time: ${result.totalComputeTime}ms`));
}
else {
logger.always("");
logger.always(chalk.bold("Evaluation Results"));
logger.always(chalk.gray("-".repeat(50)));
const overallColor = result.passed ? chalk.green : chalk.red;
const overallIcon = result.passed ? "PASS" : "FAIL";
logger.always(`${overallColor(overallIcon)} Overall Score: ${overallColor(result.overallScore.toFixed(2))} (${result.aggregationMethod})`);
logger.always("");
logger.always(chalk.bold("Individual Scores:"));
for (const score of result.scores) {
logger.always(formatScoreResult(score, verbose ?? false));
}
if (result.errors.length > 0) {
logger.always("");
logger.always(chalk.yellow("Errors:"));
for (const error of result.errors) {
logger.always(` ${chalk.yellow("!")} ${error.scorerId}: ${error.error}`);
}
}
logger.always("");
logger.always(chalk.gray(`Total time: ${result.totalComputeTime}ms`));
}
}
catch (error) {
spinner?.fail("Evaluation failed");
const errorMessage = error instanceof Error ? error.message : String(error);
logger.error(chalk.red(`Error: ${errorMessage}`));
process.exit(1);
}
},
};
/**
* Create evaluate command factory for CLICommandFactory
*/
export class EvaluateCommandFactory {
/**
* Create the evaluate command module
*/
static createEvaluateCommand() {
return evaluateCommand;
}
/**
* List available scorers (utility method)
*/
static async listScorers() {
await ScorerRegistry.registerBuiltInScorers();
const scorerList = ScorerRegistry.list();
logger.always(chalk.bold("Available Scorers:"));
logger.always("");
for (const metadata of scorerList) {
logger.always(` ${chalk.cyan(metadata.id)}`);
logger.always(` ${chalk.gray(metadata.description)}`);
logger.always(` Type: ${metadata.type}, Category: ${metadata.category}`);
logger.always("");
}
}
/**
* List available pipeline presets (utility method)
*/
static listPipelines() {
const presets = getPresetNames();
logger.always(chalk.bold("Available Pipeline Presets:"));
logger.always("");
for (const preset of presets) {
if (isValidPreset(preset)) {
const config = getPreset(preset);
logger.always(` ${chalk.cyan(preset)}`);
if (config.description) {
logger.always(` ${chalk.gray(config.description)}`);
}
logger.always(` Scorers: ${config.scorers.map((s) => s.id).join(", ")}`);
logger.always("");
}
}
}
}
//# sourceMappingURL=evaluate.js.map