claude-flow
Version:
Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration
179 lines • 6.49 kB
JavaScript
/**
* GAIA Hardness Predictor — Training Data Loader (ADR-136 Track Q)
*
* Loads labelled training examples from prior bench-run result JSONs
* (iter-15, iter-23, iter-28 outputs) and converts them into the
* `LabeledExample[]` format consumed by `HardnessPredictor.train()`.
*
* Expected result JSON schema (matches gaia-bench --output json):
* {
* level: number,
* model: string,
* summary: { total, passed, passRate, estCostUsd, meanTurns, meanWallMs },
* results: [
* {
* task_id: string, question: string, model: string, correct: boolean,
* answer: string | null, expected_output: string, error?: string,
* turns?: number, wallMs?: number, inputTokens?: number, outputTokens?: number
* }
* ]
* }
*
* The file may contain either:
* (a) a single JSON object (one model run), or
* (b) a JSON array of objects (multi-model run from --models a,b,c), or
* (c) a text preamble followed by JSON (raw output from gaia-bench text mode
* — we scan for the first '[' or '{' and parse from there).
*
* Missing files are silently skipped (returns empty array).
* Malformed files emit a warning to stderr and are skipped.
*
* Default search paths (tried in order, first found wins per iter):
* /tmp/gaia-l1-full.json
* /tmp/gaia-l1-haiku.json
* /tmp/gaia-all-p1b.json
* /tmp/gaia-all-p2.json
* <custom paths passed by caller>
*
* Refs: ADR-136, #2156
*/
import * as fs from 'node:fs';
// ---------------------------------------------------------------------------
// Default result-file search paths
// ---------------------------------------------------------------------------
/** Default candidate paths for historical bench-run result JSONs. */
export const DEFAULT_RESULT_PATHS = [
'/tmp/gaia-l1-full.json',
'/tmp/gaia-l1-haiku.json',
'/tmp/gaia-all-p1b.json',
'/tmp/gaia-all-p2.json',
'/tmp/gaia-all-probe.json',
];
// ---------------------------------------------------------------------------
// JSON extraction helper
// ---------------------------------------------------------------------------
/**
* Attempt to extract and parse a JSON value (object or array) from a string
* that may have a text preamble before the JSON.
*
* Strategy:
* 1. Find the first '[' — parse as array.
* 2. Find the first '{' — parse as object.
* 3. Return null on failure.
*/
function extractJson(content) {
const bracketIdx = content.indexOf('[');
const braceIdx = content.indexOf('{');
// Prefer whichever appears first (handles both array and object formats).
const candidates = [];
if (bracketIdx >= 0)
candidates.push([bracketIdx, '[']);
if (braceIdx >= 0)
candidates.push([braceIdx, '{']);
candidates.sort((a, b) => a[0] - b[0]);
for (const [startIdx] of candidates) {
try {
return JSON.parse(content.slice(startIdx));
}
catch {
// Try next candidate.
}
}
return null;
}
// ---------------------------------------------------------------------------
// Parse a single file into RawBenchOutput[]
// ---------------------------------------------------------------------------
function parseBenchFile(filePath) {
let content;
try {
content = fs.readFileSync(filePath, 'utf-8');
}
catch {
return [];
}
const parsed = extractJson(content);
if (parsed === null) {
process.stderr.write(`[gaia-hardness] Warning: could not extract JSON from ${filePath}\n`);
return [];
}
// Normalise to array of RawBenchOutput.
const outputs = [];
if (Array.isArray(parsed)) {
for (const item of parsed) {
if (item && typeof item === 'object' && Array.isArray(item.results)) {
outputs.push(item);
}
}
}
else if (parsed !== null &&
typeof parsed === 'object' &&
Array.isArray(parsed.results)) {
outputs.push(parsed);
}
if (outputs.length === 0) {
process.stderr.write(`[gaia-hardness] Warning: no valid bench outputs found in ${filePath}\n`);
}
return outputs;
}
// ---------------------------------------------------------------------------
// Convert RawQuestionResult → LabeledExample
// ---------------------------------------------------------------------------
function toGaiaQuestion(r) {
return {
task_id: r.task_id,
level: 1, // level not stored in result JSON; default to 1
question: r.question,
final_answer: r.expected_output,
file_name: null,
file_path: null,
};
}
function toBenchLabeledExample(r) {
return {
question: toGaiaQuestion(r),
wasCorrect: Boolean(r.correct),
turns: r.turns,
};
}
// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------
/**
* Load labelled training examples from historical bench-run result JSONs.
*
* @param additionalPaths - Extra file paths to scan beyond the defaults.
* @param verbose - If true, log loaded example counts to stderr.
* @returns Deduplicated array of LabeledExample (dedup by task_id, last write wins).
*/
export function loadTrainingData(additionalPaths = [], verbose = false) {
const allPaths = [...DEFAULT_RESULT_PATHS, ...additionalPaths];
const seen = new Set();
const examples = [];
for (const filePath of allPaths) {
if (!fs.existsSync(filePath))
continue;
const outputs = parseBenchFile(filePath);
let fileCount = 0;
for (const output of outputs) {
for (const result of output.results) {
if (!result.task_id || typeof result.correct !== 'boolean')
continue;
const key = result.task_id;
if (seen.has(key))
continue; // first file wins (chronological order)
seen.add(key);
examples.push(toBenchLabeledExample(result));
fileCount++;
}
}
if (verbose && fileCount > 0) {
process.stderr.write(`[gaia-hardness] Loaded ${fileCount} examples from ${filePath}\n`);
}
}
if (verbose) {
process.stderr.write(`[gaia-hardness] Total training examples: ${examples.length}\n`);
}
return examples;
}
//# sourceMappingURL=train-data-loader.js.map