UNPKG

claude-flow

Version:

Ruflo - Enterprise AI agent orchestration for Claude Code. Deploy 60+ specialized agents in coordinated swarms with self-learning, fault-tolerant consensus, vector memory, and MCP integration

170 lines 9.13 kB
/** * GAIA Hardness Predictor — Feature Extraction (ADR-136 Track Q) * * Extracts a 17-dimensional feature vector from a GaiaQuestion for use * in the linear hardness classifier. Deliberately avoids external * dependencies (no spacy, no heavy NLP) — all extraction is regex-based * with O(1) per question. * * Feature vector layout (17 dims): * [0] question length in characters (normalised / 500) * [1] question length in words (normalised / 100) * [2] sentence count (normalised / 5) * [3] question word: "what" (0/1) * [4] question word: "how" (0/1) * [5] question word: "who/when/where" (0/1) * [6] question word: "calculate/compute/how many/what percentage" (0/1) * [7] has numeric token (0/1) * [8] has year token (4-digit) (0/1) * [9] has comparison keyword (0/1) * [10] estimated named-entity count (normalised / 5) * [11] digit-token count (normalised / 5) * [12] multi-hop signal (0/1) — bridge/relative clause pattern * [13] requires math (0/1) — "how many/much/percentage/calculate/compute" * [14] temporal chain (0/1) — "before/after X happened / since / until" * [15] tool implication count (normalised / 4) — PDF/image/video/URL markers * [16] file attachment present (0/1) * * All continuous values are min-max normalised to [0, 1] using fixed * divisors chosen so typical GAIA questions fall in [0.1, 0.9]. * * Refs: ADR-136, #2156 */ // --------------------------------------------------------------------------- // Feature labels (in order, matches values array) // --------------------------------------------------------------------------- export const FEATURE_LABELS = [ 'len_chars_norm', 'len_words_norm', 'sentence_count_norm', 'qword_what', 'qword_how', 'qword_who_when_where', 'qword_calc_compute', 'has_numeric', 'has_year', 'has_comparison', 'entity_count_norm', 'digit_token_norm', 'multi_hop_signal', 'requires_math', 'temporal_chain', 'tool_implication_norm', 'has_file_attachment', ]; // --------------------------------------------------------------------------- // Regex catalogue (compiled once at module load) // --------------------------------------------------------------------------- // Named entity proxy: capitalised word sequences (not at sentence start) // e.g. "Barack Obama", "New York City", "United States of America" const RE_ENTITY = /(?<!^|\.\s{1,3})(?:[A-Z][a-z]+(?:\s+[A-Z][a-z]+){0,4})/g; const RE_YEAR = /\b(1[0-9]{3}|20[0-9]{2})\b/g; const RE_DIGIT_TOKEN = /\b\d[\d,._]*/g; const RE_NUMERIC = /\b\d/; const RE_COMPARISON = /\b(more|fewer|greater|less|compare|versus|vs\.?|differ|increase|decrease|ratio|proportion)\b/i; const RE_MULTI_HOP = /\b(the\s+\w+\s+that\s+|which\s+was\s+|who\s+was\s+|whose\s+|of\s+the\s+\w+\s+that\s+)/i; // Requires math: stems without trailing \b to handle "calculate", "multiply", "division", etc. const RE_MATH = /\b(how\s+many|how\s+much|what\s+percentage|what\s+fraction|what\s+proportion|calculat|comput|multipli|divis|subtract|sum\s+of|total\s+of|average\s+of|product\s+of|express\s+as\s+a\s+decimal)/i; const RE_TEMPORAL = /\b(before|after|since|until|during|between\s+\d|in\s+the\s+year|by\s+the\s+time|at\s+the\s+time)\b/i; // Tool implication markers const RE_TOOL_FILE = /\b(pdf|\.docx|\.xlsx|\.csv|file|attachment|document|spreadsheet)\b/i; const RE_TOOL_IMAGE = /\b(image|photo|picture|screenshot|figure|diagram|chart)\b/i; const RE_TOOL_VIDEO = /\b(video|youtube|clip|footage)\b/i; const RE_TOOL_URL = /https?:\/\//i; // Question-word detection (first word of question, or full-text for calc/compute) const RE_QWORD_WHAT = /^(what|which)\b/i; const RE_QWORD_HOW = /^how\b/i; const RE_QWORD_WHO_WHEN_WHERE = /^(who|when|where|why|whom)\b/i; // calc/compute: fires if question contains calculate/compute anywhere (stems, no trailing \b) const RE_QWORD_CALC = /\b(calculat|comput|how\s+many|what\s+percentage|how\s+much)/i; // Sentence boundary (rough — period/exclamation/question-mark followed by space+capital) const RE_SENTENCE = /[.!?]\s+[A-Z]/g; // --------------------------------------------------------------------------- // Helper: clamp value to [0, 1] // --------------------------------------------------------------------------- function clamp(v, max) { return Math.min(v / max, 1.0); } // --------------------------------------------------------------------------- // Public API // --------------------------------------------------------------------------- /** * Extract the 17-dimensional feature vector from a GaiaQuestion. * All features are in [0, 1]. Never throws. */ export function extractFeatures(q) { const text = q.question ?? ''; // ── Syntactic scalars ────────────────────────────────────────────────── const lenChars = text.length; const words = text.trim().split(/\s+/).filter(Boolean); const lenWords = words.length; const sentenceMatches = text.match(RE_SENTENCE); const sentenceCount = 1 + (sentenceMatches ? sentenceMatches.length : 0); // ── Question-word one-hot ───────────────────────────────────────────── const firstWord = text.trim(); const qwWhat = RE_QWORD_WHAT.test(firstWord) ? 1 : 0; const qwHow = RE_QWORD_HOW.test(firstWord) ? 1 : 0; const qwWhoWhenWhere = RE_QWORD_WHO_WHEN_WHERE.test(firstWord) ? 1 : 0; const qwCalcCompute = RE_QWORD_CALC.test(text) ? 1 : 0; // ── Lexical booleans ────────────────────────────────────────────────── const hasNumeric = RE_NUMERIC.test(text) ? 1 : 0; // Count year tokens const yearMatches = text.match(RE_YEAR); const hasYear = yearMatches && yearMatches.length > 0 ? 1 : 0; const hasComparison = RE_COMPARISON.test(text) ? 1 : 0; // ── Named entity proxy (capitalised word sequences) ─────────────────── // Reset lastIndex for global regex RE_ENTITY.lastIndex = 0; let entityCount = 0; // eslint-disable-next-line no-constant-condition while (true) { const m = RE_ENTITY.exec(text); if (!m) break; entityCount++; } // ── Digit token count ───────────────────────────────────────────────── RE_DIGIT_TOKEN.lastIndex = 0; const digitTokenMatches = text.match(RE_DIGIT_TOKEN); const digitTokenCount = digitTokenMatches ? digitTokenMatches.length : 0; // ── Multi-hop signal ────────────────────────────────────────────────── const multiHopSignal = RE_MULTI_HOP.test(text) ? 1 : 0; // ── Math requirement ────────────────────────────────────────────────── const requiresMath = RE_MATH.test(text) ? 1 : 0; // ── Temporal chain ──────────────────────────────────────────────────── const temporalChain = RE_TEMPORAL.test(text) ? 1 : 0; // ── Tool implication (PDF/image/video/URL → file_read/web_browse/image_describe) ── let toolImplication = 0; if (RE_TOOL_FILE.test(text) || q.file_name) toolImplication++; if (RE_TOOL_IMAGE.test(text)) toolImplication++; if (RE_TOOL_VIDEO.test(text)) toolImplication++; if (RE_TOOL_URL.test(text)) toolImplication++; // ── File attachment ─────────────────────────────────────────────────── const hasFileAttachment = q.file_name && q.file_name.trim() ? 1 : 0; // ── Assemble normalised vector ──────────────────────────────────────── const values = [ clamp(lenChars, 500), // [0] clamp(lenWords, 100), // [1] clamp(sentenceCount, 5), // [2] qwWhat, // [3] qwHow, // [4] qwWhoWhenWhere, // [5] qwCalcCompute, // [6] hasNumeric, // [7] hasYear, // [8] hasComparison, // [9] clamp(entityCount, 5), // [10] clamp(digitTokenCount, 5), // [11] multiHopSignal, // [12] requiresMath, // [13] temporalChain, // [14] clamp(toolImplication, 4), // [15] hasFileAttachment ? 1 : 0, // [16] ]; return { values, labels: FEATURE_LABELS }; } //# sourceMappingURL=features.js.map