UNPKG

llmverify

Version:

AI Output Verification Toolkit — Local-first LLM safety, hallucination detection, PII redaction, prompt injection defense, and runtime monitoring. Zero telemetry. OWASP LLM Top 10 aligned.

122 lines 12.6 kB
"use strict"; /** * Short Reasoning Test * * Tests if the LLM can perform basic logical reasoning. * Uses simple, verifiable reasoning tasks. * * WHAT THIS TESTS: * ✅ Basic logical reasoning * ✅ Step-by-step thinking * ✅ Correct conclusion derivation * * LIMITATIONS: * - Tests very basic reasoning only * - May not detect subtle reasoning errors * - Answer extraction may miss valid formats * * @module sentinel/shortReasoningTest * @author Haiec * @license MIT */ Object.defineProperty(exports, "__esModule", { value: true }); exports.shortReasoningTest = shortReasoningTest; const LIMITATIONS = [ 'Tests very basic reasoning only', 'May not detect subtle reasoning errors', 'Answer extraction may miss valid response formats', 'Does not test complex multi-step reasoning' ]; // Simple reasoning problems with known answers const REASONING_PROBLEMS = [ { prompt: 'If all cats are animals, and Whiskers is a cat, is Whiskers an animal? Answer yes or no.', expectedAnswer: 'yes', type: 'syllogism' }, { prompt: 'I have 5 apples. I give away 2. How many do I have left? Answer with just the number.', expectedAnswer: '3', type: 'arithmetic' }, { prompt: 'Which is larger: 100 or 99? Answer with just the number.', expectedAnswer: '100', type: 'comparison' } ]; /** * Tests if the LLM can perform basic reasoning. * * @param config - Sentinel configuration with LLM client * @returns Test result with reasoning analysis * * @example * const result = await shortReasoningTest({ * client: myLLMClient, * model: 'gpt-4' * }); * * if (!result.passed) { * console.error('Reasoning test failed'); * } */ async function shortReasoningTest(config) { const results = []; try { // Test each reasoning problem for (const problem of REASONING_PROBLEMS) { const response = await config.client.generate({ prompt: problem.prompt, model: config.model }); const text = response.text.trim().toLowerCase(); // Check if response contains expected answer const correct = text.includes(problem.expectedAnswer.toLowerCase()); results.push({ problem, response: response.text.trim().substring(0, 100), correct }); } // Calculate success rate const correctCount = results.filter(r => r.correct).length; const successRate = correctCount / REASONING_PROBLEMS.length; // Pass if at least 2/3 problems are correct const passed = successRate >= 0.66; return { test: 'shortReasoningTest', passed, message: passed ? `LLM passed ${correctCount}/${REASONING_PROBLEMS.length} reasoning tests` : `LLM failed reasoning tests: ${correctCount}/${REASONING_PROBLEMS.length} correct`, details: { totalProblems: REASONING_PROBLEMS.length, correctCount, successRate: Math.round(successRate * 100) / 100, results: results.map(r => ({ type: r.problem.type, expected: r.problem.expectedAnswer, response: r.response, correct: r.correct })) }, confidence: successRate > 0.9 ? 0.9 : 0.75, limitations: LIMITATIONS }; } catch (error) { return { test: 'shortReasoningTest', passed: false, message: `Test failed with error: ${error instanceof Error ? error.message : 'Unknown error'}`, details: { error: error instanceof Error ? error.message : 'Unknown error', completedTests: results.length }, confidence: 0.5, limitations: [...LIMITATIONS, 'Test failed due to error'] }; } } //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"shortReasoningTest.js","sourceRoot":"","sources":["../../src/sentinel/shortReasoningTest.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;GAmBG;;AA8CH,gDAmEC;AA7GD,MAAM,WAAW,GAAG;IAClB,iCAAiC;IACjC,wCAAwC;IACxC,mDAAmD;IACnD,4CAA4C;CAC7C,CAAC;AAEF,+CAA+C;AAC/C,MAAM,kBAAkB,GAAG;IACzB;QACE,MAAM,EAAE,0FAA0F;QAClG,cAAc,EAAE,KAAK;QACrB,IAAI,EAAE,WAAW;KAClB;IACD;QACE,MAAM,EAAE,uFAAuF;QAC/F,cAAc,EAAE,GAAG;QACnB,IAAI,EAAE,YAAY;KACnB;IACD;QACE,MAAM,EAAE,0DAA0D;QAClE,cAAc,EAAE,KAAK;QACrB,IAAI,EAAE,YAAY;KACnB;CACF,CAAC;AAEF;;;;;;;;;;;;;;;GAeG;AACI,KAAK,UAAU,kBAAkB,CAAC,MAAsB;IAC7D,MAAM,OAAO,GAIR,EAAE,CAAC;IAER,IAAI,CAAC;QACH,8BAA8B;QAC9B,KAAK,MAAM,OAAO,IAAI,kBAAkB,EAAE,CAAC;YACzC,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC;gBAC5C,MAAM,EAAE,OAAO,CAAC,MAAM;gBACtB,KAAK,EAAE,MAAM,CAAC,KAAK;aACpB,CAAC,CAAC;YAEH,MAAM,IAAI,GAAG,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC;YAEhD,6CAA6C;YAC7C,MAAM,OAAO,GAAG,IAAI,CAAC,QAAQ,CAAC,OAAO,CAAC,cAAc,CAAC,WAAW,EAAE,CAAC,CAAC;YAEpE,OAAO,CAAC,IAAI,CAAC;gBACX,OAAO;gBACP,QAAQ,EAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,SAAS,CAAC,CAAC,EAAE,GAAG,CAAC;gBAChD,OAAO;aACR,CAAC,CAAC;QACL,CAAC;QAED,yBAAyB;QACzB,MAAM,YAAY,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC;QAC3D,MAAM,WAAW,GAAG,YAAY,GAAG,kBAAkB,CAAC,MAAM,CAAC;QAE7D,4CAA4C;QAC5C,MAAM,MAAM,GAAG,WAAW,IAAI,IAAI,CAAC;QAEnC,OAAO;YACL,IAAI,EAAE,oBAAoB;YAC1B,MAAM;YACN,OAAO,EAAE,MAAM;gBACb,CAAC,CAAC,cAAc,YAAY,IAAI,kBAAkB,CAAC,MAAM,kBAAkB;gBAC3E,CAAC,CAAC,+BAA+B,YAAY,IAAI,kBAAkB,CAAC,MAAM,UAAU;YACtF,OAAO,EAAE;gBACP,aAAa,EAAE,kBAAkB,CAAC,MAAM;gBACxC,YAAY;gBACZ,WAAW,EAAE,IAAI,CAAC,KAAK,CAAC,WAAW,GAAG,GAAG,CAAC,GAAG,GAAG;gBAChD,OAAO,EAAE,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;oBACzB,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,IAAI;oBACpB,QAAQ,EAAE,CAAC,CAAC,OAAO,CAAC,cAAc;oBAClC,QAAQ,EAAE,CAAC,CAAC,QAAQ;oBACpB,OAAO,EAAE,CAAC,CAAC,OAAO;iBACnB,CAAC,CAAC;aACJ;YACD,UAAU,EAAE,WAAW,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI;YAC1C,WAAW,EAAE,WAAW;SACzB,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO;YACL,IAAI,EAAE,oBAAoB;YAC1B,MAAM,EAAE,KAAK;YACb,OAAO,EAAE,2BAA2B,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE;YAC9F,OAAO,EAAE;gBACP,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe;gBAC/D,cAAc,EAAE,OAAO,CAAC,MAAM;aAC/B;YACD,UAAU,EAAE,GAAG;YACf,WAAW,EAAE,CAAC,GAAG,WAAW,EAAE,0BAA0B,CAAC;SAC1D,CAAC;IACJ,CAAC;AACH,CAAC","sourcesContent":["/**\n * Short Reasoning Test\n * \n * Tests if the LLM can perform basic logical reasoning.\n * Uses simple, verifiable reasoning tasks.\n * \n * WHAT THIS TESTS:\n * ✅ Basic logical reasoning\n * ✅ Step-by-step thinking\n * ✅ Correct conclusion derivation\n * \n * LIMITATIONS:\n * - Tests very basic reasoning only\n * - May not detect subtle reasoning errors\n * - Answer extraction may miss valid formats\n * \n * @module sentinel/shortReasoningTest\n * @author Haiec\n * @license MIT\n */\n\nimport { SentinelTestResult, SentinelConfig } from '../types/runtime';\n\nconst LIMITATIONS = [\n  'Tests very basic reasoning only',\n  'May not detect subtle reasoning errors',\n  'Answer extraction may miss valid response formats',\n  'Does not test complex multi-step reasoning'\n];\n\n// Simple reasoning problems with known answers\nconst REASONING_PROBLEMS = [\n  {\n    prompt: 'If all cats are animals, and Whiskers is a cat, is Whiskers an animal? Answer yes or no.',\n    expectedAnswer: 'yes',\n    type: 'syllogism'\n  },\n  {\n    prompt: 'I have 5 apples. I give away 2. How many do I have left? Answer with just the number.',\n    expectedAnswer: '3',\n    type: 'arithmetic'\n  },\n  {\n    prompt: 'Which is larger: 100 or 99? Answer with just the number.',\n    expectedAnswer: '100',\n    type: 'comparison'\n  }\n];\n\n/**\n * Tests if the LLM can perform basic reasoning.\n * \n * @param config - Sentinel configuration with LLM client\n * @returns Test result with reasoning analysis\n * \n * @example\n * const result = await shortReasoningTest({\n *   client: myLLMClient,\n *   model: 'gpt-4'\n * });\n * \n * if (!result.passed) {\n *   console.error('Reasoning test failed');\n * }\n */\nexport async function shortReasoningTest(config: SentinelConfig): Promise<SentinelTestResult> {\n  const results: Array<{\n    problem: typeof REASONING_PROBLEMS[0];\n    response: string;\n    correct: boolean;\n  }> = [];\n\n  try {\n    // Test each reasoning problem\n    for (const problem of REASONING_PROBLEMS) {\n      const response = await config.client.generate({\n        prompt: problem.prompt,\n        model: config.model\n      });\n\n      const text = response.text.trim().toLowerCase();\n      \n      // Check if response contains expected answer\n      const correct = text.includes(problem.expectedAnswer.toLowerCase());\n\n      results.push({\n        problem,\n        response: response.text.trim().substring(0, 100),\n        correct\n      });\n    }\n\n    // Calculate success rate\n    const correctCount = results.filter(r => r.correct).length;\n    const successRate = correctCount / REASONING_PROBLEMS.length;\n\n    // Pass if at least 2/3 problems are correct\n    const passed = successRate >= 0.66;\n\n    return {\n      test: 'shortReasoningTest',\n      passed,\n      message: passed\n        ? `LLM passed ${correctCount}/${REASONING_PROBLEMS.length} reasoning tests`\n        : `LLM failed reasoning tests: ${correctCount}/${REASONING_PROBLEMS.length} correct`,\n      details: {\n        totalProblems: REASONING_PROBLEMS.length,\n        correctCount,\n        successRate: Math.round(successRate * 100) / 100,\n        results: results.map(r => ({\n          type: r.problem.type,\n          expected: r.problem.expectedAnswer,\n          response: r.response,\n          correct: r.correct\n        }))\n      },\n      confidence: successRate > 0.9 ? 0.9 : 0.75,\n      limitations: LIMITATIONS\n    };\n  } catch (error) {\n    return {\n      test: 'shortReasoningTest',\n      passed: false,\n      message: `Test failed with error: ${error instanceof Error ? error.message : 'Unknown error'}`,\n      details: {\n        error: error instanceof Error ? error.message : 'Unknown error',\n        completedTests: results.length\n      },\n      confidence: 0.5,\n      limitations: [...LIMITATIONS, 'Test failed due to error']\n    };\n  }\n}\n"]}