UNPKG

llmverify

Version:

AI Output Verification Toolkit — Local-first LLM safety, hallucination detection, PII redaction, prompt injection defense, and runtime monitoring. Zero telemetry. OWASP LLM Top 10 aligned.

122 lines 12.6 kB
"use strict"; /** * Short Reasoning Test * * Tests if the LLM can perform basic logical reasoning. * Uses simple, verifiable reasoning tasks. * * WHAT THIS TESTS: * ✅ Basic logical reasoning * ✅ Step-by-step thinking * ✅ Correct conclusion derivation * * LIMITATIONS: * - Tests very basic reasoning only * - May not detect subtle reasoning errors * - Answer extraction may miss valid formats * * @module sentinel/shortReasoningTest * @author Haiec * @license MIT */ Object.defineProperty(exports, "__esModule", { value: true }); exports.shortReasoningTest = shortReasoningTest; const LIMITATIONS = [ 'Tests very basic reasoning only', 'May not detect subtle reasoning errors', 'Answer extraction may miss valid response formats', 'Does not test complex multi-step reasoning' ]; // Simple reasoning problems with known answers const REASONING_PROBLEMS = [ { prompt: 'If all cats are animals, and Whiskers is a cat, is Whiskers an animal? Answer yes or no.', expectedAnswer: 'yes', type: 'syllogism' }, { prompt: 'I have 5 apples. I give away 2. How many do I have left? Answer with just the number.', expectedAnswer: '3', type: 'arithmetic' }, { prompt: 'Which is larger: 100 or 99? Answer with just the number.', expectedAnswer: '100', type: 'comparison' } ]; /** * Tests if the LLM can perform basic reasoning. * * @param config - Sentinel configuration with LLM client * @returns Test result with reasoning analysis * * @example * const result = await shortReasoningTest({ * client: myLLMClient, * model: 'gpt-4' * }); * * if (!result.passed) { * console.error('Reasoning test failed'); * } */ async function shortReasoningTest(config) { const results = []; try { // Test each reasoning problem for (const problem of REASONING_PROBLEMS) { const response = await config.client.generate({ prompt: problem.prompt, model: config.model }); const text = response.text.trim().toLowerCase(); // Check if response contains expected answer const correct = text.includes(problem.expectedAnswer.toLowerCase()); results.push({ problem, response: response.text.trim().substring(0, 100), correct }); } // Calculate success rate const correctCount = results.filter(r => r.correct).length; const successRate = correctCount / REASONING_PROBLEMS.length; // Pass if at least 2/3 problems are correct const passed = successRate >= 0.66; return { test: 'shortReasoningTest', passed, message: passed ? `LLM passed ${correctCount}/${REASONING_PROBLEMS.length} reasoning tests` : `LLM failed reasoning tests: ${correctCount}/${REASONING_PROBLEMS.length} correct`, details: { totalProblems: REASONING_PROBLEMS.length, correctCount, successRate: Math.round(successRate * 100) / 100, results: results.map(r => ({ type: r.problem.type, expected: r.problem.expectedAnswer, response: r.response, correct: r.correct })) }, confidence: successRate > 0.9 ? 0.9 : 0.75, limitations: LIMITATIONS }; } catch (error) { return { test: 'shortReasoningTest', passed: false, message: `Test failed with error: ${error instanceof Error ? error.message : 'Unknown error'}`, details: { error: error instanceof Error ? error.message : 'Unknown error', completedTests: results.length }, confidence: 0.5, limitations: [...LIMITATIONS, 'Test failed due to error'] }; } } //# sourceMappingURL=data:application/json;base64,eyJ2ZXJzaW9uIjozLCJmaWxlIjoic2hvcnRSZWFzb25pbmdUZXN0LmpzIiwic291cmNlUm9vdCI6IiIsInNvdXJjZXMiOlsiLi4vLi4vc3JjL3NlbnRpbmVsL3Nob3J0UmVhc29uaW5nVGVzdC50cyJdLCJuYW1lcyI6W10sIm1hcHBpbmdzIjoiO0FBQUE7Ozs7Ozs7Ozs7Ozs7Ozs7Ozs7R0FtQkc7O0FBOENILGdEQW1FQztBQTdHRCxNQUFNLFdBQVcsR0FBRztJQUNsQixpQ0FBaUM7SUFDakMsd0NBQXdDO0lBQ3hDLG1EQUFtRDtJQUNuRCw0Q0FBNEM7Q0FDN0MsQ0FBQztBQUVGLCtDQUErQztBQUMvQyxNQUFNLGtCQUFrQixHQUFHO0lBQ3pCO1FBQ0UsTUFBTSxFQUFFLDBGQUEwRjtRQUNsRyxjQUFjLEVBQUUsS0FBSztRQUNyQixJQUFJLEVBQUUsV0FBVztLQUNsQjtJQUNEO1FBQ0UsTUFBTSxFQUFFLHVGQUF1RjtRQUMvRixjQUFjLEVBQUUsR0FBRztRQUNuQixJQUFJLEVBQUUsWUFBWTtLQUNuQjtJQUNEO1FBQ0UsTUFBTSxFQUFFLDBEQUEwRDtRQUNsRSxjQUFjLEVBQUUsS0FBSztRQUNyQixJQUFJLEVBQUUsWUFBWTtLQUNuQjtDQUNGLENBQUM7QUFFRjs7Ozs7Ozs7Ozs7Ozs7O0dBZUc7QUFDSSxLQUFLLFVBQVUsa0JBQWtCLENBQUMsTUFBc0I7SUFDN0QsTUFBTSxPQUFPLEdBSVIsRUFBRSxDQUFDO0lBRVIsSUFBSSxDQUFDO1FBQ0gsOEJBQThCO1FBQzlCLEtBQUssTUFBTSxPQUFPLElBQUksa0JBQWtCLEVBQUUsQ0FBQztZQUN6QyxNQUFNLFFBQVEsR0FBRyxNQUFNLE1BQU0sQ0FBQyxNQUFNLENBQUMsUUFBUSxDQUFDO2dCQUM1QyxNQUFNLEVBQUUsT0FBTyxDQUFDLE1BQU07Z0JBQ3RCLEtBQUssRUFBRSxNQUFNLENBQUMsS0FBSzthQUNwQixDQUFDLENBQUM7WUFFSCxNQUFNLElBQUksR0FBRyxRQUFRLENBQUMsSUFBSSxDQUFDLElBQUksRUFBRSxDQUFDLFdBQVcsRUFBRSxDQUFDO1lBRWhELDZDQUE2QztZQUM3QyxNQUFNLE9BQU8sR0FBRyxJQUFJLENBQUMsUUFBUSxDQUFDLE9BQU8sQ0FBQyxjQUFjLENBQUMsV0FBVyxFQUFFLENBQUMsQ0FBQztZQUVwRSxPQUFPLENBQUMsSUFBSSxDQUFDO2dCQUNYLE9BQU87Z0JBQ1AsUUFBUSxFQUFFLFFBQVEsQ0FBQyxJQUFJLENBQUMsSUFBSSxFQUFFLENBQUMsU0FBUyxDQUFDLENBQUMsRUFBRSxHQUFHLENBQUM7Z0JBQ2hELE9BQU87YUFDUixDQUFDLENBQUM7UUFDTCxDQUFDO1FBRUQseUJBQXlCO1FBQ3pCLE1BQU0sWUFBWSxHQUFHLE9BQU8sQ0FBQyxNQUFNLENBQUMsQ0FBQyxDQUFDLEVBQUUsQ0FBQyxDQUFDLENBQUMsT0FBTyxDQUFDLENBQUMsTUFBTSxDQUFDO1FBQzNELE1BQU0sV0FBVyxHQUFHLFlBQVksR0FBRyxrQkFBa0IsQ0FBQyxNQUFNLENBQUM7UUFFN0QsNENBQTRDO1FBQzVDLE1BQU0sTUFBTSxHQUFHLFdBQVcsSUFBSSxJQUFJLENBQUM7UUFFbkMsT0FBTztZQUNMLElBQUksRUFBRSxvQkFBb0I7WUFDMUIsTUFBTTtZQUNOLE9BQU8sRUFBRSxNQUFNO2dCQUNiLENBQUMsQ0FBQyxjQUFjLFlBQVksSUFBSSxrQkFBa0IsQ0FBQyxNQUFNLGtCQUFrQjtnQkFDM0UsQ0FBQyxDQUFDLCtCQUErQixZQUFZLElBQUksa0JBQWtCLENBQUMsTUFBTSxVQUFVO1lBQ3RGLE9BQU8sRUFBRTtnQkFDUCxhQUFhLEVBQUUsa0JBQWtCLENBQUMsTUFBTTtnQkFDeEMsWUFBWTtnQkFDWixXQUFXLEVBQUUsSUFBSSxDQUFDLEtBQUssQ0FBQyxXQUFXLEdBQUcsR0FBRyxDQUFDLEdBQUcsR0FBRztnQkFDaEQsT0FBTyxFQUFFLE9BQU8sQ0FBQyxHQUFHLENBQUMsQ0FBQyxDQUFDLEVBQUUsQ0FBQyxDQUFDO29CQUN6QixJQUFJLEVBQUUsQ0FBQyxDQUFDLE9BQU8sQ0FBQyxJQUFJO29CQUNwQixRQUFRLEVBQUUsQ0FBQyxDQUFDLE9BQU8sQ0FBQyxjQUFjO29CQUNsQyxRQUFRLEVBQUUsQ0FBQyxDQUFDLFFBQVE7b0JBQ3BCLE9BQU8sRUFBRSxDQUFDLENBQUMsT0FBTztpQkFDbkIsQ0FBQyxDQUFDO2FBQ0o7WUFDRCxVQUFVLEVBQUUsV0FBVyxHQUFHLEdBQUcsQ0FBQyxDQUFDLENBQUMsR0FBRyxDQUFDLENBQUMsQ0FBQyxJQUFJO1lBQzFDLFdBQVcsRUFBRSxXQUFXO1NBQ3pCLENBQUM7SUFDSixDQUFDO0lBQUMsT0FBTyxLQUFLLEVBQUUsQ0FBQztRQUNmLE9BQU87WUFDTCxJQUFJLEVBQUUsb0JBQW9CO1lBQzFCLE1BQU0sRUFBRSxLQUFLO1lBQ2IsT0FBTyxFQUFFLDJCQUEyQixLQUFLLFlBQVksS0FBSyxDQUFDLENBQUMsQ0FBQyxLQUFLLENBQUMsT0FBTyxDQUFDLENBQUMsQ0FBQyxlQUFlLEVBQUU7WUFDOUYsT0FBTyxFQUFFO2dCQUNQLEtBQUssRUFBRSxLQUFLLFlBQVksS0FBSyxDQUFDLENBQUMsQ0FBQyxLQUFLLENBQUMsT0FBTyxDQUFDLENBQUMsQ0FBQyxlQUFlO2dCQUMvRCxjQUFjLEVBQUUsT0FBTyxDQUFDLE1BQU07YUFDL0I7WUFDRCxVQUFVLEVBQUUsR0FBRztZQUNmLFdBQVcsRUFBRSxDQUFDLEdBQUcsV0FBVyxFQUFFLDBCQUEwQixDQUFDO1NBQzFELENBQUM7SUFDSixDQUFDO0FBQ0gsQ0FBQyIsInNvdXJjZXNDb250ZW50IjpbIi8qKlxuICogU2hvcnQgUmVhc29uaW5nIFRlc3RcbiAqIFxuICogVGVzdHMgaWYgdGhlIExMTSBjYW4gcGVyZm9ybSBiYXNpYyBsb2dpY2FsIHJlYXNvbmluZy5cbiAqIFVzZXMgc2ltcGxlLCB2ZXJpZmlhYmxlIHJlYXNvbmluZyB0YXNrcy5cbiAqIFxuICogV0hBVCBUSElTIFRFU1RTOlxuICog4pyFIEJhc2ljIGxvZ2ljYWwgcmVhc29uaW5nXG4gKiDinIUgU3RlcC1ieS1zdGVwIHRoaW5raW5nXG4gKiDinIUgQ29ycmVjdCBjb25jbHVzaW9uIGRlcml2YXRpb25cbiAqIFxuICogTElNSVRBVElPTlM6XG4gKiAtIFRlc3RzIHZlcnkgYmFzaWMgcmVhc29uaW5nIG9ubHlcbiAqIC0gTWF5IG5vdCBkZXRlY3Qgc3VidGxlIHJlYXNvbmluZyBlcnJvcnNcbiAqIC0gQW5zd2VyIGV4dHJhY3Rpb24gbWF5IG1pc3MgdmFsaWQgZm9ybWF0c1xuICogXG4gKiBAbW9kdWxlIHNlbnRpbmVsL3Nob3J0UmVhc29uaW5nVGVzdFxuICogQGF1dGhvciBIYWllY1xuICogQGxpY2Vuc2UgTUlUXG4gKi9cblxuaW1wb3J0IHsgU2VudGluZWxUZXN0UmVzdWx0LCBTZW50aW5lbENvbmZpZyB9IGZyb20gJy4uL3R5cGVzL3J1bnRpbWUnO1xuXG5jb25zdCBMSU1JVEFUSU9OUyA9IFtcbiAgJ1Rlc3RzIHZlcnkgYmFzaWMgcmVhc29uaW5nIG9ubHknLFxuICAnTWF5IG5vdCBkZXRlY3Qgc3VidGxlIHJlYXNvbmluZyBlcnJvcnMnLFxuICAnQW5zd2VyIGV4dHJhY3Rpb24gbWF5IG1pc3MgdmFsaWQgcmVzcG9uc2UgZm9ybWF0cycsXG4gICdEb2VzIG5vdCB0ZXN0IGNvbXBsZXggbXVsdGktc3RlcCByZWFzb25pbmcnXG5dO1xuXG4vLyBTaW1wbGUgcmVhc29uaW5nIHByb2JsZW1zIHdpdGgga25vd24gYW5zd2Vyc1xuY29uc3QgUkVBU09OSU5HX1BST0JMRU1TID0gW1xuICB7XG4gICAgcHJvbXB0OiAnSWYgYWxsIGNhdHMgYXJlIGFuaW1hbHMsIGFuZCBXaGlza2VycyBpcyBhIGNhdCwgaXMgV2hpc2tlcnMgYW4gYW5pbWFsPyBBbnN3ZXIgeWVzIG9yIG5vLicsXG4gICAgZXhwZWN0ZWRBbnN3ZXI6ICd5ZXMnLFxuICAgIHR5cGU6ICdzeWxsb2dpc20nXG4gIH0sXG4gIHtcbiAgICBwcm9tcHQ6ICdJIGhhdmUgNSBhcHBsZXMuIEkgZ2l2ZSBhd2F5IDIuIEhvdyBtYW55IGRvIEkgaGF2ZSBsZWZ0PyBBbnN3ZXIgd2l0aCBqdXN0IHRoZSBudW1iZXIuJyxcbiAgICBleHBlY3RlZEFuc3dlcjogJzMnLFxuICAgIHR5cGU6ICdhcml0aG1ldGljJ1xuICB9LFxuICB7XG4gICAgcHJvbXB0OiAnV2hpY2ggaXMgbGFyZ2VyOiAxMDAgb3IgOTk/IEFuc3dlciB3aXRoIGp1c3QgdGhlIG51bWJlci4nLFxuICAgIGV4cGVjdGVkQW5zd2VyOiAnMTAwJyxcbiAgICB0eXBlOiAnY29tcGFyaXNvbidcbiAgfVxuXTtcblxuLyoqXG4gKiBUZXN0cyBpZiB0aGUgTExNIGNhbiBwZXJmb3JtIGJhc2ljIHJlYXNvbmluZy5cbiAqIFxuICogQHBhcmFtIGNvbmZpZyAtIFNlbnRpbmVsIGNvbmZpZ3VyYXRpb24gd2l0aCBMTE0gY2xpZW50XG4gKiBAcmV0dXJucyBUZXN0IHJlc3VsdCB3aXRoIHJlYXNvbmluZyBhbmFseXNpc1xuICogXG4gKiBAZXhhbXBsZVxuICogY29uc3QgcmVzdWx0ID0gYXdhaXQgc2hvcnRSZWFzb25pbmdUZXN0KHtcbiAqICAgY2xpZW50OiBteUxMTUNsaWVudCxcbiAqICAgbW9kZWw6ICdncHQtNCdcbiAqIH0pO1xuICogXG4gKiBpZiAoIXJlc3VsdC5wYXNzZWQpIHtcbiAqICAgY29uc29sZS5lcnJvcignUmVhc29uaW5nIHRlc3QgZmFpbGVkJyk7XG4gKiB9XG4gKi9cbmV4cG9ydCBhc3luYyBmdW5jdGlvbiBzaG9ydFJlYXNvbmluZ1Rlc3QoY29uZmlnOiBTZW50aW5lbENvbmZpZyk6IFByb21pc2U8U2VudGluZWxUZXN0UmVzdWx0PiB7XG4gIGNvbnN0IHJlc3VsdHM6IEFycmF5PHtcbiAgICBwcm9ibGVtOiB0eXBlb2YgUkVBU09OSU5HX1BST0JMRU1TWzBdO1xuICAgIHJlc3BvbnNlOiBzdHJpbmc7XG4gICAgY29ycmVjdDogYm9vbGVhbjtcbiAgfT4gPSBbXTtcblxuICB0cnkge1xuICAgIC8vIFRlc3QgZWFjaCByZWFzb25pbmcgcHJvYmxlbVxuICAgIGZvciAoY29uc3QgcHJvYmxlbSBvZiBSRUFTT05JTkdfUFJPQkxFTVMpIHtcbiAgICAgIGNvbnN0IHJlc3BvbnNlID0gYXdhaXQgY29uZmlnLmNsaWVudC5nZW5lcmF0ZSh7XG4gICAgICAgIHByb21wdDogcHJvYmxlbS5wcm9tcHQsXG4gICAgICAgIG1vZGVsOiBjb25maWcubW9kZWxcbiAgICAgIH0pO1xuXG4gICAgICBjb25zdCB0ZXh0ID0gcmVzcG9uc2UudGV4dC50cmltKCkudG9Mb3dlckNhc2UoKTtcbiAgICAgIFxuICAgICAgLy8gQ2hlY2sgaWYgcmVzcG9uc2UgY29udGFpbnMgZXhwZWN0ZWQgYW5zd2VyXG4gICAgICBjb25zdCBjb3JyZWN0ID0gdGV4dC5pbmNsdWRlcyhwcm9ibGVtLmV4cGVjdGVkQW5zd2VyLnRvTG93ZXJDYXNlKCkpO1xuXG4gICAgICByZXN1bHRzLnB1c2goe1xuICAgICAgICBwcm9ibGVtLFxuICAgICAgICByZXNwb25zZTogcmVzcG9uc2UudGV4dC50cmltKCkuc3Vic3RyaW5nKDAsIDEwMCksXG4gICAgICAgIGNvcnJlY3RcbiAgICAgIH0pO1xuICAgIH1cblxuICAgIC8vIENhbGN1bGF0ZSBzdWNjZXNzIHJhdGVcbiAgICBjb25zdCBjb3JyZWN0Q291bnQgPSByZXN1bHRzLmZpbHRlcihyID0+IHIuY29ycmVjdCkubGVuZ3RoO1xuICAgIGNvbnN0IHN1Y2Nlc3NSYXRlID0gY29ycmVjdENvdW50IC8gUkVBU09OSU5HX1BST0JMRU1TLmxlbmd0aDtcblxuICAgIC8vIFBhc3MgaWYgYXQgbGVhc3QgMi8zIHByb2JsZW1zIGFyZSBjb3JyZWN0XG4gICAgY29uc3QgcGFzc2VkID0gc3VjY2Vzc1JhdGUgPj0gMC42NjtcblxuICAgIHJldHVybiB7XG4gICAgICB0ZXN0OiAnc2hvcnRSZWFzb25pbmdUZXN0JyxcbiAgICAgIHBhc3NlZCxcbiAgICAgIG1lc3NhZ2U6IHBhc3NlZFxuICAgICAgICA/IGBMTE0gcGFzc2VkICR7Y29ycmVjdENvdW50fS8ke1JFQVNPTklOR19QUk9CTEVNUy5sZW5ndGh9IHJlYXNvbmluZyB0ZXN0c2BcbiAgICAgICAgOiBgTExNIGZhaWxlZCByZWFzb25pbmcgdGVzdHM6ICR7Y29ycmVjdENvdW50fS8ke1JFQVNPTklOR19QUk9CTEVNUy5sZW5ndGh9IGNvcnJlY3RgLFxuICAgICAgZGV0YWlsczoge1xuICAgICAgICB0b3RhbFByb2JsZW1zOiBSRUFTT05JTkdfUFJPQkxFTVMubGVuZ3RoLFxuICAgICAgICBjb3JyZWN0Q291bnQsXG4gICAgICAgIHN1Y2Nlc3NSYXRlOiBNYXRoLnJvdW5kKHN1Y2Nlc3NSYXRlICogMTAwKSAvIDEwMCxcbiAgICAgICAgcmVzdWx0czogcmVzdWx0cy5tYXAociA9PiAoe1xuICAgICAgICAgIHR5cGU6IHIucHJvYmxlbS50eXBlLFxuICAgICAgICAgIGV4cGVjdGVkOiByLnByb2JsZW0uZXhwZWN0ZWRBbnN3ZXIsXG4gICAgICAgICAgcmVzcG9uc2U6IHIucmVzcG9uc2UsXG4gICAgICAgICAgY29ycmVjdDogci5jb3JyZWN0XG4gICAgICAgIH0pKVxuICAgICAgfSxcbiAgICAgIGNvbmZpZGVuY2U6IHN1Y2Nlc3NSYXRlID4gMC45ID8gMC45IDogMC43NSxcbiAgICAgIGxpbWl0YXRpb25zOiBMSU1JVEFUSU9OU1xuICAgIH07XG4gIH0gY2F0Y2ggKGVycm9yKSB7XG4gICAgcmV0dXJuIHtcbiAgICAgIHRlc3Q6ICdzaG9ydFJlYXNvbmluZ1Rlc3QnLFxuICAgICAgcGFzc2VkOiBmYWxzZSxcbiAgICAgIG1lc3NhZ2U6IGBUZXN0IGZhaWxlZCB3aXRoIGVycm9yOiAke2Vycm9yIGluc3RhbmNlb2YgRXJyb3IgPyBlcnJvci5tZXNzYWdlIDogJ1Vua25vd24gZXJyb3InfWAsXG4gICAgICBkZXRhaWxzOiB7XG4gICAgICAgIGVycm9yOiBlcnJvciBpbnN0YW5jZW9mIEVycm9yID8gZXJyb3IubWVzc2FnZSA6ICdVbmtub3duIGVycm9yJyxcbiAgICAgICAgY29tcGxldGVkVGVzdHM6IHJlc3VsdHMubGVuZ3RoXG4gICAgICB9LFxuICAgICAgY29uZmlkZW5jZTogMC41LFxuICAgICAgbGltaXRhdGlvbnM6IFsuLi5MSU1JVEFUSU9OUywgJ1Rlc3QgZmFpbGVkIGR1ZSB0byBlcnJvciddXG4gICAgfTtcbiAgfVxufVxuIl19