UNPKG

llmverify

Version:

AI Output Verification Toolkit — Local-first LLM safety, hallucination detection, PII redaction, prompt injection defense, and runtime monitoring. Zero telemetry. OWASP LLM Top 10 aligned.

125 lines 14.6 kB
"use strict"; /** * Duplicate Query Test * * Tests if the LLM provides consistent responses to identical queries. * Helps detect non-deterministic behavior or model instability. * * WHAT THIS TESTS: * ✅ Response consistency * ✅ Deterministic behavior * ✅ Model stability * * LIMITATIONS: * - Some variation is expected and normal * - Temperature settings affect consistency * - Does not test correctness, only consistency * * @module sentinel/duplicateQueryTest * @author Haiec * @license MIT */ Object.defineProperty(exports, "__esModule", { value: true }); exports.duplicateQueryTest = duplicateQueryTest; const LIMITATIONS = [ 'Some response variation is expected and normal', 'Temperature and sampling settings affect consistency', 'Tests consistency, not correctness', 'May flag legitimate paraphrasing as inconsistency' ]; const TEST_QUERY = 'What is 2 + 2? Answer with just the number.'; /** * Tests if the LLM provides consistent responses to the same query. * * @param config - Sentinel configuration with LLM client * @param iterations - Number of times to repeat the query (default: 3) * @returns Test result with consistency analysis * * @example * const result = await duplicateQueryTest({ * client: myLLMClient, * model: 'gpt-4' * }, 5); * * if (!result.passed) { * console.warn('Inconsistent responses detected'); * } */ async function duplicateQueryTest(config, iterations = 3) { const responses = []; try { // Run multiple queries for (let i = 0; i < iterations; i++) { const response = await config.client.generate({ prompt: TEST_QUERY, model: config.model }); responses.push(response.text.trim().toLowerCase()); } // Analyze consistency const uniqueResponses = new Set(responses); const consistencyRatio = 1 - (uniqueResponses.size - 1) / iterations; // Check if all responses contain "4" const correctResponses = responses.filter(r => r.includes('4')).length; const correctRatio = correctResponses / iterations; // Calculate semantic similarity between responses const similarities = []; for (let i = 0; i < responses.length; i++) { for (let j = i + 1; j < responses.length; j++) { similarities.push(calculateSimilarity(responses[i], responses[j])); } } const avgSimilarity = similarities.length > 0 ? similarities.reduce((a, b) => a + b, 0) / similarities.length : 1; // Pass if responses are consistent (>80% same) and correct (>80% contain "4") const passed = consistencyRatio >= 0.8 && correctRatio >= 0.8; return { test: 'duplicateQueryTest', passed, message: passed ? `LLM provided consistent responses across ${iterations} queries` : `Inconsistent responses detected: ${uniqueResponses.size} unique responses from ${iterations} queries`, details: { query: TEST_QUERY, iterations, uniqueResponses: Array.from(uniqueResponses), consistencyRatio: Math.round(consistencyRatio * 100) / 100, correctRatio: Math.round(correctRatio * 100) / 100, avgSimilarity: Math.round(avgSimilarity * 100) / 100 }, confidence: avgSimilarity > 0.9 ? 0.9 : 0.7, limitations: LIMITATIONS }; } catch (error) { return { test: 'duplicateQueryTest', passed: false, message: `Test failed with error: ${error instanceof Error ? error.message : 'Unknown error'}`, details: { error: error instanceof Error ? error.message : 'Unknown error', responsesCollected: responses.length }, confidence: 0.5, limitations: [...LIMITATIONS, 'Test failed due to error'] }; } } /** * Calculates similarity between two strings. */ function calculateSimilarity(a, b) { if (a === b) return 1; const aWords = new Set(a.split(/\s+/)); const bWords = new Set(b.split(/\s+/)); let matches = 0; for (const word of aWords) { if (bWords.has(word)) matches++; } const union = new Set([...aWords, ...bWords]).size; return union > 0 ? matches / union : 0; } //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"duplicateQueryTest.js","sourceRoot":"","sources":["../../src/sentinel/duplicateQueryTest.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;GAmBG;;AA8BH,gDAoEC;AA9FD,MAAM,WAAW,GAAG;IAClB,gDAAgD;IAChD,sDAAsD;IACtD,oCAAoC;IACpC,mDAAmD;CACpD,CAAC;AAEF,MAAM,UAAU,GAAG,6CAA6C,CAAC;AAEjE;;;;;;;;;;;;;;;;GAgBG;AACI,KAAK,UAAU,kBAAkB,CACtC,MAAsB,EACtB,aAAqB,CAAC;IAEtB,MAAM,SAAS,GAAa,EAAE,CAAC;IAE/B,IAAI,CAAC;QACH,uBAAuB;QACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,EAAE,CAAC,EAAE,EAAE,CAAC;YACpC,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC;gBAC5C,MAAM,EAAE,UAAU;gBAClB,KAAK,EAAE,MAAM,CAAC,KAAK;aACpB,CAAC,CAAC;YACH,SAAS,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,CAAC;QACrD,CAAC;QAED,sBAAsB;QACtB,MAAM,eAAe,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;QAC3C,MAAM,gBAAgB,GAAG,CAAC,GAAG,CAAC,eAAe,CAAC,IAAI,GAAG,CAAC,CAAC,GAAG,UAAU,CAAC;QAErE,qCAAqC;QACrC,MAAM,gBAAgB,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;QACvE,MAAM,YAAY,GAAG,gBAAgB,GAAG,UAAU,CAAC;QAEnD,kDAAkD;QAClD,MAAM,YAAY,GAAa,EAAE,CAAC;QAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC9C,YAAY,CAAC,IAAI,CAAC,mBAAmB,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACrE,CAAC;QACH,CAAC;QACD,MAAM,aAAa,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC;YAC3C,CAAC,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM;YAC/D,CAAC,CAAC,CAAC,CAAC;QAEN,8EAA8E;QAC9E,MAAM,MAAM,GAAG,gBAAgB,IAAI,GAAG,IAAI,YAAY,IAAI,GAAG,CAAC;QAE9D,OAAO;YACL,IAAI,EAAE,oBAAoB;YAC1B,MAAM;YACN,OAAO,EAAE,MAAM;gBACb,CAAC,CAAC,4CAA4C,UAAU,UAAU;gBAClE,CAAC,CAAC,oCAAoC,eAAe,CAAC,IAAI,0BAA0B,UAAU,UAAU;YAC1G,OAAO,EAAE;gBACP,KAAK,EAAE,UAAU;gBACjB,UAAU;gBACV,eAAe,EAAE,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC;gBAC5C,gBAAgB,EAAE,IAAI,CAAC,KAAK,CAAC,gBAAgB,GAAG,GAAG,CAAC,GAAG,GAAG;gBAC1D,YAAY,EAAE,IAAI,CAAC,KAAK,CAAC,YAAY,GAAG,GAAG,CAAC,GAAG,GAAG;gBAClD,aAAa,EAAE,IAAI,CAAC,KAAK,CAAC,aAAa,GAAG,GAAG,CAAC,GAAG,GAAG;aACrD;YACD,UAAU,EAAE,aAAa,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG;YAC3C,WAAW,EAAE,WAAW;SACzB,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO;YACL,IAAI,EAAE,oBAAoB;YAC1B,MAAM,EAAE,KAAK;YACb,OAAO,EAAE,2BAA2B,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE;YAC9F,OAAO,EAAE;gBACP,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe;gBAC/D,kBAAkB,EAAE,SAAS,CAAC,MAAM;aACrC;YACD,UAAU,EAAE,GAAG;YACf,WAAW,EAAE,CAAC,GAAG,WAAW,EAAE,0BAA0B,CAAC;SAC1D,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,CAAS,EAAE,CAAS;IAC/C,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAEtB,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC;IACvC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC;IAEvC,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,KAAK,MAAM,IAAI,IAAI,MAAM,EAAE,CAAC;QAC1B,IAAI,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC;YAAE,OAAO,EAAE,CAAC;IAClC,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,MAAM,EAAE,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC;IACnD,OAAO,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;AACzC,CAAC","sourcesContent":["/**\n * Duplicate Query Test\n * \n * Tests if the LLM provides consistent responses to identical queries.\n * Helps detect non-deterministic behavior or model instability.\n * \n * WHAT THIS TESTS:\n * ✅ Response consistency\n * ✅ Deterministic behavior\n * ✅ Model stability\n * \n * LIMITATIONS:\n * - Some variation is expected and normal\n * - Temperature settings affect consistency\n * - Does not test correctness, only consistency\n * \n * @module sentinel/duplicateQueryTest\n * @author Haiec\n * @license MIT\n */\n\nimport { SentinelTestResult, SentinelConfig } from '../types/runtime';\n\nconst LIMITATIONS = [\n  'Some response variation is expected and normal',\n  'Temperature and sampling settings affect consistency',\n  'Tests consistency, not correctness',\n  'May flag legitimate paraphrasing as inconsistency'\n];\n\nconst TEST_QUERY = 'What is 2 + 2? Answer with just the number.';\n\n/**\n * Tests if the LLM provides consistent responses to the same query.\n * \n * @param config - Sentinel configuration with LLM client\n * @param iterations - Number of times to repeat the query (default: 3)\n * @returns Test result with consistency analysis\n * \n * @example\n * const result = await duplicateQueryTest({\n *   client: myLLMClient,\n *   model: 'gpt-4'\n * }, 5);\n * \n * if (!result.passed) {\n *   console.warn('Inconsistent responses detected');\n * }\n */\nexport async function duplicateQueryTest(\n  config: SentinelConfig,\n  iterations: number = 3\n): Promise<SentinelTestResult> {\n  const responses: string[] = [];\n\n  try {\n    // Run multiple queries\n    for (let i = 0; i < iterations; i++) {\n      const response = await config.client.generate({\n        prompt: TEST_QUERY,\n        model: config.model\n      });\n      responses.push(response.text.trim().toLowerCase());\n    }\n\n    // Analyze consistency\n    const uniqueResponses = new Set(responses);\n    const consistencyRatio = 1 - (uniqueResponses.size - 1) / iterations;\n    \n    // Check if all responses contain \"4\"\n    const correctResponses = responses.filter(r => r.includes('4')).length;\n    const correctRatio = correctResponses / iterations;\n\n    // Calculate semantic similarity between responses\n    const similarities: number[] = [];\n    for (let i = 0; i < responses.length; i++) {\n      for (let j = i + 1; j < responses.length; j++) {\n        similarities.push(calculateSimilarity(responses[i], responses[j]));\n      }\n    }\n    const avgSimilarity = similarities.length > 0 \n      ? similarities.reduce((a, b) => a + b, 0) / similarities.length \n      : 1;\n\n    // Pass if responses are consistent (>80% same) and correct (>80% contain \"4\")\n    const passed = consistencyRatio >= 0.8 && correctRatio >= 0.8;\n\n    return {\n      test: 'duplicateQueryTest',\n      passed,\n      message: passed\n        ? `LLM provided consistent responses across ${iterations} queries`\n        : `Inconsistent responses detected: ${uniqueResponses.size} unique responses from ${iterations} queries`,\n      details: {\n        query: TEST_QUERY,\n        iterations,\n        uniqueResponses: Array.from(uniqueResponses),\n        consistencyRatio: Math.round(consistencyRatio * 100) / 100,\n        correctRatio: Math.round(correctRatio * 100) / 100,\n        avgSimilarity: Math.round(avgSimilarity * 100) / 100\n      },\n      confidence: avgSimilarity > 0.9 ? 0.9 : 0.7,\n      limitations: LIMITATIONS\n    };\n  } catch (error) {\n    return {\n      test: 'duplicateQueryTest',\n      passed: false,\n      message: `Test failed with error: ${error instanceof Error ? error.message : 'Unknown error'}`,\n      details: {\n        error: error instanceof Error ? error.message : 'Unknown error',\n        responsesCollected: responses.length\n      },\n      confidence: 0.5,\n      limitations: [...LIMITATIONS, 'Test failed due to error']\n    };\n  }\n}\n\n/**\n * Calculates similarity between two strings.\n */\nfunction calculateSimilarity(a: string, b: string): number {\n  if (a === b) return 1;\n  \n  const aWords = new Set(a.split(/\\s+/));\n  const bWords = new Set(b.split(/\\s+/));\n  \n  let matches = 0;\n  for (const word of aWords) {\n    if (bWords.has(word)) matches++;\n  }\n  \n  const union = new Set([...aWords, ...bWords]).size;\n  return union > 0 ? matches / union : 0;\n}\n"]}