llmverify
Version:
AI Output Verification Toolkit — Local-first LLM safety, hallucination detection, PII redaction, prompt injection defense, and runtime monitoring. Zero telemetry. OWASP LLM Top 10 aligned.
125 lines • 14.6 kB
JavaScript
;
/**
* Duplicate Query Test
*
* Tests if the LLM provides consistent responses to identical queries.
* Helps detect non-deterministic behavior or model instability.
*
* WHAT THIS TESTS:
* ✅ Response consistency
* ✅ Deterministic behavior
* ✅ Model stability
*
* LIMITATIONS:
* - Some variation is expected and normal
* - Temperature settings affect consistency
* - Does not test correctness, only consistency
*
* @module sentinel/duplicateQueryTest
* @author Haiec
* @license MIT
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.duplicateQueryTest = duplicateQueryTest;
const LIMITATIONS = [
'Some response variation is expected and normal',
'Temperature and sampling settings affect consistency',
'Tests consistency, not correctness',
'May flag legitimate paraphrasing as inconsistency'
];
const TEST_QUERY = 'What is 2 + 2? Answer with just the number.';
/**
* Tests if the LLM provides consistent responses to the same query.
*
* @param config - Sentinel configuration with LLM client
* @param iterations - Number of times to repeat the query (default: 3)
* @returns Test result with consistency analysis
*
* @example
* const result = await duplicateQueryTest({
* client: myLLMClient,
* model: 'gpt-4'
* }, 5);
*
* if (!result.passed) {
* console.warn('Inconsistent responses detected');
* }
*/
async function duplicateQueryTest(config, iterations = 3) {
const responses = [];
try {
// Run multiple queries
for (let i = 0; i < iterations; i++) {
const response = await config.client.generate({
prompt: TEST_QUERY,
model: config.model
});
responses.push(response.text.trim().toLowerCase());
}
// Analyze consistency
const uniqueResponses = new Set(responses);
const consistencyRatio = 1 - (uniqueResponses.size - 1) / iterations;
// Check if all responses contain "4"
const correctResponses = responses.filter(r => r.includes('4')).length;
const correctRatio = correctResponses / iterations;
// Calculate semantic similarity between responses
const similarities = [];
for (let i = 0; i < responses.length; i++) {
for (let j = i + 1; j < responses.length; j++) {
similarities.push(calculateSimilarity(responses[i], responses[j]));
}
}
const avgSimilarity = similarities.length > 0
? similarities.reduce((a, b) => a + b, 0) / similarities.length
: 1;
// Pass if responses are consistent (>80% same) and correct (>80% contain "4")
const passed = consistencyRatio >= 0.8 && correctRatio >= 0.8;
return {
test: 'duplicateQueryTest',
passed,
message: passed
? `LLM provided consistent responses across ${iterations} queries`
: `Inconsistent responses detected: ${uniqueResponses.size} unique responses from ${iterations} queries`,
details: {
query: TEST_QUERY,
iterations,
uniqueResponses: Array.from(uniqueResponses),
consistencyRatio: Math.round(consistencyRatio * 100) / 100,
correctRatio: Math.round(correctRatio * 100) / 100,
avgSimilarity: Math.round(avgSimilarity * 100) / 100
},
confidence: avgSimilarity > 0.9 ? 0.9 : 0.7,
limitations: LIMITATIONS
};
}
catch (error) {
return {
test: 'duplicateQueryTest',
passed: false,
message: `Test failed with error: ${error instanceof Error ? error.message : 'Unknown error'}`,
details: {
error: error instanceof Error ? error.message : 'Unknown error',
responsesCollected: responses.length
},
confidence: 0.5,
limitations: [...LIMITATIONS, 'Test failed due to error']
};
}
}
/**
* Calculates similarity between two strings.
*/
function calculateSimilarity(a, b) {
if (a === b)
return 1;
const aWords = new Set(a.split(/\s+/));
const bWords = new Set(b.split(/\s+/));
let matches = 0;
for (const word of aWords) {
if (bWords.has(word))
matches++;
}
const union = new Set([...aWords, ...bWords]).size;
return union > 0 ? matches / union : 0;
}
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"duplicateQueryTest.js","sourceRoot":"","sources":["../../src/sentinel/duplicateQueryTest.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;;;;;GAmBG;;AA8BH,gDAoEC;AA9FD,MAAM,WAAW,GAAG;IAClB,gDAAgD;IAChD,sDAAsD;IACtD,oCAAoC;IACpC,mDAAmD;CACpD,CAAC;AAEF,MAAM,UAAU,GAAG,6CAA6C,CAAC;AAEjE;;;;;;;;;;;;;;;;GAgBG;AACI,KAAK,UAAU,kBAAkB,CACtC,MAAsB,EACtB,aAAqB,CAAC;IAEtB,MAAM,SAAS,GAAa,EAAE,CAAC;IAE/B,IAAI,CAAC;QACH,uBAAuB;QACvB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,UAAU,EAAE,CAAC,EAAE,EAAE,CAAC;YACpC,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,MAAM,CAAC,QAAQ,CAAC;gBAC5C,MAAM,EAAE,UAAU;gBAClB,KAAK,EAAE,MAAM,CAAC,KAAK;aACpB,CAAC,CAAC;YACH,SAAS,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,CAAC;QACrD,CAAC;QAED,sBAAsB;QACtB,MAAM,eAAe,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,CAAC;QAC3C,MAAM,gBAAgB,GAAG,CAAC,GAAG,CAAC,eAAe,CAAC,IAAI,GAAG,CAAC,CAAC,GAAG,UAAU,CAAC;QAErE,qCAAqC;QACrC,MAAM,gBAAgB,GAAG,SAAS,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC;QACvE,MAAM,YAAY,GAAG,gBAAgB,GAAG,UAAU,CAAC;QAEnD,kDAAkD;QAClD,MAAM,YAAY,GAAa,EAAE,CAAC;QAClC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;YAC1C,KAAK,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;gBAC9C,YAAY,CAAC,IAAI,CAAC,mBAAmB,CAAC,SAAS,CAAC,CAAC,CAAC,EAAE,SAAS,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YACrE,CAAC;QACH,CAAC;QACD,MAAM,aAAa,GAAG,YAAY,CAAC,MAAM,GAAG,CAAC;YAC3C,CAAC,CAAC,YAAY,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM;YAC/D,CAAC,CAAC,CAAC,CAAC;QAEN,8EAA8E;QAC9E,MAAM,MAAM,GAAG,gBAAgB,IAAI,GAAG,IAAI,YAAY,IAAI,GAAG,CAAC;QAE9D,OAAO;YACL,IAAI,EAAE,oBAAoB;YAC1B,MAAM;YACN,OAAO,EAAE,MAAM;gBACb,CAAC,CAAC,4CAA4C,UAAU,UAAU;gBAClE,CAAC,CAAC,oCAAoC,eAAe,CAAC,IAAI,0BAA0B,UAAU,UAAU;YAC1G,OAAO,EAAE;gBACP,KAAK,EAAE,UAAU;gBACjB,UAAU;gBACV,eAAe,EAAE,KAAK,CAAC,IAAI,CAAC,eAAe,CAAC;gBAC5C,gBAAgB,EAAE,IAAI,CAAC,KAAK,CAAC,gBAAgB,GAAG,GAAG,CAAC,GAAG,GAAG;gBAC1D,YAAY,EAAE,IAAI,CAAC,KAAK,CAAC,YAAY,GAAG,GAAG,CAAC,GAAG,GAAG;gBAClD,aAAa,EAAE,IAAI,CAAC,KAAK,CAAC,aAAa,GAAG,GAAG,CAAC,GAAG,GAAG;aACrD;YACD,UAAU,EAAE,aAAa,GAAG,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG;YAC3C,WAAW,EAAE,WAAW;SACzB,CAAC;IACJ,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO;YACL,IAAI,EAAE,oBAAoB;YAC1B,MAAM,EAAE,KAAK;YACb,OAAO,EAAE,2BAA2B,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe,EAAE;YAC9F,OAAO,EAAE;gBACP,KAAK,EAAE,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,eAAe;gBAC/D,kBAAkB,EAAE,SAAS,CAAC,MAAM;aACrC;YACD,UAAU,EAAE,GAAG;YACf,WAAW,EAAE,CAAC,GAAG,WAAW,EAAE,0BAA0B,CAAC;SAC1D,CAAC;IACJ,CAAC;AACH,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,CAAS,EAAE,CAAS;IAC/C,IAAI,CAAC,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAEtB,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC;IACvC,MAAM,MAAM,GAAG,IAAI,GAAG,CAAC,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC;IAEvC,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,KAAK,MAAM,IAAI,IAAI,MAAM,EAAE,CAAC;QAC1B,IAAI,MAAM,CAAC,GAAG,CAAC,IAAI,CAAC;YAAE,OAAO,EAAE,CAAC;IAClC,CAAC;IAED,MAAM,KAAK,GAAG,IAAI,GAAG,CAAC,CAAC,GAAG,MAAM,EAAE,GAAG,MAAM,CAAC,CAAC,CAAC,IAAI,CAAC;IACnD,OAAO,KAAK,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;AACzC,CAAC","sourcesContent":["/**\n * Duplicate Query Test\n * \n * Tests if the LLM provides consistent responses to identical queries.\n * Helps detect non-deterministic behavior or model instability.\n * \n * WHAT THIS TESTS:\n * ✅ Response consistency\n * ✅ Deterministic behavior\n * ✅ Model stability\n * \n * LIMITATIONS:\n * - Some variation is expected and normal\n * - Temperature settings affect consistency\n * - Does not test correctness, only consistency\n * \n * @module sentinel/duplicateQueryTest\n * @author Haiec\n * @license MIT\n */\n\nimport { SentinelTestResult, SentinelConfig } from '../types/runtime';\n\nconst LIMITATIONS = [\n  'Some response variation is expected and normal',\n  'Temperature and sampling settings affect consistency',\n  'Tests consistency, not correctness',\n  'May flag legitimate paraphrasing as inconsistency'\n];\n\nconst TEST_QUERY = 'What is 2 + 2? Answer with just the number.';\n\n/**\n * Tests if the LLM provides consistent responses to the same query.\n * \n * @param config - Sentinel configuration with LLM client\n * @param iterations - Number of times to repeat the query (default: 3)\n * @returns Test result with consistency analysis\n * \n * @example\n * const result = await duplicateQueryTest({\n *   client: myLLMClient,\n *   model: 'gpt-4'\n * }, 5);\n * \n * if (!result.passed) {\n *   console.warn('Inconsistent responses detected');\n * }\n */\nexport async function duplicateQueryTest(\n  config: SentinelConfig,\n  iterations: number = 3\n): Promise<SentinelTestResult> {\n  const responses: string[] = [];\n\n  try {\n    // Run multiple queries\n    for (let i = 0; i < iterations; i++) {\n      const response = await config.client.generate({\n        prompt: TEST_QUERY,\n        model: config.model\n      });\n      responses.push(response.text.trim().toLowerCase());\n    }\n\n    // Analyze consistency\n    const uniqueResponses = new Set(responses);\n    const consistencyRatio = 1 - (uniqueResponses.size - 1) / iterations;\n    \n    // Check if all responses contain \"4\"\n    const correctResponses = responses.filter(r => r.includes('4')).length;\n    const correctRatio = correctResponses / iterations;\n\n    // Calculate semantic similarity between responses\n    const similarities: number[] = [];\n    for (let i = 0; i < responses.length; i++) {\n      for (let j = i + 1; j < responses.length; j++) {\n        similarities.push(calculateSimilarity(responses[i], responses[j]));\n      }\n    }\n    const avgSimilarity = similarities.length > 0 \n      ? similarities.reduce((a, b) => a + b, 0) / similarities.length \n      : 1;\n\n    // Pass if responses are consistent (>80% same) and correct (>80% contain \"4\")\n    const passed = consistencyRatio >= 0.8 && correctRatio >= 0.8;\n\n    return {\n      test: 'duplicateQueryTest',\n      passed,\n      message: passed\n        ? `LLM provided consistent responses across ${iterations} queries`\n        : `Inconsistent responses detected: ${uniqueResponses.size} unique responses from ${iterations} queries`,\n      details: {\n        query: TEST_QUERY,\n        iterations,\n        uniqueResponses: Array.from(uniqueResponses),\n        consistencyRatio: Math.round(consistencyRatio * 100) / 100,\n        correctRatio: Math.round(correctRatio * 100) / 100,\n        avgSimilarity: Math.round(avgSimilarity * 100) / 100\n      },\n      confidence: avgSimilarity > 0.9 ? 0.9 : 0.7,\n      limitations: LIMITATIONS\n    };\n  } catch (error) {\n    return {\n      test: 'duplicateQueryTest',\n      passed: false,\n      message: `Test failed with error: ${error instanceof Error ? error.message : 'Unknown error'}`,\n      details: {\n        error: error instanceof Error ? error.message : 'Unknown error',\n        responsesCollected: responses.length\n      },\n      confidence: 0.5,\n      limitations: [...LIMITATIONS, 'Test failed due to error']\n    };\n  }\n}\n\n/**\n * Calculates similarity between two strings.\n */\nfunction calculateSimilarity(a: string, b: string): number {\n  if (a === b) return 1;\n  \n  const aWords = new Set(a.split(/\\s+/));\n  const bWords = new Set(b.split(/\\s+/));\n  \n  let matches = 0;\n  for (const word of aWords) {\n    if (bWords.has(word)) matches++;\n  }\n  \n  const union = new Set([...aWords, ...bWords]).size;\n  return union > 0 ? matches / union : 0;\n}\n"]}