UNPKG

llmverify

Version:

AI Output Verification Toolkit — Local-first LLM safety, hallucination detection, PII redaction, prompt injection defense, and runtime monitoring. Zero telemetry. OWASP LLM Top 10 aligned.

189 lines 20.4 kB
"use strict"; /** * Hallucination Risk Heuristic Module * * Detects potential hallucination signals in LLM output. * Uses heuristics - not definitive hallucination detection. * * LIMITATIONS: * - Heuristic-based, not ground-truth verification * - May produce false positives/negatives * - Cannot detect factually incorrect but plausible statements * - Requires prompt context for best results * * @module engines/classification/hallucination * @author Haiec * @license MIT */ Object.defineProperty(exports, "__esModule", { value: true }); exports.DEFAULT_HALLUCINATION_WEIGHTS = void 0; exports.calculateHallucinationSignals = calculateHallucinationSignals; exports.calculateHallucinationRisk = calculateHallucinationRisk; exports.getHallucinationLabel = getHallucinationLabel; const utils_1 = require("./utils"); /** * Overconfident language patterns. */ const OVERCONFIDENT_PATTERNS = [ 'definitely', 'certainly', 'guaranteed', 'proven', 'no doubt', 'undeniable', 'absolutely', 'without question', '100%', 'always', 'never fails' ]; /** * Contradiction patterns (simplified). */ const CONTRADICTION_PATTERNS = [ { positive: /is required/i, negative: /is optional/i }, { positive: /must/i, negative: /does not need to/i }, { positive: /always/i, negative: /never/i }, { positive: /can/i, negative: /cannot/i }, { positive: /will/i, negative: /will not/i } ]; /** * Calculates speculative facts score. * Looks for capitalized entities in output not present in prompt. */ function calculateSpeculativeFactsScore(prompt, output) { const promptEntities = (0, utils_1.extractCapitalizedTokens)(prompt); const outputEntities = (0, utils_1.extractCapitalizedTokens)(output); let newEntities = 0; for (const entity of outputEntities) { if (!promptEntities.has(entity)) { newEntities++; } } // Score: 0 if no new entities, 1 if 5+ new entities return (0, utils_1.clamp)(newEntities / 5, 0, 1); } /** * Calculates overconfident language score. */ function calculateOverconfidentScore(text) { const matches = (0, utils_1.countMatches)(text, OVERCONFIDENT_PATTERNS); if (matches === 0) return 0; if (matches === 1) return 0.5; return 0.8; } /** * Calculates fabricated JSON keys score. * Looks for JSON keys not mentioned in prompt. */ function calculateFabricatedKeysScore(prompt, normalizedJson) { if (!normalizedJson || typeof normalizedJson !== 'object' || Array.isArray(normalizedJson)) { return 0; } const obj = normalizedJson; const keys = Object.keys(obj); const promptLower = prompt.toLowerCase(); let extraKeys = 0; for (const key of keys) { if (!promptLower.includes(key.toLowerCase())) { extraKeys++; } } return (0, utils_1.clamp)(extraKeys / 5, 0, 1); } /** * Calculates contradiction score. * Looks for simple contradictory patterns. */ function calculateContradictionScore(text) { for (const pattern of CONTRADICTION_PATTERNS) { const hasPositive = pattern.positive.test(text); const hasNegative = pattern.negative.test(text); if (hasPositive && hasNegative) { // Found potential contradiction return 0.7; } } return 0; } /** * Calculates hallucination risk signals. * * @param prompt - The original prompt * @param output - The LLM output * @param normalizedJson - Parsed JSON if available * @param customHooks - Optional custom detection hooks * @returns Hallucination signals and scores */ function calculateHallucinationSignals(prompt, output, normalizedJson, customHooks) { const speculativeFactsScore = calculateSpeculativeFactsScore(prompt, output); const overconfidentScore = calculateOverconfidentScore(output); const fabricatedKeysScore = normalizedJson ? calculateFabricatedKeysScore(prompt, normalizedJson) : 0; const contradictionScore = calculateContradictionScore(output); return { speculativeFactsScore, fabricatedKeysScore, overconfidentScore, contradictionScore, customHooksCount: customHooks?.length ?? 0 }; } /** Default weights for hallucination signals (tuned to reduce false positives) */ exports.DEFAULT_HALLUCINATION_WEIGHTS = { speculative: 0.35, // Reduced from 0.4 - new entities are often legitimate fabricated: 0.25, // Reduced from 0.3 - JSON keys often aren't in prompt overconfident: 0.25, // Increased from 0.2 - overconfident language is a stronger signal contradiction: 0.15 // Increased from 0.1 - contradictions are meaningful }; /** * Calculates overall hallucination risk score. * * @param signals - Hallucination signals * @param prompt - Original prompt * @param output - LLM output * @param customHooks - Optional custom hooks * @param weights - Optional weight overrides * @returns Risk score (0-1) */ function calculateHallucinationRisk(signals, prompt, output, customHooks, weights) { // Merge weights with defaults const w = { speculative: weights?.speculative ?? exports.DEFAULT_HALLUCINATION_WEIGHTS.speculative, fabricated: weights?.fabricated ?? exports.DEFAULT_HALLUCINATION_WEIGHTS.fabricated, overconfident: weights?.overconfident ?? exports.DEFAULT_HALLUCINATION_WEIGHTS.overconfident, contradiction: weights?.contradiction ?? exports.DEFAULT_HALLUCINATION_WEIGHTS.contradiction }; // Base risk from internal signals let risk = w.speculative * signals.speculativeFactsScore + w.fabricated * signals.fabricatedKeysScore + w.overconfident * signals.overconfidentScore + w.contradiction * signals.contradictionScore; // Apply custom hooks if (customHooks) { for (const hook of customHooks) { try { const hookScore = (0, utils_1.clamp)(hook(prompt, output), 0, 1); risk += 0.2 * hookScore; } catch { // Ignore hook errors } } } return (0, utils_1.clamp)(risk, 0, 1); } /** * Gets hallucination risk label from score. */ function getHallucinationLabel(risk) { if (risk <= 0.3) return 'low'; if (risk <= 0.6) return 'medium'; return 'high'; } //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"hallucination.js","sourceRoot":"","sources":["../../../src/engines/classification/hallucination.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;GAeG;;;AAoHH,sEAoBC;AAoBD,gEAwCC;AAKD,sDAIC;AA1MD,mCAAqF;AAErF;;GAEG;AACH,MAAM,sBAAsB,GAAG;IAC7B,YAAY;IACZ,WAAW;IACX,YAAY;IACZ,QAAQ;IACR,UAAU;IACV,YAAY;IACZ,YAAY;IACZ,kBAAkB;IAClB,MAAM;IACN,QAAQ;IACR,aAAa;CACd,CAAC;AAEF;;GAEG;AACH,MAAM,sBAAsB,GAAG;IAC7B,EAAE,QAAQ,EAAE,cAAc,EAAE,QAAQ,EAAE,cAAc,EAAE;IACtD,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,mBAAmB,EAAE;IACpD,EAAE,QAAQ,EAAE,SAAS,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAC3C,EAAE,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE;IACzC,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,WAAW,EAAE;CAC7C,CAAC;AAEF;;;GAGG;AACH,SAAS,8BAA8B,CAAC,MAAc,EAAE,MAAc;IACpE,MAAM,cAAc,GAAG,IAAA,gCAAwB,EAAC,MAAM,CAAC,CAAC;IACxD,MAAM,cAAc,GAAG,IAAA,gCAAwB,EAAC,MAAM,CAAC,CAAC;IAExD,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,KAAK,MAAM,MAAM,IAAI,cAAc,EAAE,CAAC;QACpC,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;YAChC,WAAW,EAAE,CAAC;QAChB,CAAC;IACH,CAAC;IAED,oDAAoD;IACpD,OAAO,IAAA,aAAK,EAAC,WAAW,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;AACtC,CAAC;AAED;;GAEG;AACH,SAAS,2BAA2B,CAAC,IAAY;IAC/C,MAAM,OAAO,GAAG,IAAA,oBAAY,EAAC,IAAI,EAAE,sBAAsB,CAAC,CAAC;IAE3D,IAAI,OAAO,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAC5B,IAAI,OAAO,KAAK,CAAC;QAAE,OAAO,GAAG,CAAC;IAC9B,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;GAGG;AACH,SAAS,4BAA4B,CACnC,MAAc,EACd,cAAuB;IAEvB,IAAI,CAAC,cAAc,IAAI,OAAO,cAAc,KAAK,QAAQ,IAAI,KAAK,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE,CAAC;QAC3F,OAAO,CAAC,CAAC;IACX,CAAC;IAED,MAAM,GAAG,GAAG,cAAyC,CAAC;IACtD,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC9B,MAAM,WAAW,GAAG,MAAM,CAAC,WAAW,EAAE,CAAC;IAEzC,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;YAC7C,SAAS,EAAE,CAAC;QACd,CAAC;IACH,CAAC;IAED,OAAO,IAAA,aAAK,EAAC,SAAS,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;AACpC,CAAC;AAED;;;GAGG;AACH,SAAS,2BAA2B,CAAC,IAAY;IAC/C,KAAK,MAAM,OAAO,IAAI,sBAAsB,EAAE,CAAC;QAC7C,MAAM,WAAW,GAAG,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,MAAM,WAAW,GAAG,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAEhD,IAAI,WAAW,IAAI,WAAW,EAAE,CAAC;YAC/B,gCAAgC;YAChC,OAAO,GAAG,CAAC;QACb,CAAC;IACH,CAAC;IAED,OAAO,CAAC,CAAC;AACX,CAAC;AAED;;;;;;;;GAQG;AACH,SAAgB,6BAA6B,CAC3C,MAAc,EACd,MAAc,EACd,cAAmC,EACnC,WAA+D;IAE/D,MAAM,qBAAqB,GAAG,8BAA8B,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC7E,MAAM,kBAAkB,GAAG,2BAA2B,CAAC,MAAM,CAAC,CAAC;IAC/D,MAAM,mBAAmB,GAAG,cAAc;QACxC,CAAC,CAAC,4BAA4B,CAAC,MAAM,EAAE,cAAc,CAAC;QACtD,CAAC,CAAC,CAAC,CAAC;IACN,MAAM,kBAAkB,GAAG,2BAA2B,CAAC,MAAM,CAAC,CAAC;IAE/D,OAAO;QACL,qBAAqB;QACrB,mBAAmB;QACnB,kBAAkB;QAClB,kBAAkB;QAClB,gBAAgB,EAAE,WAAW,EAAE,MAAM,IAAI,CAAC;KAC3C,CAAC;AACJ,CAAC;AAED,kFAAkF;AACrE,QAAA,6BAA6B,GAAG;IAC3C,WAAW,EAAE,IAAI,EAAI,uDAAuD;IAC5E,UAAU,EAAE,IAAI,EAAK,sDAAsD;IAC3E,aAAa,EAAE,IAAI,EAAE,mEAAmE;IACxF,aAAa,EAAE,IAAI,CAAE,qDAAqD;CAC3E,CAAC;AAEF;;;;;;;;;GASG;AACH,SAAgB,0BAA0B,CACxC,OAA6B,EAC7B,MAAc,EACd,MAAc,EACd,WAA+D,EAC/D,OAKC;IAED,8BAA8B;IAC9B,MAAM,CAAC,GAAG;QACR,WAAW,EAAE,OAAO,EAAE,WAAW,IAAI,qCAA6B,CAAC,WAAW;QAC9E,UAAU,EAAE,OAAO,EAAE,UAAU,IAAI,qCAA6B,CAAC,UAAU;QAC3E,aAAa,EAAE,OAAO,EAAE,aAAa,IAAI,qCAA6B,CAAC,aAAa;QACpF,aAAa,EAAE,OAAO,EAAE,aAAa,IAAI,qCAA6B,CAAC,aAAa;KACrF,CAAC;IAEF,kCAAkC;IAClC,IAAI,IAAI,GACN,CAAC,CAAC,WAAW,GAAG,OAAO,CAAC,qBAAqB;QAC7C,CAAC,CAAC,UAAU,GAAG,OAAO,CAAC,mBAAmB;QAC1C,CAAC,CAAC,aAAa,GAAG,OAAO,CAAC,kBAAkB;QAC5C,CAAC,CAAC,aAAa,GAAG,OAAO,CAAC,kBAAkB,CAAC;IAE/C,qBAAqB;IACrB,IAAI,WAAW,EAAE,CAAC;QAChB,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,IAAI,CAAC;gBACH,MAAM,SAAS,GAAG,IAAA,aAAK,EAAC,IAAI,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;gBACpD,IAAI,IAAI,GAAG,GAAG,SAAS,CAAC;YAC1B,CAAC;YAAC,MAAM,CAAC;gBACP,qBAAqB;YACvB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,IAAA,aAAK,EAAC,IAAI,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;AAC3B,CAAC;AAED;;GAEG;AACH,SAAgB,qBAAqB,CAAC,IAAY;IAChD,IAAI,IAAI,IAAI,GAAG;QAAE,OAAO,KAAK,CAAC;IAC9B,IAAI,IAAI,IAAI,GAAG;QAAE,OAAO,QAAQ,CAAC;IACjC,OAAO,MAAM,CAAC;AAChB,CAAC","sourcesContent":["/**\n * Hallucination Risk Heuristic Module\n * \n * Detects potential hallucination signals in LLM output.\n * Uses heuristics - not definitive hallucination detection.\n * \n * LIMITATIONS:\n * - Heuristic-based, not ground-truth verification\n * - May produce false positives/negatives\n * - Cannot detect factually incorrect but plausible statements\n * - Requires prompt context for best results\n * \n * @module engines/classification/hallucination\n * @author Haiec\n * @license MIT\n */\n\nimport { HallucinationSignals, HallucinationLabel } from './types';\nimport { clamp, extractCapitalizedTokens, containsAny, countMatches } from './utils';\n\n/**\n * Overconfident language patterns.\n */\nconst OVERCONFIDENT_PATTERNS = [\n  'definitely',\n  'certainly',\n  'guaranteed',\n  'proven',\n  'no doubt',\n  'undeniable',\n  'absolutely',\n  'without question',\n  '100%',\n  'always',\n  'never fails'\n];\n\n/**\n * Contradiction patterns (simplified).\n */\nconst CONTRADICTION_PATTERNS = [\n  { positive: /is required/i, negative: /is optional/i },\n  { positive: /must/i, negative: /does not need to/i },\n  { positive: /always/i, negative: /never/i },\n  { positive: /can/i, negative: /cannot/i },\n  { positive: /will/i, negative: /will not/i }\n];\n\n/**\n * Calculates speculative facts score.\n * Looks for capitalized entities in output not present in prompt.\n */\nfunction calculateSpeculativeFactsScore(prompt: string, output: string): number {\n  const promptEntities = extractCapitalizedTokens(prompt);\n  const outputEntities = extractCapitalizedTokens(output);\n  \n  let newEntities = 0;\n  for (const entity of outputEntities) {\n    if (!promptEntities.has(entity)) {\n      newEntities++;\n    }\n  }\n  \n  // Score: 0 if no new entities, 1 if 5+ new entities\n  return clamp(newEntities / 5, 0, 1);\n}\n\n/**\n * Calculates overconfident language score.\n */\nfunction calculateOverconfidentScore(text: string): number {\n  const matches = countMatches(text, OVERCONFIDENT_PATTERNS);\n  \n  if (matches === 0) return 0;\n  if (matches === 1) return 0.5;\n  return 0.8;\n}\n\n/**\n * Calculates fabricated JSON keys score.\n * Looks for JSON keys not mentioned in prompt.\n */\nfunction calculateFabricatedKeysScore(\n  prompt: string,\n  normalizedJson: unknown\n): number {\n  if (!normalizedJson || typeof normalizedJson !== 'object' || Array.isArray(normalizedJson)) {\n    return 0;\n  }\n  \n  const obj = normalizedJson as Record<string, unknown>;\n  const keys = Object.keys(obj);\n  const promptLower = prompt.toLowerCase();\n  \n  let extraKeys = 0;\n  for (const key of keys) {\n    if (!promptLower.includes(key.toLowerCase())) {\n      extraKeys++;\n    }\n  }\n  \n  return clamp(extraKeys / 5, 0, 1);\n}\n\n/**\n * Calculates contradiction score.\n * Looks for simple contradictory patterns.\n */\nfunction calculateContradictionScore(text: string): number {\n  for (const pattern of CONTRADICTION_PATTERNS) {\n    const hasPositive = pattern.positive.test(text);\n    const hasNegative = pattern.negative.test(text);\n    \n    if (hasPositive && hasNegative) {\n      // Found potential contradiction\n      return 0.7;\n    }\n  }\n  \n  return 0;\n}\n\n/**\n * Calculates hallucination risk signals.\n * \n * @param prompt - The original prompt\n * @param output - The LLM output\n * @param normalizedJson - Parsed JSON if available\n * @param customHooks - Optional custom detection hooks\n * @returns Hallucination signals and scores\n */\nexport function calculateHallucinationSignals(\n  prompt: string,\n  output: string,\n  normalizedJson: unknown | undefined,\n  customHooks?: Array<(prompt: string, output: string) => number>\n): HallucinationSignals {\n  const speculativeFactsScore = calculateSpeculativeFactsScore(prompt, output);\n  const overconfidentScore = calculateOverconfidentScore(output);\n  const fabricatedKeysScore = normalizedJson \n    ? calculateFabricatedKeysScore(prompt, normalizedJson) \n    : 0;\n  const contradictionScore = calculateContradictionScore(output);\n  \n  return {\n    speculativeFactsScore,\n    fabricatedKeysScore,\n    overconfidentScore,\n    contradictionScore,\n    customHooksCount: customHooks?.length ?? 0\n  };\n}\n\n/** Default weights for hallucination signals (tuned to reduce false positives) */\nexport const DEFAULT_HALLUCINATION_WEIGHTS = {\n  speculative: 0.35,   // Reduced from 0.4 - new entities are often legitimate\n  fabricated: 0.25,    // Reduced from 0.3 - JSON keys often aren't in prompt\n  overconfident: 0.25, // Increased from 0.2 - overconfident language is a stronger signal\n  contradiction: 0.15  // Increased from 0.1 - contradictions are meaningful\n};\n\n/**\n * Calculates overall hallucination risk score.\n * \n * @param signals - Hallucination signals\n * @param prompt - Original prompt\n * @param output - LLM output\n * @param customHooks - Optional custom hooks\n * @param weights - Optional weight overrides\n * @returns Risk score (0-1)\n */\nexport function calculateHallucinationRisk(\n  signals: HallucinationSignals,\n  prompt: string,\n  output: string,\n  customHooks?: Array<(prompt: string, output: string) => number>,\n  weights?: {\n    speculative?: number;\n    fabricated?: number;\n    overconfident?: number;\n    contradiction?: number;\n  }\n): number {\n  // Merge weights with defaults\n  const w = {\n    speculative: weights?.speculative ?? DEFAULT_HALLUCINATION_WEIGHTS.speculative,\n    fabricated: weights?.fabricated ?? DEFAULT_HALLUCINATION_WEIGHTS.fabricated,\n    overconfident: weights?.overconfident ?? DEFAULT_HALLUCINATION_WEIGHTS.overconfident,\n    contradiction: weights?.contradiction ?? DEFAULT_HALLUCINATION_WEIGHTS.contradiction\n  };\n  \n  // Base risk from internal signals\n  let risk = \n    w.speculative * signals.speculativeFactsScore +\n    w.fabricated * signals.fabricatedKeysScore +\n    w.overconfident * signals.overconfidentScore +\n    w.contradiction * signals.contradictionScore;\n  \n  // Apply custom hooks\n  if (customHooks) {\n    for (const hook of customHooks) {\n      try {\n        const hookScore = clamp(hook(prompt, output), 0, 1);\n        risk += 0.2 * hookScore;\n      } catch {\n        // Ignore hook errors\n      }\n    }\n  }\n  \n  return clamp(risk, 0, 1);\n}\n\n/**\n * Gets hallucination risk label from score.\n */\nexport function getHallucinationLabel(risk: number): HallucinationLabel {\n  if (risk <= 0.3) return 'low';\n  if (risk <= 0.6) return 'medium';\n  return 'high';\n}\n"]}