llmverify
Version:
AI Output Verification Toolkit — Local-first LLM safety, hallucination detection, PII redaction, prompt injection defense, and runtime monitoring. Zero telemetry. OWASP LLM Top 10 aligned.
189 lines • 20.4 kB
JavaScript
;
/**
* Hallucination Risk Heuristic Module
*
* Detects potential hallucination signals in LLM output.
* Uses heuristics - not definitive hallucination detection.
*
* LIMITATIONS:
* - Heuristic-based, not ground-truth verification
* - May produce false positives/negatives
* - Cannot detect factually incorrect but plausible statements
* - Requires prompt context for best results
*
* @module engines/classification/hallucination
* @author Haiec
* @license MIT
*/
Object.defineProperty(exports, "__esModule", { value: true });
exports.DEFAULT_HALLUCINATION_WEIGHTS = void 0;
exports.calculateHallucinationSignals = calculateHallucinationSignals;
exports.calculateHallucinationRisk = calculateHallucinationRisk;
exports.getHallucinationLabel = getHallucinationLabel;
const utils_1 = require("./utils");
/**
* Overconfident language patterns.
*/
const OVERCONFIDENT_PATTERNS = [
'definitely',
'certainly',
'guaranteed',
'proven',
'no doubt',
'undeniable',
'absolutely',
'without question',
'100%',
'always',
'never fails'
];
/**
* Contradiction patterns (simplified).
*/
const CONTRADICTION_PATTERNS = [
{ positive: /is required/i, negative: /is optional/i },
{ positive: /must/i, negative: /does not need to/i },
{ positive: /always/i, negative: /never/i },
{ positive: /can/i, negative: /cannot/i },
{ positive: /will/i, negative: /will not/i }
];
/**
* Calculates speculative facts score.
* Looks for capitalized entities in output not present in prompt.
*/
function calculateSpeculativeFactsScore(prompt, output) {
const promptEntities = (0, utils_1.extractCapitalizedTokens)(prompt);
const outputEntities = (0, utils_1.extractCapitalizedTokens)(output);
let newEntities = 0;
for (const entity of outputEntities) {
if (!promptEntities.has(entity)) {
newEntities++;
}
}
// Score: 0 if no new entities, 1 if 5+ new entities
return (0, utils_1.clamp)(newEntities / 5, 0, 1);
}
/**
* Calculates overconfident language score.
*/
function calculateOverconfidentScore(text) {
const matches = (0, utils_1.countMatches)(text, OVERCONFIDENT_PATTERNS);
if (matches === 0)
return 0;
if (matches === 1)
return 0.5;
return 0.8;
}
/**
* Calculates fabricated JSON keys score.
* Looks for JSON keys not mentioned in prompt.
*/
function calculateFabricatedKeysScore(prompt, normalizedJson) {
if (!normalizedJson || typeof normalizedJson !== 'object' || Array.isArray(normalizedJson)) {
return 0;
}
const obj = normalizedJson;
const keys = Object.keys(obj);
const promptLower = prompt.toLowerCase();
let extraKeys = 0;
for (const key of keys) {
if (!promptLower.includes(key.toLowerCase())) {
extraKeys++;
}
}
return (0, utils_1.clamp)(extraKeys / 5, 0, 1);
}
/**
* Calculates contradiction score.
* Looks for simple contradictory patterns.
*/
function calculateContradictionScore(text) {
for (const pattern of CONTRADICTION_PATTERNS) {
const hasPositive = pattern.positive.test(text);
const hasNegative = pattern.negative.test(text);
if (hasPositive && hasNegative) {
// Found potential contradiction
return 0.7;
}
}
return 0;
}
/**
* Calculates hallucination risk signals.
*
* @param prompt - The original prompt
* @param output - The LLM output
* @param normalizedJson - Parsed JSON if available
* @param customHooks - Optional custom detection hooks
* @returns Hallucination signals and scores
*/
function calculateHallucinationSignals(prompt, output, normalizedJson, customHooks) {
const speculativeFactsScore = calculateSpeculativeFactsScore(prompt, output);
const overconfidentScore = calculateOverconfidentScore(output);
const fabricatedKeysScore = normalizedJson
? calculateFabricatedKeysScore(prompt, normalizedJson)
: 0;
const contradictionScore = calculateContradictionScore(output);
return {
speculativeFactsScore,
fabricatedKeysScore,
overconfidentScore,
contradictionScore,
customHooksCount: customHooks?.length ?? 0
};
}
/** Default weights for hallucination signals (tuned to reduce false positives) */
exports.DEFAULT_HALLUCINATION_WEIGHTS = {
speculative: 0.35, // Reduced from 0.4 - new entities are often legitimate
fabricated: 0.25, // Reduced from 0.3 - JSON keys often aren't in prompt
overconfident: 0.25, // Increased from 0.2 - overconfident language is a stronger signal
contradiction: 0.15 // Increased from 0.1 - contradictions are meaningful
};
/**
* Calculates overall hallucination risk score.
*
* @param signals - Hallucination signals
* @param prompt - Original prompt
* @param output - LLM output
* @param customHooks - Optional custom hooks
* @param weights - Optional weight overrides
* @returns Risk score (0-1)
*/
function calculateHallucinationRisk(signals, prompt, output, customHooks, weights) {
// Merge weights with defaults
const w = {
speculative: weights?.speculative ?? exports.DEFAULT_HALLUCINATION_WEIGHTS.speculative,
fabricated: weights?.fabricated ?? exports.DEFAULT_HALLUCINATION_WEIGHTS.fabricated,
overconfident: weights?.overconfident ?? exports.DEFAULT_HALLUCINATION_WEIGHTS.overconfident,
contradiction: weights?.contradiction ?? exports.DEFAULT_HALLUCINATION_WEIGHTS.contradiction
};
// Base risk from internal signals
let risk = w.speculative * signals.speculativeFactsScore +
w.fabricated * signals.fabricatedKeysScore +
w.overconfident * signals.overconfidentScore +
w.contradiction * signals.contradictionScore;
// Apply custom hooks
if (customHooks) {
for (const hook of customHooks) {
try {
const hookScore = (0, utils_1.clamp)(hook(prompt, output), 0, 1);
risk += 0.2 * hookScore;
}
catch {
// Ignore hook errors
}
}
}
return (0, utils_1.clamp)(risk, 0, 1);
}
/**
* Gets hallucination risk label from score.
*/
function getHallucinationLabel(risk) {
if (risk <= 0.3)
return 'low';
if (risk <= 0.6)
return 'medium';
return 'high';
}
//# sourceMappingURL=data:application/json;base64,{"version":3,"file":"hallucination.js","sourceRoot":"","sources":["../../../src/engines/classification/hallucination.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;;;;;;GAeG;;;AAoHH,sEAoBC;AAoBD,gEAwCC;AAKD,sDAIC;AA1MD,mCAAqF;AAErF;;GAEG;AACH,MAAM,sBAAsB,GAAG;IAC7B,YAAY;IACZ,WAAW;IACX,YAAY;IACZ,QAAQ;IACR,UAAU;IACV,YAAY;IACZ,YAAY;IACZ,kBAAkB;IAClB,MAAM;IACN,QAAQ;IACR,aAAa;CACd,CAAC;AAEF;;GAEG;AACH,MAAM,sBAAsB,GAAG;IAC7B,EAAE,QAAQ,EAAE,cAAc,EAAE,QAAQ,EAAE,cAAc,EAAE;IACtD,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,mBAAmB,EAAE;IACpD,EAAE,QAAQ,EAAE,SAAS,EAAE,QAAQ,EAAE,QAAQ,EAAE;IAC3C,EAAE,QAAQ,EAAE,MAAM,EAAE,QAAQ,EAAE,SAAS,EAAE;IACzC,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,WAAW,EAAE;CAC7C,CAAC;AAEF;;;GAGG;AACH,SAAS,8BAA8B,CAAC,MAAc,EAAE,MAAc;IACpE,MAAM,cAAc,GAAG,IAAA,gCAAwB,EAAC,MAAM,CAAC,CAAC;IACxD,MAAM,cAAc,GAAG,IAAA,gCAAwB,EAAC,MAAM,CAAC,CAAC;IAExD,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,KAAK,MAAM,MAAM,IAAI,cAAc,EAAE,CAAC;QACpC,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC;YAChC,WAAW,EAAE,CAAC;QAChB,CAAC;IACH,CAAC;IAED,oDAAoD;IACpD,OAAO,IAAA,aAAK,EAAC,WAAW,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;AACtC,CAAC;AAED;;GAEG;AACH,SAAS,2BAA2B,CAAC,IAAY;IAC/C,MAAM,OAAO,GAAG,IAAA,oBAAY,EAAC,IAAI,EAAE,sBAAsB,CAAC,CAAC;IAE3D,IAAI,OAAO,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IAC5B,IAAI,OAAO,KAAK,CAAC;QAAE,OAAO,GAAG,CAAC;IAC9B,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;GAGG;AACH,SAAS,4BAA4B,CACnC,MAAc,EACd,cAAuB;IAEvB,IAAI,CAAC,cAAc,IAAI,OAAO,cAAc,KAAK,QAAQ,IAAI,KAAK,CAAC,OAAO,CAAC,cAAc,CAAC,EAAE,CAAC;QAC3F,OAAO,CAAC,CAAC;IACX,CAAC;IAED,MAAM,GAAG,GAAG,cAAyC,CAAC;IACtD,MAAM,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;IAC9B,MAAM,WAAW,GAAG,MAAM,CAAC,WAAW,EAAE,CAAC;IAEzC,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,IAAI,CAAC,WAAW,CAAC,QAAQ,CAAC,GAAG,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;YAC7C,SAAS,EAAE,CAAC;QACd,CAAC;IACH,CAAC;IAED,OAAO,IAAA,aAAK,EAAC,SAAS,GAAG,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;AACpC,CAAC;AAED;;;GAGG;AACH,SAAS,2BAA2B,CAAC,IAAY;IAC/C,KAAK,MAAM,OAAO,IAAI,sBAAsB,EAAE,CAAC;QAC7C,MAAM,WAAW,GAAG,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChD,MAAM,WAAW,GAAG,OAAO,CAAC,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAEhD,IAAI,WAAW,IAAI,WAAW,EAAE,CAAC;YAC/B,gCAAgC;YAChC,OAAO,GAAG,CAAC;QACb,CAAC;IACH,CAAC;IAED,OAAO,CAAC,CAAC;AACX,CAAC;AAED;;;;;;;;GAQG;AACH,SAAgB,6BAA6B,CAC3C,MAAc,EACd,MAAc,EACd,cAAmC,EACnC,WAA+D;IAE/D,MAAM,qBAAqB,GAAG,8BAA8B,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAC7E,MAAM,kBAAkB,GAAG,2BAA2B,CAAC,MAAM,CAAC,CAAC;IAC/D,MAAM,mBAAmB,GAAG,cAAc;QACxC,CAAC,CAAC,4BAA4B,CAAC,MAAM,EAAE,cAAc,CAAC;QACtD,CAAC,CAAC,CAAC,CAAC;IACN,MAAM,kBAAkB,GAAG,2BAA2B,CAAC,MAAM,CAAC,CAAC;IAE/D,OAAO;QACL,qBAAqB;QACrB,mBAAmB;QACnB,kBAAkB;QAClB,kBAAkB;QAClB,gBAAgB,EAAE,WAAW,EAAE,MAAM,IAAI,CAAC;KAC3C,CAAC;AACJ,CAAC;AAED,kFAAkF;AACrE,QAAA,6BAA6B,GAAG;IAC3C,WAAW,EAAE,IAAI,EAAI,uDAAuD;IAC5E,UAAU,EAAE,IAAI,EAAK,sDAAsD;IAC3E,aAAa,EAAE,IAAI,EAAE,mEAAmE;IACxF,aAAa,EAAE,IAAI,CAAE,qDAAqD;CAC3E,CAAC;AAEF;;;;;;;;;GASG;AACH,SAAgB,0BAA0B,CACxC,OAA6B,EAC7B,MAAc,EACd,MAAc,EACd,WAA+D,EAC/D,OAKC;IAED,8BAA8B;IAC9B,MAAM,CAAC,GAAG;QACR,WAAW,EAAE,OAAO,EAAE,WAAW,IAAI,qCAA6B,CAAC,WAAW;QAC9E,UAAU,EAAE,OAAO,EAAE,UAAU,IAAI,qCAA6B,CAAC,UAAU;QAC3E,aAAa,EAAE,OAAO,EAAE,aAAa,IAAI,qCAA6B,CAAC,aAAa;QACpF,aAAa,EAAE,OAAO,EAAE,aAAa,IAAI,qCAA6B,CAAC,aAAa;KACrF,CAAC;IAEF,kCAAkC;IAClC,IAAI,IAAI,GACN,CAAC,CAAC,WAAW,GAAG,OAAO,CAAC,qBAAqB;QAC7C,CAAC,CAAC,UAAU,GAAG,OAAO,CAAC,mBAAmB;QAC1C,CAAC,CAAC,aAAa,GAAG,OAAO,CAAC,kBAAkB;QAC5C,CAAC,CAAC,aAAa,GAAG,OAAO,CAAC,kBAAkB,CAAC;IAE/C,qBAAqB;IACrB,IAAI,WAAW,EAAE,CAAC;QAChB,KAAK,MAAM,IAAI,IAAI,WAAW,EAAE,CAAC;YAC/B,IAAI,CAAC;gBACH,MAAM,SAAS,GAAG,IAAA,aAAK,EAAC,IAAI,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;gBACpD,IAAI,IAAI,GAAG,GAAG,SAAS,CAAC;YAC1B,CAAC;YAAC,MAAM,CAAC;gBACP,qBAAqB;YACvB,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,IAAA,aAAK,EAAC,IAAI,EAAE,CAAC,EAAE,CAAC,CAAC,CAAC;AAC3B,CAAC;AAED;;GAEG;AACH,SAAgB,qBAAqB,CAAC,IAAY;IAChD,IAAI,IAAI,IAAI,GAAG;QAAE,OAAO,KAAK,CAAC;IAC9B,IAAI,IAAI,IAAI,GAAG;QAAE,OAAO,QAAQ,CAAC;IACjC,OAAO,MAAM,CAAC;AAChB,CAAC","sourcesContent":["/**\n * Hallucination Risk Heuristic Module\n * \n * Detects potential hallucination signals in LLM output.\n * Uses heuristics - not definitive hallucination detection.\n * \n * LIMITATIONS:\n * - Heuristic-based, not ground-truth verification\n * - May produce false positives/negatives\n * - Cannot detect factually incorrect but plausible statements\n * - Requires prompt context for best results\n * \n * @module engines/classification/hallucination\n * @author Haiec\n * @license MIT\n */\n\nimport { HallucinationSignals, HallucinationLabel } from './types';\nimport { clamp, extractCapitalizedTokens, containsAny, countMatches } from './utils';\n\n/**\n * Overconfident language patterns.\n */\nconst OVERCONFIDENT_PATTERNS = [\n  'definitely',\n  'certainly',\n  'guaranteed',\n  'proven',\n  'no doubt',\n  'undeniable',\n  'absolutely',\n  'without question',\n  '100%',\n  'always',\n  'never fails'\n];\n\n/**\n * Contradiction patterns (simplified).\n */\nconst CONTRADICTION_PATTERNS = [\n  { positive: /is required/i, negative: /is optional/i },\n  { positive: /must/i, negative: /does not need to/i },\n  { positive: /always/i, negative: /never/i },\n  { positive: /can/i, negative: /cannot/i },\n  { positive: /will/i, negative: /will not/i }\n];\n\n/**\n * Calculates speculative facts score.\n * Looks for capitalized entities in output not present in prompt.\n */\nfunction calculateSpeculativeFactsScore(prompt: string, output: string): number {\n  const promptEntities = extractCapitalizedTokens(prompt);\n  const outputEntities = extractCapitalizedTokens(output);\n  \n  let newEntities = 0;\n  for (const entity of outputEntities) {\n    if (!promptEntities.has(entity)) {\n      newEntities++;\n    }\n  }\n  \n  // Score: 0 if no new entities, 1 if 5+ new entities\n  return clamp(newEntities / 5, 0, 1);\n}\n\n/**\n * Calculates overconfident language score.\n */\nfunction calculateOverconfidentScore(text: string): number {\n  const matches = countMatches(text, OVERCONFIDENT_PATTERNS);\n  \n  if (matches === 0) return 0;\n  if (matches === 1) return 0.5;\n  return 0.8;\n}\n\n/**\n * Calculates fabricated JSON keys score.\n * Looks for JSON keys not mentioned in prompt.\n */\nfunction calculateFabricatedKeysScore(\n  prompt: string,\n  normalizedJson: unknown\n): number {\n  if (!normalizedJson || typeof normalizedJson !== 'object' || Array.isArray(normalizedJson)) {\n    return 0;\n  }\n  \n  const obj = normalizedJson as Record<string, unknown>;\n  const keys = Object.keys(obj);\n  const promptLower = prompt.toLowerCase();\n  \n  let extraKeys = 0;\n  for (const key of keys) {\n    if (!promptLower.includes(key.toLowerCase())) {\n      extraKeys++;\n    }\n  }\n  \n  return clamp(extraKeys / 5, 0, 1);\n}\n\n/**\n * Calculates contradiction score.\n * Looks for simple contradictory patterns.\n */\nfunction calculateContradictionScore(text: string): number {\n  for (const pattern of CONTRADICTION_PATTERNS) {\n    const hasPositive = pattern.positive.test(text);\n    const hasNegative = pattern.negative.test(text);\n    \n    if (hasPositive && hasNegative) {\n      // Found potential contradiction\n      return 0.7;\n    }\n  }\n  \n  return 0;\n}\n\n/**\n * Calculates hallucination risk signals.\n * \n * @param prompt - The original prompt\n * @param output - The LLM output\n * @param normalizedJson - Parsed JSON if available\n * @param customHooks - Optional custom detection hooks\n * @returns Hallucination signals and scores\n */\nexport function calculateHallucinationSignals(\n  prompt: string,\n  output: string,\n  normalizedJson: unknown | undefined,\n  customHooks?: Array<(prompt: string, output: string) => number>\n): HallucinationSignals {\n  const speculativeFactsScore = calculateSpeculativeFactsScore(prompt, output);\n  const overconfidentScore = calculateOverconfidentScore(output);\n  const fabricatedKeysScore = normalizedJson \n    ? calculateFabricatedKeysScore(prompt, normalizedJson) \n    : 0;\n  const contradictionScore = calculateContradictionScore(output);\n  \n  return {\n    speculativeFactsScore,\n    fabricatedKeysScore,\n    overconfidentScore,\n    contradictionScore,\n    customHooksCount: customHooks?.length ?? 0\n  };\n}\n\n/** Default weights for hallucination signals (tuned to reduce false positives) */\nexport const DEFAULT_HALLUCINATION_WEIGHTS = {\n  speculative: 0.35,   // Reduced from 0.4 - new entities are often legitimate\n  fabricated: 0.25,    // Reduced from 0.3 - JSON keys often aren't in prompt\n  overconfident: 0.25, // Increased from 0.2 - overconfident language is a stronger signal\n  contradiction: 0.15  // Increased from 0.1 - contradictions are meaningful\n};\n\n/**\n * Calculates overall hallucination risk score.\n * \n * @param signals - Hallucination signals\n * @param prompt - Original prompt\n * @param output - LLM output\n * @param customHooks - Optional custom hooks\n * @param weights - Optional weight overrides\n * @returns Risk score (0-1)\n */\nexport function calculateHallucinationRisk(\n  signals: HallucinationSignals,\n  prompt: string,\n  output: string,\n  customHooks?: Array<(prompt: string, output: string) => number>,\n  weights?: {\n    speculative?: number;\n    fabricated?: number;\n    overconfident?: number;\n    contradiction?: number;\n  }\n): number {\n  // Merge weights with defaults\n  const w = {\n    speculative: weights?.speculative ?? DEFAULT_HALLUCINATION_WEIGHTS.speculative,\n    fabricated: weights?.fabricated ?? DEFAULT_HALLUCINATION_WEIGHTS.fabricated,\n    overconfident: weights?.overconfident ?? DEFAULT_HALLUCINATION_WEIGHTS.overconfident,\n    contradiction: weights?.contradiction ?? DEFAULT_HALLUCINATION_WEIGHTS.contradiction\n  };\n  \n  // Base risk from internal signals\n  let risk = \n    w.speculative * signals.speculativeFactsScore +\n    w.fabricated * signals.fabricatedKeysScore +\n    w.overconfident * signals.overconfidentScore +\n    w.contradiction * signals.contradictionScore;\n  \n  // Apply custom hooks\n  if (customHooks) {\n    for (const hook of customHooks) {\n      try {\n        const hookScore = clamp(hook(prompt, output), 0, 1);\n        risk += 0.2 * hookScore;\n      } catch {\n        // Ignore hook errors\n      }\n    }\n  }\n  \n  return clamp(risk, 0, 1);\n}\n\n/**\n * Gets hallucination risk label from score.\n */\nexport function getHallucinationLabel(risk: number): HallucinationLabel {\n  if (risk <= 0.3) return 'low';\n  if (risk <= 0.6) return 'medium';\n  return 'high';\n}\n"]}