UNPKG

llmverify

Version:

AI Output Verification Toolkit — Local-first LLM safety, hallucination detection, PII redaction, prompt injection defense, and runtime monitoring. Zero telemetry. OWASP LLM Top 10 aligned.

180 lines 18.4 kB
"use strict"; /** * Harmful Content Detection * * Detects potentially harmful content patterns. * Pattern-based detection with honest limitations. * * @module csm6/security/harmful-content * @author Haiec * @license MIT */ Object.defineProperty(exports, "__esModule", { value: true }); exports.checkHarmfulContent = checkHarmfulContent; const text_1 = require("../../utils/text"); const LIMITATIONS = [ 'Keyword-based detection only', 'High false positive rate on educational content', 'Cannot detect context or intent', 'May miss euphemisms or coded language', 'English language only' ]; const METHODOLOGY = 'Keyword pattern matching for harmful content categories. ' + 'Detects violence, threats, self-harm, and dangerous instructions. ' + 'Accuracy: ~60% recall, requires human validation.'; const HARM_PATTERNS = [ { category: 'VIOLENCE', patterns: [ /\b(kill|murder|assassinate|execute)\s+(him|her|them|you|everyone)\b/gi, /\bhow\s+to\s+(kill|murder|harm|hurt)\b/gi, /\b(bomb|explosive|weapon)\s+(making|instructions|how\s+to)\b/gi ], severity: 'critical', confidence: 0.7, message: 'Violent content detected' }, { category: 'THREATS', patterns: [ /\bi\s+will\s+(kill|hurt|harm|destroy)\s+you\b/gi, /\byou\s+will\s+(die|suffer|regret)\b/gi, /\b(threat|threaten|threatening)\s+to\b/gi ], severity: 'high', confidence: 0.65, message: 'Threatening language detected' }, { category: 'SELF_HARM', patterns: [ /\bhow\s+to\s+(commit\s+suicide|kill\s+myself|end\s+my\s+life)\b/gi, /\b(suicide|self-harm)\s+(methods?|ways?|instructions?)\b/gi ], severity: 'critical', confidence: 0.75, message: 'Self-harm content detected' }, { category: 'DANGEROUS_INSTRUCTIONS', patterns: [ /\bhow\s+to\s+(hack|break\s+into|steal)\b/gi, /\b(synthesize|make|create)\s+(drugs?|meth|cocaine|heroin)\b/gi, /\b(poison|poisoning)\s+(someone|food|water)\b/gi ], severity: 'high', confidence: 0.6, message: 'Dangerous instructions detected' }, { category: 'HATE_SPEECH', patterns: [ /\b(hate|kill|exterminate)\s+all\s+(jews|muslims|christians|blacks|whites|gays)\b/gi, /\b(inferior|subhuman)\s+(race|people|group)\b/gi ], severity: 'critical', confidence: 0.8, message: 'Hate speech detected' } ]; /** * Check for harmful content */ function checkHarmfulContent(content) { const findings = []; const foundCategories = new Set(); for (const harmPattern of HARM_PATTERNS) { for (const pattern of harmPattern.patterns) { pattern.lastIndex = 0; const matches = Array.from(content.matchAll(pattern)); for (const match of matches) { if (foundCategories.has(harmPattern.category)) continue; if (!isLikelyFalsePositive(content, match[0])) { foundCategories.add(harmPattern.category); findings.push({ id: `HARMFUL_${harmPattern.category}`, category: 'safety', severity: harmPattern.severity, surface: 'output', message: harmPattern.message, recommendation: 'Block or flag content for human review. Do not display to users.', evidence: { textSample: (0, text_1.truncate)(match[0], 50), pattern: harmPattern.category, context: (0, text_1.extractContext)(content, match.index || 0, 30) }, confidence: calculateConfidence(harmPattern.confidence), limitations: LIMITATIONS, methodology: METHODOLOGY }); break; } } } } return findings; } /** * Calculate confidence score */ function calculateConfidence(baseConfidence) { return { value: baseConfidence, interval: [Math.max(0, baseConfidence - 0.2), Math.min(1, baseConfidence + 0.1)], method: 'heuristic', factors: { patternStrength: baseConfidence, contextClarity: 0.5 // Low because keyword-based } }; } /** * Check for false positives */ function isLikelyFalsePositive(fullText, match) { const context = fullText.substring(Math.max(0, fullText.indexOf(match) - 100), fullText.indexOf(match) + match.length + 100).toLowerCase(); // Educational/warning context const educationalMarkers = [ /warning/i, /example\s+of\s+what\s+not/i, /never\s+say/i, /avoid/i, /don'?t\s+do/i, /harmful\s+content\s+includes/i, /detecting/i, /prevention/i, /awareness/i, /training/i, /policy/i ]; if (educationalMarkers.some(p => p.test(context))) { return true; } // Fiction/story context const fictionMarkers = [ /story/i, /novel/i, /fiction/i, /character/i, /plot/i, /movie/i, /game/i ]; if (fictionMarkers.some(p => p.test(context))) { return true; } // News/reporting context const newsMarkers = [ /reported/i, /according\s+to/i, /news/i, /article/i, /investigation/i ]; if (newsMarkers.some(p => p.test(context))) { return true; } return false; } //# sourceMappingURL=data:application/json;base64,{"version":3,"file":"harmful-content.js","sourceRoot":"","sources":["../../../src/csm6/security/harmful-content.ts"],"names":[],"mappings":";AAAA;;;;;;;;;GASG;;AAqFH,kDAyCC;AA3HD,2CAA4D;AAE5D,MAAM,WAAW,GAAG;IAClB,8BAA8B;IAC9B,iDAAiD;IACjD,iCAAiC;IACjC,uCAAuC;IACvC,uBAAuB;CACxB,CAAC;AAEF,MAAM,WAAW,GACf,2DAA2D;IAC3D,oEAAoE;IACpE,mDAAmD,CAAC;AAUtD,MAAM,aAAa,GAAkB;IACnC;QACE,QAAQ,EAAE,UAAU;QACpB,QAAQ,EAAE;YACR,uEAAuE;YACvE,0CAA0C;YAC1C,gEAAgE;SACjE;QACD,QAAQ,EAAE,UAAU;QACpB,UAAU,EAAE,GAAG;QACf,OAAO,EAAE,0BAA0B;KACpC;IACD;QACE,QAAQ,EAAE,SAAS;QACnB,QAAQ,EAAE;YACR,iDAAiD;YACjD,wCAAwC;YACxC,0CAA0C;SAC3C;QACD,QAAQ,EAAE,MAAM;QAChB,UAAU,EAAE,IAAI;QAChB,OAAO,EAAE,+BAA+B;KACzC;IACD;QACE,QAAQ,EAAE,WAAW;QACrB,QAAQ,EAAE;YACR,mEAAmE;YACnE,4DAA4D;SAC7D;QACD,QAAQ,EAAE,UAAU;QACpB,UAAU,EAAE,IAAI;QAChB,OAAO,EAAE,4BAA4B;KACtC;IACD;QACE,QAAQ,EAAE,wBAAwB;QAClC,QAAQ,EAAE;YACR,4CAA4C;YAC5C,+DAA+D;YAC/D,iDAAiD;SAClD;QACD,QAAQ,EAAE,MAAM;QAChB,UAAU,EAAE,GAAG;QACf,OAAO,EAAE,iCAAiC;KAC3C;IACD;QACE,QAAQ,EAAE,aAAa;QACvB,QAAQ,EAAE;YACR,oFAAoF;YACpF,iDAAiD;SAClD;QACD,QAAQ,EAAE,UAAU;QACpB,UAAU,EAAE,GAAG;QACf,OAAO,EAAE,sBAAsB;KAChC;CACF,CAAC;AAEF;;GAEG;AACH,SAAgB,mBAAmB,CAAC,OAAe;IACjD,MAAM,QAAQ,GAAc,EAAE,CAAC;IAC/B,MAAM,eAAe,GAAG,IAAI,GAAG,EAAU,CAAC;IAE1C,KAAK,MAAM,WAAW,IAAI,aAAa,EAAE,CAAC;QACxC,KAAK,MAAM,OAAO,IAAI,WAAW,CAAC,QAAQ,EAAE,CAAC;YAC3C,OAAO,CAAC,SAAS,GAAG,CAAC,CAAC;YACtB,MAAM,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC;YAEtD,KAAK,MAAM,KAAK,IAAI,OAAO,EAAE,CAAC;gBAC5B,IAAI,eAAe,CAAC,GAAG,CAAC,WAAW,CAAC,QAAQ,CAAC;oBAAE,SAAS;gBAExD,IAAI,CAAC,qBAAqB,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;oBAC9C,eAAe,CAAC,GAAG,CAAC,WAAW,CAAC,QAAQ,CAAC,CAAC;oBAE1C,QAAQ,CAAC,IAAI,CAAC;wBACZ,EAAE,EAAE,WAAW,WAAW,CAAC,QAAQ,EAAE;wBACrC,QAAQ,EAAE,QAAQ;wBAClB,QAAQ,EAAE,WAAW,CAAC,QAAQ;wBAC9B,OAAO,EAAE,QAAQ;wBACjB,OAAO,EAAE,WAAW,CAAC,OAAO;wBAC5B,cAAc,EAAE,kEAAkE;wBAElF,QAAQ,EAAE;4BACR,UAAU,EAAE,IAAA,eAAQ,EAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;4BAClC,OAAO,EAAE,WAAW,CAAC,QAAQ;4BAC7B,OAAO,EAAE,IAAA,qBAAc,EAAC,OAAO,EAAE,KAAK,CAAC,KAAK,IAAI,CAAC,EAAE,EAAE,CAAC;yBACvD;wBAED,UAAU,EAAE,mBAAmB,CAAC,WAAW,CAAC,UAAU,CAAC;wBACvD,WAAW,EAAE,WAAW;wBACxB,WAAW,EAAE,WAAW;qBACzB,CAAC,CAAC;oBAEH,MAAM;gBACR,CAAC;YACH,CAAC;QACH,CAAC;IACH,CAAC;IAED,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,cAAsB;IACjD,OAAO;QACL,KAAK,EAAE,cAAc;QACrB,QAAQ,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,cAAc,GAAG,GAAG,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,cAAc,GAAG,GAAG,CAAC,CAAC;QAChF,MAAM,EAAE,WAAW;QACnB,OAAO,EAAE;YACP,eAAe,EAAE,cAAc;YAC/B,cAAc,EAAE,GAAG,CAAC,4BAA4B;SACjD;KACF,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,qBAAqB,CAAC,QAAgB,EAAE,KAAa;IAC5D,MAAM,OAAO,GAAG,QAAQ,CAAC,SAAS,CAChC,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,QAAQ,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,GAAG,CAAC,EAC1C,QAAQ,CAAC,OAAO,CAAC,KAAK,CAAC,GAAG,KAAK,CAAC,MAAM,GAAG,GAAG,CAC7C,CAAC,WAAW,EAAE,CAAC;IAEhB,8BAA8B;IAC9B,MAAM,kBAAkB,GAAG;QACzB,UAAU;QACV,4BAA4B;QAC5B,cAAc;QACd,QAAQ;QACR,cAAc;QACd,+BAA+B;QAC/B,YAAY;QACZ,aAAa;QACb,YAAY;QACZ,WAAW;QACX,SAAS;KACV,CAAC;IAEF,IAAI,kBAAkB,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,EAAE,CAAC;QAClD,OAAO,IAAI,CAAC;IACd,CAAC;IAED,wBAAwB;IACxB,MAAM,cAAc,GAAG;QACrB,QAAQ;QACR,QAAQ;QACR,UAAU;QACV,YAAY;QACZ,OAAO;QACP,QAAQ;QACR,OAAO;KACR,CAAC;IAEF,IAAI,cAAc,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,EAAE,CAAC;QAC9C,OAAO,IAAI,CAAC;IACd,CAAC;IAED,yBAAyB;IACzB,MAAM,WAAW,GAAG;QAClB,WAAW;QACX,iBAAiB;QACjB,OAAO;QACP,UAAU;QACV,gBAAgB;KACjB,CAAC;IAEF,IAAI,WAAW,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,EAAE,CAAC;QAC3C,OAAO,IAAI,CAAC;IACd,CAAC;IAED,OAAO,KAAK,CAAC;AACf,CAAC","sourcesContent":["/**\n * Harmful Content Detection\n * \n * Detects potentially harmful content patterns.\n * Pattern-based detection with honest limitations.\n * \n * @module csm6/security/harmful-content\n * @author Haiec\n * @license MIT\n */\n\nimport { Finding, ConfidenceScore } from '../../types/results';\nimport { truncate, extractContext } from '../../utils/text';\n\nconst LIMITATIONS = [\n  'Keyword-based detection only',\n  'High false positive rate on educational content',\n  'Cannot detect context or intent',\n  'May miss euphemisms or coded language',\n  'English language only'\n];\n\nconst METHODOLOGY =\n  'Keyword pattern matching for harmful content categories. ' +\n  'Detects violence, threats, self-harm, and dangerous instructions. ' +\n  'Accuracy: ~60% recall, requires human validation.';\n\ninterface HarmPattern {\n  category: string;\n  patterns: RegExp[];\n  severity: 'medium' | 'high' | 'critical';\n  confidence: number;\n  message: string;\n}\n\nconst HARM_PATTERNS: HarmPattern[] = [\n  {\n    category: 'VIOLENCE',\n    patterns: [\n      /\\b(kill|murder|assassinate|execute)\\s+(him|her|them|you|everyone)\\b/gi,\n      /\\bhow\\s+to\\s+(kill|murder|harm|hurt)\\b/gi,\n      /\\b(bomb|explosive|weapon)\\s+(making|instructions|how\\s+to)\\b/gi\n    ],\n    severity: 'critical',\n    confidence: 0.7,\n    message: 'Violent content detected'\n  },\n  {\n    category: 'THREATS',\n    patterns: [\n      /\\bi\\s+will\\s+(kill|hurt|harm|destroy)\\s+you\\b/gi,\n      /\\byou\\s+will\\s+(die|suffer|regret)\\b/gi,\n      /\\b(threat|threaten|threatening)\\s+to\\b/gi\n    ],\n    severity: 'high',\n    confidence: 0.65,\n    message: 'Threatening language detected'\n  },\n  {\n    category: 'SELF_HARM',\n    patterns: [\n      /\\bhow\\s+to\\s+(commit\\s+suicide|kill\\s+myself|end\\s+my\\s+life)\\b/gi,\n      /\\b(suicide|self-harm)\\s+(methods?|ways?|instructions?)\\b/gi\n    ],\n    severity: 'critical',\n    confidence: 0.75,\n    message: 'Self-harm content detected'\n  },\n  {\n    category: 'DANGEROUS_INSTRUCTIONS',\n    patterns: [\n      /\\bhow\\s+to\\s+(hack|break\\s+into|steal)\\b/gi,\n      /\\b(synthesize|make|create)\\s+(drugs?|meth|cocaine|heroin)\\b/gi,\n      /\\b(poison|poisoning)\\s+(someone|food|water)\\b/gi\n    ],\n    severity: 'high',\n    confidence: 0.6,\n    message: 'Dangerous instructions detected'\n  },\n  {\n    category: 'HATE_SPEECH',\n    patterns: [\n      /\\b(hate|kill|exterminate)\\s+all\\s+(jews|muslims|christians|blacks|whites|gays)\\b/gi,\n      /\\b(inferior|subhuman)\\s+(race|people|group)\\b/gi\n    ],\n    severity: 'critical',\n    confidence: 0.8,\n    message: 'Hate speech detected'\n  }\n];\n\n/**\n * Check for harmful content\n */\nexport function checkHarmfulContent(content: string): Finding[] {\n  const findings: Finding[] = [];\n  const foundCategories = new Set<string>();\n  \n  for (const harmPattern of HARM_PATTERNS) {\n    for (const pattern of harmPattern.patterns) {\n      pattern.lastIndex = 0;\n      const matches = Array.from(content.matchAll(pattern));\n      \n      for (const match of matches) {\n        if (foundCategories.has(harmPattern.category)) continue;\n        \n        if (!isLikelyFalsePositive(content, match[0])) {\n          foundCategories.add(harmPattern.category);\n          \n          findings.push({\n            id: `HARMFUL_${harmPattern.category}`,\n            category: 'safety',\n            severity: harmPattern.severity,\n            surface: 'output',\n            message: harmPattern.message,\n            recommendation: 'Block or flag content for human review. Do not display to users.',\n            \n            evidence: {\n              textSample: truncate(match[0], 50),\n              pattern: harmPattern.category,\n              context: extractContext(content, match.index || 0, 30)\n            },\n            \n            confidence: calculateConfidence(harmPattern.confidence),\n            limitations: LIMITATIONS,\n            methodology: METHODOLOGY\n          });\n          \n          break;\n        }\n      }\n    }\n  }\n  \n  return findings;\n}\n\n/**\n * Calculate confidence score\n */\nfunction calculateConfidence(baseConfidence: number): ConfidenceScore {\n  return {\n    value: baseConfidence,\n    interval: [Math.max(0, baseConfidence - 0.2), Math.min(1, baseConfidence + 0.1)],\n    method: 'heuristic',\n    factors: {\n      patternStrength: baseConfidence,\n      contextClarity: 0.5 // Low because keyword-based\n    }\n  };\n}\n\n/**\n * Check for false positives\n */\nfunction isLikelyFalsePositive(fullText: string, match: string): boolean {\n  const context = fullText.substring(\n    Math.max(0, fullText.indexOf(match) - 100),\n    fullText.indexOf(match) + match.length + 100\n  ).toLowerCase();\n  \n  // Educational/warning context\n  const educationalMarkers = [\n    /warning/i,\n    /example\\s+of\\s+what\\s+not/i,\n    /never\\s+say/i,\n    /avoid/i,\n    /don'?t\\s+do/i,\n    /harmful\\s+content\\s+includes/i,\n    /detecting/i,\n    /prevention/i,\n    /awareness/i,\n    /training/i,\n    /policy/i\n  ];\n  \n  if (educationalMarkers.some(p => p.test(context))) {\n    return true;\n  }\n  \n  // Fiction/story context\n  const fictionMarkers = [\n    /story/i,\n    /novel/i,\n    /fiction/i,\n    /character/i,\n    /plot/i,\n    /movie/i,\n    /game/i\n  ];\n  \n  if (fictionMarkers.some(p => p.test(context))) {\n    return true;\n  }\n  \n  // News/reporting context\n  const newsMarkers = [\n    /reported/i,\n    /according\\s+to/i,\n    /news/i,\n    /article/i,\n    /investigation/i\n  ];\n  \n  if (newsMarkers.some(p => p.test(context))) {\n    return true;\n  }\n  \n  return false;\n}\n"]}