@dawans/promptshield
Version:
Secure your LLM stack with enterprise-grade RulePacks for AI safety scanning
150 lines (142 loc) • 4.18 kB
YAML
# Copyright (c) 2025 Sawyer0
# Licensed under proprietary terms. See LICENSE for details.
version: '1.0.0'
last_updated: '2025-01-15'
name: Hallucination Detection Rules
description: Detects potential AI hallucination patterns and unreliable content
rules:
# Uncertain or speculative language
- id: uncertain-language
description: Detects uncertain or speculative language that may indicate hallucination
match_keywords:
- 'I think'
- 'I believe'
- 'probably'
- 'maybe'
- 'possibly'
- 'might be'
- 'could be'
- 'seems like'
- 'appears to be'
- 'I guess'
- 'not sure'
- 'uncertain'
- 'unclear'
- 'I imagine'
- 'presumably'
- 'supposedly'
- 'allegedly'
severity: low
category: hallucination
enabled: true
case_sensitive: false
# Fabricated data patterns
- id: fabricated-data
description: Detects patterns that suggest fabricated specific data
match_regex:
- 'According to (a )?study (by|from|conducted)'
- 'Research shows? that'
- 'Studies have shown'
- 'Data indicates?'
- 'Statistics show'
- "\\d{1,3}% of (people|users|respondents)"
- 'In a recent (study|survey|poll)'
- 'Scientists have (found|discovered|proven)'
- 'Experts say'
- 'According to experts'
severity: medium
category: hallucination
enabled: true
case_sensitive: false
# Specific false claims indicators
- id: false-claims
description: Detects language patterns that often accompany false claims
match_keywords:
- 'definitely'
- 'absolutely'
- 'certainly'
- 'without a doubt'
- 'guaranteed'
- 'proven fact'
- 'scientific fact'
- 'it is known that'
- 'everyone knows'
- 'obviously'
- 'clearly'
- 'undoubtedly'
severity: medium
category: hallucination
enabled: true
case_sensitive: false
# Temporal inconsistencies
- id: temporal-inconsistencies
description: Detects temporal references that may be inconsistent
match_regex:
- 'yesterday.*today.*tomorrow'
- 'last (week|month|year).*next (week|month|year)'
- 'currently.*in the past'
- 'now.*previously'
severity: medium
category: hallucination
enabled: true
case_sensitive: false
# Contradictory statements
- id: contradictions
description: Detects potentially contradictory language patterns
match_regex:
- '(however|but|although).*(?:however|but|although)'
- '(always|never).*(?:sometimes|occasionally)'
- '(all|every|none).*(?:some|few|many)'
- 'impossible.*possible'
- 'cannot.*can'
severity: high
category: hallucination
enabled: true
case_sensitive: false
# Overgeneralization patterns
- id: overgeneralization
description: Detects overgeneralization that may indicate hallucination
match_regex:
- "\\b(all|every|no|never|always)\\s+\\w+\\s+(are|is|do|does|will|have|has)\\b"
- 'without exception'
- 'in all cases'
- 'universally'
- 'everyone agrees'
- 'no one disagrees'
severity: medium
category: hallucination
enabled: true
case_sensitive: false
# Unsupported claims
- id: unsupported-claims
description: Detects claims that are typically unsupported
match_keywords:
- 'breakthrough'
- 'revolutionary'
- 'never before seen'
- 'first time ever'
- 'unprecedented'
- 'game-changing'
- 'life-changing'
- 'miraculous'
- 'instantly'
- 'overnight success'
severity: medium
category: hallucination
enabled: true
case_sensitive: false
# Memory confusion indicators
- id: memory-confusion
description: Detects language suggesting memory confusion or fabrication
match_keywords:
- 'I remember'
- 'I recall'
- 'as I mentioned before'
- 'like I said earlier'
- 'previously discussed'
- 'as we talked about'
- 'from our last conversation'
severity: medium
category: hallucination
enabled: true
case_sensitive: false