aiwg
Version:
Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.
376 lines (338 loc) • 12 kB
YAML
# Hallucination Detection Schema
# Based on REF-059 LitLLM Citation Processing
# Issue: #247
$schema: "https://json-schema.org/draft/2020-12/schema"
$id: "https://aiwg.io/schemas/hallucination-detection/v1"
title: "Hallucination Detection Schema"
description: |
Automated detection of fabricated or hallucinated citations implementing
pattern recognition per REF-059 LitLLM.
type: object
required:
- version
- detection_methods
- detection_pipeline
- severity_levels
properties:
version:
type: string
pattern: "^\\d+\\.\\d+\\.\\d+$"
default: "1.0.0"
detection_methods:
$ref: "#/$defs/DetectionMethods"
detection_pipeline:
$ref: "#/$defs/DetectionPipeline"
severity_levels:
$ref: "#/$defs/SeverityLevels"
$defs:
DetectionMethods:
type: object
description: "Methods for detecting citation hallucinations"
properties:
corpus_mismatch:
type: object
properties:
description: { type: string, default: "Citation REF-XXX not found in research corpus" }
severity: { type: string, default: "critical" }
detection:
type: string
default: "Check if REF-XXX exists in .aiwg/research/corpus/"
false_positive_rate: { type: number, default: 0.01 }
metadata_inconsistency:
type: object
properties:
description: { type: string, default: "Authors, year, or title mismatch with corpus entry" }
severity: { type: string, default: "critical" }
checks:
type: array
items: { type: string }
default:
- "Title match (normalized comparison)"
- "Authors match (last name comparison)"
- "Year match (exact)"
- "Venue match (fuzzy)"
similarity_threshold: { type: number, default: 0.85 }
context_mismatch:
type: object
properties:
description: { type: string, default: "Claim keywords have low semantic overlap with paper content" }
severity: { type: string, default: "warning" }
detection:
type: string
default: "Compute semantic similarity between claim and paper abstract/summary"
similarity_threshold: { type: number, default: 0.5 }
training_data_leakage:
type: object
properties:
description: { type: string, default: "Pre-cutoff citations without corpus entry" }
severity: { type: string, default: "warning" }
detection:
type: string
default: "Citation year < 2022 AND not in corpus (likely from LLM training data)"
indicators:
type: array
items: { type: string }
default:
- "Publication date before LLM training cutoff"
- "Citation not in corpus but plausible-looking"
- "Authors are well-known but paper unfindable"
- "DOI doesn't resolve"
format_anomaly:
type: object
properties:
description: { type: string, default: "Non-standard REF format or duplicate REF with different metadata" }
severity: { type: string, default: "info" }
checks:
type: array
items: { type: string }
default:
- "REF-XXX format compliance"
- "No duplicate REF numbers with different metadata"
- "Consistent citation style"
DetectionPipeline:
type: object
description: "Multi-stage detection pipeline"
properties:
enabled:
type: boolean
default: true
pipeline_stages:
type: array
items:
type: object
properties:
stage: { type: integer }
name: { type: string }
description: { type: string }
method: { type: string }
default:
- stage: 1
name: "corpus_check"
description: "Check corpus membership (whitelist)"
method: "corpus_mismatch"
- stage: 2
name: "metadata_verify"
description: "Verify metadata consistency"
method: "metadata_inconsistency"
- stage: 3
name: "context_analyze"
description: "Analyze context-claim alignment"
method: "context_mismatch"
- stage: 4
name: "leakage_check"
description: "Check for training data leakage patterns"
method: "training_data_leakage"
- stage: 5
name: "format_validate"
description: "Validate format compliance"
method: "format_anomaly"
aggregation:
type: object
properties:
strategy: { type: string, default: "severity_based" }
critical_count_threshold: { type: integer, default: 1 }
warning_count_threshold: { type: integer, default: 3 }
final_status_rules:
type: object
properties:
fail: { type: string, default: "Any critical detection OR warning_count >= threshold" }
warn: { type: string, default: "warning_count > 0 AND warning_count < threshold" }
pass: { type: string, default: "No detections" }
SeverityLevels:
type: object
description: "Detection severity classification"
properties:
critical:
type: object
properties:
symbol: { type: string, default: "✗" }
action: { type: string, default: "block_merge" }
methods:
type: array
items: { type: string }
default:
- "corpus_mismatch"
- "metadata_inconsistency"
warning:
type: object
properties:
symbol: { type: string, default: "⚠" }
action: { type: string, default: "allow_with_warning" }
methods:
type: array
items: { type: string }
default:
- "context_mismatch"
- "training_data_leakage"
info:
type: object
properties:
symbol: { type: string, default: "ℹ" }
action: { type: string, default: "log_only" }
methods:
type: array
items: { type: string }
default:
- "format_anomaly"
# Detection result schema
detection_result:
type: object
properties:
file:
type: string
citations_analyzed:
type: integer
detections:
type: array
items:
type: object
properties:
citation:
type: string
description: "REF-XXX identifier"
method:
type: string
description: "Detection method that triggered"
severity:
type: string
enum: [critical, warning, info]
message:
type: string
evidence:
type: object
description: "Supporting evidence for detection"
suggestion:
type: string
description: "How to fix the issue"
summary:
type: object
properties:
critical: { type: integer }
warning: { type: integer }
info: { type: integer }
status: { type: string, enum: [pass, fail, warn] }
# CLI commands
cli_commands:
detect_hallucinations:
command: "aiwg detect-hallucinations [path]"
description: "Detect citation hallucinations in documents"
options:
- name: "--all"
short: "-a"
description: "Check all markdown files"
- name: "--fail-on"
description: "Fail on severity level (critical, warning, info)"
default: "critical"
- name: "--format"
short: "-f"
description: "Output format (text, json, github)"
default: "text"
- name: "--fix"
description: "Suggest fixes for detected issues"
# Agent protocol
agent_protocol:
detect_hallucinations:
description: "Run hallucination detection on document"
steps:
- read_document
- extract_all_citations
- for_each_citation:
- stage_1_corpus_check
- if_in_corpus:
- stage_2_metadata_verify
- stage_3_context_analyze
- if_not_in_corpus:
- stage_4_leakage_check
- stage_5_format_validate
- record_detections
- aggregate_results
- calculate_severity_summary
- determine_final_status
- return_detection_result
generate_fix_suggestions:
description: "Generate suggestions for detected hallucinations"
steps:
- for_each_detection:
- if_corpus_mismatch:
- suggest_search_for_similar_paper
- suggest_add_to_corpus_if_real
- suggest_remove_if_fabricated
- if_metadata_inconsistency:
- suggest_correct_metadata
- show_expected_vs_actual
- if_context_mismatch:
- suggest_revise_claim
- suggest_find_supporting_paper
- if_training_leakage:
- suggest_verify_source
- suggest_add_to_corpus_if_verified
- return_suggestions
# Integration with writing-validator
writing_validator_integration:
hook_point: "validation_pipeline"
order: 3
enabled: true
fail_on: "critical"
report_format: "inline"
# CI/CD integration
ci_integration:
github_actions:
workflow_snippet: |
- name: Detect Citation Hallucinations
run: aiwg detect-hallucinations --all --fail-on critical
# Storage
storage:
detection_logs: ".aiwg/logs/hallucination-detection/"
false_positive_log: ".aiwg/logs/hallucination-false-positives.jsonl"
# Research targets (from REF-059)
research_targets:
corpus_validation: "Detect citations not in approved corpus"
metadata_verification: "Verify citation metadata accuracy"
context_analysis: "Detect claim-paper misalignment"
leakage_prevention: "Identify training data hallucinations"
format_compliance: "Enforce citation format standards"
# Success metrics
success_metrics:
false_positive_rate: "< 5%"
detection_rate: "> 95% on synthetic hallucinations"
production_target: "Zero fabricated citations in production docs"
manual_override_rate: "< 1%"
# Example detection report
example_detection_report: |
## Hallucination Detection Report
**File**: docs/research/synthesis.md
**Citations Analyzed**: 15
### Detections
✗ **CRITICAL**: REF-999 (Corpus Mismatch)
Citation "REF-999: Smith et al., 2024" not found in research corpus.
Suggestion: Verify this is a real paper and add to corpus, or remove citation.
✗ **CRITICAL**: REF-043 (Metadata Inconsistency)
Year mismatch: Document says 2024, corpus entry says 2023.
Suggestion: Update citation to match corpus: Year → 2023
⚠ **WARNING**: REF-012 (Context Mismatch)
Claim "LLMs achieve 95% accuracy" has low overlap with paper content (similarity: 0.32).
Paper discusses methodology, not accuracy metrics.
Suggestion: Verify claim is supported by cited paper.
⚠ **WARNING**: REF-008 (Training Data Leakage)
Citation from 2019 not in corpus. May be from LLM training data.
Suggestion: Verify source exists and add to corpus if real.
---
**Summary**:
- Critical: 2
- Warning: 2
- Info: 0
**Status**: FAIL (2 critical issues require resolution)
# References
references:
research:
- "@.aiwg/research/findings/REF-059-litllm-citation-processing.md"
implementation:
- "#247"
related:
- "@agentic/code/frameworks/sdlc-complete/schemas/flows/citation-verification.yaml"
- "@agentic/code/frameworks/sdlc-complete/schemas/flows/citation-integrity.yaml"
- "@agentic/code/agents/writing-validator.md"
dependencies:
- "#231 (Retrieval-first policy)"
- "#232 (Citation whitelist)"
- "#236 (Citation verification pipeline)"