aiwg
Version:
Cognitive architecture for AI-augmented software development with structured memory, ensemble validation, and closed-loop correction. FAIR-aligned artifacts, 84% cost reduction via human-in-the-loop, standards adopted by 100+ organizations.
484 lines (428 loc) • 14.7 kB
YAML
# Citation Integrity Framework Schema
# Based on REF-059 LitLLM Citation Processing
# Issues: #231 (Retrieval-First Policy), #232 (Whitelist Enforcement), #234 (Page Validation)
$schema: "https://json-schema.org/draft/2020-12/schema"
$id: "https://aiwg.io/schemas/citation-integrity/v1"
title: "Citation Integrity Framework Schema"
description: |
Comprehensive citation integrity framework implementing retrieval-first policy,
corpus whitelist enforcement, and page number validation per REF-059 LitLLM.
type: object
required:
- version
- retrieval_first_policy
- whitelist_enforcement
- page_validation
properties:
version:
type: string
pattern: "^\\d+\\.\\d+\\.\\d+$"
default: "1.0.0"
retrieval_first_policy:
$ref: "#/$defs/RetrievalFirstPolicy"
whitelist_enforcement:
$ref: "#/$defs/WhitelistEnforcement"
page_validation:
$ref: "#/$defs/PageValidation"
$defs:
RetrievalFirstPolicy:
type: object
description: "Formal retrieval-first policy for citation generation"
properties:
enabled:
type: boolean
default: true
policy_statement:
type: string
default: |
CRITICAL: Never generate citations from memory or training data.
Required workflow:
1. Retrieve sources from AIWG research corpus
2. Verify source relevance to claim
3. Extract citation metadata from retrieved document
4. Format citation using retrieved metadata
5. Include page numbers or section references
prohibited_actions:
type: array
items: { type: string }
default:
- "Generating citations from LLM training data"
- "Citing papers not in AIWG corpus"
- "Fabricating DOIs, URLs, or author lists"
- "Adding 'relevant' papers without retrieval"
- "Guessing page numbers or sections"
required_workflow:
type: array
items:
type: object
properties:
step: { type: integer }
action: { type: string }
validation: { type: string }
default:
- step: 1
action: "Search corpus for relevant sources"
validation: "At least one Read tool call to .aiwg/research/corpus/"
- step: 2
action: "Verify source relevance to claim"
validation: "Source content supports the claim being made"
- step: 3
action: "Extract citation metadata"
validation: "Use REF-XXX identifier from filename"
- step: 4
action: "Format citation properly"
validation: "Include [REF-XXX] or inline reference"
- step: 5
action: "Add page/section reference"
validation: "Specific location in source document"
corpus_expansion_protocol:
type: object
properties:
when_source_not_found:
type: string
default: |
When a claim would benefit from citation but no suitable source
exists in the AIWG research corpus:
1. Do NOT cite from memory
2. Inform user: "This claim would benefit from citation, but no
suitable source exists in the corpus."
3. Recommend addition: "Consider adding [Paper] to corpus using
`aiwg add-research-paper`"
4. Continue without citation or mark as [citation needed]
addition_command:
type: string
default: "aiwg add-research-paper --title \"...\" --url \"...\" --relevance \"...\""
WhitelistEnforcement:
type: object
description: "Corpus-as-whitelist enforcement"
properties:
enabled:
type: boolean
default: true
corpus_path:
type: string
default: ".aiwg/research/corpus/"
whitelist_principle:
type: string
default: |
The research corpus (.aiwg/research/corpus/) is the ONLY authorized
source list. Any citation not in the corpus is FORBIDDEN.
For any citation C:
IF C ∉ Authorized Citations THEN
REJECT with error "Citation not in corpus"
ELSE
ALLOW with metadata from corpus file
enforcement_levels:
type: object
properties:
agent_level:
type: object
properties:
description: { type: string, default: "Agents only cite corpus sources" }
rules:
type: array
items: { type: string }
default:
- "ONLY cite sources from .aiwg/research/corpus/"
- "Use REF-XXX identifier from filename"
- "If needed source not in corpus, STOP and recommend addition"
- "Never generate citations from memory"
validation_level:
type: object
properties:
description: { type: string, default: "Writing-Validator checks whitelist" }
check_pattern:
type: string
default: "REF-\\d{3}"
on_violation:
type: object
properties:
severity: { type: string, default: "critical" }
message: { type: string, default: "Citation not in corpus whitelist" }
action: { type: string, default: "Remove citation or add source to corpus first" }
commit_level:
type: object
properties:
description: { type: string, default: "Optional pre-commit hook" }
enabled: { type: boolean, default: false }
script: |
#!/bin/bash
# Extract all REF-XXX citations from staged markdown
REFS=$(git diff --cached "*.md" | grep -oE "REF-[0-9]{3}" | sort -u)
for REF in $REFS; do
if ! ls .aiwg/research/corpus/${REF}-*.md 1>/dev/null 2>&1; then
echo "ERROR: Citation ${REF} not in corpus"
exit 1
fi
done
allowed_citation_formats:
type: array
items: { type: string }
default:
- "[REF-XXX]"
- "(REF-XXX)"
- "[REF-XXX, p.XX]"
- "[REF-XXX, Section X]"
forbidden_citation_formats:
type: array
items: { type: string }
default:
- "(Smith et al., 2023)"
- "[1]"
- "Author (Year)"
- "Any non-corpus reference"
PageValidation:
type: object
description: "Page number validation for citations"
properties:
enabled:
type: boolean
default: true
document_metadata_schema:
type: object
properties:
total_pages:
type: integer
description: "Total page count of document"
page_range:
type: string
description: "e.g., '1-24'"
sections:
type: array
items:
type: object
properties:
name: { type: string }
pages: { type: string }
key_quote_schema:
type: object
properties:
quote:
type: string
page:
type: integer
section:
type: string
validated:
type: boolean
default: false
validation_date:
type: string
format: date
validator:
type: string
description: "@username"
validation_rules:
type: object
properties:
page_existence:
type: object
properties:
description: { type: string, default: "Page number within document range" }
rule: { type: string, default: "1 <= page <= total_pages" }
section_consistency:
type: object
properties:
description: { type: string, default: "Page within declared section" }
rule: { type: string, default: "section.start <= page <= section.end" }
quote_verification:
type: object
properties:
description: { type: string, default: "Quote exists on cited page (if PDF available)" }
rule: { type: string, default: "extract_text(pdf, page).contains(quote)" }
validation_workflow:
type: object
properties:
manual_checklist:
type: array
items: { type: string }
default:
- "Record total page count"
- "Document section page ranges"
- "Verify each Key Quote page number"
- "Mark each quote as validated"
- "Sign and date validation"
automated_checks:
type: array
items:
type: object
properties:
check: { type: string }
automated: { type: boolean }
default:
- { check: "Page within range", automated: true }
- { check: "Section consistency", automated: true }
- { check: "Quote on page (PDF)", automated: false }
# Citation validation result
citation_validation_result:
type: object
properties:
ref_id:
type: string
status:
type: string
enum: [valid, invalid, warning]
checks:
type: object
properties:
in_corpus:
type: boolean
page_valid:
type: boolean
section_consistent:
type: boolean
quote_verified:
type: boolean
issues:
type: array
items:
type: object
properties:
severity: { type: string }
message: { type: string }
suggestion: { type: string }
# Agent citation workflow
agent_citation_workflow:
type: object
properties:
when_citation_needed:
type: array
items: { type: string }
default:
- "1. Search corpus: @.aiwg/research/corpus/*.md"
- "2. If no match: Recommend adding source, do NOT cite"
- "3. If match: Extract metadata from corpus file"
- "4. Use REF-XXX identifier from filename"
- "5. Include page/section reference"
- "6. Verify quote exists at cited location"
example_valid:
type: string
default: |
✅ ALLOWED:
"Voice consistency improves quality [REF-043, p.15]"
(corpus file .aiwg/research/corpus/REF-043-voice-consistency.md exists)
example_invalid:
type: string
default: |
❌ FORBIDDEN:
"Voice consistency improves quality (Smith et al., 2023)"
(not in corpus, citation from memory)
# CLI commands
cli_commands:
validate_citations:
command: "aiwg validate-citations <file>"
description: "Validate all citations in file"
options:
- name: "--strict"
description: "Fail on any warning"
- name: "--fix"
description: "Remove invalid citations"
corpus_check:
command: "aiwg corpus-check"
description: "List citations not in corpus"
page_validate:
command: "aiwg page-validate <ref-id>"
description: "Validate page numbers for a paper"
citation_audit:
command: "aiwg citation-audit"
description: "Full citation integrity audit"
output:
- "Corpus coverage"
- "Page validation status"
- "Quote verification status"
# Agent protocol
agent_protocol:
generate_citation:
description: "Generate citation with integrity checks"
steps:
- identify_claim_needing_citation
- search_corpus_for_source
- if_not_found:
- do_not_cite
- recommend_corpus_addition
- mark_citation_needed
- if_found:
- read_corpus_file
- extract_ref_id
- identify_relevant_quote
- validate_page_number
- format_citation
- return_citation
validate_citation:
description: "Validate existing citation"
steps:
- extract_ref_id_from_citation
- check_corpus_whitelist
- if_not_in_corpus:
- flag_critical_error
- if_in_corpus:
- validate_page_number
- validate_section_consistency
- if_pdf_available:
- verify_quote_on_page
- return_validation_result
audit_all_citations:
description: "Audit all citations in document"
steps:
- extract_all_citations
- for_each_citation:
- validate_citation
- record_result
- generate_audit_report
- return_summary
# Storage
storage:
corpus: ".aiwg/research/corpus/"
validation_logs: ".aiwg/logs/citation-validation/"
audit_reports: ".aiwg/reports/citation-audit/"
# Research targets (from REF-059 LitLLM)
research_targets:
retrieval_first: "Never generate citations without retrieval"
corpus_whitelist: "Only cite sources in research corpus"
page_validation: "Verify page numbers exist and are correct"
quote_verification: "Confirm quotes exist at cited locations"
# Example validation report
example_validation_report: |
================================================================================
CITATION INTEGRITY AUDIT
================================================================================
Document: docs/voice-framework/technical-guide.md
Date: 2026-01-25
Auditor: writing-validator
SUMMARY:
Total Citations: 15
Valid: 12
Warnings: 2
Errors: 1
DETAILS:
✓ [REF-043, p.15] - Valid
- In corpus: Yes
- Page valid: Yes (1-24 range)
- Section consistent: Yes (Results)
⚠ [REF-018, p.8] - Warning
- In corpus: Yes
- Page valid: Yes
- Section consistent: No (cited as Introduction, actually Methods)
✗ [REF-099] - Error
- In corpus: No
- Action: Remove citation or add REF-099 to corpus
⚠ [REF-021, p.45] - Warning
- In corpus: Yes
- Page valid: No (document has 32 pages)
- Action: Verify correct page number
RECOMMENDATIONS:
1. Remove or add REF-099 to corpus
2. Verify REF-018 section reference
3. Correct REF-021 page number
# References
references:
research:
- "@.aiwg/research/findings/REF-059-litllm-citation-processing.md"
implementation:
- "#231"
- "#232"
- "#234"
related:
- "@.aiwg/research/corpus/"
- "@agentic/code/agents/writing-validator.md"
- "@agentic/code/frameworks/sdlc-complete/schemas/flows/grade-evidence-quality.yaml"