UNPKG

aiwg

Version:

Deployment tool and support utility for AI context. Copies agents, skills, commands, rules, and behaviors into the paths each AI platform reads (Claude Code, Codex, Copilot, Cursor, Warp, OpenClaw, and 6 more) so one source of truth works across 10 platfo

383 lines 12.8 kB
/** * Corpus Builder * * Utility for creating and exporting ground truth corpora with validation. * * @module testing/corpus/corpus-builder */ import * as fs from 'fs/promises'; import * as path from 'path'; /** * CorpusBuilder - Build and export ground truth corpora * * @example * ```typescript * const builder = new CorpusBuilder({ * type: 'ai-vs-human', * name: 'AI vs Human Writing Corpus', * description: 'Labeled corpus for AI pattern detection validation', * schema: { * groundTruthType: 'boolean', * formatDescription: 'true for AI-generated, false for human-written' * }, * linkedNFRs: ['NFR-ACC-001'] * }); * * // Add items * builder.addItem({ * id: 'doc-001', * content: 'This is a sample document...', * groundTruth: true, // AI-generated * metadata: { source: 'gpt-4' } * }); * * // Validate and export * const validation = builder.validate(); * if (validation.valid) { * await builder.export({ * outputDir: './tests/fixtures/corpora', * version: '1.0.0' * }); * } * ``` */ export class CorpusBuilder { options; items = new Map(); constructor(options) { this.options = options; } /** * Add an item to the corpus * * @param item - Ground truth item * @throws {Error} If item ID already exists */ addItem(item) { if (this.items.has(item.id)) { throw new Error(`Duplicate item ID: ${item.id}`); } this.items.set(item.id, item); } /** * Add multiple items to the corpus * * @param items - Array of ground truth items */ addItems(items) { for (const item of items) { this.addItem(item); } } /** * Remove an item from the corpus * * @param itemId - Item identifier * @returns True if item was removed */ removeItem(itemId) { return this.items.delete(itemId); } /** * Get an item by ID * * @param itemId - Item identifier * @returns Item or undefined */ getItem(itemId) { return this.items.get(itemId); } /** * Get all items * * @returns Array of all items */ getAllItems() { return Array.from(this.items.values()); } /** * Get item count */ getItemCount() { return this.items.size; } /** * Clear all items */ clear() { this.items.clear(); } /** * Validate the corpus * * @returns Validation result */ validate() { const errors = []; const warnings = []; // Check minimum items if (this.items.size === 0) { errors.push('Corpus is empty'); } else if (this.items.size < 10) { warnings.push(`Corpus has only ${this.items.size} items - may not be statistically significant`); } // Validate each item against schema for (const [id, item] of this.items) { const itemErrors = this.validateItem(item); for (const error of itemErrors) { errors.push(`Item ${id}: ${error}`); } } // Check label distribution const distribution = this.calculateLabelDistribution(); const labels = Object.keys(distribution); if (labels.length === 1 && this.items.size > 1) { warnings.push('All items have the same label - corpus may not be useful for validation'); } // Check for severe class imbalance if (labels.length > 1) { const counts = Object.values(distribution); const max = Math.max(...counts); const min = Math.min(...counts); if (max > min * 10) { warnings.push('Severe class imbalance detected - largest class is 10x+ larger than smallest'); } } return { valid: errors.length === 0, errors, warnings }; } /** * Validate a single item against schema */ validateItem(item) { const errors = []; const schema = this.options.schema; // Check required item fields if (!item.id) { errors.push('Missing required field: id'); } if (item.content === undefined) { errors.push('Missing required field: content'); } if (item.groundTruth === undefined) { errors.push('Missing required field: groundTruth'); } // Validate ground truth type const gtType = typeof item.groundTruth; if (schema.groundTruthType === 'boolean' && gtType !== 'boolean') { errors.push(`Ground truth should be boolean, got ${gtType}`); } if (schema.groundTruthType === 'string' && gtType !== 'string') { errors.push(`Ground truth should be string, got ${gtType}`); } if (schema.groundTruthType === 'number' && gtType !== 'number') { errors.push(`Ground truth should be number, got ${gtType}`); } if (schema.groundTruthType === 'object' && (gtType !== 'object' || Array.isArray(item.groundTruth))) { errors.push(`Ground truth should be object, got ${gtType}`); } if (schema.groundTruthType === 'array' && !Array.isArray(item.groundTruth)) { errors.push(`Ground truth should be array, got ${gtType}`); } // Check required fields for object type if (schema.groundTruthType === 'object' && schema.requiredFields && typeof item.groundTruth === 'object') { for (const field of schema.requiredFields) { if (!(field in item.groundTruth)) { errors.push(`Ground truth missing required field: ${field}`); } } } // Check enum values for string type if (schema.groundTruthType === 'string' && schema.enumValues && typeof item.groundTruth === 'string') { if (!schema.enumValues.includes(item.groundTruth)) { errors.push(`Ground truth value '${item.groundTruth}' not in allowed values: ${schema.enumValues.join(', ')}`); } } return errors; } /** * Calculate label distribution */ calculateLabelDistribution() { const distribution = {}; for (const item of this.items.values()) { const label = String(item.groundTruth); distribution[label] = (distribution[label] || 0) + 1; } return distribution; } /** * Export corpus to files * * @param options - Export options * @throws {Error} If corpus is invalid */ async export(options) { // Validate first const validation = this.validate(); if (!validation.valid) { throw new Error(`Cannot export invalid corpus: ${validation.errors.join(', ')}`); } const maxItemsPerFile = options.maxItemsPerFile || 1000; const items = this.getAllItems(); // Create directories const manifestDir = path.join(options.outputDir, 'manifests'); const dataDir = path.join(options.outputDir, 'data', this.options.type); await fs.mkdir(manifestDir, { recursive: true }); await fs.mkdir(dataDir, { recursive: true }); // Split items into files const dataFiles = []; const chunks = this.chunkArray(items, maxItemsPerFile); for (let i = 0; i < chunks.length; i++) { const fileName = chunks.length === 1 ? `${this.options.type}-v${options.version}.json` : `${this.options.type}-v${options.version}-part${i + 1}.json`; const filePath = path.join(dataDir, fileName); await fs.writeFile(filePath, JSON.stringify(chunks[i], null, 2), 'utf-8'); dataFiles.push(fileName); } // Create manifest const manifest = { name: this.options.name, type: this.options.type, version: options.version, description: this.options.description, createdAt: new Date().toISOString(), updatedAt: new Date().toISOString(), itemCount: items.length, schema: this.options.schema, labelDistribution: this.calculateLabelDistribution(), linkedNFRs: this.options.linkedNFRs, dataFiles }; const manifestPath = path.join(manifestDir, `${this.options.type}-v${options.version}.json`); await fs.writeFile(manifestPath, JSON.stringify(manifest, null, 2), 'utf-8'); } /** * Split array into chunks */ chunkArray(array, chunkSize) { const chunks = []; for (let i = 0; i < array.length; i += chunkSize) { chunks.push(array.slice(i, i + chunkSize)); } return chunks; } /** * Import items from existing JSON file * * @param filePath - Path to JSON file * @throws {Error} If file cannot be read or parsed */ async importFromFile(filePath) { const content = await fs.readFile(filePath, 'utf-8'); const data = JSON.parse(content); if (Array.isArray(data)) { this.addItems(data); } else if (data.items && Array.isArray(data.items)) { this.addItems(data.items); } else { throw new Error('Invalid file format: expected array of items or object with items array'); } } /** * Get corpus statistics */ getStatistics() { return { itemCount: this.items.size, labelDistribution: this.calculateLabelDistribution(), type: this.options.type }; } } /** * Pre-configured builder factories for each corpus type */ export const CorpusBuilders = { /** * Create AI vs Human writing corpus builder */ aiVsHuman() { return new CorpusBuilder({ type: 'ai-vs-human', name: 'AI vs Human Writing Corpus', description: 'Labeled documents for AI pattern detection validation', schema: { groundTruthType: 'boolean', formatDescription: 'true for AI-generated, false for human-written' }, linkedNFRs: ['NFR-ACC-001'] }); }, /** * Create codebase metadata corpus builder */ codebases() { return new CorpusBuilder({ type: 'codebases', name: 'Codebase Metadata Corpus', description: 'Codebases with verified metadata for intake validation', schema: { groundTruthType: 'object', requiredFields: ['language', 'framework', 'techStack'], formatDescription: 'Object with language, framework, techStack fields' }, linkedNFRs: ['NFR-ACC-002'] }); }, /** * Create traceability links corpus builder */ traceability() { return new CorpusBuilder({ type: 'traceability', name: 'Requirements Traceability Corpus', description: 'Requirements with verified traceability links', schema: { groundTruthType: 'object', requiredFields: ['requirementId', 'codeFiles', 'testFiles'], formatDescription: 'Object with requirementId, codeFiles array, testFiles array' }, linkedNFRs: ['NFR-TRACE-05', 'NFR-TRACE-06'] }); }, /** * Create security attacks corpus builder */ securityAttacks() { return new CorpusBuilder({ type: 'security-attacks', name: 'Security Attack Patterns Corpus', description: 'Known attack patterns for security detection validation', schema: { groundTruthType: 'object', requiredFields: ['attackType', 'severity'], formatDescription: 'Object with attackType (sql-injection, xss, etc.) and severity (critical, high, medium, low)' }, linkedNFRs: ['NFR-SEC-ACC-01'] }); }, /** * Create template recommendations corpus builder */ templateRecommendations() { return new CorpusBuilder({ type: 'template-recommendations', name: 'Template Recommendation Corpus', description: 'Scenarios with expected template recommendations', schema: { groundTruthType: 'array', formatDescription: 'Array of recommended template IDs' }, linkedNFRs: ['NFR-TMPL-07'] }); } }; //# sourceMappingURL=corpus-builder.js.map