aiwg
Version:
Deployment tool and support utility for AI context. Copies agents, skills, commands, rules, and behaviors into the paths each AI platform reads (Claude Code, Codex, Copilot, Cursor, Warp, OpenClaw, and 6 more) so one source of truth works across 10 platfo
383 lines • 12.8 kB
JavaScript
/**
* Corpus Builder
*
* Utility for creating and exporting ground truth corpora with validation.
*
* @module testing/corpus/corpus-builder
*/
import * as fs from 'fs/promises';
import * as path from 'path';
/**
* CorpusBuilder - Build and export ground truth corpora
*
* @example
* ```typescript
* const builder = new CorpusBuilder({
* type: 'ai-vs-human',
* name: 'AI vs Human Writing Corpus',
* description: 'Labeled corpus for AI pattern detection validation',
* schema: {
* groundTruthType: 'boolean',
* formatDescription: 'true for AI-generated, false for human-written'
* },
* linkedNFRs: ['NFR-ACC-001']
* });
*
* // Add items
* builder.addItem({
* id: 'doc-001',
* content: 'This is a sample document...',
* groundTruth: true, // AI-generated
* metadata: { source: 'gpt-4' }
* });
*
* // Validate and export
* const validation = builder.validate();
* if (validation.valid) {
* await builder.export({
* outputDir: './tests/fixtures/corpora',
* version: '1.0.0'
* });
* }
* ```
*/
export class CorpusBuilder {
options;
items = new Map();
constructor(options) {
this.options = options;
}
/**
* Add an item to the corpus
*
* @param item - Ground truth item
* @throws {Error} If item ID already exists
*/
addItem(item) {
if (this.items.has(item.id)) {
throw new Error(`Duplicate item ID: ${item.id}`);
}
this.items.set(item.id, item);
}
/**
* Add multiple items to the corpus
*
* @param items - Array of ground truth items
*/
addItems(items) {
for (const item of items) {
this.addItem(item);
}
}
/**
* Remove an item from the corpus
*
* @param itemId - Item identifier
* @returns True if item was removed
*/
removeItem(itemId) {
return this.items.delete(itemId);
}
/**
* Get an item by ID
*
* @param itemId - Item identifier
* @returns Item or undefined
*/
getItem(itemId) {
return this.items.get(itemId);
}
/**
* Get all items
*
* @returns Array of all items
*/
getAllItems() {
return Array.from(this.items.values());
}
/**
* Get item count
*/
getItemCount() {
return this.items.size;
}
/**
* Clear all items
*/
clear() {
this.items.clear();
}
/**
* Validate the corpus
*
* @returns Validation result
*/
validate() {
const errors = [];
const warnings = [];
// Check minimum items
if (this.items.size === 0) {
errors.push('Corpus is empty');
}
else if (this.items.size < 10) {
warnings.push(`Corpus has only ${this.items.size} items - may not be statistically significant`);
}
// Validate each item against schema
for (const [id, item] of this.items) {
const itemErrors = this.validateItem(item);
for (const error of itemErrors) {
errors.push(`Item ${id}: ${error}`);
}
}
// Check label distribution
const distribution = this.calculateLabelDistribution();
const labels = Object.keys(distribution);
if (labels.length === 1 && this.items.size > 1) {
warnings.push('All items have the same label - corpus may not be useful for validation');
}
// Check for severe class imbalance
if (labels.length > 1) {
const counts = Object.values(distribution);
const max = Math.max(...counts);
const min = Math.min(...counts);
if (max > min * 10) {
warnings.push('Severe class imbalance detected - largest class is 10x+ larger than smallest');
}
}
return {
valid: errors.length === 0,
errors,
warnings
};
}
/**
* Validate a single item against schema
*/
validateItem(item) {
const errors = [];
const schema = this.options.schema;
// Check required item fields
if (!item.id) {
errors.push('Missing required field: id');
}
if (item.content === undefined) {
errors.push('Missing required field: content');
}
if (item.groundTruth === undefined) {
errors.push('Missing required field: groundTruth');
}
// Validate ground truth type
const gtType = typeof item.groundTruth;
if (schema.groundTruthType === 'boolean' && gtType !== 'boolean') {
errors.push(`Ground truth should be boolean, got ${gtType}`);
}
if (schema.groundTruthType === 'string' && gtType !== 'string') {
errors.push(`Ground truth should be string, got ${gtType}`);
}
if (schema.groundTruthType === 'number' && gtType !== 'number') {
errors.push(`Ground truth should be number, got ${gtType}`);
}
if (schema.groundTruthType === 'object' && (gtType !== 'object' || Array.isArray(item.groundTruth))) {
errors.push(`Ground truth should be object, got ${gtType}`);
}
if (schema.groundTruthType === 'array' && !Array.isArray(item.groundTruth)) {
errors.push(`Ground truth should be array, got ${gtType}`);
}
// Check required fields for object type
if (schema.groundTruthType === 'object' && schema.requiredFields && typeof item.groundTruth === 'object') {
for (const field of schema.requiredFields) {
if (!(field in item.groundTruth)) {
errors.push(`Ground truth missing required field: ${field}`);
}
}
}
// Check enum values for string type
if (schema.groundTruthType === 'string' && schema.enumValues && typeof item.groundTruth === 'string') {
if (!schema.enumValues.includes(item.groundTruth)) {
errors.push(`Ground truth value '${item.groundTruth}' not in allowed values: ${schema.enumValues.join(', ')}`);
}
}
return errors;
}
/**
* Calculate label distribution
*/
calculateLabelDistribution() {
const distribution = {};
for (const item of this.items.values()) {
const label = String(item.groundTruth);
distribution[label] = (distribution[label] || 0) + 1;
}
return distribution;
}
/**
* Export corpus to files
*
* @param options - Export options
* @throws {Error} If corpus is invalid
*/
async export(options) {
// Validate first
const validation = this.validate();
if (!validation.valid) {
throw new Error(`Cannot export invalid corpus: ${validation.errors.join(', ')}`);
}
const maxItemsPerFile = options.maxItemsPerFile || 1000;
const items = this.getAllItems();
// Create directories
const manifestDir = path.join(options.outputDir, 'manifests');
const dataDir = path.join(options.outputDir, 'data', this.options.type);
await fs.mkdir(manifestDir, { recursive: true });
await fs.mkdir(dataDir, { recursive: true });
// Split items into files
const dataFiles = [];
const chunks = this.chunkArray(items, maxItemsPerFile);
for (let i = 0; i < chunks.length; i++) {
const fileName = chunks.length === 1
? `${this.options.type}-v${options.version}.json`
: `${this.options.type}-v${options.version}-part${i + 1}.json`;
const filePath = path.join(dataDir, fileName);
await fs.writeFile(filePath, JSON.stringify(chunks[i], null, 2), 'utf-8');
dataFiles.push(fileName);
}
// Create manifest
const manifest = {
name: this.options.name,
type: this.options.type,
version: options.version,
description: this.options.description,
createdAt: new Date().toISOString(),
updatedAt: new Date().toISOString(),
itemCount: items.length,
schema: this.options.schema,
labelDistribution: this.calculateLabelDistribution(),
linkedNFRs: this.options.linkedNFRs,
dataFiles
};
const manifestPath = path.join(manifestDir, `${this.options.type}-v${options.version}.json`);
await fs.writeFile(manifestPath, JSON.stringify(manifest, null, 2), 'utf-8');
}
/**
* Split array into chunks
*/
chunkArray(array, chunkSize) {
const chunks = [];
for (let i = 0; i < array.length; i += chunkSize) {
chunks.push(array.slice(i, i + chunkSize));
}
return chunks;
}
/**
* Import items from existing JSON file
*
* @param filePath - Path to JSON file
* @throws {Error} If file cannot be read or parsed
*/
async importFromFile(filePath) {
const content = await fs.readFile(filePath, 'utf-8');
const data = JSON.parse(content);
if (Array.isArray(data)) {
this.addItems(data);
}
else if (data.items && Array.isArray(data.items)) {
this.addItems(data.items);
}
else {
throw new Error('Invalid file format: expected array of items or object with items array');
}
}
/**
* Get corpus statistics
*/
getStatistics() {
return {
itemCount: this.items.size,
labelDistribution: this.calculateLabelDistribution(),
type: this.options.type
};
}
}
/**
* Pre-configured builder factories for each corpus type
*/
export const CorpusBuilders = {
/**
* Create AI vs Human writing corpus builder
*/
aiVsHuman() {
return new CorpusBuilder({
type: 'ai-vs-human',
name: 'AI vs Human Writing Corpus',
description: 'Labeled documents for AI pattern detection validation',
schema: {
groundTruthType: 'boolean',
formatDescription: 'true for AI-generated, false for human-written'
},
linkedNFRs: ['NFR-ACC-001']
});
},
/**
* Create codebase metadata corpus builder
*/
codebases() {
return new CorpusBuilder({
type: 'codebases',
name: 'Codebase Metadata Corpus',
description: 'Codebases with verified metadata for intake validation',
schema: {
groundTruthType: 'object',
requiredFields: ['language', 'framework', 'techStack'],
formatDescription: 'Object with language, framework, techStack fields'
},
linkedNFRs: ['NFR-ACC-002']
});
},
/**
* Create traceability links corpus builder
*/
traceability() {
return new CorpusBuilder({
type: 'traceability',
name: 'Requirements Traceability Corpus',
description: 'Requirements with verified traceability links',
schema: {
groundTruthType: 'object',
requiredFields: ['requirementId', 'codeFiles', 'testFiles'],
formatDescription: 'Object with requirementId, codeFiles array, testFiles array'
},
linkedNFRs: ['NFR-TRACE-05', 'NFR-TRACE-06']
});
},
/**
* Create security attacks corpus builder
*/
securityAttacks() {
return new CorpusBuilder({
type: 'security-attacks',
name: 'Security Attack Patterns Corpus',
description: 'Known attack patterns for security detection validation',
schema: {
groundTruthType: 'object',
requiredFields: ['attackType', 'severity'],
formatDescription: 'Object with attackType (sql-injection, xss, etc.) and severity (critical, high, medium, low)'
},
linkedNFRs: ['NFR-SEC-ACC-01']
});
},
/**
* Create template recommendations corpus builder
*/
templateRecommendations() {
return new CorpusBuilder({
type: 'template-recommendations',
name: 'Template Recommendation Corpus',
description: 'Scenarios with expected template recommendations',
schema: {
groundTruthType: 'array',
formatDescription: 'Array of recommended template IDs'
},
linkedNFRs: ['NFR-TMPL-07']
});
}
};
//# sourceMappingURL=corpus-builder.js.map