@astermind/astermind-synth

Version:

OmegaSynth - Label-Conditioned Synthetic Data Generator for AsterMind ELM/KELM Pipelines

1 lines • 181 kB

Source Map (JSON)

{"version":3,"file":"pipeline.mjs","sources":["../src/omegasynth/store/SyntheticFieldStore.ts","../src/omegasynth/generators/RetrievalGenerator.ts","../src/omegasynth/encoders/CharVocab.ts","../src/omegasynth/encoders/FixedLength.ts","../src/omegasynth/encoders/OneHot.ts","../src/omegasynth/encoders/StringEncoder.ts","../src/omegasynth/core/elm_utils.ts","../src/omegasynth/core/validation.ts","../src/omegasynth/core/PatternCorrector.ts","../src/omegasynth/core/SequenceContext.ts","../src/omegasynth/generators/ELMGenerator.ts","../src/omegasynth/generators/HybridGenerator.ts","../src/omegasynth/generators/ExactGenerator.ts","../src/omegasynth/generators/PerfectGenerator.ts","../node_modules/@astermindai/license-runtime/dist/base64url.js","../node_modules/@astermindai/license-runtime/dist/joseSignature.js","../node_modules/@astermindai/license-runtime/dist/time.js","../node_modules/@astermindai/license-runtime/dist/jwksCache.js","../node_modules/@astermindai/license-runtime/dist/verifyLktClient.js","../node_modules/@astermindai/license-runtime/dist/runtime.js","../node_modules/@astermindai/license-runtime/dist/index.js","../src/omegasynth/core/license.ts","../src/omegasynth/OmegaSynth.ts","../src/omegasynth/scripts/loadTrainingData.ts","../src/omegasynth/scripts/trainModel.ts","../src/omegasynth/evaluation/Metrics.ts","../src/omegasynth/scripts/testModel.ts","../src/omegasynth/scripts/validateModel.ts","../src/omegasynth/scripts/saveVersionedModel.ts","../src/omegasynth/scripts/pipeline.ts"],"sourcesContent":["/**\n * SyntheticFieldStore - Storage for labeled samples\n * Supports insert, get, and sample operations\n */\n\nimport { LabeledSample } from '../types';\n\nexport class SyntheticFieldStore {\n private store: Map<string, string[]> = new Map();\n\n /**\n * Insert a labeled sample into the store\n */\n insert(sample: LabeledSample): void {\n if (!this.store.has(sample.label)) {\n this.store.set(sample.label, []);\n }\n this.store.get(sample.label)!.push(sample.value);\n }\n\n /**\n * Insert multiple samples at once\n */\n insertMany(samples: LabeledSample[]): void {\n for (const sample of samples) {\n this.insert(sample);\n }\n }\n\n /**\n * Get all values for a given label\n */\n get(label: string): string[] {\n return this.store.get(label) || [];\n }\n\n /**\n * Sample k values uniformly at random for a given label\n */\n sample(label: string, k: number = 1): string[] {\n const values = this.get(label);\n if (values.length === 0) {\n return [];\n }\n\n const result: string[] = [];\n const indices = new Set<number>();\n\n // Simple uniform random sampling without replacement\n while (result.length < k && indices.size < values.length) {\n const idx = Math.floor(Math.random() * values.length);\n if (!indices.has(idx)) {\n indices.add(idx);\n result.push(values[idx]);\n }\n }\n\n return result;\n }\n\n /**\n * Check if a label exists in the store\n */\n hasLabel(label: string): boolean {\n return this.store.has(label);\n }\n\n /**\n * Get all labels in the store\n */\n getLabels(): string[] {\n return Array.from(this.store.keys());\n }\n\n /**\n * Get the count of samples for a label\n */\n count(label: string): number {\n return this.get(label).length;\n }\n\n /**\n * Clear all data\n */\n clear(): void {\n this.store.clear();\n }\n}\n\n\n\n","/**\n * RetrievalGenerator - Simple deterministic retrieval sampler\n * Uniform random sampling from stored labeled samples\n */\n\nimport { SyntheticFieldStore } from '../store/SyntheticFieldStore';\nimport { LabeledSample } from '../types';\n\n/**\n * Seeded random number generator for deterministic testing\n */\nclass SeededRNG {\n private seed: number;\n\n constructor(seed: number = Date.now()) {\n this.seed = seed;\n }\n\n next(): number {\n // Linear congruential generator\n this.seed = (this.seed * 1664525 + 1013904223) % 2 ** 32;\n return this.seed / 2 ** 32;\n }\n\n setSeed(seed: number): void {\n this.seed = seed;\n }\n}\n\nexport class RetrievalGenerator {\n private store: SyntheticFieldStore;\n private rng: SeededRNG;\n private seed?: number;\n\n constructor(seed?: number) {\n this.store = new SyntheticFieldStore();\n this.seed = seed;\n this.rng = new SeededRNG(seed);\n }\n\n /**\n * Ingest labeled samples into the store\n */\n ingest(samples: LabeledSample[]): void {\n this.store.insertMany(samples);\n }\n\n /**\n * Sample k values for a given label\n * Returns empty array if label doesn't exist or has no samples\n */\n sample(label: string, k: number = 1): string[] {\n const values = this.store.get(label);\n if (values.length === 0) {\n return [];\n }\n\n const result: string[] = [];\n const availableIndices = Array.from({ length: values.length }, (_, i) => i);\n\n // Sample k values (or all if k > available)\n const sampleCount = Math.min(k, values.length);\n for (let i = 0; i < sampleCount; i++) {\n const randomIndex = Math.floor(this.rng.next() * availableIndices.length);\n const selectedIndex = availableIndices.splice(randomIndex, 1)[0];\n result.push(values[selectedIndex]);\n }\n\n return result;\n }\n\n /**\n * Get a single sample (convenience method)\n */\n sampleOne(label: string): string | null {\n const samples = this.sample(label, 1);\n return samples.length > 0 ? samples[0] : null;\n }\n\n /**\n * Check if a label has samples\n */\n hasLabel(label: string): boolean {\n return this.store.hasLabel(label) && this.store.count(label) > 0;\n }\n\n /**\n * Get all available labels\n */\n getLabels(): string[] {\n return this.store.getLabels();\n }\n\n /**\n * Reset the generator (clears store and optionally resets seed)\n */\n reset(seed?: number): void {\n this.store.clear();\n if (seed !== undefined) {\n this.seed = seed;\n this.rng.setSeed(seed);\n }\n }\n}\n\n\n","/**\n * CharVocab - Character vocabulary builder\n * Builds a vocabulary from character sets and training data\n */\n\nexport class CharVocab {\n private charToIndex: Map<string, number> = new Map();\n private indexToChar: Map<number, string> = new Map();\n private size: number = 0;\n\n /**\n * Build vocabulary from a set of strings\n * @param samples Array of strings to build vocabulary from\n * @param charSet Optional predefined character set (e.g., alphanumeric + punctuation)\n */\n build(samples: string[], charSet?: string): void {\n const chars = new Set<string>();\n\n // Add padding character first (index 0) - use null character\n // This ensures index 0 is always padding\n chars.add('\\0');\n\n // Add predefined character set if provided\n if (charSet) {\n for (const char of charSet) {\n // Skip null character if it's in the charSet (we already added it)\n if (char !== '\\0') {\n chars.add(char);\n }\n }\n }\n\n // Add all characters from samples\n for (const sample of samples) {\n for (const char of sample) {\n // Skip null characters from samples (we use it for padding)\n if (char !== '\\0') {\n chars.add(char);\n }\n }\n }\n\n // Sort characters for consistent ordering, but keep null char at index 0\n const sortedChars = Array.from(chars).sort((a, b) => {\n // Ensure null char is always first\n if (a === '\\0') return -1;\n if (b === '\\0') return 1;\n return a.localeCompare(b);\n });\n\n // Build mappings\n this.charToIndex.clear();\n this.indexToChar.clear();\n this.size = sortedChars.length;\n\n sortedChars.forEach((char, index) => {\n this.charToIndex.set(char, index);\n this.indexToChar.set(index, char);\n });\n }\n\n /**\n * Get index for a character\n */\n getIndex(char: string): number {\n const index = this.charToIndex.get(char);\n if (index === undefined) {\n throw new Error(`Character '${char}' not in vocabulary`);\n }\n return index;\n }\n\n /**\n * Get character for an index\n */\n getChar(index: number): string {\n const char = this.indexToChar.get(index);\n if (char === undefined) {\n throw new Error(`Index ${index} not in vocabulary`);\n }\n return char;\n }\n\n /**\n * Check if character exists in vocabulary\n */\n hasChar(char: string): boolean {\n return this.charToIndex.has(char);\n }\n\n /**\n * Get vocabulary size\n */\n getSize(): number {\n return this.size;\n }\n\n /**\n * Get all characters in vocabulary\n */\n getChars(): string[] {\n return Array.from(this.charToIndex.keys()).sort();\n }\n\n /**\n * Get default character set (alphanumeric + common punctuation)\n */\n static getDefaultCharSet(): string {\n return 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' +\n ' !\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~';\n }\n}\n\n","/**\n * FixedLength - Utilities for fixed-length padding and truncation\n */\n\nexport class FixedLength {\n /**\n * Pad or truncate an array to a fixed length\n * @param arr Array to pad/truncate\n * @param length Target length\n * @param padValue Value to use for padding (default: 0)\n */\n static padOrTruncate<T>(arr: T[], length: number, padValue: T = 0 as T): T[] {\n if (arr.length === length) {\n return [...arr];\n }\n\n if (arr.length > length) {\n // Truncate\n return arr.slice(0, length);\n }\n\n // Pad\n const result = [...arr];\n while (result.length < length) {\n result.push(padValue);\n }\n return result;\n }\n\n /**\n * Pad or truncate a string to a fixed length\n * @param str String to pad/truncate\n * @param length Target length\n * @param padChar Character to use for padding (default: space)\n */\n static padOrTruncateString(str: string, length: number, padChar: string = ' '): string {\n if (str.length === length) {\n return str;\n }\n\n if (str.length > length) {\n // Truncate\n return str.slice(0, length);\n }\n\n // Pad\n return str + padChar.repeat(length - str.length);\n }\n}\n\n\n\n","/**\n * OneHot - One-hot encoding utilities\n */\n\nexport class OneHot {\n /**\n * Encode an index as a one-hot vector\n * @param index Index to encode\n * @param size Size of the one-hot vector\n */\n static encode(index: number, size: number): number[] {\n if (index < 0 || index >= size) {\n throw new Error(`Index ${index} out of range [0, ${size})`);\n }\n\n const vector = new Array(size).fill(0);\n vector[index] = 1;\n return vector;\n }\n\n /**\n * Decode a one-hot vector to an index\n * @param vector One-hot vector\n */\n static decode(vector: number[]): number {\n const index = vector.indexOf(1);\n if (index === -1) {\n throw new Error('Invalid one-hot vector: no element equals 1');\n }\n return index;\n }\n\n /**\n * Encode multiple indices as one-hot vectors\n * @param indices Array of indices\n * @param size Size of each one-hot vector\n */\n static encodeBatch(indices: number[], size: number): number[][] {\n return indices.map(idx => this.encode(idx, size));\n }\n\n /**\n * Decode multiple one-hot vectors to indices\n * @param vectors Array of one-hot vectors\n */\n static decodeBatch(vectors: number[][]): number[] {\n return vectors.map(vec => this.decode(vec));\n }\n}\n\n\n\n","/**\n * StringEncoder - Encodes strings to vectors and decodes back\n * Compatible with ELM/KELM pipelines\n */\n\nimport { CharVocab } from './CharVocab';\nimport { FixedLength } from './FixedLength';\nimport { OneHot } from './OneHot';\n\nexport interface StringEncoderConfig {\n maxLength: number;\n charSet?: string;\n useOneHot?: boolean; // If false, uses index-based encoding\n}\n\nexport class StringEncoder {\n private vocab: CharVocab;\n private config: StringEncoderConfig;\n\n constructor(config: StringEncoderConfig) {\n this.config = {\n useOneHot: false, // Default to index-based for efficiency\n ...config,\n };\n this.vocab = new CharVocab();\n }\n\n /**\n * Build vocabulary from training samples\n */\n buildVocab(samples: string[]): void {\n this.vocab.build(samples, this.config.charSet || CharVocab.getDefaultCharSet());\n }\n\n /**\n * Encode a string to a vector\n * @param str String to encode\n * @returns Encoded vector (either indices or one-hot)\n */\n encode(str: string): number[] {\n if (this.vocab.getSize() === 0) {\n throw new Error('Vocabulary not built. Call buildVocab() first.');\n }\n\n // Convert string to indices\n const indices: number[] = [];\n for (const char of str) {\n if (this.vocab.hasChar(char)) {\n indices.push(this.vocab.getIndex(char));\n } else {\n // For unknown characters, try to find a similar one or use space\n // If space is in vocab, use it; otherwise use 0 (which will be treated as padding)\n if (this.vocab.hasChar(' ')) {\n indices.push(this.vocab.getIndex(' '));\n } else {\n indices.push(0);\n }\n }\n }\n\n // Pad or truncate to fixed length\n const padded = FixedLength.padOrTruncate(\n indices,\n this.config.maxLength,\n 0\n );\n\n // Convert to one-hot if requested\n if (this.config.useOneHot) {\n const vocabSize = this.vocab.getSize();\n const oneHotVectors: number[] = [];\n for (const idx of padded) {\n oneHotVectors.push(...OneHot.encode(idx, vocabSize));\n }\n return oneHotVectors;\n }\n\n return padded;\n }\n\n /**\n * Decode a vector back to a string\n * @param vector Encoded vector\n * @returns Decoded string\n */\n decode(vector: number[]): string {\n if (this.vocab.getSize() === 0) {\n throw new Error('Vocabulary not built. Call buildVocab() first.');\n }\n\n let indices: number[];\n\n if (this.config.useOneHot) {\n // Decode one-hot vectors\n const vocabSize = this.vocab.getSize();\n indices = [];\n for (let i = 0; i < vector.length; i += vocabSize) {\n const oneHot = vector.slice(i, i + vocabSize);\n try {\n indices.push(OneHot.decode(oneHot));\n } catch {\n // If decoding fails, use argmax as fallback\n const maxIdx = oneHot.indexOf(Math.max(...oneHot));\n indices.push(maxIdx);\n }\n }\n // Truncate to maxLength\n indices = indices.slice(0, this.config.maxLength);\n } else {\n // Direct index-based decoding\n indices = vector.slice(0, this.config.maxLength);\n }\n\n // Convert indices to characters, stopping at first padding\n let result = '';\n const vocabSize = this.vocab.getSize();\n const paddingIdx = 0; // Padding is always index 0\n \n for (const idx of indices) {\n // Clamp index to valid range\n const clampedIdx = Math.max(0, Math.min(vocabSize - 1, Math.round(idx)));\n \n // Stop decoding at first padding index (0)\n if (clampedIdx === paddingIdx) {\n break;\n }\n \n // Try to get character for this index\n try {\n const char = this.vocab.getChar(clampedIdx);\n // Skip null characters and control characters (except space, tab, newline)\n if (char === '\\0' || (char.charCodeAt(0) < 32 && char !== ' ' && char !== '\\t' && char !== '\\n')) {\n break; // Stop at first invalid character\n }\n result += char;\n } catch {\n // Invalid index - stop decoding\n break;\n }\n }\n\n // Trim trailing whitespace but preserve internal spaces\n return result.trimEnd();\n }\n\n /**\n * Encode multiple strings\n */\n encodeBatch(strings: string[]): number[][] {\n return strings.map(str => this.encode(str));\n }\n\n /**\n * Decode multiple vectors\n */\n decodeBatch(vectors: number[][]): string[] {\n return vectors.map(vec => this.decode(vec));\n }\n\n /**\n * Get the output vector size\n */\n getVectorSize(): number {\n if (this.config.useOneHot) {\n return this.config.maxLength * this.vocab.getSize();\n }\n return this.config.maxLength;\n }\n\n /**\n * Get vocabulary size\n */\n getVocabSize(): number {\n return this.vocab.getSize();\n }\n\n /**\n * Get vocabulary\n */\n getVocab(): CharVocab {\n return this.vocab;\n }\n}\n\n","/**\n * ELM utilities for OmegaSynth\n * Helper functions for working with ELM models\n */\n\n/**\n * Create one-hot vector for a label index\n */\nexport function oneHotLabel(labelIndex: number, numLabels: number): number[] {\n const vector = new Array(numLabels).fill(0);\n if (labelIndex >= 0 && labelIndex < numLabels) {\n vector[labelIndex] = 1;\n }\n return vector;\n}\n\n/**\n * Generate random noise vector\n */\nexport function generateNoiseVector(size: number, seed?: number): number[] {\n const rng = seed !== undefined ? new SeededRNG(seed) : null;\n const noise: number[] = [];\n for (let i = 0; i < size; i++) {\n const value = rng ? rng.next() : Math.random();\n // Normalize to [-1, 1]\n noise.push(value * 2 - 1);\n }\n return noise;\n}\n\n/**\n * Seeded random number generator\n */\nclass SeededRNG {\n private seed: number;\n\n constructor(seed: number) {\n this.seed = seed;\n }\n\n next(): number {\n this.seed = (this.seed * 1664525 + 1013904223) % 2 ** 32;\n return this.seed / 2 ** 32;\n }\n}\n\n\n\n","/**\n * Label-specific validation and cleaning utilities\n */\n\nexport interface ValidationResult {\n isValid: boolean;\n cleaned: string;\n reason?: string;\n}\n\n/**\n * Validate and clean a generated string based on its label type\n */\nexport function validateForLabel(label: string, value: string): ValidationResult {\n if (!value || value.length === 0) {\n return { isValid: false, cleaned: '', reason: 'Empty value' };\n }\n\n // Get label-specific validator\n const validator = getValidatorForLabel(label);\n return validator(value);\n}\n\n/**\n * Get validator function for a specific label\n */\nfunction getValidatorForLabel(label: string): (value: string) => ValidationResult {\n switch (label) {\n case 'first_name':\n case 'last_name':\n return validateName;\n case 'phone_number':\n return validatePhoneNumber;\n case 'email':\n return validateEmail;\n case 'street_address':\n return validateStreetAddress;\n case 'city':\n case 'state':\n case 'country':\n return validateLocation;\n case 'company_name':\n case 'job_title':\n case 'product_name':\n return validateText;\n case 'color':\n return validateColor;\n case 'uuid':\n return validateUUID;\n case 'date':\n return validateDate;\n case 'credit_card_type':\n case 'device_type':\n return validateText;\n default:\n return validateGeneric;\n }\n}\n\n/**\n * Validate name (first_name, last_name)\n * Rules: Letters only, optional hyphens/apostrophes, no numbers\n */\nfunction validateName(value: string): ValidationResult {\n // First check for placeholder patterns in original value (before cleaning)\n const lowerOriginal = value.toLowerCase();\n // Reject \"Name\" followed by numbers (e.g., \"Name97\", \"name123\")\n if (/^name\\d+$/i.test(value)) {\n return { isValid: false, cleaned: '', reason: 'Placeholder name with numbers' };\n }\n \n // Remove all non-letter characters except hyphens and apostrophes\n let cleaned = value.replace(/[^a-zA-Z\\-\\'\\s]/g, '');\n \n // Remove numbers completely\n cleaned = cleaned.replace(/[0-9]/g, '');\n \n // Remove excessive special characters\n cleaned = cleaned.replace(/[-']{2,}/g, '-'); // Multiple hyphens/apostrophes -> single\n cleaned = cleaned.replace(/^[-']+|[-']+$/g, ''); // Remove leading/trailing\n \n // Trim and normalize whitespace\n cleaned = cleaned.trim().replace(/\\s+/g, ' ');\n \n // Must be at least 2 characters and contain at least one letter\n if (cleaned.length < 2 || !/[a-zA-Z]/.test(cleaned)) {\n return { isValid: false, cleaned: '', reason: 'Too short or no letters' };\n }\n \n // Reject common placeholder names (case-insensitive) after cleaning\n const lowerCleaned = cleaned.toLowerCase();\n // Check for exact matches\n if (lowerCleaned === 'name' || lowerCleaned === 'firstname' || lowerCleaned === 'lastname' || \n lowerCleaned === 'surname') {\n return { isValid: false, cleaned: '', reason: 'Placeholder name' };\n }\n // Check for \"name\" followed by very short variations\n if (lowerCleaned.startsWith('name') && lowerCleaned.length <= 6) {\n return { isValid: false, cleaned: '', reason: 'Placeholder name' };\n }\n \n // Max length check\n if (cleaned.length > 30) {\n cleaned = cleaned.substring(0, 30).trim();\n }\n \n return { isValid: true, cleaned };\n}\n\n/**\n * Validate phone number\n * Rules: Digits, dashes, parentheses, dots, plus, spaces\n */\nfunction validatePhoneNumber(value: string): ValidationResult {\n // Keep only valid phone characters\n let cleaned = value.replace(/[^0-9\\-\\+\$\$\\.\\s]/g, '');\n \n // Remove excessive special characters\n cleaned = cleaned.replace(/[-\\.]{2,}/g, '-');\n cleaned = cleaned.replace(/\\s+/g, ' ');\n cleaned = cleaned.trim();\n \n // Count digits\n const digitCount = (cleaned.match(/\\d/g) || []).length;\n \n // Must have at least 7 digits (minimum phone number)\n if (digitCount < 7) {\n return { isValid: false, cleaned: '', reason: 'Too few digits' };\n }\n \n // Max length check\n if (cleaned.length > 25) {\n cleaned = cleaned.substring(0, 25).trim();\n }\n \n return { isValid: true, cleaned };\n}\n\n/**\n * Validate email\n * Rules: Must contain @, valid characters before and after\n */\nfunction validateEmail(value: string): ValidationResult {\n // Keep valid email characters\n let cleaned = value.replace(/[^a-zA-Z0-9@\\.\\-\\_]/g, '');\n \n // Must contain @\n if (!cleaned.includes('@')) {\n return { isValid: false, cleaned: '', reason: 'Missing @ symbol' };\n }\n \n const parts = cleaned.split('@');\n if (parts.length !== 2) {\n return { isValid: false, cleaned: '', reason: 'Invalid @ usage' };\n }\n \n const [local, domain] = parts;\n \n // Local part must have at least 1 character\n if (!local || local.length === 0) {\n return { isValid: false, cleaned: '', reason: 'Empty local part' };\n }\n \n // Domain must have at least 3 characters (x.y)\n if (!domain || domain.length < 3) {\n return { isValid: false, cleaned: '', reason: 'Invalid domain' };\n }\n \n // Domain must contain at least one dot\n if (!domain.includes('.')) {\n return { isValid: false, cleaned: '', reason: 'Domain missing dot' };\n }\n \n // Remove leading/trailing dots and hyphens\n const cleanLocal = local.replace(/^[\\.\\-]+|[\\.\\-]+$/g, '');\n const cleanDomain = domain.replace(/^[\\.\\-]+|[\\.\\-]+$/g, '');\n \n if (!cleanLocal || !cleanDomain) {\n return { isValid: false, cleaned: '', reason: 'Invalid format after cleaning' };\n }\n \n cleaned = `${cleanLocal}@${cleanDomain}`;\n \n // Max length check\n if (cleaned.length > 50) {\n cleaned = cleaned.substring(0, 50);\n }\n \n return { isValid: true, cleaned };\n}\n\n/**\n * Validate street address\n * Rules: Numbers, letters, spaces, common address characters\n */\nfunction validateStreetAddress(value: string): ValidationResult {\n // Keep valid address characters\n let cleaned = value.replace(/[^a-zA-Z0-9\\s\\-\\#\\.\\,]/g, '');\n cleaned = cleaned.trim().replace(/\\s+/g, ' ');\n \n // Must have at least 5 characters\n if (cleaned.length < 5) {\n return { isValid: false, cleaned: '', reason: 'Too short' };\n }\n \n // Max length check\n if (cleaned.length > 50) {\n cleaned = cleaned.substring(0, 50).trim();\n }\n \n return { isValid: true, cleaned };\n}\n\n/**\n * Validate location (city, state, country)\n * Rules: Mostly letters, optional spaces/hyphens\n */\nfunction validateLocation(value: string): ValidationResult {\n // Keep letters, spaces, hyphens, apostrophes\n let cleaned = value.replace(/[^a-zA-Z\\s\\-\\']/g, '');\n cleaned = cleaned.trim().replace(/\\s+/g, ' ');\n \n // Must have at least 2 characters and contain letters\n if (cleaned.length < 2 || !/[a-zA-Z]/.test(cleaned)) {\n return { isValid: false, cleaned: '', reason: 'Too short or no letters' };\n }\n \n // Max length check\n if (cleaned.length > 30) {\n cleaned = cleaned.substring(0, 30).trim();\n }\n \n return { isValid: true, cleaned };\n}\n\n/**\n * Validate text (company_name, job_title, product_name)\n * Rules: Letters, numbers, spaces, common punctuation\n */\nfunction validateText(value: string): ValidationResult {\n // Keep alphanumeric and common punctuation\n let cleaned = value.replace(/[^a-zA-Z0-9\\s\\-\\'\\.\\,]/g, '');\n cleaned = cleaned.trim().replace(/\\s+/g, ' ');\n \n // Must have at least 2 characters\n if (cleaned.length < 2) {\n return { isValid: false, cleaned: '', reason: 'Too short' };\n }\n \n // Max length check\n if (cleaned.length > 50) {\n cleaned = cleaned.substring(0, 50).trim();\n }\n \n return { isValid: true, cleaned };\n}\n\n/**\n * Validate color\n * Rules: Letters only, maybe spaces\n */\nfunction validateColor(value: string): ValidationResult {\n // Keep letters and spaces only\n let cleaned = value.replace(/[^a-zA-Z\\s]/g, '');\n cleaned = cleaned.trim().replace(/\\s+/g, ' ');\n \n // Must have at least 3 characters\n if (cleaned.length < 3) {\n return { isValid: false, cleaned: '', reason: 'Too short' };\n }\n \n // Max length check\n if (cleaned.length > 20) {\n cleaned = cleaned.substring(0, 20).trim();\n }\n \n return { isValid: true, cleaned };\n}\n\n/**\n * Validate UUID\n * Rules: Should follow UUID format (8-4-4-4-12 hex digits with dashes)\n */\nfunction validateUUID(value: string): ValidationResult {\n // Keep hex characters and dashes\n let cleaned = value.replace(/[^0-9a-fA-F\\-]/g, '');\n \n // Try to format as UUID if it has enough characters\n const hexOnly = cleaned.replace(/-/g, '');\n if (hexOnly.length >= 32) {\n // Format as UUID: 8-4-4-4-12\n const formatted = [\n hexOnly.substring(0, 8),\n hexOnly.substring(8, 12),\n hexOnly.substring(12, 16),\n hexOnly.substring(16, 20),\n hexOnly.substring(20, 32)\n ].join('-');\n cleaned = formatted;\n }\n \n // Must have at least 32 hex characters\n const hexCount = cleaned.replace(/-/g, '').length;\n if (hexCount < 32) {\n return { isValid: false, cleaned: '', reason: 'Too few hex characters' };\n }\n \n return { isValid: true, cleaned };\n}\n\n/**\n * Validate date\n * Rules: Should follow date format (YYYY-MM-DD or similar)\n */\nfunction validateDate(value: string): ValidationResult {\n // Keep digits, dashes, slashes\n let cleaned = value.replace(/[^0-9\\-\\/]/g, '');\n \n // Must have at least 8 digits (YYYYMMDD)\n const digitCount = (cleaned.match(/\\d/g) || []).length;\n if (digitCount < 8) {\n return { isValid: false, cleaned: '', reason: 'Too few digits' };\n }\n \n // Max length check\n if (cleaned.length > 20) {\n cleaned = cleaned.substring(0, 20).trim();\n }\n \n return { isValid: true, cleaned };\n}\n\n/**\n * Generic validator for unknown labels\n */\nfunction validateGeneric(value: string): ValidationResult {\n // Remove control characters\n let cleaned = value.replace(/[\\x00-\\x1F\\x7F]/g, '');\n cleaned = cleaned.trim().replace(/\\s+/g, ' ');\n \n if (cleaned.length < 1) {\n return { isValid: false, cleaned: '', reason: 'Empty after cleaning' };\n }\n \n return { isValid: true, cleaned };\n}\n\n","/**\n * PatternCorrector - Post-processing pattern matching and correction\n * Learns patterns from training data and applies them to generated samples\n */\n\nimport { LabeledSample } from '../types';\n\nexport interface Pattern {\n label: string;\n examples: string[];\n commonPrefixes: string[];\n commonSuffixes: string[];\n charFrequency: Map<string, number>;\n lengthDistribution: number[];\n}\n\nexport class PatternCorrector {\n private patterns: Map<string, Pattern> = new Map();\n\n /**\n * Learn patterns from training data\n */\n learnPatterns(samples: LabeledSample[]): void {\n const byLabel = new Map<string, string[]>();\n \n // Group samples by label\n for (const sample of samples) {\n if (!byLabel.has(sample.label)) {\n byLabel.set(sample.label, []);\n }\n byLabel.get(sample.label)!.push(sample.value);\n }\n\n // Learn patterns for each label\n for (const [label, values] of byLabel.entries()) {\n this.learnPattern(label, values);\n }\n }\n\n /**\n * Learn pattern for a specific label\n */\n private learnPattern(label: string, examples: string[]): void {\n if (examples.length === 0) return;\n\n // Extract common prefixes (first 1-3 characters)\n const prefixCounts = new Map<string, number>();\n const suffixCounts = new Map<string, number>();\n const charFreq = new Map<string, number>();\n const lengths: number[] = [];\n\n for (const example of examples) {\n lengths.push(example.length);\n \n // Prefixes\n for (let len = 1; len <= Math.min(3, example.length); len++) {\n const prefix = example.substring(0, len);\n prefixCounts.set(prefix, (prefixCounts.get(prefix) || 0) + 1);\n }\n \n // Suffixes\n for (let len = 1; len <= Math.min(3, example.length); len++) {\n const suffix = example.substring(example.length - len);\n suffixCounts.set(suffix, (suffixCounts.get(suffix) || 0) + 1);\n }\n \n // Character frequency\n for (const char of example) {\n charFreq.set(char, (charFreq.get(char) || 0) + 1);\n }\n }\n\n // Get common prefixes (appear in >10% of examples - lowered from 20% for better pattern matching)\n const commonPrefixes = Array.from(prefixCounts.entries())\n .filter(([_, count]) => count / examples.length > 0.1)\n .sort((a, b) => b[1] - a[1])\n .slice(0, 15) // Increased from 10 to 15\n .map(([prefix]) => prefix);\n\n // Get common suffixes (appear in >10% of examples - lowered from 20% for better pattern matching)\n const commonSuffixes = Array.from(suffixCounts.entries())\n .filter(([_, count]) => count / examples.length > 0.1)\n .sort((a, b) => b[1] - a[1])\n .slice(0, 15) // Increased from 10 to 15\n .map(([suffix]) => suffix);\n\n // Normalize character frequencies\n const totalChars = Array.from(charFreq.values()).reduce((a, b) => a + b, 0);\n for (const [char, count] of charFreq.entries()) {\n charFreq.set(char, count / totalChars);\n }\n\n this.patterns.set(label, {\n label,\n examples,\n commonPrefixes,\n commonSuffixes,\n charFrequency: charFreq,\n lengthDistribution: lengths,\n });\n }\n\n /**\n * Correct a generated string using learned patterns\n */\n correct(generated: string, label: string): string {\n const pattern = this.patterns.get(label);\n if (!pattern) {\n return generated; // No pattern learned, return as-is\n }\n\n let corrected = generated;\n\n // 1. Check if it matches a known example (exact match)\n if (pattern.examples.includes(generated)) {\n return generated; // Already perfect\n }\n\n // 2. Check prefix/suffix patterns\n const hasValidPrefix = pattern.commonPrefixes.some(prefix => \n corrected.toLowerCase().startsWith(prefix.toLowerCase())\n );\n const hasValidSuffix = pattern.commonSuffixes.some(suffix => \n corrected.toLowerCase().endsWith(suffix.toLowerCase())\n );\n\n // 3. If no valid prefix, try to fix it\n if (!hasValidPrefix && pattern.commonPrefixes.length > 0) {\n const mostCommonPrefix = pattern.commonPrefixes[0];\n // Only fix if the generated string is very different\n if (corrected.length > 0 && !corrected.toLowerCase().startsWith(mostCommonPrefix[0].toLowerCase())) {\n // Don't change, but note it for scoring\n }\n }\n\n // 4. Check character frequency (remove unlikely characters)\n const charFreq = pattern.charFrequency;\n let cleaned = '';\n for (const char of corrected) {\n const freq = charFreq.get(char) || 0;\n // Keep character if it appears in >0.5% of training data (lowered from 1%), or if it's common (space, etc.)\n if (freq > 0.005 || /[a-zA-Z0-9\\s]/.test(char)) {\n cleaned += char;\n }\n }\n if (cleaned.length > 0) {\n corrected = cleaned;\n }\n\n // 5. Check length distribution\n const avgLength = pattern.lengthDistribution.reduce((a, b) => a + b, 0) / pattern.lengthDistribution.length;\n const minLength = Math.min(...pattern.lengthDistribution);\n const maxLength = Math.max(...pattern.lengthDistribution);\n \n // Truncate if too long\n if (corrected.length > maxLength * 1.5) {\n corrected = corrected.substring(0, Math.floor(maxLength * 1.2));\n }\n\n return corrected;\n }\n\n /**\n * Score how well a generated string matches the pattern\n */\n score(generated: string, label: string): number {\n const pattern = this.patterns.get(label);\n if (!pattern) {\n return 0.5; // Unknown pattern, neutral score\n }\n\n let score = 0;\n let factors = 0;\n\n // 1. Exact match bonus\n if (pattern.examples.includes(generated)) {\n return 1.0; // Perfect match\n }\n\n // 2. Prefix match (30% weight)\n const prefixMatch = pattern.commonPrefixes.some(prefix => \n generated.toLowerCase().startsWith(prefix.toLowerCase())\n );\n score += prefixMatch ? 0.3 : 0;\n factors++;\n\n // 3. Suffix match (20% weight)\n const suffixMatch = pattern.commonSuffixes.some(suffix => \n generated.toLowerCase().endsWith(suffix.toLowerCase())\n );\n score += suffixMatch ? 0.2 : 0;\n factors++;\n\n // 4. Character frequency match (30% weight)\n const charFreq = pattern.charFrequency;\n let charScore = 0;\n let charCount = 0;\n for (const char of generated) {\n const freq = charFreq.get(char) || 0;\n charScore += freq;\n charCount++;\n }\n score += (charCount > 0 ? charScore / charCount : 0) * 0.3;\n factors++;\n\n // 5. Length match (20% weight)\n const avgLength = pattern.lengthDistribution.reduce((a, b) => a + b, 0) / pattern.lengthDistribution.length;\n const lengthDiff = Math.abs(generated.length - avgLength) / avgLength;\n const lengthScore = Math.max(0, 1 - lengthDiff);\n score += lengthScore * 0.2;\n factors++;\n\n return factors > 0 ? score / factors : 0;\n }\n\n /**\n * Get pattern for a label\n */\n getPattern(label: string): Pattern | undefined {\n return this.patterns.get(label);\n }\n}\n\n","/**\n * SequenceContext - Add sequence context to generation\n * Uses previous characters to inform next character prediction\n */\n\nexport class SequenceContext {\n private ngramPatterns: Map<string, Map<string, number>> = new Map();\n private n: number; // n-gram size\n\n constructor(n: number = 3) {\n this.n = n;\n }\n\n /**\n * Learn n-gram patterns from training data\n */\n learnPatterns(samples: string[]): void {\n this.ngramPatterns.clear();\n \n for (const sample of samples) {\n // Extract n-grams\n for (let i = 0; i <= sample.length - this.n; i++) {\n const ngram = sample.substring(i, i + this.n - 1); // Context (n-1 chars)\n const nextChar = sample[i + this.n - 1]; // Next character\n \n if (!this.ngramPatterns.has(ngram)) {\n this.ngramPatterns.set(ngram, new Map());\n }\n \n const charMap = this.ngramPatterns.get(ngram)!;\n charMap.set(nextChar, (charMap.get(nextChar) || 0) + 1);\n }\n }\n }\n\n /**\n * Get next character probabilities given context\n */\n getNextCharProbs(context: string): Map<string, number> {\n // Use last n-1 characters as context\n const ctx = context.length >= this.n - 1 \n ? context.substring(context.length - (this.n - 1))\n : context;\n \n const charCounts = this.ngramPatterns.get(ctx);\n if (!charCounts || charCounts.size === 0) {\n return new Map();\n }\n \n // Convert counts to probabilities\n const total = Array.from(charCounts.values()).reduce((a, b) => a + b, 0);\n const probs = new Map<string, number>();\n \n for (const [char, count] of charCounts.entries()) {\n probs.set(char, count / total);\n }\n \n return probs;\n }\n\n /**\n * Suggest next character based on context\n */\n suggestNextChar(context: string): string | null {\n const probs = this.getNextCharProbs(context);\n if (probs.size === 0) {\n return null;\n }\n \n // Return most likely character\n let bestChar = '';\n let bestProb = 0;\n \n for (const [char, prob] of probs.entries()) {\n if (prob > bestProb) {\n bestProb = prob;\n bestChar = char;\n }\n }\n \n return bestChar;\n }\n\n /**\n * Score how well a character fits the context\n */\n scoreChar(context: string, char: string): number {\n const probs = this.getNextCharProbs(context);\n return probs.get(char) || 0;\n }\n}\n\n","/**\n * ELMGenerator - Label-conditioned string generator using ELM\n * Trains an ELM to generate encoded strings based on labels + noise\n */\n\nimport { ELM } from '@astermind/astermind-elm';\nimport { StringEncoder } from '../encoders/StringEncoder';\nimport { LabeledSample } from '../types';\nimport { oneHotLabel, generateNoiseVector } from '../core/elm_utils';\nimport { validateForLabel } from '../core/validation';\nimport { PatternCorrector } from '../core/PatternCorrector';\nimport { SequenceContext } from '../core/SequenceContext';\n\n// Type definitions - ELM will be available at runtime from AsterMind\n// Using any for now to allow runtime import from dist or external package\ntype ELMConfig = {\n useTokenizer: false;\n inputSize: number;\n categories: string[];\n hiddenUnits: number;\n activation?: 'tanh' | 'relu' | 'leakyrelu' | 'sigmoid' | 'linear' | 'gelu';\n ridgeLambda?: number;\n task?: 'classification' | 'regression';\n};\n\ntype ELMModel = {\n trainFromData(X: number[][], Y: number[] | number[][], options?: any): any;\n predictLogitsFromVector(vec: number[]): number[];\n predictProbaFromVector?(vec: number[]): number[]; // For classification\n};\n\nexport interface ELMGeneratorConfig {\n maxLength: number;\n hiddenUnits?: number;\n activation?: 'tanh' | 'relu' | 'leakyrelu' | 'sigmoid' | 'linear' | 'gelu';\n ridgeLambda?: number;\n noiseSize?: number; // Size of noise vector\n seed?: number;\n useOneHot?: boolean; // Use one-hot encoding (better for classification)\n useClassification?: boolean; // Use classification instead of regression\n usePatternCorrection?: boolean; // Apply pattern-based correction\n}\n\nexport class ELMGenerator {\n private encoder: StringEncoder;\n private elm: ELMModel | null = null;\n private labels: string[] = [];\n private config: ELMGeneratorConfig;\n private noiseSize: number;\n private patternCorrector: PatternCorrector | null = null;\n private sequenceContext: SequenceContext | null = null;\n private useClassification: boolean;\n\n constructor(config: ELMGeneratorConfig) {\n this.config = {\n hiddenUnits: 128,\n activation: 'relu',\n ridgeLambda: 0.01,\n noiseSize: 32,\n useOneHot: false, // Default to false for memory efficiency (can enable for better accuracy)\n useClassification: false, // Default to regression for compatibility\n usePatternCorrection: true,\n ...config,\n };\n this.noiseSize = this.config.noiseSize!;\n this.useClassification = this.config.useClassification!;\n this.encoder = new StringEncoder({\n maxLength: config.maxLength,\n useOneHot: this.config.useOneHot ?? false, // Default to false for memory efficiency\n });\n \n if (this.config.usePatternCorrection) {\n this.patternCorrector = new PatternCorrector();\n }\n \n // Always use sequence context for better generation\n this.sequenceContext = new SequenceContext(3); // 3-grams\n }\n\n /**\n * Train the ELM generator on labeled samples\n */\n train(samples: LabeledSample[]): void {\n if (samples.length === 0) {\n throw new Error('Cannot train on empty dataset');\n }\n\n // Extract unique labels\n const uniqueLabels = Array.from(new Set(samples.map(s => s.label)));\n this.labels = uniqueLabels;\n\n // Extract all values for vocabulary building\n const allValues = samples.map(s => s.value);\n this.encoder.buildVocab(allValues);\n \n // Learn patterns if pattern correction is enabled\n if (this.patternCorrector) {\n this.patternCorrector.learnPatterns(samples);\n }\n \n // Learn sequence context\n if (this.sequenceContext) {\n this.sequenceContext.learnPatterns(allValues);\n }\n\n // Build training data\n const X: number[][] = [];\n const Y: number[][] = [];\n\n for (const sample of samples) {\n const labelIndex = this.labels.indexOf(sample.label);\n if (labelIndex === -1) {\n continue;\n }\n\n // Input: concat(oneHot(label), noiseVector)\n const labelOneHot = oneHotLabel(labelIndex, this.labels.length);\n const noise = generateNoiseVector(this.noiseSize, this.config.seed);\n const inputVector = [...labelOneHot, ...noise];\n X.push(inputVector);\n\n // Target: encoded(value)\n const encodedValue = this.encoder.encode(sample.value);\n Y.push(encodedValue);\n }\n\n if (X.length === 0) {\n throw new Error('No valid training samples after processing');\n }\n\n // Create ELM config\n const inputSize = this.labels.length + this.noiseSize;\n const outputSize = this.encoder.getVectorSize();\n\n const elmConfig: ELMConfig = {\n useTokenizer: false, // Numeric mode\n inputSize: inputSize,\n categories: this.useClassification ? [] : [], // For classification, we'll handle it differently\n hiddenUnits: this.config.hiddenUnits!,\n activation: this.config.activation!,\n // Use lower regularization for better pattern learning\n ridgeLambda: this.config.ridgeLambda! * 0.1, // Reduce regularization\n task: this.useClassification ? 'classification' : 'regression',\n };\n\n // Create and train ELM - resolve constructor robustly across CJS/ESM shapes\n // Replace dynamic require with direct constructor\n this.elm = new (ELM as any)(elmConfig) as unknown as ELMModel;\n this.elm.trainFromData(X, Y);\n }\n\n /**\n * Generate a string for a given label\n * @param label Label to generate for\n * @param noiseSeed Optional seed for noise generation (for deterministic output)\n */\n generate(label: string, noiseSeed?: number): string {\n if (!this.elm) {\n throw new Error('Model not trained. Call train() first.');\n }\n\n const labelIndex = this.labels.indexOf(label);\n if (labelIndex === -1) {\n throw new Error(`Label '${label}' not found in training data`);\n }\n\n // Create input: concat(oneHot(label), noiseVector)\n const labelOneHot = oneHotLabel(labelIndex, this.labels.length);\n const noise = generateNoiseVector(\n this.noiseSize,\n noiseSeed !== undefined ? noiseSeed : this.config.seed\n );\n const inputVector = [...labelOneHot, ...noise];\n\n // Predict based on mode\n let decoded: string;\n \n if (this.useClassification && this.config.useOneHot && typeof (this.elm as any).predictProbaFromVector === 'function') {\n // Classification mode with one-hot: use probabilities\n const vocabSize = this.encoder.getVocabSize();\n const maxLength = this.config.maxLength;\n const vectorSize = vocabSize * maxLength;\n \n // Get probabilities for each position\n const probs = (this.elm as any).predictProbaFromVector(inputVector);\n \n // Reshape to [maxLength, vocabSize] and use argmax\n const indices: number[] = [];\n for (let pos = 0; pos < maxLength; pos++) {\n const posProbs = probs.slice(pos * vocabSize, (pos + 1) * vocabSize);\n const maxIdx = posProbs.indexOf(Math.max(...posProbs));\n indices.push(maxIdx);\n }\n \n decoded = this.encoder.decode(indices);\n } else {\n // Regression mode: use logits and round\n const prediction = this.elm.predictLogitsFromVector(inputVector);\n \n // Convert logits to indices with proper quantization\n const vocabSize = this.encoder.getVocabSize();\n const indices = prediction.map(val => {\n // Clamp value to reasonable range first (prevent extreme values)\n const clamped = Math.max(-vocabSize, Math.min(vocabSize * 2, val));\n // Round to nearest integer\n const rounded = Math.round(clamped);\n // Clamp to valid vocabulary range [0, vocabSize-1]\n const idx = Math.max(0, Math.min(vocabSize - 1, rounded));\n return idx;\n });\n \n decoded = this.encoder.decode(indices);\n }\n \n // Apply pattern correction if enabled\n let corrected = decoded;\n if (this.patternCorrector) {\n corrected = this.patternCorrector.correct(decoded, label);\n }\n \n // Apply sequence context refinement\n if (this.sequenceContext && corrected.length > 0) {\n corrected = this.refineWithSequenceContext(corrected, label);\n }\n \n // Validate and clean the decoded string using label-specific rules\n const validation = validateForLabel(label, corrected);\n \n // If validation fails, try to generate again with different noise (up to 3 attempts)\n if (!validation.isValid) {\n for (let attempt = 0; attempt < 3; attempt++) {\n const baseSeed = noiseSeed !== undefined ? noiseSeed : (this.config.seed ?? Date.now());\n const newNoise = generateNoiseVector(\n this.noiseSize, baseSeed + attempt + 1000\n );\n const newInputVector = [...labelOneHot, ...newNoise];\n \n let newDecoded: string;\n if (this.useClassification && this.config.useOneHot && typeof (this.elm as any).predictProbaFromVector === 'function') {\n const vocabSize = this.encoder.getVocabSize();\n const maxLength = this.config.maxLength;\n const probs = (this.elm as any).predictProbaFromVector(newInputVector);\n const newIndices: number[] = [];\n for (let pos = 0; pos < maxLength; pos++) {\n const posProbs = probs.slice(pos * vocabSize, (pos + 1) * vocabSize);\n const maxIdx = posProbs.indexOf(Math.max(...posProbs));\n newIndices.push(maxIdx);\n }\n newDecoded = this.encoder.decode(newIndices);\n } else {\n const newPrediction = this.elm.predictLogitsFromVector(newInputVector);\n const vocabSize = this.encoder.getVocabSize();\n const newIndices = newPrediction.map(val => {\n const clamped = Math.max(-vocabSize, Math.min(vocabSize * 2, val));\n const rounded = Math.round(clamped);\n return Math.max(0, Math.min(vocabSize - 1, rounded));\n });\n newDecoded = this.encoder.decode(newIndices);\n }\n \n // Apply pattern correction\n if (this.patternCorrector) {\n newDecoded = this.patternCorrector.correct(newDecoded, label);\n }\n \n const newValidation = validateForLabel(label, newDecoded);\n if (newValidation.isValid) {\n return newValidation.cleaned;\n }\n }\n // If all attempts fail, return empty string\n return '';\n }\n \n return validation.cleaned;\n }\n\n /**\n * Generate multiple strings for a label with confidence-based selection\n */\n generateBatch(label: string, count: number): string[] {\n const candidates: Array<{ value: string; score: number }> = [];\n const seen = new Set<string>();\n let attempts = 0;\n const maxAttempts = count * 10; // Allow up to 10x attempts to get valid unique samples\n \n // Generate candidates with scoring\n while (attempts < maxAttempts) {\n const seed = this.config.seed !== undefined \n ? this.config.seed + attempts \n : Date.now() + attempts;\n \n try {\n const generated = this.generate(label, seed);\n \n if (generated && generated.length > 0 && !seen.has(generated.toLowerCase())) {\n // Score the candidate\n let score = 1.0;\n \n // Pattern match score\n if (this.patternCorrector) {\n score = this.patternCorrector.score(generated, label);\n }\n \n // Validation score (valid = 1.0, invalid = 0.0)\n const validation = validateForLabel(label, generated);\n if (!validation.isValid) {\n score = 0;\n }\n \n candidates.push({ value: generated, score });\n seen.add(generated.toLowerCase());\n }\n } catch (error) {\n // Skip errors\n }\n \n attempts++;\n }\n \n // Sort by score and return top candidates\n candidates.sort((a, b) => b.score - a.score);\n return candidates.slice(0, count).map(c => c.value);\n }\n\n /**\n * Refine generated string using sequence context\n */\n private refineWithSequenceContext(generated: string, label: string): string {\n if (!this.