UNPKG

@astermind/astermind-synth

Version:

OmegaSynth - Label-Conditioned Synthetic Data Generator for AsterMind ELM/KELM Pipelines

1,396 lines (1,384 loc) 97.8 kB
import { setLicenseToken, hasFeature, getLicenseState, initLicenseRuntime, requireFeature } from '@astermindai/license-runtime'; import { ELM } from '@astermind/astermind-elm'; import * as fs from 'fs'; import * as path from 'path'; /** * SyntheticFieldStore - Storage for labeled samples * Supports insert, get, and sample operations */ class SyntheticFieldStore { constructor() { this.store = new Map(); } /** * Insert a labeled sample into the store */ insert(sample) { if (!this.store.has(sample.label)) { this.store.set(sample.label, []); } this.store.get(sample.label).push(sample.value); } /** * Insert multiple samples at once */ insertMany(samples) { for (const sample of samples) { this.insert(sample); } } /** * Get all values for a given label */ get(label) { return this.store.get(label) || []; } /** * Sample k values uniformly at random for a given label */ sample(label, k = 1) { const values = this.get(label); if (values.length === 0) { return []; } const result = []; const indices = new Set(); // Simple uniform random sampling without replacement while (result.length < k && indices.size < values.length) { const idx = Math.floor(Math.random() * values.length); if (!indices.has(idx)) { indices.add(idx); result.push(values[idx]); } } return result; } /** * Check if a label exists in the store */ hasLabel(label) { return this.store.has(label); } /** * Get all labels in the store */ getLabels() { return Array.from(this.store.keys()); } /** * Get the count of samples for a label */ count(label) { return this.get(label).length; } /** * Clear all data */ clear() { this.store.clear(); } } /** * License management for OmegaSynth * Wraps @astermindai/license-runtime with convenience functions */ let initialized = false; /** * Initialize the license runtime singleton. * Must be called before any other license functions. * Automatically loads token from ASTERMIND_LICENSE_TOKEN environment variable if present. * * PROFESSIONAL LICENSING APPROACH: * - Always requires a valid license key (strict mode) * - Trial keys are obtained from the license server and have expiration dates * - No "eval mode" bypass - all usage requires a valid key * - For testing, use a test/dev key or mock the license runtime * * This follows industry best practices where: * 1. All users (including evaluators) must obtain a trial key * 2. License validation happens server-side via JWT verification * 3. Trial keys expire after a set period (e.g., 30 days) * 4. No environment-based bypasses are possible */ function initializeLicense() { if (initialized) { return; // Already initialized } // Always use strict mode - require valid license key // Trial keys are just license keys with expiration dates initLicenseRuntime({ jwksUrl: "https://license.astermind.ai/.well-known/astermind-license-keys.json", expectedIss: "https://license.astermind.ai", expectedAud: "astermind-synth", jwksMaxAgeSeconds: 300, mode: 'strict' // Always strict - no bypasses }); // Load token from environment variable if present const token = process.env.ASTERMIND_LICENSE_TOKEN; if (token) { // Note: setLicenseToken is async, but we can't await in module initialization // The token will be set asynchronously setLicenseToken(token).catch(err => { console.warn("Failed to set license token from environment:", err); }); } initialized = true; } /** * Require a valid license before proceeding. * Throws if license is invalid, expired, or feature is missing. * * SECURITY: This function ONLY trusts payloads that have been verified by the license-runtime. * We never decode or trust unverified JWT payloads to prevent bypass attacks. * * FLEXIBLE AUDIENCE CHECK: * - If the token's audience matches "astermind-synth", it passes normally * - If the token's audience doesn't match but the token includes the "astermind-synth" feature, * it will still pass (allows tokens with audience "astermind-elm" or other products that include this feature) * - This flexibility ONLY applies when the license-runtime has VERIFIED the token signature * * All users (including evaluators) must obtain a trial or production license key. * Trial keys can be obtained from: https://license.astermind.ai/v1/trial/create */ /** * Check if the token has a valid feature for astermind-synth. * Accepts both "astermind-synth" and "astermind-elm-basic" features. * * SECURITY: Only call this with verified payloads from license-runtime state. */ function hasValidFeature(features) { return Array.isArray(features) && (features.includes("astermind-synth") || features.includes("astermind-elm-basic")); } function requireLicense() { try { requireFeature("astermind-synth"); } catch (error) { const state = getLicenseState(); // SECURITY: Only trust payloads that have been verified by license-runtime. // If state.payload exists, it means the license-runtime has successfully verified // the JWT signature. We can safely check features in verified payloads. if (state.payload && hasValidFeature(state.payload.features)) { // Feature is present in VERIFIED payload, even if audience doesn't match - allow it // This handles tokens with audience "astermind-elm" that include "astermind-synth" feature return; } // Handle "missing" state: Token not set or async setLicenseToken() hasn't completed if (state.status === 'missing') { // SECURITY: Do NOT decode unverified tokens. Wait for license-runtime verification. // If token exists in env but state is "missing", it means async verification hasn't completed. // We must wait for verification - do not trust unverified payloads. const token = process.env.ASTERMIND_LICENSE_TOKEN; if (token) { // Token exists but not verified yet - give a helpful error message throw new Error('License token is being verified. Please wait a moment and try again.\n' + 'If this error persists, verify your license token is valid.\n' + 'For trial tokens, visit: https://license.astermind.ai/v1/trial/create'); } throw new Error('License token is required. Please set ASTERMIND_LICENSE_TOKEN environment variable.\n' + 'For trial tokens, visit: https://license.astermind.ai/v1/trial/create'); } else if (state.status === 'expired') { throw new Error(`License token has expired. Please obtain a new license token.\n` + `Expired at: ${state.payload?.exp ? new Date(state.payload.exp * 1000).toISOString() : 'unknown'}`); } else if (state.status === 'invalid') { // SECURITY: Do NOT decode unverified tokens. If state is "invalid", the token failed verification. // This could be due to: // - Invalid signature (token was tampered with or is fake) // - Wrong issuer // - Other verification failures // We must reject invalid tokens - never trust unverified payloads. throw new Error(`License token is invalid: ${state.reason || 'unknown error'}\n` + 'The token failed cryptographic verification. Please verify your license token is correct.\n' + 'For trial tokens, visit: https://license.astermind.ai/v1/trial/create'); } // Re-throw original error if we can't provide better message throw error; } } /** * Check if license is valid and feature is available (non-blocking). * @returns true if astermind-synth feature is available */ function checkLicense() { return hasFeature("astermind-synth"); } /** * Get detailed license status. * @returns LicenseState object with status, reason, payload, etc. */ function getLicenseStatus() { return getLicenseState(); } /** * Set license token from a string. * Useful for dynamic token loading from backend services or user input. * @param token The license token string (JWT format) */ async function setLicenseTokenFromString(token) { await setLicenseToken(token); } /** * RetrievalGenerator - Simple deterministic retrieval sampler * Uniform random sampling from stored labeled samples */ /** * Seeded random number generator for deterministic testing */ let SeededRNG$1 = class SeededRNG { constructor(seed = Date.now()) { this.seed = seed; } next() { // Linear congruential generator this.seed = (this.seed * 1664525 + 1013904223) % 2 ** 32; return this.seed / 2 ** 32; } setSeed(seed) { this.seed = seed; } }; class RetrievalGenerator { constructor(seed) { // Initialize and require license before allowing generator use initializeLicense(); requireLicense(); this.store = new SyntheticFieldStore(); this.seed = seed; this.rng = new SeededRNG$1(seed); } /** * Ingest labeled samples into the store */ ingest(samples) { this.store.insertMany(samples); } /** * Sample k values for a given label * Returns empty array if label doesn't exist or has no samples */ sample(label, k = 1) { const values = this.store.get(label); if (values.length === 0) { return []; } const result = []; const availableIndices = Array.from({ length: values.length }, (_, i) => i); // Sample k values (or all if k > available) const sampleCount = Math.min(k, values.length); for (let i = 0; i < sampleCount; i++) { const randomIndex = Math.floor(this.rng.next() * availableIndices.length); const selectedIndex = availableIndices.splice(randomIndex, 1)[0]; result.push(values[selectedIndex]); } return result; } /** * Get a single sample (convenience method) */ sampleOne(label) { const samples = this.sample(label, 1); return samples.length > 0 ? samples[0] : null; } /** * Check if a label has samples */ hasLabel(label) { return this.store.hasLabel(label) && this.store.count(label) > 0; } /** * Get all available labels */ getLabels() { return this.store.getLabels(); } /** * Reset the generator (clears store and optionally resets seed) */ reset(seed) { this.store.clear(); if (seed !== undefined) { this.seed = seed; this.rng.setSeed(seed); } } } /** * CharVocab - Character vocabulary builder * Builds a vocabulary from character sets and training data */ class CharVocab { constructor() { this.charToIndex = new Map(); this.indexToChar = new Map(); this.size = 0; } /** * Build vocabulary from a set of strings * @param samples Array of strings to build vocabulary from * @param charSet Optional predefined character set (e.g., alphanumeric + punctuation) */ build(samples, charSet) { const chars = new Set(); // Add padding character first (index 0) - use null character // This ensures index 0 is always padding chars.add('\0'); // Add predefined character set if provided if (charSet) { for (const char of charSet) { // Skip null character if it's in the charSet (we already added it) if (char !== '\0') { chars.add(char); } } } // Add all characters from samples for (const sample of samples) { for (const char of sample) { // Skip null characters from samples (we use it for padding) if (char !== '\0') { chars.add(char); } } } // Sort characters for consistent ordering, but keep null char at index 0 const sortedChars = Array.from(chars).sort((a, b) => { // Ensure null char is always first if (a === '\0') return -1; if (b === '\0') return 1; return a.localeCompare(b); }); // Build mappings this.charToIndex.clear(); this.indexToChar.clear(); this.size = sortedChars.length; sortedChars.forEach((char, index) => { this.charToIndex.set(char, index); this.indexToChar.set(index, char); }); } /** * Get index for a character */ getIndex(char) { const index = this.charToIndex.get(char); if (index === undefined) { throw new Error(`Character '${char}' not in vocabulary`); } return index; } /** * Get character for an index */ getChar(index) { const char = this.indexToChar.get(index); if (char === undefined) { throw new Error(`Index ${index} not in vocabulary`); } return char; } /** * Check if character exists in vocabulary */ hasChar(char) { return this.charToIndex.has(char); } /** * Get vocabulary size */ getSize() { return this.size; } /** * Get all characters in vocabulary */ getChars() { return Array.from(this.charToIndex.keys()).sort(); } /** * Get default character set (alphanumeric + common punctuation) */ static getDefaultCharSet() { return 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' + ' !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'; } } /** * FixedLength - Utilities for fixed-length padding and truncation */ class FixedLength { /** * Pad or truncate an array to a fixed length * @param arr Array to pad/truncate * @param length Target length * @param padValue Value to use for padding (default: 0) */ static padOrTruncate(arr, length, padValue = 0) { if (arr.length === length) { return [...arr]; } if (arr.length > length) { // Truncate return arr.slice(0, length); } // Pad const result = [...arr]; while (result.length < length) { result.push(padValue); } return result; } /** * Pad or truncate a string to a fixed length * @param str String to pad/truncate * @param length Target length * @param padChar Character to use for padding (default: space) */ static padOrTruncateString(str, length, padChar = ' ') { if (str.length === length) { return str; } if (str.length > length) { // Truncate return str.slice(0, length); } // Pad return str + padChar.repeat(length - str.length); } } /** * OneHot - One-hot encoding utilities */ class OneHot { /** * Encode an index as a one-hot vector * @param index Index to encode * @param size Size of the one-hot vector */ static encode(index, size) { if (index < 0 || index >= size) { throw new Error(`Index ${index} out of range [0, ${size})`); } const vector = new Array(size).fill(0); vector[index] = 1; return vector; } /** * Decode a one-hot vector to an index * @param vector One-hot vector */ static decode(vector) { const index = vector.indexOf(1); if (index === -1) { throw new Error('Invalid one-hot vector: no element equals 1'); } return index; } /** * Encode multiple indices as one-hot vectors * @param indices Array of indices * @param size Size of each one-hot vector */ static encodeBatch(indices, size) { return indices.map(idx => this.encode(idx, size)); } /** * Decode multiple one-hot vectors to indices * @param vectors Array of one-hot vectors */ static decodeBatch(vectors) { return vectors.map(vec => this.decode(vec)); } } /** * StringEncoder - Encodes strings to vectors and decodes back * Compatible with ELM/KELM pipelines */ class StringEncoder { constructor(config) { this.config = { useOneHot: false, // Default to index-based for efficiency ...config, }; this.vocab = new CharVocab(); } /** * Build vocabulary from training samples */ buildVocab(samples) { this.vocab.build(samples, this.config.charSet || CharVocab.getDefaultCharSet()); } /** * Encode a string to a vector * @param str String to encode * @returns Encoded vector (either indices or one-hot) */ encode(str) { if (this.vocab.getSize() === 0) { throw new Error('Vocabulary not built. Call buildVocab() first.'); } // Convert string to indices const indices = []; for (const char of str) { if (this.vocab.hasChar(char)) { indices.push(this.vocab.getIndex(char)); } else { // For unknown characters, try to find a similar one or use space // If space is in vocab, use it; otherwise use 0 (which will be treated as padding) if (this.vocab.hasChar(' ')) { indices.push(this.vocab.getIndex(' ')); } else { indices.push(0); } } } // Pad or truncate to fixed length const padded = FixedLength.padOrTruncate(indices, this.config.maxLength, 0); // Convert to one-hot if requested if (this.config.useOneHot) { const vocabSize = this.vocab.getSize(); const oneHotVectors = []; for (const idx of padded) { oneHotVectors.push(...OneHot.encode(idx, vocabSize)); } return oneHotVectors; } return padded; } /** * Decode a vector back to a string * @param vector Encoded vector * @returns Decoded string */ decode(vector) { if (this.vocab.getSize() === 0) { throw new Error('Vocabulary not built. Call buildVocab() first.'); } let indices; if (this.config.useOneHot) { // Decode one-hot vectors const vocabSize = this.vocab.getSize(); indices = []; for (let i = 0; i < vector.length; i += vocabSize) { const oneHot = vector.slice(i, i + vocabSize); try { indices.push(OneHot.decode(oneHot)); } catch { // If decoding fails, use argmax as fallback const maxIdx = oneHot.indexOf(Math.max(...oneHot)); indices.push(maxIdx); } } // Truncate to maxLength indices = indices.slice(0, this.config.maxLength); } else { // Direct index-based decoding indices = vector.slice(0, this.config.maxLength); } // Convert indices to characters, stopping at first padding let result = ''; const vocabSize = this.vocab.getSize(); const paddingIdx = 0; // Padding is always index 0 for (const idx of indices) { // Clamp index to valid range const clampedIdx = Math.max(0, Math.min(vocabSize - 1, Math.round(idx))); // Stop decoding at first padding index (0) if (clampedIdx === paddingIdx) { break; } // Try to get character for this index try { const char = this.vocab.getChar(clampedIdx); // Skip null characters and control characters (except space, tab, newline) if (char === '\0' || (char.charCodeAt(0) < 32 && char !== ' ' && char !== '\t' && char !== '\n')) { break; // Stop at first invalid character } result += char; } catch { // Invalid index - stop decoding break; } } // Trim trailing whitespace but preserve internal spaces return result.trimEnd(); } /** * Encode multiple strings */ encodeBatch(strings) { return strings.map(str => this.encode(str)); } /** * Decode multiple vectors */ decodeBatch(vectors) { return vectors.map(vec => this.decode(vec)); } /** * Get the output vector size */ getVectorSize() { if (this.config.useOneHot) { return this.config.maxLength * this.vocab.getSize(); } return this.config.maxLength; } /** * Get vocabulary size */ getVocabSize() { return this.vocab.getSize(); } /** * Get vocabulary */ getVocab() { return this.vocab; } } /** * ELM utilities for OmegaSynth * Helper functions for working with ELM models */ /** * Create one-hot vector for a label index */ function oneHotLabel(labelIndex, numLabels) { const vector = new Array(numLabels).fill(0); if (labelIndex >= 0 && labelIndex < numLabels) { vector[labelIndex] = 1; } return vector; } /** * Generate random noise vector */ function generateNoiseVector(size, seed) { const rng = seed !== undefined ? new SeededRNG(seed) : null; const noise = []; for (let i = 0; i < size; i++) { const value = rng ? rng.next() : Math.random(); // Normalize to [-1, 1] noise.push(value * 2 - 1); } return noise; } /** * Seeded random number generator */ class SeededRNG { constructor(seed) { this.seed = seed; } next() { this.seed = (this.seed * 1664525 + 1013904223) % 2 ** 32; return this.seed / 2 ** 32; } } /** * Label-specific validation and cleaning utilities */ /** * Validate and clean a generated string based on its label type */ function validateForLabel(label, value) { if (!value || value.length === 0) { return { isValid: false, cleaned: '', reason: 'Empty value' }; } // Get label-specific validator const validator = getValidatorForLabel(label); return validator(value); } /** * Get validator function for a specific label */ function getValidatorForLabel(label) { switch (label) { case 'first_name': case 'last_name': return validateName; case 'phone_number': return validatePhoneNumber; case 'email': return validateEmail; case 'street_address': return validateStreetAddress; case 'city': case 'state': case 'country': return validateLocation; case 'company_name': case 'job_title': case 'product_name': return validateText; case 'color': return validateColor; case 'uuid': return validateUUID; case 'date': return validateDate; case 'credit_card_type': case 'device_type': return validateText; default: return validateGeneric; } } /** * Validate name (first_name, last_name) * Rules: Letters only, optional hyphens/apostrophes, no numbers */ function validateName(value) { // First check for placeholder patterns in original value (before cleaning) value.toLowerCase(); // Reject "Name" followed by numbers (e.g., "Name97", "name123") if (/^name\d+$/i.test(value)) { return { isValid: false, cleaned: '', reason: 'Placeholder name with numbers' }; } // Remove all non-letter characters except hyphens and apostrophes let cleaned = value.replace(/[^a-zA-Z\-\'\s]/g, ''); // Remove numbers completely cleaned = cleaned.replace(/[0-9]/g, ''); // Remove excessive special characters cleaned = cleaned.replace(/[-']{2,}/g, '-'); // Multiple hyphens/apostrophes -> single cleaned = cleaned.replace(/^[-']+|[-']+$/g, ''); // Remove leading/trailing // Trim and normalize whitespace cleaned = cleaned.trim().replace(/\s+/g, ' '); // Must be at least 2 characters and contain at least one letter if (cleaned.length < 2 || !/[a-zA-Z]/.test(cleaned)) { return { isValid: false, cleaned: '', reason: 'Too short or no letters' }; } // Reject common placeholder names (case-insensitive) after cleaning const lowerCleaned = cleaned.toLowerCase(); // Check for exact matches if (lowerCleaned === 'name' || lowerCleaned === 'firstname' || lowerCleaned === 'lastname' || lowerCleaned === 'surname') { return { isValid: false, cleaned: '', reason: 'Placeholder name' }; } // Check for "name" followed by very short variations if (lowerCleaned.startsWith('name') && lowerCleaned.length <= 6) { return { isValid: false, cleaned: '', reason: 'Placeholder name' }; } // Max length check if (cleaned.length > 30) { cleaned = cleaned.substring(0, 30).trim(); } return { isValid: true, cleaned }; } /** * Validate phone number * Rules: Digits, dashes, parentheses, dots, plus, spaces */ function validatePhoneNumber(value) { // Keep only valid phone characters let cleaned = value.replace(/[^0-9\-\+\(\)\.\s]/g, ''); // Remove excessive special characters cleaned = cleaned.replace(/[-\.]{2,}/g, '-'); cleaned = cleaned.replace(/\s+/g, ' '); cleaned = cleaned.trim(); // Count digits const digitCount = (cleaned.match(/\d/g) || []).length; // Must have at least 7 digits (minimum phone number) if (digitCount < 7) { return { isValid: false, cleaned: '', reason: 'Too few digits' }; } // Max length check if (cleaned.length > 25) { cleaned = cleaned.substring(0, 25).trim(); } return { isValid: true, cleaned }; } /** * Validate email * Rules: Must contain @, valid characters before and after */ function validateEmail(value) { // Keep valid email characters let cleaned = value.replace(/[^a-zA-Z0-9@\.\-\_]/g, ''); // Must contain @ if (!cleaned.includes('@')) { return { isValid: false, cleaned: '', reason: 'Missing @ symbol' }; } const parts = cleaned.split('@'); if (parts.length !== 2) { return { isValid: false, cleaned: '', reason: 'Invalid @ usage' }; } const [local, domain] = parts; // Local part must have at least 1 character if (!local || local.length === 0) { return { isValid: false, cleaned: '', reason: 'Empty local part' }; } // Domain must have at least 3 characters (x.y) if (!domain || domain.length < 3) { return { isValid: false, cleaned: '', reason: 'Invalid domain' }; } // Domain must contain at least one dot if (!domain.includes('.')) { return { isValid: false, cleaned: '', reason: 'Domain missing dot' }; } // Remove leading/trailing dots and hyphens const cleanLocal = local.replace(/^[\.\-]+|[\.\-]+$/g, ''); const cleanDomain = domain.replace(/^[\.\-]+|[\.\-]+$/g, ''); if (!cleanLocal || !cleanDomain) { return { isValid: false, cleaned: '', reason: 'Invalid format after cleaning' }; } cleaned = `${cleanLocal}@${cleanDomain}`; // Max length check if (cleaned.length > 50) { cleaned = cleaned.substring(0, 50); } return { isValid: true, cleaned }; } /** * Validate street address * Rules: Numbers, letters, spaces, common address characters */ function validateStreetAddress(value) { // Keep valid address characters let cleaned = value.replace(/[^a-zA-Z0-9\s\-\#\.\,]/g, ''); cleaned = cleaned.trim().replace(/\s+/g, ' '); // Must have at least 5 characters if (cleaned.length < 5) { return { isValid: false, cleaned: '', reason: 'Too short' }; } // Max length check if (cleaned.length > 50) { cleaned = cleaned.substring(0, 50).trim(); } return { isValid: true, cleaned }; } /** * Validate location (city, state, country) * Rules: Mostly letters, optional spaces/hyphens */ function validateLocation(value) { // Keep letters, spaces, hyphens, apostrophes let cleaned = value.replace(/[^a-zA-Z\s\-\']/g, ''); cleaned = cleaned.trim().replace(/\s+/g, ' '); // Must have at least 2 characters and contain letters if (cleaned.length < 2 || !/[a-zA-Z]/.test(cleaned)) { return { isValid: false, cleaned: '', reason: 'Too short or no letters' }; } // Max length check if (cleaned.length > 30) { cleaned = cleaned.substring(0, 30).trim(); } return { isValid: true, cleaned }; } /** * Validate text (company_name, job_title, product_name) * Rules: Letters, numbers, spaces, common punctuation */ function validateText(value) { // Keep alphanumeric and common punctuation let cleaned = value.replace(/[^a-zA-Z0-9\s\-\'\.\,]/g, ''); cleaned = cleaned.trim().replace(/\s+/g, ' '); // Must have at least 2 characters if (cleaned.length < 2) { return { isValid: false, cleaned: '', reason: 'Too short' }; } // Max length check if (cleaned.length > 50) { cleaned = cleaned.substring(0, 50).trim(); } return { isValid: true, cleaned }; } /** * Validate color * Rules: Letters only, maybe spaces */ function validateColor(value) { // Keep letters and spaces only let cleaned = value.replace(/[^a-zA-Z\s]/g, ''); cleaned = cleaned.trim().replace(/\s+/g, ' '); // Must have at least 3 characters if (cleaned.length < 3) { return { isValid: false, cleaned: '', reason: 'Too short' }; } // Max length check if (cleaned.length > 20) { cleaned = cleaned.substring(0, 20).trim(); } return { isValid: true, cleaned }; } /** * Validate UUID * Rules: Should follow UUID format (8-4-4-4-12 hex digits with dashes) */ function validateUUID(value) { // Keep hex characters and dashes let cleaned = value.replace(/[^0-9a-fA-F\-]/g, ''); // Try to format as UUID if it has enough characters const hexOnly = cleaned.replace(/-/g, ''); if (hexOnly.length >= 32) { // Format as UUID: 8-4-4-4-12 const formatted = [ hexOnly.substring(0, 8), hexOnly.substring(8, 12), hexOnly.substring(12, 16), hexOnly.substring(16, 20), hexOnly.substring(20, 32) ].join('-'); cleaned = formatted; } // Must have at least 32 hex characters const hexCount = cleaned.replace(/-/g, '').length; if (hexCount < 32) { return { isValid: false, cleaned: '', reason: 'Too few hex characters' }; } return { isValid: true, cleaned }; } /** * Validate date * Rules: Should follow date format (YYYY-MM-DD or similar) */ function validateDate(value) { // Keep digits, dashes, slashes let cleaned = value.replace(/[^0-9\-\/]/g, ''); // Must have at least 8 digits (YYYYMMDD) const digitCount = (cleaned.match(/\d/g) || []).length; if (digitCount < 8) { return { isValid: false, cleaned: '', reason: 'Too few digits' }; } // Max length check if (cleaned.length > 20) { cleaned = cleaned.substring(0, 20).trim(); } return { isValid: true, cleaned }; } /** * Generic validator for unknown labels */ function validateGeneric(value) { // Remove control characters let cleaned = value.replace(/[\x00-\x1F\x7F]/g, ''); cleaned = cleaned.trim().replace(/\s+/g, ' '); if (cleaned.length < 1) { return { isValid: false, cleaned: '', reason: 'Empty after cleaning' }; } return { isValid: true, cleaned }; } /** * PatternCorrector - Post-processing pattern matching and correction * Learns patterns from training data and applies them to generated samples */ class PatternCorrector { constructor() { this.patterns = new Map(); } /** * Learn patterns from training data */ learnPatterns(samples) { const byLabel = new Map(); // Group samples by label for (const sample of samples) { if (!byLabel.has(sample.label)) { byLabel.set(sample.label, []); } byLabel.get(sample.label).push(sample.value); } // Learn patterns for each label for (const [label, values] of byLabel.entries()) { this.learnPattern(label, values); } } /** * Learn pattern for a specific label */ learnPattern(label, examples) { if (examples.length === 0) return; // Extract common prefixes (first 1-3 characters) const prefixCounts = new Map(); const suffixCounts = new Map(); const charFreq = new Map(); const lengths = []; for (const example of examples) { lengths.push(example.length); // Prefixes for (let len = 1; len <= Math.min(3, example.length); len++) { const prefix = example.substring(0, len); prefixCounts.set(prefix, (prefixCounts.get(prefix) || 0) + 1); } // Suffixes for (let len = 1; len <= Math.min(3, example.length); len++) { const suffix = example.substring(example.length - len); suffixCounts.set(suffix, (suffixCounts.get(suffix) || 0) + 1); } // Character frequency for (const char of example) { charFreq.set(char, (charFreq.get(char) || 0) + 1); } } // Get common prefixes (appear in >10% of examples - lowered from 20% for better pattern matching) const commonPrefixes = Array.from(prefixCounts.entries()) .filter(([_, count]) => count / examples.length > 0.1) .sort((a, b) => b[1] - a[1]) .slice(0, 15) // Increased from 10 to 15 .map(([prefix]) => prefix); // Get common suffixes (appear in >10% of examples - lowered from 20% for better pattern matching) const commonSuffixes = Array.from(suffixCounts.entries()) .filter(([_, count]) => count / examples.length > 0.1) .sort((a, b) => b[1] - a[1]) .slice(0, 15) // Increased from 10 to 15 .map(([suffix]) => suffix); // Normalize character frequencies const totalChars = Array.from(charFreq.values()).reduce((a, b) => a + b, 0); for (const [char, count] of charFreq.entries()) { charFreq.set(char, count / totalChars); } this.patterns.set(label, { label, examples, commonPrefixes, commonSuffixes, charFrequency: charFreq, lengthDistribution: lengths, }); } /** * Correct a generated string using learned patterns */ correct(generated, label) { const pattern = this.patterns.get(label); if (!pattern) { return generated; // No pattern learned, return as-is } let corrected = generated; // 1. Check if it matches a known example (exact match) if (pattern.examples.includes(generated)) { return generated; // Already perfect } // 2. Check prefix/suffix patterns const hasValidPrefix = pattern.commonPrefixes.some(prefix => corrected.toLowerCase().startsWith(prefix.toLowerCase())); pattern.commonSuffixes.some(suffix => corrected.toLowerCase().endsWith(suffix.toLowerCase())); // 3. If no valid prefix, try to fix it if (!hasValidPrefix && pattern.commonPrefixes.length > 0) { const mostCommonPrefix = pattern.commonPrefixes[0]; // Only fix if the generated string is very different if (corrected.length > 0 && !corrected.toLowerCase().startsWith(mostCommonPrefix[0].toLowerCase())) ; } // 4. Check character frequency (remove unlikely characters) const charFreq = pattern.charFrequency; let cleaned = ''; for (const char of corrected) { const freq = charFreq.get(char) || 0; // Keep character if it appears in >0.5% of training data (lowered from 1%), or if it's common (space, etc.) if (freq > 0.005 || /[a-zA-Z0-9\s]/.test(char)) { cleaned += char; } } if (cleaned.length > 0) { corrected = cleaned; } // 5. Check length distribution pattern.lengthDistribution.reduce((a, b) => a + b, 0) / pattern.lengthDistribution.length; Math.min(...pattern.lengthDistribution); const maxLength = Math.max(...pattern.lengthDistribution); // Truncate if too long if (corrected.length > maxLength * 1.5) { corrected = corrected.substring(0, Math.floor(maxLength * 1.2)); } return corrected; } /** * Score how well a generated string matches the pattern */ score(generated, label) { const pattern = this.patterns.get(label); if (!pattern) { return 0.5; // Unknown pattern, neutral score } let score = 0; let factors = 0; // 1. Exact match bonus if (pattern.examples.includes(generated)) { return 1.0; // Perfect match } // 2. Prefix match (30% weight) const prefixMatch = pattern.commonPrefixes.some(prefix => generated.toLowerCase().startsWith(prefix.toLowerCase())); score += prefixMatch ? 0.3 : 0; factors++; // 3. Suffix match (20% weight) const suffixMatch = pattern.commonSuffixes.some(suffix => generated.toLowerCase().endsWith(suffix.toLowerCase())); score += suffixMatch ? 0.2 : 0; factors++; // 4. Character frequency match (30% weight) const charFreq = pattern.charFrequency; let charScore = 0; let charCount = 0; for (const char of generated) { const freq = charFreq.get(char) || 0; charScore += freq; charCount++; } score += (charCount > 0 ? charScore / charCount : 0) * 0.3; factors++; // 5. Length match (20% weight) const avgLength = pattern.lengthDistribution.reduce((a, b) => a + b, 0) / pattern.lengthDistribution.length; const lengthDiff = Math.abs(generated.length - avgLength) / avgLength; const lengthScore = Math.max(0, 1 - lengthDiff); score += lengthScore * 0.2; factors++; return factors > 0 ? score / factors : 0; } /** * Get pattern for a label */ getPattern(label) { return this.patterns.get(label); } } /** * SequenceContext - Add sequence context to generation * Uses previous characters to inform next character prediction */ class SequenceContext { constructor(n = 3) { this.ngramPatterns = new Map(); this.n = n; } /** * Learn n-gram patterns from training data */ learnPatterns(samples) { this.ngramPatterns.clear(); for (const sample of samples) { // Extract n-grams for (let i = 0; i <= sample.length - this.n; i++) { const ngram = sample.substring(i, i + this.n - 1); // Context (n-1 chars) const nextChar = sample[i + this.n - 1]; // Next character if (!this.ngramPatterns.has(ngram)) { this.ngramPatterns.set(ngram, new Map()); } const charMap = this.ngramPatterns.get(ngram); charMap.set(nextChar, (charMap.get(nextChar) || 0) + 1); } } } /** * Get next character probabilities given context */ getNextCharProbs(context) { // Use last n-1 characters as context const ctx = context.length >= this.n - 1 ? context.substring(context.length - (this.n - 1)) : context; const charCounts = this.ngramPatterns.get(ctx); if (!charCounts || charCounts.size === 0) { return new Map(); } // Convert counts to probabilities const total = Array.from(charCounts.values()).reduce((a, b) => a + b, 0); const probs = new Map(); for (const [char, count] of charCounts.entries()) { probs.set(char, count / total); } return probs; } /** * Suggest next character based on context */ suggestNextChar(context) { const probs = this.getNextCharProbs(context); if (probs.size === 0) { return null; } // Return most likely character let bestChar = ''; let bestProb = 0; for (const [char, prob] of probs.entries()) { if (prob > bestProb) { bestProb = prob; bestChar = char; } } return bestChar; } /** * Score how well a character fits the context */ scoreChar(context, char) { const probs = this.getNextCharProbs(context); return probs.get(char) || 0; } } /** * ELMGenerator - Label-conditioned string generator using ELM * Trains an ELM to generate encoded strings based on labels + noise */ class ELMGenerator { constructor(config) { this.elm = null; this.labels = []; this.patternCorrector = null; this.sequenceContext = null; // Initialize and require license before allowing generator use initializeLicense(); requireLicense(); this.config = { hiddenUnits: 128, activation: 'relu', ridgeLambda: 0.01, noiseSize: 32, useOneHot: false, // Default to false for memory efficiency (can enable for better accuracy) useClassification: false, // Default to regression for compatibility usePatternCorrection: true, ...config, }; this.noiseSize = this.config.noiseSize; this.useClassification = this.config.useClassification; this.encoder = new StringEncoder({ maxLength: config.maxLength, useOneHot: this.config.useOneHot ?? false, // Default to false for memory efficiency }); if (this.config.usePatternCorrection) { this.patternCorrector = new PatternCorrector(); } // Always use sequence context for better generation this.sequenceContext = new SequenceContext(3); // 3-grams } /** * Train the ELM generator on labeled samples */ train(samples) { if (samples.length === 0) { throw new Error('Cannot train on empty dataset'); } // Extract unique labels const uniqueLabels = Array.from(new Set(samples.map(s => s.label))); this.labels = uniqueLabels; // Extract all values for vocabulary building const allValues = samples.map(s => s.value); this.encoder.buildVocab(allValues); // Learn patterns if pattern correction is enabled if (this.patternCorrector) { this.patternCorrector.learnPatterns(samples); } // Learn sequence context if (this.sequenceContext) { this.sequenceContext.learnPatterns(allValues); } // Build training data const X = []; const Y = []; for (const sample of samples) { const labelIndex = this.labels.indexOf(sample.label); if (labelIndex === -1) { continue; } // Input: concat(oneHot(label), noiseVector) const labelOneHot = oneHotLabel(labelIndex, this.labels.length); const noise = generateNoiseVector(this.noiseSize, this.config.seed); const inputVector = [...labelOneHot, ...noise]; X.push(inputVector); // Target: encoded(value) const encodedValue = this.encoder.encode(sample.value); Y.push(encodedValue); } if (X.length === 0) { throw new Error('No valid training samples after processing'); } // Create ELM config const inputSize = this.labels.length + this.noiseSize; this.encoder.getVectorSize(); const elmConfig = { useTokenizer: false, // Numeric mode inputSize: inputSize, categories: this.useClassification ? [] : [], // For classification, we'll handle it differently hiddenUnits: this.config.hiddenUnits, activation: this.config.activation, // Use lower regularization for better pattern learning ridgeLambda: this.config.ridgeLambda * 0.1, // Reduce regularization task: this.useClassification ? 'classification' : 'regression', }; // Create and train ELM - resolve constructor robustly across CJS/ESM shapes // Replace dynamic require with direct constructor this.elm = new ELM(elmConfig); this.elm.trainFromData(X, Y); } /** * Generate a string for a given label * @param label Label to generate for * @param noiseSeed Optional seed for noise generation (for deterministic output) */ generate(label, noiseSeed) { if (!this.elm) { throw new Error('Model not trained. Call train() first.'); } const labelIndex = this.labels.indexOf(label); if (labelIndex === -1) { throw new Error(`Label '${label}' not found in training data`); } // Create input: concat(oneHot(label), noiseVector) const labelOneHot = oneHotLabel(labelIndex, this.labels.length); const noise = generateNoiseVector(this.noiseSize, noiseSeed !== undefined ? noiseSeed : this.config.seed); const inputVector = [...labelOneHot, ...noise]; // Predict based on mode let decoded; if (this.useClassification && this.config.useOneHot && typeof this.elm.predictProbaFromVector === 'function') { // Classification mode with one-hot: use probabilities const vocabSize = this.encoder.getVocabSize(); const maxLength = this.config.maxLength; // Get probabilities for each position const probs = this.elm.predictProbaFromVector(inputVector); // Reshape to [maxLength, vocabSize] and use argmax const indices = []; for (let pos = 0; pos < maxLength; pos++) { const posProbs = probs.slice(pos * vocabSize, (pos + 1) * vocabSize); const maxIdx = posProbs.indexOf(Math.max(...posProbs)); indices.push(maxIdx); } decoded = this.encoder.decode(indices); } else { // Regression mode: use logits and round const prediction = this.elm.predictLogitsFromVector(inputVector); // Convert logits to indices with proper quantization const vocabSize = this.encoder.getVocabSize(); const indices = prediction.map(val => { // Clamp value to reasonable range first (prevent extreme values) const clamped = Math.max(-vocabSize, Math.min(vocabSize * 2, val)); // Round to nearest integer const rounded = Math.round(clamped); // Clamp to valid vocabulary range [0, vocabSize-1] const idx = Math.max(0, Math.min(vocabSize - 1, rounded)); return idx; }); decoded = this.encoder.decode(indices); } // Apply pattern correction if enabled let corrected = decoded; if (this.patternCorrector) { corrected = this.patternCorrector.correct(decoded, label); } // Apply sequence context refinement if (this.sequenceContext && corrected.length > 0) { corrected = this.refineWithSequenceContext(corrected, label); } // Validate and clean the decoded string using label-specific rules const validation = validateForLabel(label, corrected); // If validation fails, try to generate again with different noise (up to 3 attempts) if (!validation.isValid) { for (let attempt = 0; attempt < 3; attempt++) { const baseSeed = noiseSeed !== undefined ? noiseSeed : (this.config.seed ?? Date.now()); const newNoise = generateNoiseVector(this.noiseSize, baseSeed + attempt + 1000); const newInputVector = [...labelOneHot, ...newNoise]; let newDecoded; if (this.useClassification && this.config.useOneHot && typeof this.elm.predictProbaFromVector === 'function') { const vocabSize = this.encoder.getVocabSize(); const maxLength = this.config.maxLength; const probs = this.elm.predictProbaFromVector(newInputVector); const newIndices = []; for (let pos = 0; pos < maxLength; pos++) { const posProbs = probs.slice(pos * vocabSize, (pos +