@astermind/astermind-synth
Version:
OmegaSynth - Label-Conditioned Synthetic Data Generator for AsterMind ELM/KELM Pipelines
1,396 lines (1,384 loc) • 97.8 kB
JavaScript
import { setLicenseToken, hasFeature, getLicenseState, initLicenseRuntime, requireFeature } from '@astermindai/license-runtime';
import { ELM } from '@astermind/astermind-elm';
import * as fs from 'fs';
import * as path from 'path';
/**
* SyntheticFieldStore - Storage for labeled samples
* Supports insert, get, and sample operations
*/
class SyntheticFieldStore {
constructor() {
this.store = new Map();
}
/**
* Insert a labeled sample into the store
*/
insert(sample) {
if (!this.store.has(sample.label)) {
this.store.set(sample.label, []);
}
this.store.get(sample.label).push(sample.value);
}
/**
* Insert multiple samples at once
*/
insertMany(samples) {
for (const sample of samples) {
this.insert(sample);
}
}
/**
* Get all values for a given label
*/
get(label) {
return this.store.get(label) || [];
}
/**
* Sample k values uniformly at random for a given label
*/
sample(label, k = 1) {
const values = this.get(label);
if (values.length === 0) {
return [];
}
const result = [];
const indices = new Set();
// Simple uniform random sampling without replacement
while (result.length < k && indices.size < values.length) {
const idx = Math.floor(Math.random() * values.length);
if (!indices.has(idx)) {
indices.add(idx);
result.push(values[idx]);
}
}
return result;
}
/**
* Check if a label exists in the store
*/
hasLabel(label) {
return this.store.has(label);
}
/**
* Get all labels in the store
*/
getLabels() {
return Array.from(this.store.keys());
}
/**
* Get the count of samples for a label
*/
count(label) {
return this.get(label).length;
}
/**
* Clear all data
*/
clear() {
this.store.clear();
}
}
/**
* License management for OmegaSynth
* Wraps @astermindai/license-runtime with convenience functions
*/
let initialized = false;
/**
* Initialize the license runtime singleton.
* Must be called before any other license functions.
* Automatically loads token from ASTERMIND_LICENSE_TOKEN environment variable if present.
*
* PROFESSIONAL LICENSING APPROACH:
* - Always requires a valid license key (strict mode)
* - Trial keys are obtained from the license server and have expiration dates
* - No "eval mode" bypass - all usage requires a valid key
* - For testing, use a test/dev key or mock the license runtime
*
* This follows industry best practices where:
* 1. All users (including evaluators) must obtain a trial key
* 2. License validation happens server-side via JWT verification
* 3. Trial keys expire after a set period (e.g., 30 days)
* 4. No environment-based bypasses are possible
*/
function initializeLicense() {
if (initialized) {
return; // Already initialized
}
// Always use strict mode - require valid license key
// Trial keys are just license keys with expiration dates
initLicenseRuntime({
jwksUrl: "https://license.astermind.ai/.well-known/astermind-license-keys.json",
expectedIss: "https://license.astermind.ai",
expectedAud: "astermind-synth",
jwksMaxAgeSeconds: 300,
mode: 'strict' // Always strict - no bypasses
});
// Load token from environment variable if present
const token = process.env.ASTERMIND_LICENSE_TOKEN;
if (token) {
// Note: setLicenseToken is async, but we can't await in module initialization
// The token will be set asynchronously
setLicenseToken(token).catch(err => {
console.warn("Failed to set license token from environment:", err);
});
}
initialized = true;
}
/**
* Require a valid license before proceeding.
* Throws if license is invalid, expired, or feature is missing.
*
* SECURITY: This function ONLY trusts payloads that have been verified by the license-runtime.
* We never decode or trust unverified JWT payloads to prevent bypass attacks.
*
* FLEXIBLE AUDIENCE CHECK:
* - If the token's audience matches "astermind-synth", it passes normally
* - If the token's audience doesn't match but the token includes the "astermind-synth" feature,
* it will still pass (allows tokens with audience "astermind-elm" or other products that include this feature)
* - This flexibility ONLY applies when the license-runtime has VERIFIED the token signature
*
* All users (including evaluators) must obtain a trial or production license key.
* Trial keys can be obtained from: https://license.astermind.ai/v1/trial/create
*/
/**
* Check if the token has a valid feature for astermind-synth.
* Accepts both "astermind-synth" and "astermind-elm-basic" features.
*
* SECURITY: Only call this with verified payloads from license-runtime state.
*/
function hasValidFeature(features) {
return Array.isArray(features) && (features.includes("astermind-synth") ||
features.includes("astermind-elm-basic"));
}
function requireLicense() {
try {
requireFeature("astermind-synth");
}
catch (error) {
const state = getLicenseState();
// SECURITY: Only trust payloads that have been verified by license-runtime.
// If state.payload exists, it means the license-runtime has successfully verified
// the JWT signature. We can safely check features in verified payloads.
if (state.payload && hasValidFeature(state.payload.features)) {
// Feature is present in VERIFIED payload, even if audience doesn't match - allow it
// This handles tokens with audience "astermind-elm" that include "astermind-synth" feature
return;
}
// Handle "missing" state: Token not set or async setLicenseToken() hasn't completed
if (state.status === 'missing') {
// SECURITY: Do NOT decode unverified tokens. Wait for license-runtime verification.
// If token exists in env but state is "missing", it means async verification hasn't completed.
// We must wait for verification - do not trust unverified payloads.
const token = process.env.ASTERMIND_LICENSE_TOKEN;
if (token) {
// Token exists but not verified yet - give a helpful error message
throw new Error('License token is being verified. Please wait a moment and try again.\n' +
'If this error persists, verify your license token is valid.\n' +
'For trial tokens, visit: https://license.astermind.ai/v1/trial/create');
}
throw new Error('License token is required. Please set ASTERMIND_LICENSE_TOKEN environment variable.\n' +
'For trial tokens, visit: https://license.astermind.ai/v1/trial/create');
}
else if (state.status === 'expired') {
throw new Error(`License token has expired. Please obtain a new license token.\n` +
`Expired at: ${state.payload?.exp ? new Date(state.payload.exp * 1000).toISOString() : 'unknown'}`);
}
else if (state.status === 'invalid') {
// SECURITY: Do NOT decode unverified tokens. If state is "invalid", the token failed verification.
// This could be due to:
// - Invalid signature (token was tampered with or is fake)
// - Wrong issuer
// - Other verification failures
// We must reject invalid tokens - never trust unverified payloads.
throw new Error(`License token is invalid: ${state.reason || 'unknown error'}\n` +
'The token failed cryptographic verification. Please verify your license token is correct.\n' +
'For trial tokens, visit: https://license.astermind.ai/v1/trial/create');
}
// Re-throw original error if we can't provide better message
throw error;
}
}
/**
* Check if license is valid and feature is available (non-blocking).
* @returns true if astermind-synth feature is available
*/
function checkLicense() {
return hasFeature("astermind-synth");
}
/**
* Get detailed license status.
* @returns LicenseState object with status, reason, payload, etc.
*/
function getLicenseStatus() {
return getLicenseState();
}
/**
* Set license token from a string.
* Useful for dynamic token loading from backend services or user input.
* @param token The license token string (JWT format)
*/
async function setLicenseTokenFromString(token) {
await setLicenseToken(token);
}
/**
* RetrievalGenerator - Simple deterministic retrieval sampler
* Uniform random sampling from stored labeled samples
*/
/**
* Seeded random number generator for deterministic testing
*/
let SeededRNG$1 = class SeededRNG {
constructor(seed = Date.now()) {
this.seed = seed;
}
next() {
// Linear congruential generator
this.seed = (this.seed * 1664525 + 1013904223) % 2 ** 32;
return this.seed / 2 ** 32;
}
setSeed(seed) {
this.seed = seed;
}
};
class RetrievalGenerator {
constructor(seed) {
// Initialize and require license before allowing generator use
initializeLicense();
requireLicense();
this.store = new SyntheticFieldStore();
this.seed = seed;
this.rng = new SeededRNG$1(seed);
}
/**
* Ingest labeled samples into the store
*/
ingest(samples) {
this.store.insertMany(samples);
}
/**
* Sample k values for a given label
* Returns empty array if label doesn't exist or has no samples
*/
sample(label, k = 1) {
const values = this.store.get(label);
if (values.length === 0) {
return [];
}
const result = [];
const availableIndices = Array.from({ length: values.length }, (_, i) => i);
// Sample k values (or all if k > available)
const sampleCount = Math.min(k, values.length);
for (let i = 0; i < sampleCount; i++) {
const randomIndex = Math.floor(this.rng.next() * availableIndices.length);
const selectedIndex = availableIndices.splice(randomIndex, 1)[0];
result.push(values[selectedIndex]);
}
return result;
}
/**
* Get a single sample (convenience method)
*/
sampleOne(label) {
const samples = this.sample(label, 1);
return samples.length > 0 ? samples[0] : null;
}
/**
* Check if a label has samples
*/
hasLabel(label) {
return this.store.hasLabel(label) && this.store.count(label) > 0;
}
/**
* Get all available labels
*/
getLabels() {
return this.store.getLabels();
}
/**
* Reset the generator (clears store and optionally resets seed)
*/
reset(seed) {
this.store.clear();
if (seed !== undefined) {
this.seed = seed;
this.rng.setSeed(seed);
}
}
}
/**
* CharVocab - Character vocabulary builder
* Builds a vocabulary from character sets and training data
*/
class CharVocab {
constructor() {
this.charToIndex = new Map();
this.indexToChar = new Map();
this.size = 0;
}
/**
* Build vocabulary from a set of strings
* @param samples Array of strings to build vocabulary from
* @param charSet Optional predefined character set (e.g., alphanumeric + punctuation)
*/
build(samples, charSet) {
const chars = new Set();
// Add padding character first (index 0) - use null character
// This ensures index 0 is always padding
chars.add('\0');
// Add predefined character set if provided
if (charSet) {
for (const char of charSet) {
// Skip null character if it's in the charSet (we already added it)
if (char !== '\0') {
chars.add(char);
}
}
}
// Add all characters from samples
for (const sample of samples) {
for (const char of sample) {
// Skip null characters from samples (we use it for padding)
if (char !== '\0') {
chars.add(char);
}
}
}
// Sort characters for consistent ordering, but keep null char at index 0
const sortedChars = Array.from(chars).sort((a, b) => {
// Ensure null char is always first
if (a === '\0')
return -1;
if (b === '\0')
return 1;
return a.localeCompare(b);
});
// Build mappings
this.charToIndex.clear();
this.indexToChar.clear();
this.size = sortedChars.length;
sortedChars.forEach((char, index) => {
this.charToIndex.set(char, index);
this.indexToChar.set(index, char);
});
}
/**
* Get index for a character
*/
getIndex(char) {
const index = this.charToIndex.get(char);
if (index === undefined) {
throw new Error(`Character '${char}' not in vocabulary`);
}
return index;
}
/**
* Get character for an index
*/
getChar(index) {
const char = this.indexToChar.get(index);
if (char === undefined) {
throw new Error(`Index ${index} not in vocabulary`);
}
return char;
}
/**
* Check if character exists in vocabulary
*/
hasChar(char) {
return this.charToIndex.has(char);
}
/**
* Get vocabulary size
*/
getSize() {
return this.size;
}
/**
* Get all characters in vocabulary
*/
getChars() {
return Array.from(this.charToIndex.keys()).sort();
}
/**
* Get default character set (alphanumeric + common punctuation)
*/
static getDefaultCharSet() {
return 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789' +
' !"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~';
}
}
/**
* FixedLength - Utilities for fixed-length padding and truncation
*/
class FixedLength {
/**
* Pad or truncate an array to a fixed length
* @param arr Array to pad/truncate
* @param length Target length
* @param padValue Value to use for padding (default: 0)
*/
static padOrTruncate(arr, length, padValue = 0) {
if (arr.length === length) {
return [...arr];
}
if (arr.length > length) {
// Truncate
return arr.slice(0, length);
}
// Pad
const result = [...arr];
while (result.length < length) {
result.push(padValue);
}
return result;
}
/**
* Pad or truncate a string to a fixed length
* @param str String to pad/truncate
* @param length Target length
* @param padChar Character to use for padding (default: space)
*/
static padOrTruncateString(str, length, padChar = ' ') {
if (str.length === length) {
return str;
}
if (str.length > length) {
// Truncate
return str.slice(0, length);
}
// Pad
return str + padChar.repeat(length - str.length);
}
}
/**
* OneHot - One-hot encoding utilities
*/
class OneHot {
/**
* Encode an index as a one-hot vector
* @param index Index to encode
* @param size Size of the one-hot vector
*/
static encode(index, size) {
if (index < 0 || index >= size) {
throw new Error(`Index ${index} out of range [0, ${size})`);
}
const vector = new Array(size).fill(0);
vector[index] = 1;
return vector;
}
/**
* Decode a one-hot vector to an index
* @param vector One-hot vector
*/
static decode(vector) {
const index = vector.indexOf(1);
if (index === -1) {
throw new Error('Invalid one-hot vector: no element equals 1');
}
return index;
}
/**
* Encode multiple indices as one-hot vectors
* @param indices Array of indices
* @param size Size of each one-hot vector
*/
static encodeBatch(indices, size) {
return indices.map(idx => this.encode(idx, size));
}
/**
* Decode multiple one-hot vectors to indices
* @param vectors Array of one-hot vectors
*/
static decodeBatch(vectors) {
return vectors.map(vec => this.decode(vec));
}
}
/**
* StringEncoder - Encodes strings to vectors and decodes back
* Compatible with ELM/KELM pipelines
*/
class StringEncoder {
constructor(config) {
this.config = {
useOneHot: false, // Default to index-based for efficiency
...config,
};
this.vocab = new CharVocab();
}
/**
* Build vocabulary from training samples
*/
buildVocab(samples) {
this.vocab.build(samples, this.config.charSet || CharVocab.getDefaultCharSet());
}
/**
* Encode a string to a vector
* @param str String to encode
* @returns Encoded vector (either indices or one-hot)
*/
encode(str) {
if (this.vocab.getSize() === 0) {
throw new Error('Vocabulary not built. Call buildVocab() first.');
}
// Convert string to indices
const indices = [];
for (const char of str) {
if (this.vocab.hasChar(char)) {
indices.push(this.vocab.getIndex(char));
}
else {
// For unknown characters, try to find a similar one or use space
// If space is in vocab, use it; otherwise use 0 (which will be treated as padding)
if (this.vocab.hasChar(' ')) {
indices.push(this.vocab.getIndex(' '));
}
else {
indices.push(0);
}
}
}
// Pad or truncate to fixed length
const padded = FixedLength.padOrTruncate(indices, this.config.maxLength, 0);
// Convert to one-hot if requested
if (this.config.useOneHot) {
const vocabSize = this.vocab.getSize();
const oneHotVectors = [];
for (const idx of padded) {
oneHotVectors.push(...OneHot.encode(idx, vocabSize));
}
return oneHotVectors;
}
return padded;
}
/**
* Decode a vector back to a string
* @param vector Encoded vector
* @returns Decoded string
*/
decode(vector) {
if (this.vocab.getSize() === 0) {
throw new Error('Vocabulary not built. Call buildVocab() first.');
}
let indices;
if (this.config.useOneHot) {
// Decode one-hot vectors
const vocabSize = this.vocab.getSize();
indices = [];
for (let i = 0; i < vector.length; i += vocabSize) {
const oneHot = vector.slice(i, i + vocabSize);
try {
indices.push(OneHot.decode(oneHot));
}
catch {
// If decoding fails, use argmax as fallback
const maxIdx = oneHot.indexOf(Math.max(...oneHot));
indices.push(maxIdx);
}
}
// Truncate to maxLength
indices = indices.slice(0, this.config.maxLength);
}
else {
// Direct index-based decoding
indices = vector.slice(0, this.config.maxLength);
}
// Convert indices to characters, stopping at first padding
let result = '';
const vocabSize = this.vocab.getSize();
const paddingIdx = 0; // Padding is always index 0
for (const idx of indices) {
// Clamp index to valid range
const clampedIdx = Math.max(0, Math.min(vocabSize - 1, Math.round(idx)));
// Stop decoding at first padding index (0)
if (clampedIdx === paddingIdx) {
break;
}
// Try to get character for this index
try {
const char = this.vocab.getChar(clampedIdx);
// Skip null characters and control characters (except space, tab, newline)
if (char === '\0' || (char.charCodeAt(0) < 32 && char !== ' ' && char !== '\t' && char !== '\n')) {
break; // Stop at first invalid character
}
result += char;
}
catch {
// Invalid index - stop decoding
break;
}
}
// Trim trailing whitespace but preserve internal spaces
return result.trimEnd();
}
/**
* Encode multiple strings
*/
encodeBatch(strings) {
return strings.map(str => this.encode(str));
}
/**
* Decode multiple vectors
*/
decodeBatch(vectors) {
return vectors.map(vec => this.decode(vec));
}
/**
* Get the output vector size
*/
getVectorSize() {
if (this.config.useOneHot) {
return this.config.maxLength * this.vocab.getSize();
}
return this.config.maxLength;
}
/**
* Get vocabulary size
*/
getVocabSize() {
return this.vocab.getSize();
}
/**
* Get vocabulary
*/
getVocab() {
return this.vocab;
}
}
/**
* ELM utilities for OmegaSynth
* Helper functions for working with ELM models
*/
/**
* Create one-hot vector for a label index
*/
function oneHotLabel(labelIndex, numLabels) {
const vector = new Array(numLabels).fill(0);
if (labelIndex >= 0 && labelIndex < numLabels) {
vector[labelIndex] = 1;
}
return vector;
}
/**
* Generate random noise vector
*/
function generateNoiseVector(size, seed) {
const rng = seed !== undefined ? new SeededRNG(seed) : null;
const noise = [];
for (let i = 0; i < size; i++) {
const value = rng ? rng.next() : Math.random();
// Normalize to [-1, 1]
noise.push(value * 2 - 1);
}
return noise;
}
/**
* Seeded random number generator
*/
class SeededRNG {
constructor(seed) {
this.seed = seed;
}
next() {
this.seed = (this.seed * 1664525 + 1013904223) % 2 ** 32;
return this.seed / 2 ** 32;
}
}
/**
* Label-specific validation and cleaning utilities
*/
/**
* Validate and clean a generated string based on its label type
*/
function validateForLabel(label, value) {
if (!value || value.length === 0) {
return { isValid: false, cleaned: '', reason: 'Empty value' };
}
// Get label-specific validator
const validator = getValidatorForLabel(label);
return validator(value);
}
/**
* Get validator function for a specific label
*/
function getValidatorForLabel(label) {
switch (label) {
case 'first_name':
case 'last_name':
return validateName;
case 'phone_number':
return validatePhoneNumber;
case 'email':
return validateEmail;
case 'street_address':
return validateStreetAddress;
case 'city':
case 'state':
case 'country':
return validateLocation;
case 'company_name':
case 'job_title':
case 'product_name':
return validateText;
case 'color':
return validateColor;
case 'uuid':
return validateUUID;
case 'date':
return validateDate;
case 'credit_card_type':
case 'device_type':
return validateText;
default:
return validateGeneric;
}
}
/**
* Validate name (first_name, last_name)
* Rules: Letters only, optional hyphens/apostrophes, no numbers
*/
function validateName(value) {
// First check for placeholder patterns in original value (before cleaning)
value.toLowerCase();
// Reject "Name" followed by numbers (e.g., "Name97", "name123")
if (/^name\d+$/i.test(value)) {
return { isValid: false, cleaned: '', reason: 'Placeholder name with numbers' };
}
// Remove all non-letter characters except hyphens and apostrophes
let cleaned = value.replace(/[^a-zA-Z\-\'\s]/g, '');
// Remove numbers completely
cleaned = cleaned.replace(/[0-9]/g, '');
// Remove excessive special characters
cleaned = cleaned.replace(/[-']{2,}/g, '-'); // Multiple hyphens/apostrophes -> single
cleaned = cleaned.replace(/^[-']+|[-']+$/g, ''); // Remove leading/trailing
// Trim and normalize whitespace
cleaned = cleaned.trim().replace(/\s+/g, ' ');
// Must be at least 2 characters and contain at least one letter
if (cleaned.length < 2 || !/[a-zA-Z]/.test(cleaned)) {
return { isValid: false, cleaned: '', reason: 'Too short or no letters' };
}
// Reject common placeholder names (case-insensitive) after cleaning
const lowerCleaned = cleaned.toLowerCase();
// Check for exact matches
if (lowerCleaned === 'name' || lowerCleaned === 'firstname' || lowerCleaned === 'lastname' ||
lowerCleaned === 'surname') {
return { isValid: false, cleaned: '', reason: 'Placeholder name' };
}
// Check for "name" followed by very short variations
if (lowerCleaned.startsWith('name') && lowerCleaned.length <= 6) {
return { isValid: false, cleaned: '', reason: 'Placeholder name' };
}
// Max length check
if (cleaned.length > 30) {
cleaned = cleaned.substring(0, 30).trim();
}
return { isValid: true, cleaned };
}
/**
* Validate phone number
* Rules: Digits, dashes, parentheses, dots, plus, spaces
*/
function validatePhoneNumber(value) {
// Keep only valid phone characters
let cleaned = value.replace(/[^0-9\-\+\(\)\.\s]/g, '');
// Remove excessive special characters
cleaned = cleaned.replace(/[-\.]{2,}/g, '-');
cleaned = cleaned.replace(/\s+/g, ' ');
cleaned = cleaned.trim();
// Count digits
const digitCount = (cleaned.match(/\d/g) || []).length;
// Must have at least 7 digits (minimum phone number)
if (digitCount < 7) {
return { isValid: false, cleaned: '', reason: 'Too few digits' };
}
// Max length check
if (cleaned.length > 25) {
cleaned = cleaned.substring(0, 25).trim();
}
return { isValid: true, cleaned };
}
/**
* Validate email
* Rules: Must contain @, valid characters before and after
*/
function validateEmail(value) {
// Keep valid email characters
let cleaned = value.replace(/[^a-zA-Z0-9@\.\-\_]/g, '');
// Must contain @
if (!cleaned.includes('@')) {
return { isValid: false, cleaned: '', reason: 'Missing @ symbol' };
}
const parts = cleaned.split('@');
if (parts.length !== 2) {
return { isValid: false, cleaned: '', reason: 'Invalid @ usage' };
}
const [local, domain] = parts;
// Local part must have at least 1 character
if (!local || local.length === 0) {
return { isValid: false, cleaned: '', reason: 'Empty local part' };
}
// Domain must have at least 3 characters (x.y)
if (!domain || domain.length < 3) {
return { isValid: false, cleaned: '', reason: 'Invalid domain' };
}
// Domain must contain at least one dot
if (!domain.includes('.')) {
return { isValid: false, cleaned: '', reason: 'Domain missing dot' };
}
// Remove leading/trailing dots and hyphens
const cleanLocal = local.replace(/^[\.\-]+|[\.\-]+$/g, '');
const cleanDomain = domain.replace(/^[\.\-]+|[\.\-]+$/g, '');
if (!cleanLocal || !cleanDomain) {
return { isValid: false, cleaned: '', reason: 'Invalid format after cleaning' };
}
cleaned = `${cleanLocal}@${cleanDomain}`;
// Max length check
if (cleaned.length > 50) {
cleaned = cleaned.substring(0, 50);
}
return { isValid: true, cleaned };
}
/**
* Validate street address
* Rules: Numbers, letters, spaces, common address characters
*/
function validateStreetAddress(value) {
// Keep valid address characters
let cleaned = value.replace(/[^a-zA-Z0-9\s\-\#\.\,]/g, '');
cleaned = cleaned.trim().replace(/\s+/g, ' ');
// Must have at least 5 characters
if (cleaned.length < 5) {
return { isValid: false, cleaned: '', reason: 'Too short' };
}
// Max length check
if (cleaned.length > 50) {
cleaned = cleaned.substring(0, 50).trim();
}
return { isValid: true, cleaned };
}
/**
* Validate location (city, state, country)
* Rules: Mostly letters, optional spaces/hyphens
*/
function validateLocation(value) {
// Keep letters, spaces, hyphens, apostrophes
let cleaned = value.replace(/[^a-zA-Z\s\-\']/g, '');
cleaned = cleaned.trim().replace(/\s+/g, ' ');
// Must have at least 2 characters and contain letters
if (cleaned.length < 2 || !/[a-zA-Z]/.test(cleaned)) {
return { isValid: false, cleaned: '', reason: 'Too short or no letters' };
}
// Max length check
if (cleaned.length > 30) {
cleaned = cleaned.substring(0, 30).trim();
}
return { isValid: true, cleaned };
}
/**
* Validate text (company_name, job_title, product_name)
* Rules: Letters, numbers, spaces, common punctuation
*/
function validateText(value) {
// Keep alphanumeric and common punctuation
let cleaned = value.replace(/[^a-zA-Z0-9\s\-\'\.\,]/g, '');
cleaned = cleaned.trim().replace(/\s+/g, ' ');
// Must have at least 2 characters
if (cleaned.length < 2) {
return { isValid: false, cleaned: '', reason: 'Too short' };
}
// Max length check
if (cleaned.length > 50) {
cleaned = cleaned.substring(0, 50).trim();
}
return { isValid: true, cleaned };
}
/**
* Validate color
* Rules: Letters only, maybe spaces
*/
function validateColor(value) {
// Keep letters and spaces only
let cleaned = value.replace(/[^a-zA-Z\s]/g, '');
cleaned = cleaned.trim().replace(/\s+/g, ' ');
// Must have at least 3 characters
if (cleaned.length < 3) {
return { isValid: false, cleaned: '', reason: 'Too short' };
}
// Max length check
if (cleaned.length > 20) {
cleaned = cleaned.substring(0, 20).trim();
}
return { isValid: true, cleaned };
}
/**
* Validate UUID
* Rules: Should follow UUID format (8-4-4-4-12 hex digits with dashes)
*/
function validateUUID(value) {
// Keep hex characters and dashes
let cleaned = value.replace(/[^0-9a-fA-F\-]/g, '');
// Try to format as UUID if it has enough characters
const hexOnly = cleaned.replace(/-/g, '');
if (hexOnly.length >= 32) {
// Format as UUID: 8-4-4-4-12
const formatted = [
hexOnly.substring(0, 8),
hexOnly.substring(8, 12),
hexOnly.substring(12, 16),
hexOnly.substring(16, 20),
hexOnly.substring(20, 32)
].join('-');
cleaned = formatted;
}
// Must have at least 32 hex characters
const hexCount = cleaned.replace(/-/g, '').length;
if (hexCount < 32) {
return { isValid: false, cleaned: '', reason: 'Too few hex characters' };
}
return { isValid: true, cleaned };
}
/**
* Validate date
* Rules: Should follow date format (YYYY-MM-DD or similar)
*/
function validateDate(value) {
// Keep digits, dashes, slashes
let cleaned = value.replace(/[^0-9\-\/]/g, '');
// Must have at least 8 digits (YYYYMMDD)
const digitCount = (cleaned.match(/\d/g) || []).length;
if (digitCount < 8) {
return { isValid: false, cleaned: '', reason: 'Too few digits' };
}
// Max length check
if (cleaned.length > 20) {
cleaned = cleaned.substring(0, 20).trim();
}
return { isValid: true, cleaned };
}
/**
* Generic validator for unknown labels
*/
function validateGeneric(value) {
// Remove control characters
let cleaned = value.replace(/[\x00-\x1F\x7F]/g, '');
cleaned = cleaned.trim().replace(/\s+/g, ' ');
if (cleaned.length < 1) {
return { isValid: false, cleaned: '', reason: 'Empty after cleaning' };
}
return { isValid: true, cleaned };
}
/**
* PatternCorrector - Post-processing pattern matching and correction
* Learns patterns from training data and applies them to generated samples
*/
class PatternCorrector {
constructor() {
this.patterns = new Map();
}
/**
* Learn patterns from training data
*/
learnPatterns(samples) {
const byLabel = new Map();
// Group samples by label
for (const sample of samples) {
if (!byLabel.has(sample.label)) {
byLabel.set(sample.label, []);
}
byLabel.get(sample.label).push(sample.value);
}
// Learn patterns for each label
for (const [label, values] of byLabel.entries()) {
this.learnPattern(label, values);
}
}
/**
* Learn pattern for a specific label
*/
learnPattern(label, examples) {
if (examples.length === 0)
return;
// Extract common prefixes (first 1-3 characters)
const prefixCounts = new Map();
const suffixCounts = new Map();
const charFreq = new Map();
const lengths = [];
for (const example of examples) {
lengths.push(example.length);
// Prefixes
for (let len = 1; len <= Math.min(3, example.length); len++) {
const prefix = example.substring(0, len);
prefixCounts.set(prefix, (prefixCounts.get(prefix) || 0) + 1);
}
// Suffixes
for (let len = 1; len <= Math.min(3, example.length); len++) {
const suffix = example.substring(example.length - len);
suffixCounts.set(suffix, (suffixCounts.get(suffix) || 0) + 1);
}
// Character frequency
for (const char of example) {
charFreq.set(char, (charFreq.get(char) || 0) + 1);
}
}
// Get common prefixes (appear in >10% of examples - lowered from 20% for better pattern matching)
const commonPrefixes = Array.from(prefixCounts.entries())
.filter(([_, count]) => count / examples.length > 0.1)
.sort((a, b) => b[1] - a[1])
.slice(0, 15) // Increased from 10 to 15
.map(([prefix]) => prefix);
// Get common suffixes (appear in >10% of examples - lowered from 20% for better pattern matching)
const commonSuffixes = Array.from(suffixCounts.entries())
.filter(([_, count]) => count / examples.length > 0.1)
.sort((a, b) => b[1] - a[1])
.slice(0, 15) // Increased from 10 to 15
.map(([suffix]) => suffix);
// Normalize character frequencies
const totalChars = Array.from(charFreq.values()).reduce((a, b) => a + b, 0);
for (const [char, count] of charFreq.entries()) {
charFreq.set(char, count / totalChars);
}
this.patterns.set(label, {
label,
examples,
commonPrefixes,
commonSuffixes,
charFrequency: charFreq,
lengthDistribution: lengths,
});
}
/**
* Correct a generated string using learned patterns
*/
correct(generated, label) {
const pattern = this.patterns.get(label);
if (!pattern) {
return generated; // No pattern learned, return as-is
}
let corrected = generated;
// 1. Check if it matches a known example (exact match)
if (pattern.examples.includes(generated)) {
return generated; // Already perfect
}
// 2. Check prefix/suffix patterns
const hasValidPrefix = pattern.commonPrefixes.some(prefix => corrected.toLowerCase().startsWith(prefix.toLowerCase()));
pattern.commonSuffixes.some(suffix => corrected.toLowerCase().endsWith(suffix.toLowerCase()));
// 3. If no valid prefix, try to fix it
if (!hasValidPrefix && pattern.commonPrefixes.length > 0) {
const mostCommonPrefix = pattern.commonPrefixes[0];
// Only fix if the generated string is very different
if (corrected.length > 0 && !corrected.toLowerCase().startsWith(mostCommonPrefix[0].toLowerCase())) ;
}
// 4. Check character frequency (remove unlikely characters)
const charFreq = pattern.charFrequency;
let cleaned = '';
for (const char of corrected) {
const freq = charFreq.get(char) || 0;
// Keep character if it appears in >0.5% of training data (lowered from 1%), or if it's common (space, etc.)
if (freq > 0.005 || /[a-zA-Z0-9\s]/.test(char)) {
cleaned += char;
}
}
if (cleaned.length > 0) {
corrected = cleaned;
}
// 5. Check length distribution
pattern.lengthDistribution.reduce((a, b) => a + b, 0) / pattern.lengthDistribution.length;
Math.min(...pattern.lengthDistribution);
const maxLength = Math.max(...pattern.lengthDistribution);
// Truncate if too long
if (corrected.length > maxLength * 1.5) {
corrected = corrected.substring(0, Math.floor(maxLength * 1.2));
}
return corrected;
}
/**
* Score how well a generated string matches the pattern
*/
score(generated, label) {
const pattern = this.patterns.get(label);
if (!pattern) {
return 0.5; // Unknown pattern, neutral score
}
let score = 0;
let factors = 0;
// 1. Exact match bonus
if (pattern.examples.includes(generated)) {
return 1.0; // Perfect match
}
// 2. Prefix match (30% weight)
const prefixMatch = pattern.commonPrefixes.some(prefix => generated.toLowerCase().startsWith(prefix.toLowerCase()));
score += prefixMatch ? 0.3 : 0;
factors++;
// 3. Suffix match (20% weight)
const suffixMatch = pattern.commonSuffixes.some(suffix => generated.toLowerCase().endsWith(suffix.toLowerCase()));
score += suffixMatch ? 0.2 : 0;
factors++;
// 4. Character frequency match (30% weight)
const charFreq = pattern.charFrequency;
let charScore = 0;
let charCount = 0;
for (const char of generated) {
const freq = charFreq.get(char) || 0;
charScore += freq;
charCount++;
}
score += (charCount > 0 ? charScore / charCount : 0) * 0.3;
factors++;
// 5. Length match (20% weight)
const avgLength = pattern.lengthDistribution.reduce((a, b) => a + b, 0) / pattern.lengthDistribution.length;
const lengthDiff = Math.abs(generated.length - avgLength) / avgLength;
const lengthScore = Math.max(0, 1 - lengthDiff);
score += lengthScore * 0.2;
factors++;
return factors > 0 ? score / factors : 0;
}
/**
* Get pattern for a label
*/
getPattern(label) {
return this.patterns.get(label);
}
}
/**
* SequenceContext - Add sequence context to generation
* Uses previous characters to inform next character prediction
*/
class SequenceContext {
constructor(n = 3) {
this.ngramPatterns = new Map();
this.n = n;
}
/**
* Learn n-gram patterns from training data
*/
learnPatterns(samples) {
this.ngramPatterns.clear();
for (const sample of samples) {
// Extract n-grams
for (let i = 0; i <= sample.length - this.n; i++) {
const ngram = sample.substring(i, i + this.n - 1); // Context (n-1 chars)
const nextChar = sample[i + this.n - 1]; // Next character
if (!this.ngramPatterns.has(ngram)) {
this.ngramPatterns.set(ngram, new Map());
}
const charMap = this.ngramPatterns.get(ngram);
charMap.set(nextChar, (charMap.get(nextChar) || 0) + 1);
}
}
}
/**
* Get next character probabilities given context
*/
getNextCharProbs(context) {
// Use last n-1 characters as context
const ctx = context.length >= this.n - 1
? context.substring(context.length - (this.n - 1))
: context;
const charCounts = this.ngramPatterns.get(ctx);
if (!charCounts || charCounts.size === 0) {
return new Map();
}
// Convert counts to probabilities
const total = Array.from(charCounts.values()).reduce((a, b) => a + b, 0);
const probs = new Map();
for (const [char, count] of charCounts.entries()) {
probs.set(char, count / total);
}
return probs;
}
/**
* Suggest next character based on context
*/
suggestNextChar(context) {
const probs = this.getNextCharProbs(context);
if (probs.size === 0) {
return null;
}
// Return most likely character
let bestChar = '';
let bestProb = 0;
for (const [char, prob] of probs.entries()) {
if (prob > bestProb) {
bestProb = prob;
bestChar = char;
}
}
return bestChar;
}
/**
* Score how well a character fits the context
*/
scoreChar(context, char) {
const probs = this.getNextCharProbs(context);
return probs.get(char) || 0;
}
}
/**
* ELMGenerator - Label-conditioned string generator using ELM
* Trains an ELM to generate encoded strings based on labels + noise
*/
class ELMGenerator {
constructor(config) {
this.elm = null;
this.labels = [];
this.patternCorrector = null;
this.sequenceContext = null;
// Initialize and require license before allowing generator use
initializeLicense();
requireLicense();
this.config = {
hiddenUnits: 128,
activation: 'relu',
ridgeLambda: 0.01,
noiseSize: 32,
useOneHot: false, // Default to false for memory efficiency (can enable for better accuracy)
useClassification: false, // Default to regression for compatibility
usePatternCorrection: true,
...config,
};
this.noiseSize = this.config.noiseSize;
this.useClassification = this.config.useClassification;
this.encoder = new StringEncoder({
maxLength: config.maxLength,
useOneHot: this.config.useOneHot ?? false, // Default to false for memory efficiency
});
if (this.config.usePatternCorrection) {
this.patternCorrector = new PatternCorrector();
}
// Always use sequence context for better generation
this.sequenceContext = new SequenceContext(3); // 3-grams
}
/**
* Train the ELM generator on labeled samples
*/
train(samples) {
if (samples.length === 0) {
throw new Error('Cannot train on empty dataset');
}
// Extract unique labels
const uniqueLabels = Array.from(new Set(samples.map(s => s.label)));
this.labels = uniqueLabels;
// Extract all values for vocabulary building
const allValues = samples.map(s => s.value);
this.encoder.buildVocab(allValues);
// Learn patterns if pattern correction is enabled
if (this.patternCorrector) {
this.patternCorrector.learnPatterns(samples);
}
// Learn sequence context
if (this.sequenceContext) {
this.sequenceContext.learnPatterns(allValues);
}
// Build training data
const X = [];
const Y = [];
for (const sample of samples) {
const labelIndex = this.labels.indexOf(sample.label);
if (labelIndex === -1) {
continue;
}
// Input: concat(oneHot(label), noiseVector)
const labelOneHot = oneHotLabel(labelIndex, this.labels.length);
const noise = generateNoiseVector(this.noiseSize, this.config.seed);
const inputVector = [...labelOneHot, ...noise];
X.push(inputVector);
// Target: encoded(value)
const encodedValue = this.encoder.encode(sample.value);
Y.push(encodedValue);
}
if (X.length === 0) {
throw new Error('No valid training samples after processing');
}
// Create ELM config
const inputSize = this.labels.length + this.noiseSize;
this.encoder.getVectorSize();
const elmConfig = {
useTokenizer: false, // Numeric mode
inputSize: inputSize,
categories: this.useClassification ? [] : [], // For classification, we'll handle it differently
hiddenUnits: this.config.hiddenUnits,
activation: this.config.activation,
// Use lower regularization for better pattern learning
ridgeLambda: this.config.ridgeLambda * 0.1, // Reduce regularization
task: this.useClassification ? 'classification' : 'regression',
};
// Create and train ELM - resolve constructor robustly across CJS/ESM shapes
// Replace dynamic require with direct constructor
this.elm = new ELM(elmConfig);
this.elm.trainFromData(X, Y);
}
/**
* Generate a string for a given label
* @param label Label to generate for
* @param noiseSeed Optional seed for noise generation (for deterministic output)
*/
generate(label, noiseSeed) {
if (!this.elm) {
throw new Error('Model not trained. Call train() first.');
}
const labelIndex = this.labels.indexOf(label);
if (labelIndex === -1) {
throw new Error(`Label '${label}' not found in training data`);
}
// Create input: concat(oneHot(label), noiseVector)
const labelOneHot = oneHotLabel(labelIndex, this.labels.length);
const noise = generateNoiseVector(this.noiseSize, noiseSeed !== undefined ? noiseSeed : this.config.seed);
const inputVector = [...labelOneHot, ...noise];
// Predict based on mode
let decoded;
if (this.useClassification && this.config.useOneHot && typeof this.elm.predictProbaFromVector === 'function') {
// Classification mode with one-hot: use probabilities
const vocabSize = this.encoder.getVocabSize();
const maxLength = this.config.maxLength;
// Get probabilities for each position
const probs = this.elm.predictProbaFromVector(inputVector);
// Reshape to [maxLength, vocabSize] and use argmax
const indices = [];
for (let pos = 0; pos < maxLength; pos++) {
const posProbs = probs.slice(pos * vocabSize, (pos + 1) * vocabSize);
const maxIdx = posProbs.indexOf(Math.max(...posProbs));
indices.push(maxIdx);
}
decoded = this.encoder.decode(indices);
}
else {
// Regression mode: use logits and round
const prediction = this.elm.predictLogitsFromVector(inputVector);
// Convert logits to indices with proper quantization
const vocabSize = this.encoder.getVocabSize();
const indices = prediction.map(val => {
// Clamp value to reasonable range first (prevent extreme values)
const clamped = Math.max(-vocabSize, Math.min(vocabSize * 2, val));
// Round to nearest integer
const rounded = Math.round(clamped);
// Clamp to valid vocabulary range [0, vocabSize-1]
const idx = Math.max(0, Math.min(vocabSize - 1, rounded));
return idx;
});
decoded = this.encoder.decode(indices);
}
// Apply pattern correction if enabled
let corrected = decoded;
if (this.patternCorrector) {
corrected = this.patternCorrector.correct(decoded, label);
}
// Apply sequence context refinement
if (this.sequenceContext && corrected.length > 0) {
corrected = this.refineWithSequenceContext(corrected, label);
}
// Validate and clean the decoded string using label-specific rules
const validation = validateForLabel(label, corrected);
// If validation fails, try to generate again with different noise (up to 3 attempts)
if (!validation.isValid) {
for (let attempt = 0; attempt < 3; attempt++) {
const baseSeed = noiseSeed !== undefined ? noiseSeed : (this.config.seed ?? Date.now());
const newNoise = generateNoiseVector(this.noiseSize, baseSeed + attempt + 1000);
const newInputVector = [...labelOneHot, ...newNoise];
let newDecoded;
if (this.useClassification && this.config.useOneHot && typeof this.elm.predictProbaFromVector === 'function') {
const vocabSize = this.encoder.getVocabSize();
const maxLength = this.config.maxLength;
const probs = this.elm.predictProbaFromVector(newInputVector);
const newIndices = [];
for (let pos = 0; pos < maxLength; pos++) {
const posProbs = probs.slice(pos * vocabSize, (pos +