UNPKG

@astermind/astermind-synthetic-data

Version:

OmegaSynth - Label-Conditioned Synthetic Data Generator for AsterMind ELM/KELM Pipelines

1,635 lines (1,634 loc) 82.1 kB
import * as path from "path"; import * as fs from "fs"; import { fileURLToPath } from "node:url"; import { ELM } from "@astermind/astermind-elm"; class SyntheticFieldStore { constructor() { this.store = /* @__PURE__ */ new Map(); } /** * Insert a labeled sample into the store */ insert(sample) { if (!this.store.has(sample.label)) { this.store.set(sample.label, []); } this.store.get(sample.label).push(sample.value); } /** * Insert multiple samples at once */ insertMany(samples) { for (const sample of samples) { this.insert(sample); } } /** * Get all values for a given label */ get(label) { return this.store.get(label) || []; } /** * Sample k values uniformly at random for a given label */ sample(label, k = 1) { const values = this.get(label); if (values.length === 0) { return []; } const result = []; const indices = /* @__PURE__ */ new Set(); while (result.length < k && indices.size < values.length) { const idx = Math.floor(Math.random() * values.length); if (!indices.has(idx)) { indices.add(idx); result.push(values[idx]); } } return result; } /** * Check if a label exists in the store */ hasLabel(label) { return this.store.has(label); } /** * Get all labels in the store */ getLabels() { return Array.from(this.store.keys()); } /** * Get the count of samples for a label */ count(label) { return this.get(label).length; } /** * Clear all data */ clear() { this.store.clear(); } } let SeededRNG$1 = class SeededRNG { constructor(seed = Date.now()) { this.seed = seed; } next() { this.seed = (this.seed * 1664525 + 1013904223) % 2 ** 32; return this.seed / 2 ** 32; } setSeed(seed) { this.seed = seed; } }; class RetrievalGenerator { constructor(seed) { this.store = new SyntheticFieldStore(); this.seed = seed; this.rng = new SeededRNG$1(seed); } /** * Ingest labeled samples into the store */ ingest(samples) { this.store.insertMany(samples); } /** * Sample k values for a given label * Returns empty array if label doesn't exist or has no samples */ sample(label, k = 1) { const values = this.store.get(label); if (values.length === 0) { return []; } const result = []; const availableIndices = Array.from({ length: values.length }, (_, i) => i); const sampleCount = Math.min(k, values.length); for (let i = 0; i < sampleCount; i++) { const randomIndex = Math.floor(this.rng.next() * availableIndices.length); const selectedIndex = availableIndices.splice(randomIndex, 1)[0]; result.push(values[selectedIndex]); } return result; } /** * Get a single sample (convenience method) */ sampleOne(label) { const samples = this.sample(label, 1); return samples.length > 0 ? samples[0] : null; } /** * Check if a label has samples */ hasLabel(label) { return this.store.hasLabel(label) && this.store.count(label) > 0; } /** * Get all available labels */ getLabels() { return this.store.getLabels(); } /** * Reset the generator (clears store and optionally resets seed) */ reset(seed) { this.store.clear(); if (seed !== void 0) { this.seed = seed; this.rng.setSeed(seed); } } } class CharVocab { constructor() { this.charToIndex = /* @__PURE__ */ new Map(); this.indexToChar = /* @__PURE__ */ new Map(); this.size = 0; } /** * Build vocabulary from a set of strings * @param samples Array of strings to build vocabulary from * @param charSet Optional predefined character set (e.g., alphanumeric + punctuation) */ build(samples, charSet) { const chars = /* @__PURE__ */ new Set(); chars.add("\0"); if (charSet) { for (const char of charSet) { if (char !== "\0") { chars.add(char); } } } for (const sample of samples) { for (const char of sample) { if (char !== "\0") { chars.add(char); } } } const sortedChars = Array.from(chars).sort((a, b) => { if (a === "\0") return -1; if (b === "\0") return 1; return a.localeCompare(b); }); this.charToIndex.clear(); this.indexToChar.clear(); this.size = sortedChars.length; sortedChars.forEach((char, index) => { this.charToIndex.set(char, index); this.indexToChar.set(index, char); }); } /** * Get index for a character */ getIndex(char) { const index = this.charToIndex.get(char); if (index === void 0) { throw new Error(`Character '${char}' not in vocabulary`); } return index; } /** * Get character for an index */ getChar(index) { const char = this.indexToChar.get(index); if (char === void 0) { throw new Error(`Index ${index} not in vocabulary`); } return char; } /** * Check if character exists in vocabulary */ hasChar(char) { return this.charToIndex.has(char); } /** * Get vocabulary size */ getSize() { return this.size; } /** * Get all characters in vocabulary */ getChars() { return Array.from(this.charToIndex.keys()).sort(); } /** * Get default character set (alphanumeric + common punctuation) */ static getDefaultCharSet() { return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"; } } class FixedLength { /** * Pad or truncate an array to a fixed length * @param arr Array to pad/truncate * @param length Target length * @param padValue Value to use for padding (default: 0) */ static padOrTruncate(arr, length, padValue = 0) { if (arr.length === length) { return [...arr]; } if (arr.length > length) { return arr.slice(0, length); } const result = [...arr]; while (result.length < length) { result.push(padValue); } return result; } /** * Pad or truncate a string to a fixed length * @param str String to pad/truncate * @param length Target length * @param padChar Character to use for padding (default: space) */ static padOrTruncateString(str, length, padChar = " ") { if (str.length === length) { return str; } if (str.length > length) { return str.slice(0, length); } return str + padChar.repeat(length - str.length); } } class OneHot { /** * Encode an index as a one-hot vector * @param index Index to encode * @param size Size of the one-hot vector */ static encode(index, size) { if (index < 0 || index >= size) { throw new Error(`Index ${index} out of range [0, ${size})`); } const vector = new Array(size).fill(0); vector[index] = 1; return vector; } /** * Decode a one-hot vector to an index * @param vector One-hot vector */ static decode(vector) { const index = vector.indexOf(1); if (index === -1) { throw new Error("Invalid one-hot vector: no element equals 1"); } return index; } /** * Encode multiple indices as one-hot vectors * @param indices Array of indices * @param size Size of each one-hot vector */ static encodeBatch(indices, size) { return indices.map((idx) => this.encode(idx, size)); } /** * Decode multiple one-hot vectors to indices * @param vectors Array of one-hot vectors */ static decodeBatch(vectors) { return vectors.map((vec) => this.decode(vec)); } } class StringEncoder { constructor(config) { this.config = { useOneHot: false, // Default to index-based for efficiency ...config }; this.vocab = new CharVocab(); } /** * Build vocabulary from training samples */ buildVocab(samples) { this.vocab.build(samples, this.config.charSet || CharVocab.getDefaultCharSet()); } /** * Encode a string to a vector * @param str String to encode * @returns Encoded vector (either indices or one-hot) */ encode(str) { if (this.vocab.getSize() === 0) { throw new Error("Vocabulary not built. Call buildVocab() first."); } const indices = []; for (const char of str) { if (this.vocab.hasChar(char)) { indices.push(this.vocab.getIndex(char)); } else { if (this.vocab.hasChar(" ")) { indices.push(this.vocab.getIndex(" ")); } else { indices.push(0); } } } const padded = FixedLength.padOrTruncate( indices, this.config.maxLength, 0 ); if (this.config.useOneHot) { const vocabSize = this.vocab.getSize(); const oneHotVectors = []; for (const idx of padded) { oneHotVectors.push(...OneHot.encode(idx, vocabSize)); } return oneHotVectors; } return padded; } /** * Decode a vector back to a string * @param vector Encoded vector * @returns Decoded string */ decode(vector) { if (this.vocab.getSize() === 0) { throw new Error("Vocabulary not built. Call buildVocab() first."); } let indices; if (this.config.useOneHot) { const vocabSize2 = this.vocab.getSize(); indices = []; for (let i = 0; i < vector.length; i += vocabSize2) { const oneHot = vector.slice(i, i + vocabSize2); try { indices.push(OneHot.decode(oneHot)); } catch { const maxIdx = oneHot.indexOf(Math.max(...oneHot)); indices.push(maxIdx); } } indices = indices.slice(0, this.config.maxLength); } else { indices = vector.slice(0, this.config.maxLength); } let result = ""; const vocabSize = this.vocab.getSize(); const paddingIdx = 0; for (const idx of indices) { const clampedIdx = Math.max(0, Math.min(vocabSize - 1, Math.round(idx))); if (clampedIdx === paddingIdx) { break; } try { const char = this.vocab.getChar(clampedIdx); if (char === "\0" || char.charCodeAt(0) < 32 && char !== " " && char !== " " && char !== "\n") { break; } result += char; } catch { break; } } return result.trimEnd(); } /** * Encode multiple strings */ encodeBatch(strings) { return strings.map((str) => this.encode(str)); } /** * Decode multiple vectors */ decodeBatch(vectors) { return vectors.map((vec) => this.decode(vec)); } /** * Get the output vector size */ getVectorSize() { if (this.config.useOneHot) { return this.config.maxLength * this.vocab.getSize(); } return this.config.maxLength; } /** * Get vocabulary size */ getVocabSize() { return this.vocab.getSize(); } /** * Get vocabulary */ getVocab() { return this.vocab; } } function oneHotLabel(labelIndex, numLabels) { const vector = new Array(numLabels).fill(0); if (labelIndex >= 0 && labelIndex < numLabels) { vector[labelIndex] = 1; } return vector; } function generateNoiseVector(size, seed) { const rng = seed !== void 0 ? new SeededRNG2(seed) : null; const noise = []; for (let i = 0; i < size; i++) { const value = rng ? rng.next() : Math.random(); noise.push(value * 2 - 1); } return noise; } class SeededRNG2 { constructor(seed) { this.seed = seed; } next() { this.seed = (this.seed * 1664525 + 1013904223) % 2 ** 32; return this.seed / 2 ** 32; } } function validateForLabel(label, value) { if (!value || value.length === 0) { return { isValid: false, cleaned: "", reason: "Empty value" }; } const validator = getValidatorForLabel(label); return validator(value); } function getValidatorForLabel(label) { switch (label) { case "first_name": case "last_name": return validateName; case "phone_number": return validatePhoneNumber; case "email": return validateEmail; case "street_address": return validateStreetAddress; case "city": case "state": case "country": return validateLocation; case "company_name": case "job_title": case "product_name": return validateText; case "color": return validateColor; case "uuid": return validateUUID; case "date": return validateDate; case "credit_card_type": case "device_type": return validateText; default: return validateGeneric; } } function validateName(value) { value.toLowerCase(); if (/^name\d+$/i.test(value)) { return { isValid: false, cleaned: "", reason: "Placeholder name with numbers" }; } let cleaned = value.replace(/[^a-zA-Z\-\'\s]/g, ""); cleaned = cleaned.replace(/[0-9]/g, ""); cleaned = cleaned.replace(/[-']{2,}/g, "-"); cleaned = cleaned.replace(/^[-']+|[-']+$/g, ""); cleaned = cleaned.trim().replace(/\s+/g, " "); if (cleaned.length < 2 || !/[a-zA-Z]/.test(cleaned)) { return { isValid: false, cleaned: "", reason: "Too short or no letters" }; } const lowerCleaned = cleaned.toLowerCase(); if (lowerCleaned === "name" || lowerCleaned === "firstname" || lowerCleaned === "lastname" || lowerCleaned === "surname") { return { isValid: false, cleaned: "", reason: "Placeholder name" }; } if (lowerCleaned.startsWith("name") && lowerCleaned.length <= 6) { return { isValid: false, cleaned: "", reason: "Placeholder name" }; } if (cleaned.length > 30) { cleaned = cleaned.substring(0, 30).trim(); } return { isValid: true, cleaned }; } function validatePhoneNumber(value) { let cleaned = value.replace(/[^0-9\-\+\(\)\.\s]/g, ""); cleaned = cleaned.replace(/[-\.]{2,}/g, "-"); cleaned = cleaned.replace(/\s+/g, " "); cleaned = cleaned.trim(); const digitCount = (cleaned.match(/\d/g) || []).length; if (digitCount < 7) { return { isValid: false, cleaned: "", reason: "Too few digits" }; } if (cleaned.length > 25) { cleaned = cleaned.substring(0, 25).trim(); } return { isValid: true, cleaned }; } function validateEmail(value) { let cleaned = value.replace(/[^a-zA-Z0-9@\.\-\_]/g, ""); if (!cleaned.includes("@")) { return { isValid: false, cleaned: "", reason: "Missing @ symbol" }; } const parts = cleaned.split("@"); if (parts.length !== 2) { return { isValid: false, cleaned: "", reason: "Invalid @ usage" }; } const [local, domain] = parts; if (!local || local.length === 0) { return { isValid: false, cleaned: "", reason: "Empty local part" }; } if (!domain || domain.length < 3) { return { isValid: false, cleaned: "", reason: "Invalid domain" }; } if (!domain.includes(".")) { return { isValid: false, cleaned: "", reason: "Domain missing dot" }; } const cleanLocal = local.replace(/^[\.\-]+|[\.\-]+$/g, ""); const cleanDomain = domain.replace(/^[\.\-]+|[\.\-]+$/g, ""); if (!cleanLocal || !cleanDomain) { return { isValid: false, cleaned: "", reason: "Invalid format after cleaning" }; } cleaned = `${cleanLocal}@${cleanDomain}`; if (cleaned.length > 50) { cleaned = cleaned.substring(0, 50); } return { isValid: true, cleaned }; } function validateStreetAddress(value) { let cleaned = value.replace(/[^a-zA-Z0-9\s\-\#\.\,]/g, ""); cleaned = cleaned.trim().replace(/\s+/g, " "); if (cleaned.length < 5) { return { isValid: false, cleaned: "", reason: "Too short" }; } if (cleaned.length > 50) { cleaned = cleaned.substring(0, 50).trim(); } return { isValid: true, cleaned }; } function validateLocation(value) { let cleaned = value.replace(/[^a-zA-Z\s\-\']/g, ""); cleaned = cleaned.trim().replace(/\s+/g, " "); if (cleaned.length < 2 || !/[a-zA-Z]/.test(cleaned)) { return { isValid: false, cleaned: "", reason: "Too short or no letters" }; } if (cleaned.length > 30) { cleaned = cleaned.substring(0, 30).trim(); } return { isValid: true, cleaned }; } function validateText(value) { let cleaned = value.replace(/[^a-zA-Z0-9\s\-\'\.\,]/g, ""); cleaned = cleaned.trim().replace(/\s+/g, " "); if (cleaned.length < 2) { return { isValid: false, cleaned: "", reason: "Too short" }; } if (cleaned.length > 50) { cleaned = cleaned.substring(0, 50).trim(); } return { isValid: true, cleaned }; } function validateColor(value) { let cleaned = value.replace(/[^a-zA-Z\s]/g, ""); cleaned = cleaned.trim().replace(/\s+/g, " "); if (cleaned.length < 3) { return { isValid: false, cleaned: "", reason: "Too short" }; } if (cleaned.length > 20) { cleaned = cleaned.substring(0, 20).trim(); } return { isValid: true, cleaned }; } function validateUUID(value) { let cleaned = value.replace(/[^0-9a-fA-F\-]/g, ""); const hexOnly = cleaned.replace(/-/g, ""); if (hexOnly.length >= 32) { const formatted = [ hexOnly.substring(0, 8), hexOnly.substring(8, 12), hexOnly.substring(12, 16), hexOnly.substring(16, 20), hexOnly.substring(20, 32) ].join("-"); cleaned = formatted; } const hexCount = cleaned.replace(/-/g, "").length; if (hexCount < 32) { return { isValid: false, cleaned: "", reason: "Too few hex characters" }; } return { isValid: true, cleaned }; } function validateDate(value) { let cleaned = value.replace(/[^0-9\-\/]/g, ""); const digitCount = (cleaned.match(/\d/g) || []).length; if (digitCount < 8) { return { isValid: false, cleaned: "", reason: "Too few digits" }; } if (cleaned.length > 20) { cleaned = cleaned.substring(0, 20).trim(); } return { isValid: true, cleaned }; } function validateGeneric(value) { let cleaned = value.replace(/[\x00-\x1F\x7F]/g, ""); cleaned = cleaned.trim().replace(/\s+/g, " "); if (cleaned.length < 1) { return { isValid: false, cleaned: "", reason: "Empty after cleaning" }; } return { isValid: true, cleaned }; } class PatternCorrector { constructor() { this.patterns = /* @__PURE__ */ new Map(); } /** * Learn patterns from training data */ learnPatterns(samples) { const byLabel = /* @__PURE__ */ new Map(); for (const sample of samples) { if (!byLabel.has(sample.label)) { byLabel.set(sample.label, []); } byLabel.get(sample.label).push(sample.value); } for (const [label, values] of byLabel.entries()) { this.learnPattern(label, values); } } /** * Learn pattern for a specific label */ learnPattern(label, examples) { if (examples.length === 0) return; const prefixCounts = /* @__PURE__ */ new Map(); const suffixCounts = /* @__PURE__ */ new Map(); const charFreq = /* @__PURE__ */ new Map(); const lengths = []; for (const example of examples) { lengths.push(example.length); for (let len = 1; len <= Math.min(3, example.length); len++) { const prefix = example.substring(0, len); prefixCounts.set(prefix, (prefixCounts.get(prefix) || 0) + 1); } for (let len = 1; len <= Math.min(3, example.length); len++) { const suffix = example.substring(example.length - len); suffixCounts.set(suffix, (suffixCounts.get(suffix) || 0) + 1); } for (const char of example) { charFreq.set(char, (charFreq.get(char) || 0) + 1); } } const commonPrefixes = Array.from(prefixCounts.entries()).filter(([_, count]) => count / examples.length > 0.1).sort((a, b) => b[1] - a[1]).slice(0, 15).map(([prefix]) => prefix); const commonSuffixes = Array.from(suffixCounts.entries()).filter(([_, count]) => count / examples.length > 0.1).sort((a, b) => b[1] - a[1]).slice(0, 15).map(([suffix]) => suffix); const totalChars = Array.from(charFreq.values()).reduce((a, b) => a + b, 0); for (const [char, count] of charFreq.entries()) { charFreq.set(char, count / totalChars); } this.patterns.set(label, { label, examples, commonPrefixes, commonSuffixes, charFrequency: charFreq, lengthDistribution: lengths }); } /** * Correct a generated string using learned patterns */ correct(generated, label) { const pattern = this.patterns.get(label); if (!pattern) { return generated; } let corrected = generated; if (pattern.examples.includes(generated)) { return generated; } const hasValidPrefix = pattern.commonPrefixes.some( (prefix) => corrected.toLowerCase().startsWith(prefix.toLowerCase()) ); pattern.commonSuffixes.some( (suffix) => corrected.toLowerCase().endsWith(suffix.toLowerCase()) ); if (!hasValidPrefix && pattern.commonPrefixes.length > 0) { const mostCommonPrefix = pattern.commonPrefixes[0]; if (corrected.length > 0 && !corrected.toLowerCase().startsWith(mostCommonPrefix[0].toLowerCase())) ; } const charFreq = pattern.charFrequency; let cleaned = ""; for (const char of corrected) { const freq = charFreq.get(char) || 0; if (freq > 5e-3 || /[a-zA-Z0-9\s]/.test(char)) { cleaned += char; } } if (cleaned.length > 0) { corrected = cleaned; } pattern.lengthDistribution.reduce((a, b) => a + b, 0) / pattern.lengthDistribution.length; Math.min(...pattern.lengthDistribution); const maxLength = Math.max(...pattern.lengthDistribution); if (corrected.length > maxLength * 1.5) { corrected = corrected.substring(0, Math.floor(maxLength * 1.2)); } return corrected; } /** * Score how well a generated string matches the pattern */ score(generated, label) { const pattern = this.patterns.get(label); if (!pattern) { return 0.5; } let score = 0; let factors = 0; if (pattern.examples.includes(generated)) { return 1; } const prefixMatch = pattern.commonPrefixes.some( (prefix) => generated.toLowerCase().startsWith(prefix.toLowerCase()) ); score += prefixMatch ? 0.3 : 0; factors++; const suffixMatch = pattern.commonSuffixes.some( (suffix) => generated.toLowerCase().endsWith(suffix.toLowerCase()) ); score += suffixMatch ? 0.2 : 0; factors++; const charFreq = pattern.charFrequency; let charScore = 0; let charCount = 0; for (const char of generated) { const freq = charFreq.get(char) || 0; charScore += freq; charCount++; } score += (charCount > 0 ? charScore / charCount : 0) * 0.3; factors++; const avgLength = pattern.lengthDistribution.reduce((a, b) => a + b, 0) / pattern.lengthDistribution.length; const lengthDiff = Math.abs(generated.length - avgLength) / avgLength; const lengthScore = Math.max(0, 1 - lengthDiff); score += lengthScore * 0.2; factors++; return factors > 0 ? score / factors : 0; } /** * Get pattern for a label */ getPattern(label) { return this.patterns.get(label); } } class SequenceContext { // n-gram size constructor(n = 3) { this.ngramPatterns = /* @__PURE__ */ new Map(); this.n = n; } /** * Learn n-gram patterns from training data */ learnPatterns(samples) { this.ngramPatterns.clear(); for (const sample of samples) { for (let i = 0; i <= sample.length - this.n; i++) { const ngram = sample.substring(i, i + this.n - 1); const nextChar = sample[i + this.n - 1]; if (!this.ngramPatterns.has(ngram)) { this.ngramPatterns.set(ngram, /* @__PURE__ */ new Map()); } const charMap = this.ngramPatterns.get(ngram); charMap.set(nextChar, (charMap.get(nextChar) || 0) + 1); } } } /** * Get next character probabilities given context */ getNextCharProbs(context) { const ctx = context.length >= this.n - 1 ? context.substring(context.length - (this.n - 1)) : context; const charCounts = this.ngramPatterns.get(ctx); if (!charCounts || charCounts.size === 0) { return /* @__PURE__ */ new Map(); } const total = Array.from(charCounts.values()).reduce((a, b) => a + b, 0); const probs = /* @__PURE__ */ new Map(); for (const [char, count] of charCounts.entries()) { probs.set(char, count / total); } return probs; } /** * Suggest next character based on context */ suggestNextChar(context) { const probs = this.getNextCharProbs(context); if (probs.size === 0) { return null; } let bestChar = ""; let bestProb = 0; for (const [char, prob] of probs.entries()) { if (prob > bestProb) { bestProb = prob; bestChar = char; } } return bestChar; } /** * Score how well a character fits the context */ scoreChar(context, char) { const probs = this.getNextCharProbs(context); return probs.get(char) || 0; } } class ELMGenerator { constructor(config) { this.elm = null; this.labels = []; this.patternCorrector = null; this.sequenceContext = null; this.config = { hiddenUnits: 128, activation: "relu", ridgeLambda: 0.01, noiseSize: 32, useOneHot: false, // Default to false for memory efficiency (can enable for better accuracy) useClassification: false, // Default to regression for compatibility usePatternCorrection: true, ...config }; this.noiseSize = this.config.noiseSize; this.useClassification = this.config.useClassification; this.encoder = new StringEncoder({ maxLength: config.maxLength, useOneHot: this.config.useOneHot ?? false // Default to false for memory efficiency }); if (this.config.usePatternCorrection) { this.patternCorrector = new PatternCorrector(); } this.sequenceContext = new SequenceContext(3); } /** * Train the ELM generator on labeled samples */ train(samples) { if (samples.length === 0) { throw new Error("Cannot train on empty dataset"); } const uniqueLabels = Array.from(new Set(samples.map((s) => s.label))); this.labels = uniqueLabels; const allValues = samples.map((s) => s.value); this.encoder.buildVocab(allValues); if (this.patternCorrector) { this.patternCorrector.learnPatterns(samples); } if (this.sequenceContext) { this.sequenceContext.learnPatterns(allValues); } const X = []; const Y = []; for (const sample of samples) { const labelIndex = this.labels.indexOf(sample.label); if (labelIndex === -1) { continue; } const labelOneHot = oneHotLabel(labelIndex, this.labels.length); const noise = generateNoiseVector(this.noiseSize, this.config.seed); const inputVector = [...labelOneHot, ...noise]; X.push(inputVector); const encodedValue = this.encoder.encode(sample.value); Y.push(encodedValue); } if (X.length === 0) { throw new Error("No valid training samples after processing"); } const inputSize = this.labels.length + this.noiseSize; this.encoder.getVectorSize(); const elmConfig = { useTokenizer: false, // Numeric mode inputSize, categories: this.useClassification ? [] : [], // For classification, we'll handle it differently hiddenUnits: this.config.hiddenUnits, activation: this.config.activation, // Use lower regularization for better pattern learning ridgeLambda: this.config.ridgeLambda * 0.1, // Reduce regularization task: this.useClassification ? "classification" : "regression" }; this.elm = new ELM(elmConfig); this.elm.trainFromData(X, Y); } /** * Generate a string for a given label * @param label Label to generate for * @param noiseSeed Optional seed for noise generation (for deterministic output) */ generate(label, noiseSeed) { if (!this.elm) { throw new Error("Model not trained. Call train() first."); } const labelIndex = this.labels.indexOf(label); if (labelIndex === -1) { throw new Error(`Label '${label}' not found in training data`); } const labelOneHot = oneHotLabel(labelIndex, this.labels.length); const noise = generateNoiseVector( this.noiseSize, noiseSeed !== void 0 ? noiseSeed : this.config.seed ); const inputVector = [...labelOneHot, ...noise]; let decoded; if (this.useClassification && this.config.useOneHot && typeof this.elm.predictProbaFromVector === "function") { const vocabSize = this.encoder.getVocabSize(); const maxLength = this.config.maxLength; const probs = this.elm.predictProbaFromVector(inputVector); const indices = []; for (let pos = 0; pos < maxLength; pos++) { const posProbs = probs.slice(pos * vocabSize, (pos + 1) * vocabSize); const maxIdx = posProbs.indexOf(Math.max(...posProbs)); indices.push(maxIdx); } decoded = this.encoder.decode(indices); } else { const prediction = this.elm.predictLogitsFromVector(inputVector); const vocabSize = this.encoder.getVocabSize(); const indices = prediction.map((val) => { const clamped = Math.max(-vocabSize, Math.min(vocabSize * 2, val)); const rounded = Math.round(clamped); const idx = Math.max(0, Math.min(vocabSize - 1, rounded)); return idx; }); decoded = this.encoder.decode(indices); } let corrected = decoded; if (this.patternCorrector) { corrected = this.patternCorrector.correct(decoded, label); } if (this.sequenceContext && corrected.length > 0) { corrected = this.refineWithSequenceContext(corrected, label); } const validation = validateForLabel(label, corrected); if (!validation.isValid) { for (let attempt = 0; attempt < 3; attempt++) { const baseSeed = noiseSeed !== void 0 ? noiseSeed : this.config.seed ?? Date.now(); const newNoise = generateNoiseVector( this.noiseSize, baseSeed + attempt + 1e3 ); const newInputVector = [...labelOneHot, ...newNoise]; let newDecoded; if (this.useClassification && this.config.useOneHot && typeof this.elm.predictProbaFromVector === "function") { const vocabSize = this.encoder.getVocabSize(); const maxLength = this.config.maxLength; const probs = this.elm.predictProbaFromVector(newInputVector); const newIndices = []; for (let pos = 0; pos < maxLength; pos++) { const posProbs = probs.slice(pos * vocabSize, (pos + 1) * vocabSize); const maxIdx = posProbs.indexOf(Math.max(...posProbs)); newIndices.push(maxIdx); } newDecoded = this.encoder.decode(newIndices); } else { const newPrediction = this.elm.predictLogitsFromVector(newInputVector); const vocabSize = this.encoder.getVocabSize(); const newIndices = newPrediction.map((val) => { const clamped = Math.max(-vocabSize, Math.min(vocabSize * 2, val)); const rounded = Math.round(clamped); return Math.max(0, Math.min(vocabSize - 1, rounded)); }); newDecoded = this.encoder.decode(newIndices); } if (this.patternCorrector) { newDecoded = this.patternCorrector.correct(newDecoded, label); } const newValidation = validateForLabel(label, newDecoded); if (newValidation.isValid) { return newValidation.cleaned; } } return ""; } return validation.cleaned; } /** * Generate multiple strings for a label with confidence-based selection */ generateBatch(label, count) { const candidates = []; const seen = /* @__PURE__ */ new Set(); let attempts = 0; const maxAttempts = count * 10; while (attempts < maxAttempts) { const seed = this.config.seed !== void 0 ? this.config.seed + attempts : Date.now() + attempts; try { const generated = this.generate(label, seed); if (generated && generated.length > 0 && !seen.has(generated.toLowerCase())) { let score = 1; if (this.patternCorrector) { score = this.patternCorrector.score(generated, label); } const validation = validateForLabel(label, generated); if (!validation.isValid) { score = 0; } candidates.push({ value: generated, score }); seen.add(generated.toLowerCase()); } } catch (error) { } attempts++; } candidates.sort((a, b) => b.score - a.score); return candidates.slice(0, count).map((c) => c.value); } /** * Refine generated string using sequence context */ refineWithSequenceContext(generated, label) { if (!this.sequenceContext || generated.length === 0) { return generated; } let refined = ""; for (let i = 0; i < generated.length; i++) { const context = refined; const currentChar = generated[i]; const contextScore = this.sequenceContext.scoreChar(context, currentChar); if (contextScore < 0.1 && context.length > 0) { const suggested = this.sequenceContext.suggestNextChar(context); if (suggested && suggested !== currentChar) { refined += suggested; } else { refined += currentChar; } } else { refined += currentChar; } if (currentChar === "\0" || currentChar.charCodeAt(0) === 0) { break; } } return refined; } /** * Get all trained labels */ getLabels() { return [...this.labels]; } /** * Check if model is trained */ isTrained() { return this.elm !== null; } } class HybridGenerator { constructor(config) { this.patternCorrector = null; this.config = { elmHiddenUnits: 128, elmActivation: "relu", elmRidgeLambda: 0.01, noiseSize: 32, jitterStrength: 0.05, // 5% jitter by default (reduced for better realism) exactMode: false, useOneHot: false, // Default to false for memory efficiency useClassification: false, usePatternCorrection: true, ...config }; if (this.config.exactMode) { this.jitterStrength = 0; } else { this.jitterStrength = this.config.jitterStrength; } this.retrieval = new RetrievalGenerator(config.seed); this.elm = new ELMGenerator({ maxLength: config.maxLength, hiddenUnits: this.config.elmHiddenUnits, activation: this.config.elmActivation, ridgeLambda: this.config.elmRidgeLambda, noiseSize: this.config.noiseSize, useOneHot: this.config.useOneHot, useClassification: this.config.useClassification, usePatternCorrection: this.config.usePatternCorrection, seed: config.seed }); this.encoder = new StringEncoder({ maxLength: config.maxLength, useOneHot: this.config.useOneHot ?? false // Default to false for memory efficiency }); if (this.config.usePatternCorrection) { this.patternCorrector = new PatternCorrector(); } } /** * Train the hybrid generator on labeled samples */ train(samples) { this.retrieval.ingest(samples); const allValues = samples.map((s) => s.value); this.encoder.buildVocab(allValues); this.elm.train(samples); if (this.patternCorrector) { this.patternCorrector.learnPatterns(samples); } } /** * Generate a hybrid sample (retrieval + jitter) * @param label Label to generate for * @param noiseSeed Optional seed for deterministic output */ generate(label, noiseSeed) { const retrieved = this.retrieval.sampleOne(label); if (!retrieved) { return this.elm.generate(label, noiseSeed); } const encoded = this.encoder.encode(retrieved); const jittered = this.applyJitter(encoded, label, noiseSeed); const decoded = this.encoder.decode(jittered); let corrected = decoded; if (this.patternCorrector) { corrected = this.patternCorrector.correct(decoded, label); } const validation = validateForLabel(label, corrected); if (!validation.isValid) { for (let attempt = 0; attempt < 2; attempt++) { const newSeed = noiseSeed !== void 0 ? noiseSeed + attempt + 1e3 : void 0; const newJittered = this.applyJitter(encoded, label, newSeed); const newDecoded = this.encoder.decode(newJittered); let newCorrected = newDecoded; if (this.patternCorrector) { newCorrected = this.patternCorrector.correct(newDecoded, label); } const newValidation = validateForLabel(label, newCorrected); if (newValidation.isValid) { return newValidation.cleaned; } } return retrieved; } return validation.cleaned; } /** * Apply jitter to an encoded vector */ applyJitter(encoded, label, noiseSeed) { const elmOutput = this.generateELMVector(label, noiseSeed); if (!elmOutput || elmOutput.length === 0 || elmOutput.every((v) => v === 0)) { return encoded; } const effectiveJitter = Math.min(this.jitterStrength, 0.05); const jittered = encoded.map((val, idx) => { const elmVal = elmOutput[idx] || 0; return (1 - effectiveJitter) * val + effectiveJitter * elmVal; }); const vocabSize = this.encoder.getVocabSize(); const indices = jittered.map((val) => { const clamped = Math.max(0, Math.min(vocabSize - 1, val)); const idx = Math.round(clamped); return Math.max(0, Math.min(vocabSize - 1, idx)); }); return indices; } /** * Generate an ELM vector for jittering */ generateELMVector(label, noiseSeed) { try { const elmGenerated = this.elm.generate(label, noiseSeed); if (elmGenerated && elmGenerated.length > 0) { return this.encoder.encode(elmGenerated); } return new Array(this.encoder.getVectorSize()).fill(0); } catch { return new Array(this.encoder.getVectorSize()).fill(0); } } /** * Generate multiple hybrid samples */ generateBatch(label, count) { const results = []; const seen = /* @__PURE__ */ new Set(); let attempts = 0; const maxAttempts = count * 5; while (results.length < count && attempts < maxAttempts) { const seed = this.config.seed !== void 0 ? this.config.seed + attempts : Date.now() + attempts; const generated = this.generate(label, seed); if (generated && generated.length > 0 && !seen.has(generated.toLowerCase())) { results.push(generated); seen.add(generated.toLowerCase()); } attempts++; } return results; } /** * Get all available labels */ getLabels() { return this.retrieval.getLabels(); } /** * Check if generator is trained */ isTrained() { return this.retrieval.hasLabel(this.getLabels()[0] || "") && this.elm.isTrained(); } } class ExactGenerator { constructor(config = {}) { this.trainingSamples = []; this.config = { usePatternMatching: true, maxVariations: 10, ...config }; this.retrieval = new RetrievalGenerator(config.seed); this.patternCorrector = new PatternCorrector(); } /** * Train the exact generator */ train(samples) { this.trainingSamples = samples; this.retrieval.ingest(samples); if (this.config.usePatternMatching) { this.patternCorrector.learnPatterns(samples); } } /** * Generate an exact sample (100% realistic) */ generate(label, seed) { const exact = this.retrieval.sampleOne(label); if (exact) { return exact; } if (this.config.usePatternMatching) { const pattern = this.patternCorrector.getPattern(label); if (pattern && pattern.examples.length > 0) { const randomIndex = seed !== void 0 ? seed % pattern.examples.length : Math.floor(Math.random() * pattern.examples.length); return pattern.examples[randomIndex]; } } throw new Error(`No samples found for label: ${label}`); } /** * Generate with pattern-based variations */ generateWithVariation(label, seed) { const base = this.generate(label, seed); if (!this.config.usePatternMatching) { return base; } const pattern = this.patternCorrector.getPattern(label); if (!pattern) { return base; } if (pattern.examples.length >= 2) { const seed1 = seed !== void 0 ? seed : Date.now(); const seed2 = seed1 + 1e3; const idx1 = seed1 % pattern.examples.length; const idx2 = seed2 % pattern.examples.length; if (idx1 !== idx2) { const ex1 = pattern.examples[idx1]; const ex2 = pattern.examples[idx2]; if (Math.abs(ex1.length - ex2.length) <= 2) { const mid = Math.floor(ex1.length / 2); const variation = ex1.substring(0, mid) + ex2.substring(mid); const validation = validateForLabel(label, variation); if (validation.isValid) { const score = this.patternCorrector.score(variation, label); if (score > 0.6) { return validation.cleaned; } } } } } return base; } /** * Generate multiple exact samples */ generateBatch(label, count) { const results = []; const seen = /* @__PURE__ */ new Set(); for (let i = 0; i < count * 2 && results.length < count; i++) { const seed = this.config.seed !== void 0 ? this.config.seed + i : Date.now() + i; let generated; if (i < count && this.config.usePatternMatching) { generated = this.generate(label, seed); } else { generated = this.generateWithVariation(label, seed); } if (generated && !seen.has(generated.toLowerCase())) { results.push(generated); seen.add(generated.toLowerCase()); } } return results; } /** * Get all available labels */ getLabels() { return this.retrieval.getLabels(); } /** * Check if generator is trained */ isTrained() { return this.retrieval.getLabels().length > 0; } } class PerfectGenerator { constructor(config) { this.elm = null; this.trainingSamples = []; this.config = { preferExact: true, usePatternMatching: true, useImprovedELM: false, // Default to false to avoid memory issues (creates duplicate ELM) elmHiddenUnits: 128, // Reduced from 256 for memory efficiency elmActivation: "relu", elmRidgeLambda: 1e-3, // Lower regularization noiseSize: 32, ...config }; this.exact = new ExactGenerator({ seed: config.seed, usePatternMatching: this.config.usePatternMatching }); this.hybrid = new HybridGenerator({ maxLength: config.maxLength, seed: config.seed, exactMode: false, // Allow some jitter for variation jitterStrength: 0.02, // Very low jitter (2%) useOneHot: false, // Disable one-hot to reduce memory (was: this.config.useImprovedELM) useClassification: false, // Disable classification to reduce memory (was: this.config.useImprovedELM) usePatternCorrection: true, elmHiddenUnits: this.config.elmHiddenUnits, // Now uses reduced 128 instead of 256 elmActivation: this.config.elmActivation, elmRidgeLambda: this.config.elmRidgeLambda, noiseSize: this.config.noiseSize }); if (this.config.useImprovedELM && config.useImprovedELM === true) { this.elm = new ELMGenerator({ maxLength: config.maxLength, seed: config.seed, hiddenUnits: this.config.elmHiddenUnits, activation: this.config.elmActivation, ridgeLambda: this.config.elmRidgeLambda, noiseSize: this.config.noiseSize, useOneHot: false, // Disable one-hot to reduce memory useClassification: false, // Disable classification to reduce memory usePatternCorrection: true }); } this.patternCorrector = new PatternCorrector(); } /** * Train the perfect generator */ train(samples) { this.trainingSamples = samples; this.exact.train(samples); this.patternCorrector.learnPatterns(samples); } /** * Lazy train hybrid generator */ ensureHybridTrained() { if (!this.hybrid.isTrained() && this.trainingSamples.length > 0) { this.hybrid.train(this.trainingSamples); } } /** * Lazy train ELM generator */ ensureELMTrained() { if (this.elm && !this.elm.isTrained() && this.trainingSamples.length > 0) { this.elm.train(this.trainingSamples); } } /** * Generate with best strategy */ generate(label, seed) { const candidates = []; try { const exact = this.exact.generate(label, seed); if (exact) { candidates.push({ value: exact, score: 1, source: "exact" }); } } catch (error) { } try { const exactVar = this.exact.generateWithVariation(label, seed); if (exactVar && exactVar !== candidates[0]?.value) { const score = this.patternCorrector.score(exactVar, label); candidates.push({ value: exactVar, score: score * 0.95, source: "exact-variation" }); } } catch (error) { } try { this.ensureHybridTrained(); const hybrid = this.hybrid.generate(label, seed); if (hybrid && !candidates.some((c) => c.value === hybrid)) { const score = this.patternCorrector.score(hybrid, label); const validation = validateForLabel(label, hybrid); const finalScore = validation.isValid ? score * 0.85 : score * 0.5; candidates.push({ value: hybrid, score: finalScore, source: "hybrid" }); } } catch (error) { } if (this.elm) { try { this.ensureELMTrained(); const elmGen = this.elm.generate(label, seed); if (elmGen && !candidates.some((c) => c.value === elmGen)) { const score = this.patternCorrector.score(elmGen, label); const validation = validateForLabel(label, elmGen); const finalScore = validation.isValid ? score * 0.8 : score * 0.4; candidates.push({ value: elmGen, score: finalScore, source: "elm" }); } } catch (error) { } } if (candidates.length === 0) { throw new Error(`No samples found for label: ${label}`); } candidates.sort((a, b) => b.score - a.score); if (this.config.preferExact) { const exactCandidate = candidates.find((c) => c.source === "exact"); if (exactCandidate && exactCandidate.score >= 0.9) { return exactCandidate.value; } } return candidates[0].value; } /** * Generate multiple samples with best strategy */ generateBatch(label, count) { const results = []; const seen = /* @__PURE__ */ new Set(); let attempts = 0; const maxAttempts = count * 5; while (results.length < count && attempts < maxAttempts) { const seed = this.config.seed !== void 0 ? this.config.seed + attempts : Date.now() + attempts; try { const generated = this.generate(label, seed); if (generated && generated.length > 0 && !seen.has(generated.toLowerCase())) { results.push(generated); seen.add(generated.toLowerCase()); } } catch (error) { } attempts++; } return results; } /** * Get all available labels */ getLabels() { return this.exact.getLabels(); } /** * Check if generator is trained */ isTrained() { return this.exact.isTrained(); } } function fromUint8ArrayToString(bytes) { const decoder = new TextDecoder(); return decoder.decode(bytes); } function base64urlDecode(str) { let base64 = str.replace(/-/g, "+").replace(/_/g, "/"); while (base64.length % 4 !== 0) { base64 += "="; } if (typeof atob !== "undefined") { const binary = atob(base64); const bytes = new Uint8Array(binary.length); for (let i = 0; i < binary.length; i++) { bytes[i] = binary.charCodeAt(i); } return bytes; } else { const Buffer = require("buffer").Buffer; const buf = Buffer.from(base64, "base64"); const result = new Uint8Array(buf.length); for (let i = 0; i < buf.length; i++) { result[i] = buf[i]; } return result; } } function base64urlDecodeJson(str) { const bytes = base64urlDecode(str); const json = fromUint8ArrayToString(bytes); return JSON.parse(json); } function joseToDer(joseSig) { const len = joseSig.length; if (len % 2 !== 0) throw new Error("Invalid JOSE signature length"); const size = len / 2; const rSlice = new Uint8Array(size); const sSlice = new Uint8Array(size); for (let i = 0; i < size; i++) { rSlice[i] = joseSig[i]; sSlice[i] = joseSig[i + size]; } let r = trimLeadingZeros(rSlice); let s = trimLeadingZeros(sSlice); const rHead = (r[0] & 128) !== 0; const sHead = (s[0] & 128) !== 0; const rLen = r.length + (rHead ? 1 : 0); const sLen = s.length + (sHead ? 1 : 0); const totalLen = 2 + rLen + 2 + sLen; const der = new Uint8Array(2 + totalLen); let offset = 0; der[offset++] = 48; der[offset++] = totalLen; der[offset++] = 2; der[offset++] = rLen; if (rHead) der[offset++] = 0; der.set(r, offset); offset += r.length; der[offset++] = 2; der[offset++] = sLen; if (sHead) der[offset++] = 0; der.set(s, offset); offset += s.length; return der; } function trimLeadingZeros(bytes) { let i = 0; while (i < bytes.length - 1 && bytes[i] === 0) i++; const result = new Uint8Array(bytes.length - i); for (let j = 0; j < result.length; j++) { result[j] = bytes[i + j]; } return result; } function nowEpochSeconds() { return Math.floor(Date.now() / 1e3); } const cache = {}; async function fetchJwks(jwksUrl, maxAgeSeconds = 300) { const now = Math.floor(Date.now() / 1e3); const entry = cache[jwksUrl]; if (entry && now - entry.fetchedAt < maxAgeSeconds) {