@astermind/astermind-synthetic-data
Version:
OmegaSynth - Label-Conditioned Synthetic Data Generator for AsterMind ELM/KELM Pipelines
1,635 lines (1,634 loc) • 82.1 kB
JavaScript
import * as path from "path";
import * as fs from "fs";
import { fileURLToPath } from "node:url";
import { ELM } from "@astermind/astermind-elm";
class SyntheticFieldStore {
constructor() {
this.store = /* @__PURE__ */ new Map();
}
/**
* Insert a labeled sample into the store
*/
insert(sample) {
if (!this.store.has(sample.label)) {
this.store.set(sample.label, []);
}
this.store.get(sample.label).push(sample.value);
}
/**
* Insert multiple samples at once
*/
insertMany(samples) {
for (const sample of samples) {
this.insert(sample);
}
}
/**
* Get all values for a given label
*/
get(label) {
return this.store.get(label) || [];
}
/**
* Sample k values uniformly at random for a given label
*/
sample(label, k = 1) {
const values = this.get(label);
if (values.length === 0) {
return [];
}
const result = [];
const indices = /* @__PURE__ */ new Set();
while (result.length < k && indices.size < values.length) {
const idx = Math.floor(Math.random() * values.length);
if (!indices.has(idx)) {
indices.add(idx);
result.push(values[idx]);
}
}
return result;
}
/**
* Check if a label exists in the store
*/
hasLabel(label) {
return this.store.has(label);
}
/**
* Get all labels in the store
*/
getLabels() {
return Array.from(this.store.keys());
}
/**
* Get the count of samples for a label
*/
count(label) {
return this.get(label).length;
}
/**
* Clear all data
*/
clear() {
this.store.clear();
}
}
let SeededRNG$1 = class SeededRNG {
constructor(seed = Date.now()) {
this.seed = seed;
}
next() {
this.seed = (this.seed * 1664525 + 1013904223) % 2 ** 32;
return this.seed / 2 ** 32;
}
setSeed(seed) {
this.seed = seed;
}
};
class RetrievalGenerator {
constructor(seed) {
this.store = new SyntheticFieldStore();
this.seed = seed;
this.rng = new SeededRNG$1(seed);
}
/**
* Ingest labeled samples into the store
*/
ingest(samples) {
this.store.insertMany(samples);
}
/**
* Sample k values for a given label
* Returns empty array if label doesn't exist or has no samples
*/
sample(label, k = 1) {
const values = this.store.get(label);
if (values.length === 0) {
return [];
}
const result = [];
const availableIndices = Array.from({ length: values.length }, (_, i) => i);
const sampleCount = Math.min(k, values.length);
for (let i = 0; i < sampleCount; i++) {
const randomIndex = Math.floor(this.rng.next() * availableIndices.length);
const selectedIndex = availableIndices.splice(randomIndex, 1)[0];
result.push(values[selectedIndex]);
}
return result;
}
/**
* Get a single sample (convenience method)
*/
sampleOne(label) {
const samples = this.sample(label, 1);
return samples.length > 0 ? samples[0] : null;
}
/**
* Check if a label has samples
*/
hasLabel(label) {
return this.store.hasLabel(label) && this.store.count(label) > 0;
}
/**
* Get all available labels
*/
getLabels() {
return this.store.getLabels();
}
/**
* Reset the generator (clears store and optionally resets seed)
*/
reset(seed) {
this.store.clear();
if (seed !== void 0) {
this.seed = seed;
this.rng.setSeed(seed);
}
}
}
class CharVocab {
constructor() {
this.charToIndex = /* @__PURE__ */ new Map();
this.indexToChar = /* @__PURE__ */ new Map();
this.size = 0;
}
/**
* Build vocabulary from a set of strings
* @param samples Array of strings to build vocabulary from
* @param charSet Optional predefined character set (e.g., alphanumeric + punctuation)
*/
build(samples, charSet) {
const chars = /* @__PURE__ */ new Set();
chars.add("\0");
if (charSet) {
for (const char of charSet) {
if (char !== "\0") {
chars.add(char);
}
}
}
for (const sample of samples) {
for (const char of sample) {
if (char !== "\0") {
chars.add(char);
}
}
}
const sortedChars = Array.from(chars).sort((a, b) => {
if (a === "\0") return -1;
if (b === "\0") return 1;
return a.localeCompare(b);
});
this.charToIndex.clear();
this.indexToChar.clear();
this.size = sortedChars.length;
sortedChars.forEach((char, index) => {
this.charToIndex.set(char, index);
this.indexToChar.set(index, char);
});
}
/**
* Get index for a character
*/
getIndex(char) {
const index = this.charToIndex.get(char);
if (index === void 0) {
throw new Error(`Character '${char}' not in vocabulary`);
}
return index;
}
/**
* Get character for an index
*/
getChar(index) {
const char = this.indexToChar.get(index);
if (char === void 0) {
throw new Error(`Index ${index} not in vocabulary`);
}
return char;
}
/**
* Check if character exists in vocabulary
*/
hasChar(char) {
return this.charToIndex.has(char);
}
/**
* Get vocabulary size
*/
getSize() {
return this.size;
}
/**
* Get all characters in vocabulary
*/
getChars() {
return Array.from(this.charToIndex.keys()).sort();
}
/**
* Get default character set (alphanumeric + common punctuation)
*/
static getDefaultCharSet() {
return "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 !\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
}
}
class FixedLength {
/**
* Pad or truncate an array to a fixed length
* @param arr Array to pad/truncate
* @param length Target length
* @param padValue Value to use for padding (default: 0)
*/
static padOrTruncate(arr, length, padValue = 0) {
if (arr.length === length) {
return [...arr];
}
if (arr.length > length) {
return arr.slice(0, length);
}
const result = [...arr];
while (result.length < length) {
result.push(padValue);
}
return result;
}
/**
* Pad or truncate a string to a fixed length
* @param str String to pad/truncate
* @param length Target length
* @param padChar Character to use for padding (default: space)
*/
static padOrTruncateString(str, length, padChar = " ") {
if (str.length === length) {
return str;
}
if (str.length > length) {
return str.slice(0, length);
}
return str + padChar.repeat(length - str.length);
}
}
class OneHot {
/**
* Encode an index as a one-hot vector
* @param index Index to encode
* @param size Size of the one-hot vector
*/
static encode(index, size) {
if (index < 0 || index >= size) {
throw new Error(`Index ${index} out of range [0, ${size})`);
}
const vector = new Array(size).fill(0);
vector[index] = 1;
return vector;
}
/**
* Decode a one-hot vector to an index
* @param vector One-hot vector
*/
static decode(vector) {
const index = vector.indexOf(1);
if (index === -1) {
throw new Error("Invalid one-hot vector: no element equals 1");
}
return index;
}
/**
* Encode multiple indices as one-hot vectors
* @param indices Array of indices
* @param size Size of each one-hot vector
*/
static encodeBatch(indices, size) {
return indices.map((idx) => this.encode(idx, size));
}
/**
* Decode multiple one-hot vectors to indices
* @param vectors Array of one-hot vectors
*/
static decodeBatch(vectors) {
return vectors.map((vec) => this.decode(vec));
}
}
class StringEncoder {
constructor(config) {
this.config = {
useOneHot: false,
// Default to index-based for efficiency
...config
};
this.vocab = new CharVocab();
}
/**
* Build vocabulary from training samples
*/
buildVocab(samples) {
this.vocab.build(samples, this.config.charSet || CharVocab.getDefaultCharSet());
}
/**
* Encode a string to a vector
* @param str String to encode
* @returns Encoded vector (either indices or one-hot)
*/
encode(str) {
if (this.vocab.getSize() === 0) {
throw new Error("Vocabulary not built. Call buildVocab() first.");
}
const indices = [];
for (const char of str) {
if (this.vocab.hasChar(char)) {
indices.push(this.vocab.getIndex(char));
} else {
if (this.vocab.hasChar(" ")) {
indices.push(this.vocab.getIndex(" "));
} else {
indices.push(0);
}
}
}
const padded = FixedLength.padOrTruncate(
indices,
this.config.maxLength,
0
);
if (this.config.useOneHot) {
const vocabSize = this.vocab.getSize();
const oneHotVectors = [];
for (const idx of padded) {
oneHotVectors.push(...OneHot.encode(idx, vocabSize));
}
return oneHotVectors;
}
return padded;
}
/**
* Decode a vector back to a string
* @param vector Encoded vector
* @returns Decoded string
*/
decode(vector) {
if (this.vocab.getSize() === 0) {
throw new Error("Vocabulary not built. Call buildVocab() first.");
}
let indices;
if (this.config.useOneHot) {
const vocabSize2 = this.vocab.getSize();
indices = [];
for (let i = 0; i < vector.length; i += vocabSize2) {
const oneHot = vector.slice(i, i + vocabSize2);
try {
indices.push(OneHot.decode(oneHot));
} catch {
const maxIdx = oneHot.indexOf(Math.max(...oneHot));
indices.push(maxIdx);
}
}
indices = indices.slice(0, this.config.maxLength);
} else {
indices = vector.slice(0, this.config.maxLength);
}
let result = "";
const vocabSize = this.vocab.getSize();
const paddingIdx = 0;
for (const idx of indices) {
const clampedIdx = Math.max(0, Math.min(vocabSize - 1, Math.round(idx)));
if (clampedIdx === paddingIdx) {
break;
}
try {
const char = this.vocab.getChar(clampedIdx);
if (char === "\0" || char.charCodeAt(0) < 32 && char !== " " && char !== " " && char !== "\n") {
break;
}
result += char;
} catch {
break;
}
}
return result.trimEnd();
}
/**
* Encode multiple strings
*/
encodeBatch(strings) {
return strings.map((str) => this.encode(str));
}
/**
* Decode multiple vectors
*/
decodeBatch(vectors) {
return vectors.map((vec) => this.decode(vec));
}
/**
* Get the output vector size
*/
getVectorSize() {
if (this.config.useOneHot) {
return this.config.maxLength * this.vocab.getSize();
}
return this.config.maxLength;
}
/**
* Get vocabulary size
*/
getVocabSize() {
return this.vocab.getSize();
}
/**
* Get vocabulary
*/
getVocab() {
return this.vocab;
}
}
function oneHotLabel(labelIndex, numLabels) {
const vector = new Array(numLabels).fill(0);
if (labelIndex >= 0 && labelIndex < numLabels) {
vector[labelIndex] = 1;
}
return vector;
}
function generateNoiseVector(size, seed) {
const rng = seed !== void 0 ? new SeededRNG2(seed) : null;
const noise = [];
for (let i = 0; i < size; i++) {
const value = rng ? rng.next() : Math.random();
noise.push(value * 2 - 1);
}
return noise;
}
class SeededRNG2 {
constructor(seed) {
this.seed = seed;
}
next() {
this.seed = (this.seed * 1664525 + 1013904223) % 2 ** 32;
return this.seed / 2 ** 32;
}
}
function validateForLabel(label, value) {
if (!value || value.length === 0) {
return { isValid: false, cleaned: "", reason: "Empty value" };
}
const validator = getValidatorForLabel(label);
return validator(value);
}
function getValidatorForLabel(label) {
switch (label) {
case "first_name":
case "last_name":
return validateName;
case "phone_number":
return validatePhoneNumber;
case "email":
return validateEmail;
case "street_address":
return validateStreetAddress;
case "city":
case "state":
case "country":
return validateLocation;
case "company_name":
case "job_title":
case "product_name":
return validateText;
case "color":
return validateColor;
case "uuid":
return validateUUID;
case "date":
return validateDate;
case "credit_card_type":
case "device_type":
return validateText;
default:
return validateGeneric;
}
}
function validateName(value) {
value.toLowerCase();
if (/^name\d+$/i.test(value)) {
return { isValid: false, cleaned: "", reason: "Placeholder name with numbers" };
}
let cleaned = value.replace(/[^a-zA-Z\-\'\s]/g, "");
cleaned = cleaned.replace(/[0-9]/g, "");
cleaned = cleaned.replace(/[-']{2,}/g, "-");
cleaned = cleaned.replace(/^[-']+|[-']+$/g, "");
cleaned = cleaned.trim().replace(/\s+/g, " ");
if (cleaned.length < 2 || !/[a-zA-Z]/.test(cleaned)) {
return { isValid: false, cleaned: "", reason: "Too short or no letters" };
}
const lowerCleaned = cleaned.toLowerCase();
if (lowerCleaned === "name" || lowerCleaned === "firstname" || lowerCleaned === "lastname" || lowerCleaned === "surname") {
return { isValid: false, cleaned: "", reason: "Placeholder name" };
}
if (lowerCleaned.startsWith("name") && lowerCleaned.length <= 6) {
return { isValid: false, cleaned: "", reason: "Placeholder name" };
}
if (cleaned.length > 30) {
cleaned = cleaned.substring(0, 30).trim();
}
return { isValid: true, cleaned };
}
function validatePhoneNumber(value) {
let cleaned = value.replace(/[^0-9\-\+\(\)\.\s]/g, "");
cleaned = cleaned.replace(/[-\.]{2,}/g, "-");
cleaned = cleaned.replace(/\s+/g, " ");
cleaned = cleaned.trim();
const digitCount = (cleaned.match(/\d/g) || []).length;
if (digitCount < 7) {
return { isValid: false, cleaned: "", reason: "Too few digits" };
}
if (cleaned.length > 25) {
cleaned = cleaned.substring(0, 25).trim();
}
return { isValid: true, cleaned };
}
function validateEmail(value) {
let cleaned = value.replace(/[^a-zA-Z0-9@\.\-\_]/g, "");
if (!cleaned.includes("@")) {
return { isValid: false, cleaned: "", reason: "Missing @ symbol" };
}
const parts = cleaned.split("@");
if (parts.length !== 2) {
return { isValid: false, cleaned: "", reason: "Invalid @ usage" };
}
const [local, domain] = parts;
if (!local || local.length === 0) {
return { isValid: false, cleaned: "", reason: "Empty local part" };
}
if (!domain || domain.length < 3) {
return { isValid: false, cleaned: "", reason: "Invalid domain" };
}
if (!domain.includes(".")) {
return { isValid: false, cleaned: "", reason: "Domain missing dot" };
}
const cleanLocal = local.replace(/^[\.\-]+|[\.\-]+$/g, "");
const cleanDomain = domain.replace(/^[\.\-]+|[\.\-]+$/g, "");
if (!cleanLocal || !cleanDomain) {
return { isValid: false, cleaned: "", reason: "Invalid format after cleaning" };
}
cleaned = `${cleanLocal}@${cleanDomain}`;
if (cleaned.length > 50) {
cleaned = cleaned.substring(0, 50);
}
return { isValid: true, cleaned };
}
function validateStreetAddress(value) {
let cleaned = value.replace(/[^a-zA-Z0-9\s\-\#\.\,]/g, "");
cleaned = cleaned.trim().replace(/\s+/g, " ");
if (cleaned.length < 5) {
return { isValid: false, cleaned: "", reason: "Too short" };
}
if (cleaned.length > 50) {
cleaned = cleaned.substring(0, 50).trim();
}
return { isValid: true, cleaned };
}
function validateLocation(value) {
let cleaned = value.replace(/[^a-zA-Z\s\-\']/g, "");
cleaned = cleaned.trim().replace(/\s+/g, " ");
if (cleaned.length < 2 || !/[a-zA-Z]/.test(cleaned)) {
return { isValid: false, cleaned: "", reason: "Too short or no letters" };
}
if (cleaned.length > 30) {
cleaned = cleaned.substring(0, 30).trim();
}
return { isValid: true, cleaned };
}
function validateText(value) {
let cleaned = value.replace(/[^a-zA-Z0-9\s\-\'\.\,]/g, "");
cleaned = cleaned.trim().replace(/\s+/g, " ");
if (cleaned.length < 2) {
return { isValid: false, cleaned: "", reason: "Too short" };
}
if (cleaned.length > 50) {
cleaned = cleaned.substring(0, 50).trim();
}
return { isValid: true, cleaned };
}
function validateColor(value) {
let cleaned = value.replace(/[^a-zA-Z\s]/g, "");
cleaned = cleaned.trim().replace(/\s+/g, " ");
if (cleaned.length < 3) {
return { isValid: false, cleaned: "", reason: "Too short" };
}
if (cleaned.length > 20) {
cleaned = cleaned.substring(0, 20).trim();
}
return { isValid: true, cleaned };
}
function validateUUID(value) {
let cleaned = value.replace(/[^0-9a-fA-F\-]/g, "");
const hexOnly = cleaned.replace(/-/g, "");
if (hexOnly.length >= 32) {
const formatted = [
hexOnly.substring(0, 8),
hexOnly.substring(8, 12),
hexOnly.substring(12, 16),
hexOnly.substring(16, 20),
hexOnly.substring(20, 32)
].join("-");
cleaned = formatted;
}
const hexCount = cleaned.replace(/-/g, "").length;
if (hexCount < 32) {
return { isValid: false, cleaned: "", reason: "Too few hex characters" };
}
return { isValid: true, cleaned };
}
function validateDate(value) {
let cleaned = value.replace(/[^0-9\-\/]/g, "");
const digitCount = (cleaned.match(/\d/g) || []).length;
if (digitCount < 8) {
return { isValid: false, cleaned: "", reason: "Too few digits" };
}
if (cleaned.length > 20) {
cleaned = cleaned.substring(0, 20).trim();
}
return { isValid: true, cleaned };
}
function validateGeneric(value) {
let cleaned = value.replace(/[\x00-\x1F\x7F]/g, "");
cleaned = cleaned.trim().replace(/\s+/g, " ");
if (cleaned.length < 1) {
return { isValid: false, cleaned: "", reason: "Empty after cleaning" };
}
return { isValid: true, cleaned };
}
class PatternCorrector {
constructor() {
this.patterns = /* @__PURE__ */ new Map();
}
/**
* Learn patterns from training data
*/
learnPatterns(samples) {
const byLabel = /* @__PURE__ */ new Map();
for (const sample of samples) {
if (!byLabel.has(sample.label)) {
byLabel.set(sample.label, []);
}
byLabel.get(sample.label).push(sample.value);
}
for (const [label, values] of byLabel.entries()) {
this.learnPattern(label, values);
}
}
/**
* Learn pattern for a specific label
*/
learnPattern(label, examples) {
if (examples.length === 0) return;
const prefixCounts = /* @__PURE__ */ new Map();
const suffixCounts = /* @__PURE__ */ new Map();
const charFreq = /* @__PURE__ */ new Map();
const lengths = [];
for (const example of examples) {
lengths.push(example.length);
for (let len = 1; len <= Math.min(3, example.length); len++) {
const prefix = example.substring(0, len);
prefixCounts.set(prefix, (prefixCounts.get(prefix) || 0) + 1);
}
for (let len = 1; len <= Math.min(3, example.length); len++) {
const suffix = example.substring(example.length - len);
suffixCounts.set(suffix, (suffixCounts.get(suffix) || 0) + 1);
}
for (const char of example) {
charFreq.set(char, (charFreq.get(char) || 0) + 1);
}
}
const commonPrefixes = Array.from(prefixCounts.entries()).filter(([_, count]) => count / examples.length > 0.1).sort((a, b) => b[1] - a[1]).slice(0, 15).map(([prefix]) => prefix);
const commonSuffixes = Array.from(suffixCounts.entries()).filter(([_, count]) => count / examples.length > 0.1).sort((a, b) => b[1] - a[1]).slice(0, 15).map(([suffix]) => suffix);
const totalChars = Array.from(charFreq.values()).reduce((a, b) => a + b, 0);
for (const [char, count] of charFreq.entries()) {
charFreq.set(char, count / totalChars);
}
this.patterns.set(label, {
label,
examples,
commonPrefixes,
commonSuffixes,
charFrequency: charFreq,
lengthDistribution: lengths
});
}
/**
* Correct a generated string using learned patterns
*/
correct(generated, label) {
const pattern = this.patterns.get(label);
if (!pattern) {
return generated;
}
let corrected = generated;
if (pattern.examples.includes(generated)) {
return generated;
}
const hasValidPrefix = pattern.commonPrefixes.some(
(prefix) => corrected.toLowerCase().startsWith(prefix.toLowerCase())
);
pattern.commonSuffixes.some(
(suffix) => corrected.toLowerCase().endsWith(suffix.toLowerCase())
);
if (!hasValidPrefix && pattern.commonPrefixes.length > 0) {
const mostCommonPrefix = pattern.commonPrefixes[0];
if (corrected.length > 0 && !corrected.toLowerCase().startsWith(mostCommonPrefix[0].toLowerCase())) ;
}
const charFreq = pattern.charFrequency;
let cleaned = "";
for (const char of corrected) {
const freq = charFreq.get(char) || 0;
if (freq > 5e-3 || /[a-zA-Z0-9\s]/.test(char)) {
cleaned += char;
}
}
if (cleaned.length > 0) {
corrected = cleaned;
}
pattern.lengthDistribution.reduce((a, b) => a + b, 0) / pattern.lengthDistribution.length;
Math.min(...pattern.lengthDistribution);
const maxLength = Math.max(...pattern.lengthDistribution);
if (corrected.length > maxLength * 1.5) {
corrected = corrected.substring(0, Math.floor(maxLength * 1.2));
}
return corrected;
}
/**
* Score how well a generated string matches the pattern
*/
score(generated, label) {
const pattern = this.patterns.get(label);
if (!pattern) {
return 0.5;
}
let score = 0;
let factors = 0;
if (pattern.examples.includes(generated)) {
return 1;
}
const prefixMatch = pattern.commonPrefixes.some(
(prefix) => generated.toLowerCase().startsWith(prefix.toLowerCase())
);
score += prefixMatch ? 0.3 : 0;
factors++;
const suffixMatch = pattern.commonSuffixes.some(
(suffix) => generated.toLowerCase().endsWith(suffix.toLowerCase())
);
score += suffixMatch ? 0.2 : 0;
factors++;
const charFreq = pattern.charFrequency;
let charScore = 0;
let charCount = 0;
for (const char of generated) {
const freq = charFreq.get(char) || 0;
charScore += freq;
charCount++;
}
score += (charCount > 0 ? charScore / charCount : 0) * 0.3;
factors++;
const avgLength = pattern.lengthDistribution.reduce((a, b) => a + b, 0) / pattern.lengthDistribution.length;
const lengthDiff = Math.abs(generated.length - avgLength) / avgLength;
const lengthScore = Math.max(0, 1 - lengthDiff);
score += lengthScore * 0.2;
factors++;
return factors > 0 ? score / factors : 0;
}
/**
* Get pattern for a label
*/
getPattern(label) {
return this.patterns.get(label);
}
}
class SequenceContext {
// n-gram size
constructor(n = 3) {
this.ngramPatterns = /* @__PURE__ */ new Map();
this.n = n;
}
/**
* Learn n-gram patterns from training data
*/
learnPatterns(samples) {
this.ngramPatterns.clear();
for (const sample of samples) {
for (let i = 0; i <= sample.length - this.n; i++) {
const ngram = sample.substring(i, i + this.n - 1);
const nextChar = sample[i + this.n - 1];
if (!this.ngramPatterns.has(ngram)) {
this.ngramPatterns.set(ngram, /* @__PURE__ */ new Map());
}
const charMap = this.ngramPatterns.get(ngram);
charMap.set(nextChar, (charMap.get(nextChar) || 0) + 1);
}
}
}
/**
* Get next character probabilities given context
*/
getNextCharProbs(context) {
const ctx = context.length >= this.n - 1 ? context.substring(context.length - (this.n - 1)) : context;
const charCounts = this.ngramPatterns.get(ctx);
if (!charCounts || charCounts.size === 0) {
return /* @__PURE__ */ new Map();
}
const total = Array.from(charCounts.values()).reduce((a, b) => a + b, 0);
const probs = /* @__PURE__ */ new Map();
for (const [char, count] of charCounts.entries()) {
probs.set(char, count / total);
}
return probs;
}
/**
* Suggest next character based on context
*/
suggestNextChar(context) {
const probs = this.getNextCharProbs(context);
if (probs.size === 0) {
return null;
}
let bestChar = "";
let bestProb = 0;
for (const [char, prob] of probs.entries()) {
if (prob > bestProb) {
bestProb = prob;
bestChar = char;
}
}
return bestChar;
}
/**
* Score how well a character fits the context
*/
scoreChar(context, char) {
const probs = this.getNextCharProbs(context);
return probs.get(char) || 0;
}
}
class ELMGenerator {
constructor(config) {
this.elm = null;
this.labels = [];
this.patternCorrector = null;
this.sequenceContext = null;
this.config = {
hiddenUnits: 128,
activation: "relu",
ridgeLambda: 0.01,
noiseSize: 32,
useOneHot: false,
// Default to false for memory efficiency (can enable for better accuracy)
useClassification: false,
// Default to regression for compatibility
usePatternCorrection: true,
...config
};
this.noiseSize = this.config.noiseSize;
this.useClassification = this.config.useClassification;
this.encoder = new StringEncoder({
maxLength: config.maxLength,
useOneHot: this.config.useOneHot ?? false
// Default to false for memory efficiency
});
if (this.config.usePatternCorrection) {
this.patternCorrector = new PatternCorrector();
}
this.sequenceContext = new SequenceContext(3);
}
/**
* Train the ELM generator on labeled samples
*/
train(samples) {
if (samples.length === 0) {
throw new Error("Cannot train on empty dataset");
}
const uniqueLabels = Array.from(new Set(samples.map((s) => s.label)));
this.labels = uniqueLabels;
const allValues = samples.map((s) => s.value);
this.encoder.buildVocab(allValues);
if (this.patternCorrector) {
this.patternCorrector.learnPatterns(samples);
}
if (this.sequenceContext) {
this.sequenceContext.learnPatterns(allValues);
}
const X = [];
const Y = [];
for (const sample of samples) {
const labelIndex = this.labels.indexOf(sample.label);
if (labelIndex === -1) {
continue;
}
const labelOneHot = oneHotLabel(labelIndex, this.labels.length);
const noise = generateNoiseVector(this.noiseSize, this.config.seed);
const inputVector = [...labelOneHot, ...noise];
X.push(inputVector);
const encodedValue = this.encoder.encode(sample.value);
Y.push(encodedValue);
}
if (X.length === 0) {
throw new Error("No valid training samples after processing");
}
const inputSize = this.labels.length + this.noiseSize;
this.encoder.getVectorSize();
const elmConfig = {
useTokenizer: false,
// Numeric mode
inputSize,
categories: this.useClassification ? [] : [],
// For classification, we'll handle it differently
hiddenUnits: this.config.hiddenUnits,
activation: this.config.activation,
// Use lower regularization for better pattern learning
ridgeLambda: this.config.ridgeLambda * 0.1,
// Reduce regularization
task: this.useClassification ? "classification" : "regression"
};
this.elm = new ELM(elmConfig);
this.elm.trainFromData(X, Y);
}
/**
* Generate a string for a given label
* @param label Label to generate for
* @param noiseSeed Optional seed for noise generation (for deterministic output)
*/
generate(label, noiseSeed) {
if (!this.elm) {
throw new Error("Model not trained. Call train() first.");
}
const labelIndex = this.labels.indexOf(label);
if (labelIndex === -1) {
throw new Error(`Label '${label}' not found in training data`);
}
const labelOneHot = oneHotLabel(labelIndex, this.labels.length);
const noise = generateNoiseVector(
this.noiseSize,
noiseSeed !== void 0 ? noiseSeed : this.config.seed
);
const inputVector = [...labelOneHot, ...noise];
let decoded;
if (this.useClassification && this.config.useOneHot && typeof this.elm.predictProbaFromVector === "function") {
const vocabSize = this.encoder.getVocabSize();
const maxLength = this.config.maxLength;
const probs = this.elm.predictProbaFromVector(inputVector);
const indices = [];
for (let pos = 0; pos < maxLength; pos++) {
const posProbs = probs.slice(pos * vocabSize, (pos + 1) * vocabSize);
const maxIdx = posProbs.indexOf(Math.max(...posProbs));
indices.push(maxIdx);
}
decoded = this.encoder.decode(indices);
} else {
const prediction = this.elm.predictLogitsFromVector(inputVector);
const vocabSize = this.encoder.getVocabSize();
const indices = prediction.map((val) => {
const clamped = Math.max(-vocabSize, Math.min(vocabSize * 2, val));
const rounded = Math.round(clamped);
const idx = Math.max(0, Math.min(vocabSize - 1, rounded));
return idx;
});
decoded = this.encoder.decode(indices);
}
let corrected = decoded;
if (this.patternCorrector) {
corrected = this.patternCorrector.correct(decoded, label);
}
if (this.sequenceContext && corrected.length > 0) {
corrected = this.refineWithSequenceContext(corrected, label);
}
const validation = validateForLabel(label, corrected);
if (!validation.isValid) {
for (let attempt = 0; attempt < 3; attempt++) {
const baseSeed = noiseSeed !== void 0 ? noiseSeed : this.config.seed ?? Date.now();
const newNoise = generateNoiseVector(
this.noiseSize,
baseSeed + attempt + 1e3
);
const newInputVector = [...labelOneHot, ...newNoise];
let newDecoded;
if (this.useClassification && this.config.useOneHot && typeof this.elm.predictProbaFromVector === "function") {
const vocabSize = this.encoder.getVocabSize();
const maxLength = this.config.maxLength;
const probs = this.elm.predictProbaFromVector(newInputVector);
const newIndices = [];
for (let pos = 0; pos < maxLength; pos++) {
const posProbs = probs.slice(pos * vocabSize, (pos + 1) * vocabSize);
const maxIdx = posProbs.indexOf(Math.max(...posProbs));
newIndices.push(maxIdx);
}
newDecoded = this.encoder.decode(newIndices);
} else {
const newPrediction = this.elm.predictLogitsFromVector(newInputVector);
const vocabSize = this.encoder.getVocabSize();
const newIndices = newPrediction.map((val) => {
const clamped = Math.max(-vocabSize, Math.min(vocabSize * 2, val));
const rounded = Math.round(clamped);
return Math.max(0, Math.min(vocabSize - 1, rounded));
});
newDecoded = this.encoder.decode(newIndices);
}
if (this.patternCorrector) {
newDecoded = this.patternCorrector.correct(newDecoded, label);
}
const newValidation = validateForLabel(label, newDecoded);
if (newValidation.isValid) {
return newValidation.cleaned;
}
}
return "";
}
return validation.cleaned;
}
/**
* Generate multiple strings for a label with confidence-based selection
*/
generateBatch(label, count) {
const candidates = [];
const seen = /* @__PURE__ */ new Set();
let attempts = 0;
const maxAttempts = count * 10;
while (attempts < maxAttempts) {
const seed = this.config.seed !== void 0 ? this.config.seed + attempts : Date.now() + attempts;
try {
const generated = this.generate(label, seed);
if (generated && generated.length > 0 && !seen.has(generated.toLowerCase())) {
let score = 1;
if (this.patternCorrector) {
score = this.patternCorrector.score(generated, label);
}
const validation = validateForLabel(label, generated);
if (!validation.isValid) {
score = 0;
}
candidates.push({ value: generated, score });
seen.add(generated.toLowerCase());
}
} catch (error) {
}
attempts++;
}
candidates.sort((a, b) => b.score - a.score);
return candidates.slice(0, count).map((c) => c.value);
}
/**
* Refine generated string using sequence context
*/
refineWithSequenceContext(generated, label) {
if (!this.sequenceContext || generated.length === 0) {
return generated;
}
let refined = "";
for (let i = 0; i < generated.length; i++) {
const context = refined;
const currentChar = generated[i];
const contextScore = this.sequenceContext.scoreChar(context, currentChar);
if (contextScore < 0.1 && context.length > 0) {
const suggested = this.sequenceContext.suggestNextChar(context);
if (suggested && suggested !== currentChar) {
refined += suggested;
} else {
refined += currentChar;
}
} else {
refined += currentChar;
}
if (currentChar === "\0" || currentChar.charCodeAt(0) === 0) {
break;
}
}
return refined;
}
/**
* Get all trained labels
*/
getLabels() {
return [...this.labels];
}
/**
* Check if model is trained
*/
isTrained() {
return this.elm !== null;
}
}
class HybridGenerator {
constructor(config) {
this.patternCorrector = null;
this.config = {
elmHiddenUnits: 128,
elmActivation: "relu",
elmRidgeLambda: 0.01,
noiseSize: 32,
jitterStrength: 0.05,
// 5% jitter by default (reduced for better realism)
exactMode: false,
useOneHot: false,
// Default to false for memory efficiency
useClassification: false,
usePatternCorrection: true,
...config
};
if (this.config.exactMode) {
this.jitterStrength = 0;
} else {
this.jitterStrength = this.config.jitterStrength;
}
this.retrieval = new RetrievalGenerator(config.seed);
this.elm = new ELMGenerator({
maxLength: config.maxLength,
hiddenUnits: this.config.elmHiddenUnits,
activation: this.config.elmActivation,
ridgeLambda: this.config.elmRidgeLambda,
noiseSize: this.config.noiseSize,
useOneHot: this.config.useOneHot,
useClassification: this.config.useClassification,
usePatternCorrection: this.config.usePatternCorrection,
seed: config.seed
});
this.encoder = new StringEncoder({
maxLength: config.maxLength,
useOneHot: this.config.useOneHot ?? false
// Default to false for memory efficiency
});
if (this.config.usePatternCorrection) {
this.patternCorrector = new PatternCorrector();
}
}
/**
* Train the hybrid generator on labeled samples
*/
train(samples) {
this.retrieval.ingest(samples);
const allValues = samples.map((s) => s.value);
this.encoder.buildVocab(allValues);
this.elm.train(samples);
if (this.patternCorrector) {
this.patternCorrector.learnPatterns(samples);
}
}
/**
* Generate a hybrid sample (retrieval + jitter)
* @param label Label to generate for
* @param noiseSeed Optional seed for deterministic output
*/
generate(label, noiseSeed) {
const retrieved = this.retrieval.sampleOne(label);
if (!retrieved) {
return this.elm.generate(label, noiseSeed);
}
const encoded = this.encoder.encode(retrieved);
const jittered = this.applyJitter(encoded, label, noiseSeed);
const decoded = this.encoder.decode(jittered);
let corrected = decoded;
if (this.patternCorrector) {
corrected = this.patternCorrector.correct(decoded, label);
}
const validation = validateForLabel(label, corrected);
if (!validation.isValid) {
for (let attempt = 0; attempt < 2; attempt++) {
const newSeed = noiseSeed !== void 0 ? noiseSeed + attempt + 1e3 : void 0;
const newJittered = this.applyJitter(encoded, label, newSeed);
const newDecoded = this.encoder.decode(newJittered);
let newCorrected = newDecoded;
if (this.patternCorrector) {
newCorrected = this.patternCorrector.correct(newDecoded, label);
}
const newValidation = validateForLabel(label, newCorrected);
if (newValidation.isValid) {
return newValidation.cleaned;
}
}
return retrieved;
}
return validation.cleaned;
}
/**
* Apply jitter to an encoded vector
*/
applyJitter(encoded, label, noiseSeed) {
const elmOutput = this.generateELMVector(label, noiseSeed);
if (!elmOutput || elmOutput.length === 0 || elmOutput.every((v) => v === 0)) {
return encoded;
}
const effectiveJitter = Math.min(this.jitterStrength, 0.05);
const jittered = encoded.map((val, idx) => {
const elmVal = elmOutput[idx] || 0;
return (1 - effectiveJitter) * val + effectiveJitter * elmVal;
});
const vocabSize = this.encoder.getVocabSize();
const indices = jittered.map((val) => {
const clamped = Math.max(0, Math.min(vocabSize - 1, val));
const idx = Math.round(clamped);
return Math.max(0, Math.min(vocabSize - 1, idx));
});
return indices;
}
/**
* Generate an ELM vector for jittering
*/
generateELMVector(label, noiseSeed) {
try {
const elmGenerated = this.elm.generate(label, noiseSeed);
if (elmGenerated && elmGenerated.length > 0) {
return this.encoder.encode(elmGenerated);
}
return new Array(this.encoder.getVectorSize()).fill(0);
} catch {
return new Array(this.encoder.getVectorSize()).fill(0);
}
}
/**
* Generate multiple hybrid samples
*/
generateBatch(label, count) {
const results = [];
const seen = /* @__PURE__ */ new Set();
let attempts = 0;
const maxAttempts = count * 5;
while (results.length < count && attempts < maxAttempts) {
const seed = this.config.seed !== void 0 ? this.config.seed + attempts : Date.now() + attempts;
const generated = this.generate(label, seed);
if (generated && generated.length > 0 && !seen.has(generated.toLowerCase())) {
results.push(generated);
seen.add(generated.toLowerCase());
}
attempts++;
}
return results;
}
/**
* Get all available labels
*/
getLabels() {
return this.retrieval.getLabels();
}
/**
* Check if generator is trained
*/
isTrained() {
return this.retrieval.hasLabel(this.getLabels()[0] || "") && this.elm.isTrained();
}
}
class ExactGenerator {
constructor(config = {}) {
this.trainingSamples = [];
this.config = {
usePatternMatching: true,
maxVariations: 10,
...config
};
this.retrieval = new RetrievalGenerator(config.seed);
this.patternCorrector = new PatternCorrector();
}
/**
* Train the exact generator
*/
train(samples) {
this.trainingSamples = samples;
this.retrieval.ingest(samples);
if (this.config.usePatternMatching) {
this.patternCorrector.learnPatterns(samples);
}
}
/**
* Generate an exact sample (100% realistic)
*/
generate(label, seed) {
const exact = this.retrieval.sampleOne(label);
if (exact) {
return exact;
}
if (this.config.usePatternMatching) {
const pattern = this.patternCorrector.getPattern(label);
if (pattern && pattern.examples.length > 0) {
const randomIndex = seed !== void 0 ? seed % pattern.examples.length : Math.floor(Math.random() * pattern.examples.length);
return pattern.examples[randomIndex];
}
}
throw new Error(`No samples found for label: ${label}`);
}
/**
* Generate with pattern-based variations
*/
generateWithVariation(label, seed) {
const base = this.generate(label, seed);
if (!this.config.usePatternMatching) {
return base;
}
const pattern = this.patternCorrector.getPattern(label);
if (!pattern) {
return base;
}
if (pattern.examples.length >= 2) {
const seed1 = seed !== void 0 ? seed : Date.now();
const seed2 = seed1 + 1e3;
const idx1 = seed1 % pattern.examples.length;
const idx2 = seed2 % pattern.examples.length;
if (idx1 !== idx2) {
const ex1 = pattern.examples[idx1];
const ex2 = pattern.examples[idx2];
if (Math.abs(ex1.length - ex2.length) <= 2) {
const mid = Math.floor(ex1.length / 2);
const variation = ex1.substring(0, mid) + ex2.substring(mid);
const validation = validateForLabel(label, variation);
if (validation.isValid) {
const score = this.patternCorrector.score(variation, label);
if (score > 0.6) {
return validation.cleaned;
}
}
}
}
}
return base;
}
/**
* Generate multiple exact samples
*/
generateBatch(label, count) {
const results = [];
const seen = /* @__PURE__ */ new Set();
for (let i = 0; i < count * 2 && results.length < count; i++) {
const seed = this.config.seed !== void 0 ? this.config.seed + i : Date.now() + i;
let generated;
if (i < count && this.config.usePatternMatching) {
generated = this.generate(label, seed);
} else {
generated = this.generateWithVariation(label, seed);
}
if (generated && !seen.has(generated.toLowerCase())) {
results.push(generated);
seen.add(generated.toLowerCase());
}
}
return results;
}
/**
* Get all available labels
*/
getLabels() {
return this.retrieval.getLabels();
}
/**
* Check if generator is trained
*/
isTrained() {
return this.retrieval.getLabels().length > 0;
}
}
class PerfectGenerator {
constructor(config) {
this.elm = null;
this.trainingSamples = [];
this.config = {
preferExact: true,
usePatternMatching: true,
useImprovedELM: false,
// Default to false to avoid memory issues (creates duplicate ELM)
elmHiddenUnits: 128,
// Reduced from 256 for memory efficiency
elmActivation: "relu",
elmRidgeLambda: 1e-3,
// Lower regularization
noiseSize: 32,
...config
};
this.exact = new ExactGenerator({
seed: config.seed,
usePatternMatching: this.config.usePatternMatching
});
this.hybrid = new HybridGenerator({
maxLength: config.maxLength,
seed: config.seed,
exactMode: false,
// Allow some jitter for variation
jitterStrength: 0.02,
// Very low jitter (2%)
useOneHot: false,
// Disable one-hot to reduce memory (was: this.config.useImprovedELM)
useClassification: false,
// Disable classification to reduce memory (was: this.config.useImprovedELM)
usePatternCorrection: true,
elmHiddenUnits: this.config.elmHiddenUnits,
// Now uses reduced 128 instead of 256
elmActivation: this.config.elmActivation,
elmRidgeLambda: this.config.elmRidgeLambda,
noiseSize: this.config.noiseSize
});
if (this.config.useImprovedELM && config.useImprovedELM === true) {
this.elm = new ELMGenerator({
maxLength: config.maxLength,
seed: config.seed,
hiddenUnits: this.config.elmHiddenUnits,
activation: this.config.elmActivation,
ridgeLambda: this.config.elmRidgeLambda,
noiseSize: this.config.noiseSize,
useOneHot: false,
// Disable one-hot to reduce memory
useClassification: false,
// Disable classification to reduce memory
usePatternCorrection: true
});
}
this.patternCorrector = new PatternCorrector();
}
/**
* Train the perfect generator
*/
train(samples) {
this.trainingSamples = samples;
this.exact.train(samples);
this.patternCorrector.learnPatterns(samples);
}
/**
* Lazy train hybrid generator
*/
ensureHybridTrained() {
if (!this.hybrid.isTrained() && this.trainingSamples.length > 0) {
this.hybrid.train(this.trainingSamples);
}
}
/**
* Lazy train ELM generator
*/
ensureELMTrained() {
if (this.elm && !this.elm.isTrained() && this.trainingSamples.length > 0) {
this.elm.train(this.trainingSamples);
}
}
/**
* Generate with best strategy
*/
generate(label, seed) {
const candidates = [];
try {
const exact = this.exact.generate(label, seed);
if (exact) {
candidates.push({ value: exact, score: 1, source: "exact" });
}
} catch (error) {
}
try {
const exactVar = this.exact.generateWithVariation(label, seed);
if (exactVar && exactVar !== candidates[0]?.value) {
const score = this.patternCorrector.score(exactVar, label);
candidates.push({ value: exactVar, score: score * 0.95, source: "exact-variation" });
}
} catch (error) {
}
try {
this.ensureHybridTrained();
const hybrid = this.hybrid.generate(label, seed);
if (hybrid && !candidates.some((c) => c.value === hybrid)) {
const score = this.patternCorrector.score(hybrid, label);
const validation = validateForLabel(label, hybrid);
const finalScore = validation.isValid ? score * 0.85 : score * 0.5;
candidates.push({ value: hybrid, score: finalScore, source: "hybrid" });
}
} catch (error) {
}
if (this.elm) {
try {
this.ensureELMTrained();
const elmGen = this.elm.generate(label, seed);
if (elmGen && !candidates.some((c) => c.value === elmGen)) {
const score = this.patternCorrector.score(elmGen, label);
const validation = validateForLabel(label, elmGen);
const finalScore = validation.isValid ? score * 0.8 : score * 0.4;
candidates.push({ value: elmGen, score: finalScore, source: "elm" });
}
} catch (error) {
}
}
if (candidates.length === 0) {
throw new Error(`No samples found for label: ${label}`);
}
candidates.sort((a, b) => b.score - a.score);
if (this.config.preferExact) {
const exactCandidate = candidates.find((c) => c.source === "exact");
if (exactCandidate && exactCandidate.score >= 0.9) {
return exactCandidate.value;
}
}
return candidates[0].value;
}
/**
* Generate multiple samples with best strategy
*/
generateBatch(label, count) {
const results = [];
const seen = /* @__PURE__ */ new Set();
let attempts = 0;
const maxAttempts = count * 5;
while (results.length < count && attempts < maxAttempts) {
const seed = this.config.seed !== void 0 ? this.config.seed + attempts : Date.now() + attempts;
try {
const generated = this.generate(label, seed);
if (generated && generated.length > 0 && !seen.has(generated.toLowerCase())) {
results.push(generated);
seen.add(generated.toLowerCase());
}
} catch (error) {
}
attempts++;
}
return results;
}
/**
* Get all available labels
*/
getLabels() {
return this.exact.getLabels();
}
/**
* Check if generator is trained
*/
isTrained() {
return this.exact.isTrained();
}
}
function fromUint8ArrayToString(bytes) {
const decoder = new TextDecoder();
return decoder.decode(bytes);
}
function base64urlDecode(str) {
let base64 = str.replace(/-/g, "+").replace(/_/g, "/");
while (base64.length % 4 !== 0) {
base64 += "=";
}
if (typeof atob !== "undefined") {
const binary = atob(base64);
const bytes = new Uint8Array(binary.length);
for (let i = 0; i < binary.length; i++) {
bytes[i] = binary.charCodeAt(i);
}
return bytes;
} else {
const Buffer = require("buffer").Buffer;
const buf = Buffer.from(base64, "base64");
const result = new Uint8Array(buf.length);
for (let i = 0; i < buf.length; i++) {
result[i] = buf[i];
}
return result;
}
}
function base64urlDecodeJson(str) {
const bytes = base64urlDecode(str);
const json = fromUint8ArrayToString(bytes);
return JSON.parse(json);
}
function joseToDer(joseSig) {
const len = joseSig.length;
if (len % 2 !== 0)
throw new Error("Invalid JOSE signature length");
const size = len / 2;
const rSlice = new Uint8Array(size);
const sSlice = new Uint8Array(size);
for (let i = 0; i < size; i++) {
rSlice[i] = joseSig[i];
sSlice[i] = joseSig[i + size];
}
let r = trimLeadingZeros(rSlice);
let s = trimLeadingZeros(sSlice);
const rHead = (r[0] & 128) !== 0;
const sHead = (s[0] & 128) !== 0;
const rLen = r.length + (rHead ? 1 : 0);
const sLen = s.length + (sHead ? 1 : 0);
const totalLen = 2 + rLen + 2 + sLen;
const der = new Uint8Array(2 + totalLen);
let offset = 0;
der[offset++] = 48;
der[offset++] = totalLen;
der[offset++] = 2;
der[offset++] = rLen;
if (rHead)
der[offset++] = 0;
der.set(r, offset);
offset += r.length;
der[offset++] = 2;
der[offset++] = sLen;
if (sHead)
der[offset++] = 0;
der.set(s, offset);
offset += s.length;
return der;
}
function trimLeadingZeros(bytes) {
let i = 0;
while (i < bytes.length - 1 && bytes[i] === 0)
i++;
const result = new Uint8Array(bytes.length - i);
for (let j = 0; j < result.length; j++) {
result[j] = bytes[i + j];
}
return result;
}
function nowEpochSeconds() {
return Math.floor(Date.now() / 1e3);
}
const cache = {};
async function fetchJwks(jwksUrl, maxAgeSeconds = 300) {
const now = Math.floor(Date.now() / 1e3);
const entry = cache[jwksUrl];
if (entry && now - entry.fetchedAt < maxAgeSeconds) {