@synthart/synthlite
Version:
A fast, lightweight Gen AI powered synthetic data generator written in TypeScript. 🌞
298 lines (291 loc) • 15.9 kB
JavaScript
;
/**
*
* @file synthlite-dataset.ts
* @description Handles the generation of synthetic datasets based on a given JSON schema.
* @date January 2024
* @version 1.0.0
* @license Affero General Public License v3.0
* ✨ "We write to change." — Anonymous
*
*/
var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) {
function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); }
return new (P || (P = Promise))(function (resolve, reject) {
function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } }
function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } }
function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); }
step((generator = generator.apply(thisArg, _arguments || [])).next());
});
};
var __rest = (this && this.__rest) || function (s, e) {
var t = {};
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p) && e.indexOf(p) < 0)
t[p] = s[p];
if (s != null && typeof Object.getOwnPropertySymbols === "function")
for (var i = 0, p = Object.getOwnPropertySymbols(s); i < p.length; i++) {
if (e.indexOf(p[i]) < 0 && Object.prototype.propertyIsEnumerable.call(s, p[i]))
t[p[i]] = s[p[i]];
}
return t;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.SynthliteDataset = void 0;
const ai_1 = require("ai");
const json_schema_to_zod_1 = __importDefault(require("json-schema-to-zod"));
const zod_1 = require("zod");
const ai_2 = require("../common/ai");
const constants_1 = require("../common/constants");
const generated_dataset_1 = require("./generated-dataset");
const printer_1 = require("../common/printer");
const synthlite_emitter_1 = require("./synthlite-emitter");
const promises_1 = __importDefault(require("fs/promises"));
const uuid_1 = require("uuid");
/**
* SynthliteDataset: Handles the generation of synthetic datasets based on a given JSON schema.
*/
class SynthliteDataset {
/**
* Constructs a SynthliteDataset instance.
* @param jsonSchema - The JSON schema to use for generating synthetic data.
*/
constructor(jsonSchema) {
const schemaJS = (0, json_schema_to_zod_1.default)(jsonSchema);
this.schema = eval(`const z = require('zod');\n${schemaJS}`);
this.emitter = synthlite_emitter_1.SynthliteEmitter.getInstance();
}
/**
* Creates a SynthliteDataset instance from a schema file.
* @param schemaPath - The path to the schema file.
* @returns A new SynthliteDataset instance.
*/
static fromSchemaFile(schemaPath) {
return __awaiter(this, void 0, void 0, function* () {
const schema = yield promises_1.default.readFile(schemaPath, "utf-8");
return new SynthliteDataset(JSON.parse(schema));
});
}
/**
* Generates synthetic data based on the provided schema.
* @param count - The number of synthetic data rows to generate.
* @returns A promise that resolves to a GeneratedDataset instance.
*/
generate(count) {
return __awaiter(this, void 0, void 0, function* () {
let data = [];
let batchCount = 0;
let dataLengthUpdates = [];
const duplicateCountTable = [];
while (data.length < count) {
data = data.filter((x) => Boolean(x));
data = data.map((row) => (Object.assign(Object.assign({}, row), { id: (row === null || row === void 0 ? void 0 : row.id) || (0, uuid_1.v4)() })));
printer_1.printer.debug(`Duplicate count table: ${JSON.stringify(duplicateCountTable, null, 2)}`);
const batchStartTime = performance.now();
dataLengthUpdates.push(data.length);
printer_1.printer.info(`[batchCount: ${batchCount}] Generating batch of ${constants_1.Constants.DEFAULT_BATCH_SIZE} synthetic data.`);
printer_1.printer.info(`[batchCount: ${batchCount}] Total unique rows generated so far: ${data.length} / ${count}.`);
let avoidDuplicatesPromptInput = "";
if (duplicateCountTable.length > 0) {
avoidDuplicatesPromptInput = duplicateCountTable
.sort((a, b) => {
return a.count - b.count;
})
.map((x) => {
return data.find((d) => (d === null || d === void 0 ? void 0 : d.id) === (x === null || x === void 0 ? void 0 : x.id));
})
.map((x) => JSON.stringify(x))
.join("\n");
}
else {
avoidDuplicatesPromptInput = data
.sort(() => Math.random() - 0.5)
.slice(0, data.length < 10 ? data.length : 10)
.map((x) => {
const { id: _id } = x, rest = __rest(x, ["id"]);
return JSON.stringify(rest);
})
.join("\n");
}
const rows = yield (0, ai_1.generateObject)({
model: ai_2.AI.getInstance().getModel(),
system: `You are synthlite, an advanced syntehic data generator AI agent. Given a schema, generate 10 rows of synthetic data. Try to be unique and creative in your outputs.
Make sure you return all unique data points in the batch and no items are duplicated.
`,
prompt: `Generate ${constants_1.Constants.DEFAULT_BATCH_SIZE > count
? count
: constants_1.Constants.DEFAULT_BATCH_SIZE} synthetic data based on the schema.
Avoid repeating these entries: """\n${avoidDuplicatesPromptInput}\n"""
If you are finding it difficult to generate unique data points, try to make minor adjustments to the existing data points to make them unique.
- For instance, for string fields, change some characters or add adjectives.
- For numbers, tweak the value up or down accordingly.
- For booleans, consider flipping them.
- For arrays, add or remove elements.
- For objects, change the values of the keys.
- Scramble, change, combine and mutate as many keys as possible, just make sure the data is meaningful.
- For names and addresses, consider changing the first or last name, or the street name or number.
- For dates, consider changing the year, month or day.
- For emails, consider changing the domain or the username.
- For phone numbers, consider changing the area code or the country code.
- For URLs, consider changing the domain or the path.
- For any other field, consider changing the value slightly.
- Make sure the data is coherent and makes sense.
- For numeric values like age, consider changing the value slightly.
- For categorical values consider changing the category.
- For text fields, consider changing the text slightly, be creative if needed.
- For numeric values like width, height, sensor data, temperature, etc consider a delta of +/- 10% as long as its within range.
`,
schema: zod_1.z.object({
rows: zod_1.z.array(this.schema),
}),
})
.then((res) => res.object.rows)
.catch((err) => {
printer_1.printer.error(`Error generating synthetic data: ${err.message}`);
return [];
});
const duplicates = [];
const uniques = [];
rows.forEach((row) => {
var _a, _b;
if (this.hasDuplicate(row, data)) {
const duplicateRow = this.getDuplicateRow(row, data);
duplicateCountTable.push({
id: duplicateRow.id,
count: duplicateCountTable.find((x) => x.id === duplicateRow.id)
? ((_b = (_a = duplicateCountTable.find((x) => x.id === duplicateRow.id)) === null || _a === void 0 ? void 0 : _a.count) !== null && _b !== void 0 ? _b : 0) + 1
: 1,
});
duplicates.push(row);
}
else {
uniques.push(row);
}
});
printer_1.printer.info(`Found ${duplicates.length} duplicates and ${uniques.length} unique rows in the batch.`);
data.push(...uniques);
const mutatedDuplicates = yield this.mutateDuplicates(duplicates, data);
if (mutatedDuplicates.length > 0) {
printer_1.printer.info(`Found ${mutatedDuplicates.length} mutated duplicates in the batch.`);
data.push(...mutatedDuplicates);
}
batchCount++;
// Emit event after processing each batch
this.emitter.emit("synthlite:write_data", data);
const batchEndTime = performance.now();
printer_1.printer.info(`[batchCount: ${batchCount}] Batch generated in ${(batchEndTime - batchStartTime) / 1000} seconds.`);
if (dataLengthUpdates.length > constants_1.Constants.CONSECUTIVE_NOOP_THRESHOLD) {
const lastThree = dataLengthUpdates.slice(-constants_1.Constants.CONSECUTIVE_NOOP_THRESHOLD);
const diff = lastThree[2] - lastThree[0];
if (diff === 0) {
printer_1.printer.error(`No new data points generated in last 3 batches. Exiting to avoid infinite loop.`);
break;
}
}
}
printer_1.printer.info(`Total unique rows generated so far: ${data.length} / ${count}.`);
return new generated_dataset_1.GeneratedDataset(data);
});
}
/**
* Mutates duplicate rows to make them unique.
* @param duplicates - The array of duplicate rows.
* @param data - The existing data array.
* @returns A promise that resolves to an array of mutated duplicates.
*/
mutateDuplicates(duplicates, data) {
return __awaiter(this, void 0, void 0, function* () {
const mutatedDuplicates = [];
for (const duplicate of duplicates) {
let mutatedDuplicate = yield this.mutateDuplicateByKeys(duplicate, data);
printer_1.printer.debug(`Mutated duplicate: ${JSON.stringify(mutatedDuplicate)}`);
mutatedDuplicates.push(mutatedDuplicate);
}
return mutatedDuplicates;
});
}
/**
* Mutates specific keys of a duplicate row to make it unique.
* @param duplicate - The duplicate row to mutate.
* @param data - The existing data array.
* @returns A promise that resolves to a mutated duplicate row.
*/
mutateDuplicateByKeys(duplicate, data) {
return __awaiter(this, void 0, void 0, function* () {
const keys = Object.keys(duplicate);
const randomKeys = keys
.sort(() => Math.random() - 0.5)
.slice(0, keys.length >= 3 ? 3 : keys.length);
const mutatedDuplicate = yield (0, ai_1.generateObject)({
model: ai_2.AI.getInstance().getModel(),
system: `You are synthlite, an advanced synthetic data generator AI agent.
You will be given a data point in the sample that already exists i.e. was generated previously.
Specifically make minor adjustments to ${randomKeys.join()} so that the resulting data point is unique.
The resulting data point should be different from the provided data point`,
prompt: `
Mutate this duplicate as per instructions and provide the mutated unique value.
If you are finding it difficult to generate unique data points, try to make minor adjustments to the existing data points to make them unique.
- For instance, for string fields, change some characters or add adjectives.
- For numbers, tweak the value up or down accordingly.
- For booleans, consider flipping them.
- For arrays, add or remove elements.
- For objects, change the values of the keys.
- Scramble, change, combine and mutate as many keys as possible, just make sure the data is meaningful.
- For names and addresses, consider changing the first or last name, or the street name or number.
- For dates, consider changing the year, month or day.
- For emails, consider changing the domain or the username.
- For phone numbers, consider changing the area code or the country code.
- For URLs, consider changing the domain or the path.
- For any other field, consider changing the value slightly.
- Make sure the data is coherent and makes sense.
- For numeric values like age, consider changing the value slightly.
- For categorical values consider changing the category.
- For text fields, consider changing the text slightly, be creative if needed.
- For numeric values like width, height, sensor data, temperature, etc consider a delta of +/- 10% as long as its within range.
Duplicate to Mutate: ${JSON.stringify(duplicate)}
`,
schema: this.schema,
})
.then((res) => res.object)
.catch((err) => {
printer_1.printer.error(`Error mutating duplicate: ${err.message}`);
return null;
});
if (mutatedDuplicate && this.hasDuplicate(mutatedDuplicate, data)) {
printer_1.printer.error(`[!] Mutated duplicate is still a duplicate: ${JSON.stringify(mutatedDuplicate)}`);
printer_1.printer.warn(`→ Skipping the duplicate for now.`);
return null;
}
return mutatedDuplicate;
});
}
/**
* Checks if a row is a duplicate in the existing data.
* @param row - The row to check for duplicates.
* @param data - The existing data array.
* @returns True if the row is a duplicate, false otherwise.
*/
hasDuplicate(row, data) {
const dataWithoutId = data.map((x) => {
const { id } = x, rest = __rest(x, ["id"]);
return rest;
});
const duplicate = dataWithoutId.find((x) => JSON.stringify(x) === JSON.stringify(row));
return Boolean(duplicate);
}
/**
* Retrieves the duplicate row from the existing data.
* @param row - The row to find the duplicate for.
* @param data - The existing data array.
* @returns The duplicate row if found, undefined otherwise.
*/
getDuplicateRow(row, data) {
return data.find((x) => {
const { id: _id } = x, rest = __rest(x, ["id"]);
return JSON.stringify(rest) === JSON.stringify(row);
});
}
}
exports.SynthliteDataset = SynthliteDataset;