semantic-ds-toolkit
Version:
Performance-first semantic layer for modern data stacks - Stable Column Anchors & intelligent inference
616 lines (606 loc) • 22 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.optimizedBatchProcessor = exports.OptimizedBatchProcessorV2 = exports.WorkerPool = exports.SIMDOperations = exports.RingBuffer = exports.ObjectPool = void 0;
const performance_profiler_1 = require("./performance-profiler");
const worker_threads_1 = require("worker_threads");
const os_1 = require("os");
const perf_hooks_1 = require("perf_hooks");
class ObjectPool {
pool = [];
factory;
reset;
maxSize;
constructor(factory, reset, maxSize = 1000) {
this.factory = factory;
this.reset = reset;
this.maxSize = maxSize;
}
acquire() {
const obj = this.pool.pop();
if (obj) {
return obj;
}
return this.factory();
}
release(obj) {
if (this.pool.length < this.maxSize) {
this.reset(obj);
this.pool.push(obj);
}
}
getSize() {
return this.pool.length;
}
}
exports.ObjectPool = ObjectPool;
class RingBuffer {
buffer;
writePtr = 0;
readPtr = 0;
size;
view;
constructor(size) {
this.size = size;
this.buffer = new SharedArrayBuffer(size);
this.view = new Uint8Array(this.buffer);
}
write(data) {
if (data.length > this.available()) {
return false;
}
const endIndex = this.writePtr + data.length;
if (endIndex <= this.size) {
this.view.set(data, this.writePtr);
}
else {
const firstChunk = this.size - this.writePtr;
this.view.set(data.subarray(0, firstChunk), this.writePtr);
this.view.set(data.subarray(firstChunk), 0);
}
this.writePtr = endIndex % this.size;
return true;
}
read(length) {
if (length > this.used()) {
return null;
}
const result = new Uint8Array(length);
const endIndex = this.readPtr + length;
if (endIndex <= this.size) {
result.set(this.view.subarray(this.readPtr, endIndex));
}
else {
const firstChunk = this.size - this.readPtr;
result.set(this.view.subarray(this.readPtr, this.size));
result.set(this.view.subarray(0, endIndex % this.size), firstChunk);
}
this.readPtr = endIndex % this.size;
return result;
}
available() {
return this.size - this.used() - 1;
}
used() {
return (this.writePtr - this.readPtr + this.size) % this.size;
}
getBuffer() {
return this.buffer;
}
}
exports.RingBuffer = RingBuffer;
class SIMDOperations {
static processColumnsSIMD(data, results) {
const len = data.length;
const remainder = len % 4;
const limit = len - remainder;
for (let i = 0; i < limit; i += 4) {
results[i] = this.xxHash32(data[i]);
results[i + 1] = this.xxHash32(data[i + 1]);
results[i + 2] = this.xxHash32(data[i + 2]);
results[i + 3] = this.xxHash32(data[i + 3]);
}
for (let i = limit; i < len; i++) {
results[i] = this.xxHash32(data[i]);
}
}
static xxHash32(input) {
const PRIME32_1 = 0x9E3779B1;
const PRIME32_2 = 0x85EBCA77;
const PRIME32_3 = 0xC2B2AE3D;
const PRIME32_4 = 0x27D4EB2F;
const PRIME32_5 = 0x165667B1;
let h32 = PRIME32_5 + 4;
h32 += input * PRIME32_3;
h32 = ((h32 << 17) | (h32 >>> 15)) * PRIME32_4;
h32 ^= h32 >>> 15;
h32 *= PRIME32_2;
h32 ^= h32 >>> 13;
h32 *= PRIME32_3;
h32 ^= h32 >>> 16;
return h32 >>> 0;
}
static vectorSum(values) {
let sum = 0;
const len = values.length;
const remainder = len % 4;
const limit = len - remainder;
for (let i = 0; i < limit; i += 4) {
sum += values[i] + values[i + 1] + values[i + 2] + values[i + 3];
}
for (let i = limit; i < len; i++) {
sum += values[i];
}
return sum;
}
static vectorDot(a, b) {
if (a.length !== b.length) {
throw new Error('Vector dimensions must match');
}
let dot = 0;
const len = a.length;
const remainder = len % 4;
const limit = len - remainder;
for (let i = 0; i < limit; i += 4) {
dot += a[i] * b[i] + a[i + 1] * b[i + 1] + a[i + 2] * b[i + 2] + a[i + 3] * b[i + 3];
}
for (let i = limit; i < len; i++) {
dot += a[i] * b[i];
}
return dot;
}
static vectorDistance(a, b) {
if (a.length !== b.length) {
throw new Error('Vector dimensions must match');
}
let distanceSquared = 0;
const len = a.length;
const remainder = len % 4;
const limit = len - remainder;
for (let i = 0; i < limit; i += 4) {
const d1 = a[i] - b[i];
const d2 = a[i + 1] - b[i + 1];
const d3 = a[i + 2] - b[i + 2];
const d4 = a[i + 3] - b[i + 3];
distanceSquared += d1 * d1 + d2 * d2 + d3 * d3 + d4 * d4;
}
for (let i = limit; i < len; i++) {
const d = a[i] - b[i];
distanceSquared += d * d;
}
return Math.sqrt(distanceSquared);
}
}
exports.SIMDOperations = SIMDOperations;
class WorkerPool {
workers = [];
availableWorkers = [];
jobQueue = [];
workerScript;
constructor(size = (0, os_1.cpus)().length) {
this.workerScript = this.generateWorkerScript();
this.initializeWorkers(size);
}
generateWorkerScript() {
return `
const { parentPort } = require('worker_threads');
class SIMDOperations {
static processColumnsSIMD(data, results) {
const len = data.length;
const remainder = len % 4;
const limit = len - remainder;
for (let i = 0; i < limit; i += 4) {
results[i] = this.xxHash32(data[i]);
results[i + 1] = this.xxHash32(data[i + 1]);
results[i + 2] = this.xxHash32(data[i + 2]);
results[i + 3] = this.xxHash32(data[i + 3]);
}
for (let i = limit; i < len; i++) {
results[i] = this.xxHash32(data[i]);
}
}
static xxHash32(input) {
const PRIME32_1 = 0x9E3779B1;
const PRIME32_2 = 0x85EBCA77;
const PRIME32_3 = 0xC2B2AE3D;
const PRIME32_4 = 0x27D4EB2F;
const PRIME32_5 = 0x165667B1;
let h32 = PRIME32_5 + 4;
h32 += input * PRIME32_3;
h32 = ((h32 << 17) | (h32 >>> 15)) * PRIME32_4;
h32 ^= h32 >>> 15;
h32 *= PRIME32_2;
h32 ^= h32 >>> 13;
h32 *= PRIME32_3;
h32 ^= h32 >>> 16;
return h32 >>> 0;
}
}
parentPort.on('message', ({ shared, metadata }) => {
try {
const typedArray = metadata.type === 'Float32Array' ?
new Float32Array(shared, metadata.offset, metadata.length) :
metadata.type === 'Float64Array' ?
new Float64Array(shared, metadata.offset, metadata.length) :
new Uint8Array(shared, metadata.offset, metadata.length);
const results = new Float32Array(metadata.length);
if (metadata.type === 'Float32Array') {
SIMDOperations.processColumnsSIMD(typedArray, results);
}
parentPort.postMessage({
success: true,
results: Array.from(results),
processedLength: metadata.length
});
} catch (error) {
parentPort.postMessage({
success: false,
error: error.message
});
}
});
`;
}
initializeWorkers(size) {
for (let i = 0; i < size; i++) {
const worker = new worker_threads_1.Worker(this.workerScript, { eval: true });
worker.on('message', this.handleWorkerMessage.bind(this, worker));
worker.on('error', this.handleWorkerError.bind(this, worker));
this.workers.push(worker);
this.availableWorkers.push(worker);
}
}
handleWorkerMessage(worker, message) {
this.availableWorkers.push(worker);
this.processNextJob();
}
handleWorkerError(worker, error) {
console.error('Worker error:', error);
this.availableWorkers.push(worker);
this.processNextJob();
}
async process(data, metadata) {
let shared;
if (data instanceof SharedArrayBuffer) {
shared = data;
}
else {
shared = new SharedArrayBuffer(data.byteLength);
new Uint8Array(shared).set(new Uint8Array(data));
}
return new Promise((resolve, reject) => {
this.jobQueue.push({ data: shared, metadata, resolve, reject });
this.processNextJob();
});
}
processNextJob() {
if (this.jobQueue.length === 0 || this.availableWorkers.length === 0) {
return;
}
const job = this.jobQueue.shift();
const worker = this.availableWorkers.shift();
worker.postMessage({ shared: job.data, metadata: job.metadata });
worker.once('message', (message) => {
if (message.success) {
job.resolve(message.results);
}
else {
job.reject(new Error(message.error));
}
});
}
destroy() {
this.workers.forEach(worker => worker.terminate());
this.workers = [];
this.availableWorkers = [];
this.jobQueue = [];
}
}
exports.WorkerPool = WorkerPool;
class OptimizedBatchProcessorV2 {
options;
workerPool;
columnPool;
resultPool;
ringBuffer;
processingStats = {
totalRows: 0,
totalTime: 0,
peakThroughput: 0,
averageThroughput: 0,
memoryUsage: 0
};
constructor(options) {
this.options = options;
this.workerPool = new WorkerPool(options.maxWorkers);
this.ringBuffer = new RingBuffer(options.memoryLimit);
this.columnPool = new ObjectPool(() => ({ name: '', values: [], data_type: 'unknown' }), (col) => { col.values = []; col.name = ''; }, 1000);
this.resultPool = new ObjectPool(() => [], (arr) => { arr.length = 0; }, 100);
}
async processColumns(columns, processor) {
const startTime = perf_hooks_1.performance.now();
const cpuStart = process.cpuUsage();
const initialMemory = process.memoryUsage().heapUsed;
let results;
if (this.options.streamingMode && columns.length > 50000) {
results = await this.processStreamingBatch(columns, processor);
}
else {
results = await this.processStandardBatch(columns, processor);
}
const endTime = perf_hooks_1.performance.now();
const duration = (endTime - startTime) / 1000;
const throughput = columns.length / duration;
const latency = duration / columns.length * 1000;
const finalMemory = process.memoryUsage().heapUsed;
const memoryDelta = finalMemory - initialMemory;
const cpuEnd = process.cpuUsage(cpuStart);
const cpuMicros = cpuEnd.user + cpuEnd.system; // microseconds across threads
const cpuUtilization = Math.min(1, (cpuMicros / 1_000_000) / (duration * (0, os_1.cpus)().length));
this.updateStats(columns.length, duration, throughput, memoryDelta);
return {
data: results,
throughput,
latency,
memoryEfficiency: memoryDelta / columns.length,
cpuUtilization
};
}
async processStandardBatch(columns, processor) {
const batchSize = this.options.batchSize;
const results = [];
const batches = this.createBatches(columns, batchSize);
if (this.options.useSharedMemory && this.options.enableSIMD) {
return this.processWithWorkers(batches, processor);
}
const concurrencyLimit = Math.min(this.options.maxWorkers, batches.length);
const semaphore = new Semaphore(concurrencyLimit);
const batchPromises = batches.map(async (batch, index) => {
await semaphore.acquire();
try {
return await this.processBatchChunk(batch, processor, index);
}
finally {
semaphore.release();
}
});
const batchResults = await Promise.all(batchPromises);
// Flatten and release pooled arrays if applicable
for (const batchArr of batchResults) {
results.push(...batchArr);
if (this.options.objectPooling) {
this.resultPool.release(batchArr);
}
}
return results;
}
async processWithWorkers(batches, processor) {
const results = [];
// Concurrency-limited dispatch to worker pool
const concurrency = Math.min(this.options.maxWorkers, batches.length);
const semaphore = new Semaphore(concurrency);
// Pre-allocate a small pool of SharedArrayBuffers and views to avoid per-batch allocation
const capacity = this.options.batchSize; // floats per batch
const sabPool = [];
for (let i = 0; i < concurrency; i++) {
const sab = new SharedArrayBuffer(capacity * 4);
const view = new Float32Array(sab);
sabPool.push({ sab, view });
}
const acquireSAB = () => sabPool.pop();
const releaseSAB = (item) => sabPool.push(item);
const tasks = batches.map(async (batch, index) => {
await semaphore.acquire();
try {
// Acquire a shared buffer and write data directly (no intermediate copy)
const buf = acquireSAB();
const view = buf.view;
for (let i = 0; i < batch.length; i++) {
view[i] = batch[i].values.length;
}
const metadata = {
length: batch.length,
type: 'Float32Array',
offset: 0,
byteLength: batch.length * 4
};
let workerResults = [];
try {
// Run SIMD pre-processing in worker on shared buffer
const hashes = await this.workerPool.process(buf.sab, metadata);
workerResults = hashes;
}
catch (error) {
// Log and continue; worker pre-processing is an optimization
console.error('Worker processing failed:', error);
}
finally {
releaseSAB(buf);
}
// Return raw worker hashes for minimal allocation
return workerResults;
}
finally {
semaphore.release();
}
});
const batchResults = await Promise.all(tasks);
for (const batchArr of batchResults) {
results.push(...batchArr);
if (this.options.objectPooling) {
this.resultPool.release(batchArr);
}
}
return results;
}
async processStreamingBatch(columns, processor) {
const results = [];
const batchSize = this.options.batchSize;
let processedCount = 0;
for await (const batch of this.createDataStream(columns, batchSize)) {
if (batch.isComplete)
break;
const batchResults = await this.processBatchChunk(batch.data, processor, Math.floor(processedCount / batchSize));
results.push(...batchResults);
processedCount += batch.data.length;
if (processedCount % (batchSize * 10) === 0 && global.gc) {
global.gc();
}
}
return results;
}
async processBatchChunk(batch, processor, batchIndex) {
const profilerKey = `batch_chunk_${batchIndex}`;
performance_profiler_1.globalProfiler.startOperation(profilerKey);
try {
const results = this.options.objectPooling ?
this.resultPool.acquire() : [];
if (this.options.objectPooling) {
results.__pooled = true;
}
if (this.options.enableSIMD && batch.length >= 4) {
await this.processBatchSIMD(batch, processor, results);
}
else {
for (const column of batch) {
const processedColumn = this.options.objectPooling ?
this.optimizeColumnForProcessing(column) : column;
results.push(processor(processedColumn));
if (this.options.objectPooling && processedColumn !== column) {
this.columnPool.release(processedColumn);
}
}
}
performance_profiler_1.globalProfiler.endOperation(profilerKey, batch.length);
return results;
}
catch (error) {
performance_profiler_1.globalProfiler.endOperation(profilerKey, 0);
throw error;
}
}
async processBatchSIMD(batch, processor, results) {
const inputData = new Float32Array(batch.length);
const outputData = new Float32Array(batch.length);
for (let i = 0; i < batch.length; i++) {
inputData[i] = batch[i].values.length;
}
SIMDOperations.processColumnsSIMD(inputData, outputData);
for (let i = 0; i < batch.length; i++) {
const optimizedColumn = this.optimizeColumnForProcessing(batch[i]);
results.push(processor(optimizedColumn));
if (this.options.objectPooling) {
this.columnPool.release(optimizedColumn);
}
}
}
optimizeColumnForProcessing(column) {
if (!this.options.objectPooling) {
return column;
}
const optimized = this.columnPool.acquire();
optimized.name = column.name;
optimized.data_type = column.data_type;
const sampleSize = Math.min(1000, column.values.length);
const stride = Math.max(1, Math.floor(column.values.length / sampleSize));
optimized.values = column.values.filter((_, index) => index % stride === 0);
return optimized;
}
createBatches(data, batchSize) {
const batches = [];
for (let i = 0; i < data.length; i += batchSize) {
batches.push(data.slice(i, i + batchSize));
}
return batches;
}
async *createDataStream(data, batchSize) {
for (let i = 0; i < data.length; i += batchSize) {
const batch = data.slice(i, i + batchSize);
yield {
data: batch,
isComplete: i + batchSize >= data.length
};
await new Promise(resolve => setImmediate(resolve));
}
}
updateStats(rows, time, throughput, memory) {
this.processingStats.totalRows += rows;
this.processingStats.totalTime += time;
this.processingStats.memoryUsage = memory;
if (throughput > this.processingStats.peakThroughput) {
this.processingStats.peakThroughput = throughput;
}
this.processingStats.averageThroughput =
this.processingStats.totalRows / this.processingStats.totalTime;
}
simpleHash(str) {
let hash = 0;
for (let i = 0; i < str.length; i++) {
const c = str.charCodeAt(i);
hash = ((hash << 5) - hash) + c;
hash |= 0;
}
return hash >>> 0;
}
async benchmark(testData, processor, iterations = 5) {
const results = [];
for (let i = 0; i < iterations; i++) {
const result = await this.processColumns(testData, processor);
results.push(result);
if (global.gc)
global.gc();
await new Promise(resolve => setTimeout(resolve, 100));
}
const latencies = results.map(r => r.latency).sort((a, b) => a - b);
const p50Index = Math.floor(latencies.length * 0.5);
const p99Index = Math.floor(latencies.length * 0.99);
return {
averageThroughput: results.reduce((sum, r) => sum + r.throughput, 0) / results.length,
peakThroughput: Math.max(...results.map(r => r.throughput)),
averageLatency: results.reduce((sum, r) => sum + r.latency, 0) / results.length,
memoryEfficiency: Math.min(...results.map(r => r.memoryEfficiency)),
p50Latency: latencies[p50Index],
p99Latency: latencies[p99Index]
};
}
getStats() {
return { ...this.processingStats };
}
cleanup() {
this.workerPool.destroy();
}
}
exports.OptimizedBatchProcessorV2 = OptimizedBatchProcessorV2;
class Semaphore {
permits;
waitQueue = [];
constructor(permits) {
this.permits = permits;
}
async acquire() {
if (this.permits > 0) {
this.permits--;
return;
}
return new Promise(resolve => {
this.waitQueue.push(resolve);
});
}
release() {
this.permits++;
const next = this.waitQueue.shift();
if (next) {
this.permits--;
next();
}
}
}
exports.optimizedBatchProcessor = new OptimizedBatchProcessorV2({
batchSize: 2000,
maxWorkers: Math.max(1, (0, os_1.cpus)().length - 1),
useSharedMemory: true,
enableSIMD: true,
objectPooling: true,
streamingMode: true,
memoryLimit: 512 * 1024 * 1024
});
//# sourceMappingURL=batch-processor-v2.js.map