woolball-client
Version:
Client-side library for Woolball enabling secure browser resource sharing for distributed AI task processing
161 lines (160 loc) • 7.06 kB
JavaScript
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.calculateTPS = calculateTPS;
const transformers_1 = require("@huggingface/transformers");
/**
* Calculates tokens per second performance using Transformers.js
* Uses the onnx-community/Qwen3-0.6B-ONNX model for realistic AI inference benchmarking
* Automatically detects environment and uses appropriate backend (WebGPU/WASM/Node.js)
* @returns Promise<number> - The tokens per second as an integer
*/
async function calculateTPS() {
const benchmark = await runBenchmark();
return Math.round(benchmark.tps);
}
/**
* Runs the TPS benchmark using actual ONNX model inference
* Detects environment and uses appropriate backend
* @returns Promise<BenchmarkResult> - Complete benchmark results
*/
async function runBenchmark() {
// Configure environment for WebGPU if available
if (transformers_1.env.backends.onnx?.wasm) {
transformers_1.env.backends.onnx.wasm.proxy = false;
}
// Detect environment
const isNode = typeof process !== 'undefined' && process.versions && process.versions.node;
const isBrowser = typeof window !== 'undefined' && typeof navigator !== 'undefined';
// For Node.js environment, use CPU backend
if (isNode) {
return await runNodeBenchmark();
}
// For browser environment, check for WebGPU support first
const hasWebGPU = isBrowser && 'gpu' in navigator;
if (hasWebGPU) {
try {
// Load the Qwen3-0.6B-ONNX model with WebGPU (download time excluded)
console.log('Loading model...');
const generator = await (0, transformers_1.pipeline)('text-generation', 'onnx-community/Qwen3-0.6B-ONNX', {
device: 'webgpu',
dtype: 'q4f16',
});
console.log('Model loaded, starting benchmark...');
// Warm up the model (exclude from timing)
await generator("Hello", { max_new_tokens: 5, do_sample: false });
// Run multiple inference passes for consistent measurement
const results = [];
const testPrompt = "The quick brown fox jumps over the lazy dog. This is a test sentence for measuring";
const maxTokens = 500; // More tokens for better average
for (let i = 0; i < 3; i++) {
const startTime = performance.now();
const result = await generator(testPrompt, {
max_new_tokens: maxTokens,
do_sample: false,
temperature: 0.1,
});
const endTime = performance.now();
const duration = endTime - startTime;
// Count generated tokens (approximate based on output length)
const generatedText = Array.isArray(result) && result.length > 0
? result[0]?.generated_text || ''
: typeof result === 'string' ? result : '';
const tokensProcessed = estimateTokenCount(generatedText.slice(testPrompt.length));
results.push({
tps: (tokensProcessed / duration) * 1000,
duration,
tokensProcessed
});
}
// Return median result for consistency
results.sort((a, b) => a.tps - b.tps);
return results[1]; // median
}
catch (error) {
console.warn('WebGPU failed, falling back to CPU:', error);
}
}
// Fallback to CPU-based benchmark
console.log('Loading model for CPU...');
const generator = await (0, transformers_1.pipeline)('text-generation', 'onnx-community/Qwen3-0.6B-ONNX', {
device: 'wasm',
dtype: 'q4f16',
});
console.log('Model loaded, starting CPU benchmark...');
// Warm up the model (exclude from timing)
await generator("Hello", { max_new_tokens: 5, do_sample: false });
// Run multiple inference passes for consistent measurement
const results = [];
const testPrompt = "The quick brown fox";
const maxTokens = 300; // More tokens for better average on CPU
for (let i = 0; i < 3; i++) {
const startTime = performance.now();
const result = await generator(testPrompt, {
max_new_tokens: maxTokens,
do_sample: false,
});
const endTime = performance.now();
const duration = endTime - startTime;
const generatedText = Array.isArray(result) && result.length > 0
? result[0]?.generated_text || ''
: typeof result === 'string' ? result : '';
const tokensProcessed = estimateTokenCount(generatedText.slice(testPrompt.length));
results.push({
tps: (tokensProcessed / duration) * 1000,
duration,
tokensProcessed
});
}
// Return median result for consistency
results.sort((a, b) => a.tps - b.tps);
return results[1]; // median
}
/**
* Runs TPS benchmark specifically for Node.js environment using CPU backend
* @returns Promise<BenchmarkResult> - Complete benchmark results
*/
async function runNodeBenchmark() {
console.log('Loading model for Node.js CPU...');
// Load the Qwen3-0.6B-ONNX model with Node.js CPU backend
const generator = await (0, transformers_1.pipeline)('text-generation', 'onnx-community/Qwen3-0.6B-ONNX', {
device: 'cpu',
dtype: 'q4f16',
});
console.log('Model loaded, starting Node.js CPU benchmark...');
// Warm up the model (exclude from timing)
await generator("Hello", { max_new_tokens: 5, do_sample: false });
// Run multiple inference passes for consistent measurement
const results = [];
const testPrompt = "The quick brown fox jumps over the lazy dog. This is a test sentence for measuring";
const maxTokens = 150; // Moderate token count for CPU
for (let i = 0; i < 3; i++) {
const startTime = performance.now();
const result = await generator(testPrompt, {
max_new_tokens: maxTokens,
do_sample: false,
temperature: 0.1,
});
const endTime = performance.now();
const duration = endTime - startTime;
// Count generated tokens (approximate based on output length)
const generatedText = Array.isArray(result) && result.length > 0
? result[0]?.generated_text || ''
: typeof result === 'string' ? result : '';
const tokensProcessed = estimateTokenCount(generatedText.slice(testPrompt.length));
results.push({
tps: (tokensProcessed / duration) * 1000,
duration,
tokensProcessed
});
}
// Return median result for consistency
results.sort((a, b) => a.tps - b.tps);
return results[1]; // median
}
/**
* Estimates token count from generated text
* Rough approximation: ~4 characters per token for English text
*/
function estimateTokenCount(text) {
return Math.max(1, Math.round(text.length / 4));
}