UNPKG

woolball-client

Version:

Client-side library for Woolball enabling secure browser resource sharing for distributed AI task processing

161 lines (160 loc) 7.06 kB
"use strict"; Object.defineProperty(exports, "__esModule", { value: true }); exports.calculateTPS = calculateTPS; const transformers_1 = require("@huggingface/transformers"); /** * Calculates tokens per second performance using Transformers.js * Uses the onnx-community/Qwen3-0.6B-ONNX model for realistic AI inference benchmarking * Automatically detects environment and uses appropriate backend (WebGPU/WASM/Node.js) * @returns Promise<number> - The tokens per second as an integer */ async function calculateTPS() { const benchmark = await runBenchmark(); return Math.round(benchmark.tps); } /** * Runs the TPS benchmark using actual ONNX model inference * Detects environment and uses appropriate backend * @returns Promise<BenchmarkResult> - Complete benchmark results */ async function runBenchmark() { // Configure environment for WebGPU if available if (transformers_1.env.backends.onnx?.wasm) { transformers_1.env.backends.onnx.wasm.proxy = false; } // Detect environment const isNode = typeof process !== 'undefined' && process.versions && process.versions.node; const isBrowser = typeof window !== 'undefined' && typeof navigator !== 'undefined'; // For Node.js environment, use CPU backend if (isNode) { return await runNodeBenchmark(); } // For browser environment, check for WebGPU support first const hasWebGPU = isBrowser && 'gpu' in navigator; if (hasWebGPU) { try { // Load the Qwen3-0.6B-ONNX model with WebGPU (download time excluded) console.log('Loading model...'); const generator = await (0, transformers_1.pipeline)('text-generation', 'onnx-community/Qwen3-0.6B-ONNX', { device: 'webgpu', dtype: 'q4f16', }); console.log('Model loaded, starting benchmark...'); // Warm up the model (exclude from timing) await generator("Hello", { max_new_tokens: 5, do_sample: false }); // Run multiple inference passes for consistent measurement const results = []; const testPrompt = "The quick brown fox jumps over the lazy dog. This is a test sentence for measuring"; const maxTokens = 500; // More tokens for better average for (let i = 0; i < 3; i++) { const startTime = performance.now(); const result = await generator(testPrompt, { max_new_tokens: maxTokens, do_sample: false, temperature: 0.1, }); const endTime = performance.now(); const duration = endTime - startTime; // Count generated tokens (approximate based on output length) const generatedText = Array.isArray(result) && result.length > 0 ? result[0]?.generated_text || '' : typeof result === 'string' ? result : ''; const tokensProcessed = estimateTokenCount(generatedText.slice(testPrompt.length)); results.push({ tps: (tokensProcessed / duration) * 1000, duration, tokensProcessed }); } // Return median result for consistency results.sort((a, b) => a.tps - b.tps); return results[1]; // median } catch (error) { console.warn('WebGPU failed, falling back to CPU:', error); } } // Fallback to CPU-based benchmark console.log('Loading model for CPU...'); const generator = await (0, transformers_1.pipeline)('text-generation', 'onnx-community/Qwen3-0.6B-ONNX', { device: 'wasm', dtype: 'q4f16', }); console.log('Model loaded, starting CPU benchmark...'); // Warm up the model (exclude from timing) await generator("Hello", { max_new_tokens: 5, do_sample: false }); // Run multiple inference passes for consistent measurement const results = []; const testPrompt = "The quick brown fox"; const maxTokens = 300; // More tokens for better average on CPU for (let i = 0; i < 3; i++) { const startTime = performance.now(); const result = await generator(testPrompt, { max_new_tokens: maxTokens, do_sample: false, }); const endTime = performance.now(); const duration = endTime - startTime; const generatedText = Array.isArray(result) && result.length > 0 ? result[0]?.generated_text || '' : typeof result === 'string' ? result : ''; const tokensProcessed = estimateTokenCount(generatedText.slice(testPrompt.length)); results.push({ tps: (tokensProcessed / duration) * 1000, duration, tokensProcessed }); } // Return median result for consistency results.sort((a, b) => a.tps - b.tps); return results[1]; // median } /** * Runs TPS benchmark specifically for Node.js environment using CPU backend * @returns Promise<BenchmarkResult> - Complete benchmark results */ async function runNodeBenchmark() { console.log('Loading model for Node.js CPU...'); // Load the Qwen3-0.6B-ONNX model with Node.js CPU backend const generator = await (0, transformers_1.pipeline)('text-generation', 'onnx-community/Qwen3-0.6B-ONNX', { device: 'cpu', dtype: 'q4f16', }); console.log('Model loaded, starting Node.js CPU benchmark...'); // Warm up the model (exclude from timing) await generator("Hello", { max_new_tokens: 5, do_sample: false }); // Run multiple inference passes for consistent measurement const results = []; const testPrompt = "The quick brown fox jumps over the lazy dog. This is a test sentence for measuring"; const maxTokens = 150; // Moderate token count for CPU for (let i = 0; i < 3; i++) { const startTime = performance.now(); const result = await generator(testPrompt, { max_new_tokens: maxTokens, do_sample: false, temperature: 0.1, }); const endTime = performance.now(); const duration = endTime - startTime; // Count generated tokens (approximate based on output length) const generatedText = Array.isArray(result) && result.length > 0 ? result[0]?.generated_text || '' : typeof result === 'string' ? result : ''; const tokensProcessed = estimateTokenCount(generatedText.slice(testPrompt.length)); results.push({ tps: (tokensProcessed / duration) * 1000, duration, tokensProcessed }); } // Return median result for consistency results.sort((a, b) => a.tps - b.tps); return results[1]; // median } /** * Estimates token count from generated text * Rough approximation: ~4 characters per token for English text */ function estimateTokenCount(text) { return Math.max(1, Math.round(text.length / 4)); }