UNPKG

parakeet.js

Version:

NVIDIA Parakeet speech recognition for the browser (WebGPU/WASM) powered by ONNX Runtime Web.

326 lines (296 loc) 12.1 kB
import React, { useState, useRef, useEffect } from 'react'; import { ParakeetModel, getParakeetModel } from 'parakeet.js'; import './App.css'; export default function App() { const repoId = 'istupakov/parakeet-tdt-0.6b-v2-onnx'; const [backend, setBackend] = useState('webgpu-hybrid'); const [encoderQuant, setEncoderQuant] = useState('fp32'); const [decoderQuant, setDecoderQuant] = useState('int8'); const [preprocessor, setPreprocessor] = useState('nemo128'); const [status, setStatus] = useState('Idle'); const [progress, setProgress] = useState(''); const [progressText, setProgressText] = useState(''); const [progressPct, setProgressPct] = useState(null); const [text, setText] = useState(''); const [latestMetrics, setLatestMetrics] = useState(null); const [transcriptions, setTranscriptions] = useState([]); const [isTranscribing, setIsTranscribing] = useState(false); const [verboseLog, setVerboseLog] = useState(false); const [frameStride, setFrameStride] = useState(1); const [dumpDetail, setDumpDetail] = useState(false); const maxCores = navigator.hardwareConcurrency || 8; const [cpuThreads, setCpuThreads] = useState(Math.max(1, maxCores - 2)); const modelRef = useRef(null); const fileInputRef = useRef(null); // Auto-adjust quant presets when backend changes useEffect(() => { if (backend.startsWith('webgpu')) { setEncoderQuant('fp32'); setDecoderQuant('int8'); } else { setEncoderQuant('int8'); setDecoderQuant('int8'); } }, [backend]); async function loadModel() { setStatus('Loading model…'); setProgress(''); setProgressText(''); setProgressPct(0); console.time('LoadModel'); try { const progressCallback = ({ loaded, total, file }) => { const pct = total > 0 ? Math.round((loaded / total) * 100) : 0; setProgressText(`${file}: ${pct}%`); setProgressPct(pct); }; // 1. Download all model files from HuggingFace Hub const modelUrls = await getParakeetModel(repoId, { encoderQuant, decoderQuant, preprocessor, backend, progress: progressCallback }); // Show compiling sessions stage setStatus('Creating sessions…'); setProgressText('Compiling model (this may take ~10 s)…'); setProgressPct(null); // 2. Create the model instance with all file URLs modelRef.current = await ParakeetModel.fromUrls({ ...modelUrls.urls, filenames: modelUrls.filenames, backend, verbose: verboseLog, cpuThreads, }); // 3. Warm-up and verify setStatus('Warming up & verifying…'); setProgressText('Running a test transcription…'); const expectedText = 'it is not life as we know or understand it'; try { const audioRes = await fetch('/assets/life_Jim.wav'); const buf = await audioRes.arrayBuffer(); const audioCtx = new AudioContext({ sampleRate: 16000 }); const decoded = await audioCtx.decodeAudioData(buf); const pcm = decoded.getChannelData(0); const { utterance_text } = await modelRef.current.transcribe(pcm, 16000); // Normalize both texts: lowercase and remove punctuation const normalize = (str) => str.toLowerCase().replace(/[.,\/#!$%\^&\*;:{}=\-_`~()]/g,""); if (normalize(utterance_text).includes(normalize(expectedText))) { console.log('[App] Model verification successful.'); setStatus('Model ready ✔'); } else { console.error(`[App] Model verification failed! Expected: "${expectedText}", Got: "${utterance_text}"`); setStatus('Model verification failed!'); } } catch (err) { console.error('[App] Warm-up transcription failed', err); setStatus('Warm-up failed!'); } console.timeEnd('LoadModel'); // setStatus('Model ready ✔'); // Status is now set by verification setProgressText(''); setProgressPct(null); } catch (e) { console.error(e); setStatus(`Failed: ${e.message}`); setProgress(''); } } async function transcribeFile(e) { if (!modelRef.current) return alert('Load model first'); const file = e.target.files?.[0]; if (!file) return; setIsTranscribing(true); setStatus(`Transcribing "${file.name}"…`); try { const buf = await file.arrayBuffer(); const audioCtx = new AudioContext({ sampleRate: 16000 }); const decoded = await audioCtx.decodeAudioData(buf); const pcm = decoded.getChannelData(0); console.time(`Transcribe-${file.name}`); const res = await modelRef.current.transcribe(pcm, 16_000, { returnTimestamps: true, returnConfidences: true , frameStride }); console.timeEnd(`Transcribe-${file.name}`); if (dumpDetail) { console.log('[Parakeet] Detailed transcription output', res); } setLatestMetrics(res.metrics); // Add to transcriptions list const newTranscription = { id: Date.now(), filename: file.name, text: res.utterance_text, timestamp: new Date().toLocaleTimeString(), duration: pcm.length / 16000, // duration in seconds wordCount: res.words?.length || 0, confidence: res.confidence_scores?.token_avg ?? res.confidence_scores?.word_avg ?? null, metrics: res.metrics }; setTranscriptions(prev => [newTranscription, ...prev]); setText(res.utterance_text); // Show latest transcription setStatus('Model ready ✔'); // Ready for next file } catch (error) { console.error('Transcription failed:', error); setStatus('Transcription failed'); alert(`Failed to transcribe "${file.name}": ${error.message}`); } finally { setIsTranscribing(false); // Clear the file input so the same file can be selected again if (fileInputRef.current) { fileInputRef.current.value = ''; } } } function clearTranscriptions() { setTranscriptions([]); setText(''); } return ( <div className="app"> <h2>Parakeet JS React Demo - Using npm package</h2> <div className="controls"> <p> <strong>Model:</strong> {repoId} </p> </div> <div className="controls"> <label> Backend: <select value={backend} onChange={e=>setBackend(e.target.value)}> <option value="webgpu-hybrid">WebGPU</option> <option value="wasm">WASM (CPU)</option> </select> </label> {' '} <label> Encoder Quant: <select value={encoderQuant} onChange={e=>setEncoderQuant(e.target.value)}> <option value="int8">int8 (faster)</option> <option value="fp32">fp32 (higher quality)</option> </select> </label> {' '} <label> Decoder Quant: <select value={decoderQuant} onChange={e=>setDecoderQuant(e.target.value)}> <option value="int8">int8 (faster)</option> <option value="fp32">fp32 (higher quality)</option> </select> </label> {' '} <label> Preprocessor: <select value={preprocessor} onChange={e=>setPreprocessor(e.target.value)}> <option value="nemo128">nemo128 (default)</option> </select> </label> {' '} <label> Stride: <select value={frameStride} onChange={e=>setFrameStride(Number(e.target.value))}> <option value={1}>1</option> <option value={2}>2</option> <option value={4}>4</option> </select> </label> {' '} <label> <input type="checkbox" checked={verboseLog} onChange={e => setVerboseLog(e.target.checked)} /> Verbose Log </label> {' '} <label style={{fontSize:'0.9em'}}> <input type="checkbox" checked={dumpDetail} onChange={e=>setDumpDetail(e.target.checked)} /> Dump result to console </label> {(backend === 'wasm' || backend.startsWith('webgpu')) && ( <label style={{fontSize:'0.9em'}}> Threads: <input type="number" min="1" max={maxCores} value={cpuThreads} onChange={e=>setCpuThreads(Number(e.target.value))} style={{width:'4rem'}} /> </label> )} <button onClick={loadModel} disabled={!status.toLowerCase().includes('fail') && status !== 'Idle'} className="primary" > {status === 'Model ready ✔' ? 'Model Loaded' : 'Load Model'} </button> </div> {typeof SharedArrayBuffer === 'undefined' && backend === 'wasm' && ( <div style={{ marginBottom: '1rem', padding: '0.5rem', backgroundColor: '#fff3cd', border: '1px solid #ffeaa7', borderRadius: '4px', fontSize: '0.9em' }}> ⚠️ <strong>Performance Note:</strong> SharedArrayBuffer is not available. WASM will run single-threaded. For better performance, serve over HTTPS with proper headers or use WebGPU. </div> )} <div className="controls"> <input ref={fileInputRef} type="file" accept="audio/*" onChange={transcribeFile} disabled={status !== 'Model ready ✔' || isTranscribing} /> {transcriptions.length > 0 && ( <button onClick={clearTranscriptions} style={{ marginLeft: '1rem', padding: '0.25rem 0.5rem' }} > Clear History </button> )} </div> <p>Status: {status}</p> {progressPct!==null && ( <div className="progress-wrapper"> <div className="progress-bar"><div style={{ width: `${progressPct}%` }} /></div> <p className="progress-text">{progressText}</p> </div> )} {/* Latest transcription */} <div className="controls"> <h3>Latest Transcription:</h3> <textarea value={text} readOnly className="textarea" placeholder="Transcribed text will appear here..." /> </div> {/* Latest transcription performace info */} {latestMetrics && ( <div className="performance"> <strong>RTF:</strong> {latestMetrics.rtf?.toFixed(2)}x &nbsp;|&nbsp; Total: {latestMetrics.total_ms} ms<br/> Preprocess {latestMetrics.preprocess_ms} ms · Encode {latestMetrics.encode_ms} ms · Decode {latestMetrics.decode_ms} ms · Tokenize {latestMetrics.tokenize_ms} ms </div> )} {/* Transcription history */} {transcriptions.length > 0 && ( <div className="history"> <h3>Transcription History ({transcriptions.length} files):</h3> <div style={{ maxHeight: '400px', overflowY: 'auto', border: '1px solid #ddd', borderRadius: '4px' }}> {transcriptions.map((trans) => ( <div className="history-item" key={trans.id}> <div className="history-meta"><strong>{trans.filename}</strong><span>{trans.timestamp}</span></div> <div className="history-stats">Duration: {trans.duration.toFixed(1)}s | Words: {trans.wordCount}{trans.confidence && ` | Confidence: ${trans.confidence.toFixed(2)}`}{trans.metrics && ` | RTF: ${trans.metrics.rtf?.toFixed(2)}x`}</div> <div className="history-text">{trans.text}</div> </div> ))} </div> </div> )} </div> ); }