whisper.rn
Version:
React Native binding of whisper.cpp
237 lines • 7.56 kB
TypeScript
import type { TranscribeOptions, TranscribeResult, VadOptions } from '../index';
import type { WavFileWriterFs } from '../utils/WavFileWriter';
export interface AudioStreamData {
data: Uint8Array;
sampleRate: number;
channels: number;
timestamp: number;
}
export interface AudioStreamConfig {
sampleRate?: number;
channels?: number;
bitsPerSample?: number;
bufferSize?: number;
audioSource?: number;
}
export interface AudioStreamInterface {
initialize(config: AudioStreamConfig): Promise<void>;
start(): Promise<void>;
stop(): Promise<void>;
isRecording(): boolean;
onData(callback: (data: AudioStreamData) => void): void;
onError(callback: (error: string) => void): void;
onStatusChange(callback: (isRecording: boolean) => void): void;
onEnd?(callback: () => void): void;
release(): Promise<void>;
}
/**
* VAD Presets Overview:
*
* VAD Presets
* / | \
* Conservative Default Sensitive
* / | | \
* conservative very-conservative sensitive very-sensitive
* (0.7 thresh) (0.8 thresh) (0.3 thresh) (0.2 thresh)
* 500ms min 750ms min 100ms min 100ms min
* Clear speech Very clear Quiet env Catches whispers
*
* Specialized Presets
* / | \
* continuous meeting noisy
* (60s max) (45s max) (0.75 thresh)
* Lectures Multi-spk Strict for noise
*
* Key Parameters:
* - threshold: 0.0-1.0 (lower = more sensitive)
* - minSpeechDurationMs: Min duration to consider speech
* - minSilenceDurationMs: Min silence before ending speech
* - maxSpeechDurationS: Max continuous speech duration
* - speechPadMs: Padding around detected speech
* - samplesOverlap: Analysis window overlap (0.0-1.0)
*/
export declare const VAD_PRESETS: {
default: {
threshold: number;
minSpeechDurationMs: number;
minSilenceDurationMs: number;
maxSpeechDurationS: number;
speechPadMs: number;
samplesOverlap: number;
};
sensitive: {
threshold: number;
minSpeechDurationMs: number;
minSilenceDurationMs: number;
maxSpeechDurationS: number;
speechPadMs: number;
samplesOverlap: number;
};
'very-sensitive': {
threshold: number;
minSpeechDurationMs: number;
minSilenceDurationMs: number;
maxSpeechDurationS: number;
speechPadMs: number;
samplesOverlap: number;
};
conservative: {
threshold: number;
minSpeechDurationMs: number;
minSilenceDurationMs: number;
maxSpeechDurationS: number;
speechPadMs: number;
samplesOverlap: number;
};
'very-conservative': {
threshold: number;
minSpeechDurationMs: number;
minSilenceDurationMs: number;
maxSpeechDurationS: number;
speechPadMs: number;
samplesOverlap: number;
};
continuous: {
threshold: number;
minSpeechDurationMs: number;
minSilenceDurationMs: number;
maxSpeechDurationS: number;
speechPadMs: number;
samplesOverlap: number;
};
meeting: {
threshold: number;
minSpeechDurationMs: number;
minSilenceDurationMs: number;
maxSpeechDurationS: number;
speechPadMs: number;
samplesOverlap: number;
};
noisy: {
threshold: number;
minSpeechDurationMs: number;
minSilenceDurationMs: number;
maxSpeechDurationS: number;
speechPadMs: number;
samplesOverlap: number;
};
};
export interface RealtimeVadEvent {
type: 'speech_start' | 'speech_end' | 'speech_continue' | 'silence';
timestamp: number;
lastSpeechDetectedTime: number;
confidence: number;
duration: number;
sliceIndex: number;
analysis?: {
averageAmplitude: number;
peakAmplitude: number;
spectralCentroid?: number;
zeroCrossingRate?: number;
};
currentThreshold?: number;
environmentNoise?: number;
}
export interface RealtimeTranscribeEvent {
type: 'start' | 'transcribe' | 'end' | 'error';
sliceIndex: number;
data?: TranscribeResult;
isCapturing: boolean;
processTime: number;
recordingTime: number;
memoryUsage?: {
slicesInMemory: number;
totalSamples: number;
estimatedMB: number;
};
vadEvent?: RealtimeVadEvent;
}
export interface RealtimeOptions {
audioSliceSec?: number;
audioMinSec?: number;
maxSlicesInMemory?: number;
transcribeOptions?: TranscribeOptions;
initialPrompt?: string;
promptPreviousSlices?: boolean;
audioOutputPath?: string;
audioStreamConfig?: AudioStreamConfig;
logger?: (message: string) => void;
realtimeProcessingPauseMs?: number;
initRealtimeAfterMs?: number;
}
export interface AudioSlice {
index: number;
data: Uint8Array;
sampleCount: number;
startTime: number;
endTime: number;
isProcessed: boolean;
isReleased: boolean;
}
export interface AudioSliceNoData extends Omit<AudioSlice, 'data'> {
}
export interface MemoryUsage {
slicesInMemory: number;
totalSamples: number;
estimatedMB: number;
}
export interface RealtimeStatsEvent {
timestamp: number;
type: 'slice_processed' | 'vad_change' | 'memory_change' | 'status_change';
data: {
isActive: boolean;
isTranscribing: boolean;
vadEnabled: boolean;
audioStats: any;
vadStats: any;
sliceStats: any;
};
}
export interface RealtimeTranscriberCallbacks {
onBeginTranscribe?: (sliceInfo: {
audioData: Uint8Array;
sliceIndex: number;
duration: number;
vadEvent?: RealtimeVadEvent;
}) => Promise<boolean>;
onTranscribe?: (event: RealtimeTranscribeEvent) => void;
onBeginVad?: (sliceInfo: {
audioData: Uint8Array;
sliceIndex: number;
duration: number;
}) => Promise<boolean>;
onVad?: (event: RealtimeVadEvent) => void;
onError?: (error: string) => void;
onStatusChange?: (isActive: boolean) => void;
onStatsUpdate?: (event: RealtimeStatsEvent) => void;
onSliceTranscriptionStabilized?: (text: string) => void;
}
export type WhisperContextLike = {
transcribeData: (data: ArrayBuffer, options: TranscribeOptions) => {
stop: () => Promise<void>;
promise: Promise<TranscribeResult>;
};
};
export type WhisperVadContextLike = {
detectSpeechData: (data: ArrayBuffer, options: VadOptions) => Promise<Array<{
t0: number;
t1: number;
}>>;
};
export interface RealtimeVadContextLike {
processAudio(data: Uint8Array): void;
onSpeechStart: (callback: (confidence: number, data: Uint8Array) => void) => void;
onSpeechContinue: (callback: (confidence: number, data: Uint8Array) => void) => void;
onSpeechEnd: (callback: (confidence: number) => void) => void;
onError: (callback: (error: string) => void) => void;
updateOptions(options: Partial<VadOptions>): void;
flush(): Promise<void>;
reset(): Promise<void>;
}
export interface RealtimeTranscriberDependencies {
whisperContext: WhisperContextLike;
vadContext?: RealtimeVadContextLike;
audioStream: AudioStreamInterface;
fs?: WavFileWriterFs;
}
//# sourceMappingURL=types.d.ts.map