@siteed/expo-audio-studio

Version:

Comprehensive audio processing library for React Native and Expo with recording, analysis, visualization, and streaming capabilities across iOS, Android, and web

github.com/deeeed/expo-audio-stream/blob/main/packages/expo-audio-studio/README.md

deeeed/expo-audio-stream

225 lines (211 loc) • 8.47 kB

text/typescript

// packages/expo-audio-stream/src/AudioAnalysis/AudioAnalysis.types.ts import { BitDepth, ConsoleLike } from '../ExpoAudioStream.types' /** * Represents the configuration for decoding audio data. */ export interface DecodingConfig { /** Target sample rate for decoded audio (Android and Web) */ targetSampleRate?: number /** Target number of channels (Android and Web) */ targetChannels?: number /** Target bit depth (Android and Web) */ targetBitDepth?: BitDepth /** Whether to normalize audio levels (Android and Web) */ normalizeAudio?: boolean } /** * Represents speech-related features extracted from audio. */ export interface SpeechFeatures { isActive: boolean // Whether speech is detected in this segment speakerId?: number // Optional speaker identification // Could add more speech-related features here like: // confidence: number // language?: string // sentiment?: number // etc. } /** * Represents various audio features extracted from an audio signal. */ export interface AudioFeatures { energy?: number // The infinite integral of the squared signal, representing the overall energy of the audio. mfcc?: number[] // Mel-frequency cepstral coefficients, describing the short-term power spectrum of a sound. rms?: number // Root mean square value, indicating the amplitude of the audio signal. minAmplitude?: number // Minimum amplitude value in the audio signal. maxAmplitude?: number // Maximum amplitude value in the audio signal. zcr?: number // Zero-crossing rate, indicating the rate at which the signal changes sign. spectralCentroid?: number // The center of mass of the spectrum, indicating the brightness of the sound. spectralFlatness?: number // Measure of the flatness of the spectrum, indicating how noise-like the signal is. spectralRolloff?: number // The frequency below which a specified percentage (usually 85%) of the total spectral energy lies. spectralBandwidth?: number // The width of the spectrum, indicating the range of frequencies present. chromagram?: number[] // Chromagram, representing the 12 different pitch classes of the audio. tempo?: number // Estimated tempo of the audio signal, measured in beats per minute (BPM). hnr?: number // Harmonics-to-noise ratio, indicating the proportion of harmonics to noise in the audio signal. melSpectrogram?: number[] // Mel-scaled spectrogram representation of the audio. spectralContrast?: number[] // Spectral contrast features representing the difference between peaks and valleys. tonnetz?: number[] // Tonal network features representing harmonic relationships. pitch?: number // Pitch of the audio signal, measured in Hertz (Hz). crc32?: number // crc32 checksum of the audio signal, used to verify the integrity of the audio. } /** * Options to specify which audio features to extract. * Note: Advanced features (spectral features, chromagram, pitch, etc.) are experimental, * especially during live recording, due to high processing requirements. */ export interface AudioFeaturesOptions { // Basic features - well optimized energy?: boolean rms?: boolean zcr?: boolean // Advanced features - experimental, may impact performance in live recording mfcc?: boolean spectralCentroid?: boolean spectralFlatness?: boolean spectralRolloff?: boolean spectralBandwidth?: boolean chromagram?: boolean tempo?: boolean hnr?: boolean melSpectrogram?: boolean spectralContrast?: boolean tonnetz?: boolean pitch?: boolean // Utility crc32?: boolean } /** * Represents a single data point in the audio analysis. */ export interface DataPoint { id: number amplitude: number // Peak amplitude for the segment rms: number // Root mean square value dB: number // dBFS (decibels relative to full scale) computed from RMS value silent: boolean // Always computed features?: AudioFeatures speech?: SpeechFeatures startTime?: number endTime?: number // start / end position in bytes startPosition?: number endPosition?: number // number of audio samples for this point (samples size depends on bit depth) samples?: number } /** * Represents the complete data from the audio analysis. */ export interface AudioAnalysis { segmentDurationMs: number // Duration of each segment in milliseconds durationMs: number // Duration of the audio in milliseconds /** * Bit depth used for audio analysis processing. * * **Important**: This represents the internal processing bit depth, which may differ * from the recording bit depth. Audio is typically converted to 32-bit float for * analysis to ensure precision in calculations, regardless of the original recording format. * * Platform behavior: * - iOS: Always 32 (float processing) * - Android: Always 32 (float processing) * - Web: Always 32 (Web Audio API standard) * * The actual recorded file will maintain the requested bit depth (8, 16, or 32). */ bitDepth: number samples: number // Size of the audio in bytes numberOfChannels: number // Number of audio channels sampleRate: number // Sample rate of the audio dataPoints: DataPoint[] // Array of data points from the analysis. amplitudeRange: { min: number max: number } rmsRange: { min: number max: number } extractionTimeMs: number // Time taken to extract/process the analysis in milliseconds // TODO: speaker changes into a broader speech analysis section speechAnalysis?: { speakerChanges: { timestamp: number speakerId: number }[] // Could add more speech analysis data here like: // dominantSpeaker?: number // totalSpeechDuration?: number // speakerStats?: { [speakerId: number]: { duration: number, segments: number } } } } /** * Options for specifying a time range within an audio file. */ export interface AudioRangeOptions { /** Start time in milliseconds */ startTimeMs?: number /** End time in milliseconds */ endTimeMs?: number } /** * Options for generating a quick preview of audio waveform. * This is optimized for UI rendering with a specified number of points. */ export interface PreviewOptions extends AudioRangeOptions { /** URI of the audio file to analyze */ fileUri: string /** * Total number of points to generate for the preview. * @default 100 */ numberOfPoints?: number /** * Optional logger for debugging. */ logger?: ConsoleLike /** * Optional configuration for decoding the audio file. * Defaults to: * - targetSampleRate: undefined (keep original) * - targetChannels: undefined (keep original) * - targetBitDepth: 16 * - normalizeAudio: false */ decodingOptions?: DecodingConfig } /** * Options for mel-spectrogram extraction * * @experimental This feature is experimental and currently only available on Android. * The API may change in future versions. */ export interface ExtractMelSpectrogramOptions { fileUri?: string // Path to audio file arrayBuffer?: ArrayBuffer // Raw audio buffer windowSizeMs: number // Window size in ms (e.g., 25) hopLengthMs: number // Hop length in ms (e.g., 10) nMels: number // Number of mel filters (e.g., 60) fMin?: number // Min frequency (default: 0) fMax?: number // Max frequency (default: sampleRate / 2) windowType?: 'hann' | 'hamming' // Window function (default: 'hann') normalize?: boolean // Mean normalization (default: false) logScale?: boolean // Log scaling of mel energies (default: true) decodingOptions?: DecodingConfig // Audio decoding settings startTimeMs?: number // Optional start time endTimeMs?: number // Optional end time logger?: ConsoleLike } /** * Return type for mel spectrogram extraction * * @experimental This feature is experimental and currently only available on Android. * The API may change in future versions. */ export interface MelSpectrogram { spectrogram: number[][] // 2D array [time][mel] sampleRate: number // Audio sample rate nMels: number // Number of mel filters timeSteps: number // Number of time frames durationMs: number // Audio duration in ms }