UNPKG

@huggingface/transformers

Version:

State-of-the-art Machine Learning for the web. Run 🤗 Transformers directly in your browser, with no need for a server!

121 lines (101 loc) • 4.81 kB
import { FeatureExtractor, validate_audio_inputs } from '../../base/feature_extraction_utils.js'; import { Tensor } from '../../utils/tensor.js'; import { mel_filter_bank, spectrogram, window_function } from '../../utils/audio.js'; const EPSILON = 1e-5; export class ParakeetFeatureExtractor extends FeatureExtractor { constructor(config) { super(config); // Prefer given `mel_filters` from preprocessor_config.json, or calculate them if they don't exist. this.config.mel_filters ??= mel_filter_bank( Math.floor(1 + this.config.n_fft / 2), // num_frequency_bins this.config.feature_size, // num_mel_filters 0.0, // min_frequency this.config.sampling_rate / 2, // max_frequency this.config.sampling_rate, // sampling_rate "slaney", // norm "slaney", // mel_scale ); const window = window_function(this.config.win_length, 'hann', { periodic: false, }); this.window = new Float64Array(this.config.n_fft); const offset = Math.floor((this.config.n_fft - this.config.win_length) / 2); this.window.set(window, offset); } /** * Computes the log-Mel spectrogram of the provided audio waveform. * @param {Float32Array|Float64Array} waveform The audio waveform to process. * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers. */ async _extract_fbank_features(waveform) { // Parakeet uses a custom preemphasis strategy: Apply preemphasis to entire waveform at once const preemphasis = this.config.preemphasis; waveform = new Float64Array(waveform); // Clone to avoid destructive changes for (let j = waveform.length - 1; j >= 1; --j) { waveform[j] -= preemphasis * waveform[j - 1]; } const features = await spectrogram( waveform, this.window, // window this.window.length, // frame_length this.config.hop_length, // hop_length { fft_length: this.config.n_fft, power: 2.0, mel_filters: this.config.mel_filters, log_mel: 'log', mel_floor: -Infinity, pad_mode: 'constant', center: true, // Custom transpose: true, mel_offset: 2 ** -24, } ) return features; } /** * Asynchronously extracts features from a given audio using the provided configuration. * @param {Float32Array|Float64Array} audio The audio data as a Float32Array/Float64Array. * @returns {Promise<{ input_features: Tensor; attention_mask: Tensor; }>} A Promise resolving to an object containing the extracted input features as a Tensor. */ async _call(audio) { validate_audio_inputs(audio, 'ParakeetFeatureExtractor'); const features = await this._extract_fbank_features(audio); const features_length = Math.floor( (audio.length + Math.floor(this.config.n_fft / 2) * 2 - this.config.n_fft) / this.config.hop_length ); const features_data = /** @type {Float32Array} */ (features.data); features_data.fill(0, features_length * features.dims[1]); // normalize mel features, ignoring padding const [num_frames, num_features] = features.dims; const sum = new Float64Array(num_features); const sum_sq = new Float64Array(num_features); for (let i = 0; i < features_length; ++i) { const offset = i * num_features; for (let j = 0; j < num_features; ++j) { const val = features_data[offset + j]; sum[j] += val; sum_sq[j] += val * val; } } // Calculate mean and standard deviation, then normalize const divisor = features_length > 1 ? features_length - 1 : 1; for (let j = 0; j < num_features; ++j) { const mean = sum[j] / features_length; const variance = (sum_sq[j] - features_length * mean * mean) / divisor; const std = Math.sqrt(variance) + EPSILON; const inv_std = 1 / std; for (let i = 0; i < features_length; ++i) { const index = i * num_features + j; features_data[index] = (features_data[index] - mean) * inv_std; } } const mask_data = new BigInt64Array(num_frames); mask_data.fill(1n, 0, features_length); return { input_features: features.unsqueeze_(0), attention_mask: new Tensor('int64', mask_data, [1, num_frames]), }; } }