UNPKG

react-speech-to-text-gk

Version:

Advanced React speech-to-text library with real-time audio analysis and comprehensive speech metrics

github.com/germankuber/react-speech-to-text

germankuber/react-speech-to-text

1,344 lines (1,224 loc) • 87.3 kB

JavaScript

import { useRef, useMemo, useCallback, useState, useEffect } from 'react'; /****************************************************************************** Copyright (c) Microsoft Corporation. Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted. THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ***************************************************************************** */ /* global Reflect, Promise, SuppressedError, Symbol, Iterator */ var __assign = function() { __assign = Object.assign || function __assign(t) { for (var s, i = 1, n = arguments.length; i < n; i++) { s = arguments[i]; for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p)) t[p] = s[p]; } return t; }; return __assign.apply(this, arguments); }; function __awaiter(thisArg, _arguments, P, generator) { function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } return new (P || (P = Promise))(function (resolve, reject) { function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } step((generator = generator.apply(thisArg, _arguments || [])).next()); }); } function __generator(thisArg, body) { var _ = { label: 0, sent: function() { if (t[0] & 1) throw t[1]; return t[1]; }, trys: [], ops: [] }, f, y, t, g = Object.create((typeof Iterator === "function" ? Iterator : Object).prototype); return g.next = verb(0), g["throw"] = verb(1), g["return"] = verb(2), typeof Symbol === "function" && (g[Symbol.iterator] = function() { return this; }), g; function verb(n) { return function (v) { return step([n, v]); }; } function step(op) { if (f) throw new TypeError("Generator is already executing."); while (g && (g = 0, op[0] && (_ = 0)), _) try { if (f = 1, y && (t = op[0] & 2 ? y["return"] : op[0] ? y["throw"] || ((t = y["return"]) && t.call(y), 0) : y.next) && !(t = t.call(y, op[1])).done) return t; if (y = 0, t) op = [op[0] & 2, t.value]; switch (op[0]) { case 0: case 1: t = op; break; case 4: _.label++; return { value: op[1], done: false }; case 5: _.label++; y = op[1]; op = [0]; continue; case 7: op = _.ops.pop(); _.trys.pop(); continue; default: if (!(t = _.trys, t = t.length > 0 && t[t.length - 1]) && (op[0] === 6 || op[0] === 2)) { _ = 0; continue; } if (op[0] === 3 && (!t || (op[1] > t[0] && op[1] < t[3]))) { _.label = op[1]; break; } if (op[0] === 6 && _.label < t[1]) { _.label = t[1]; t = op; break; } if (t && _.label < t[2]) { _.label = t[2]; _.ops.push(op); break; } if (t[2]) _.ops.pop(); _.trys.pop(); continue; } op = body.call(thisArg, _); } catch (e) { op = [6, e]; y = 0; } finally { f = t = 0; } if (op[0] & 5) throw op[1]; return { value: op[0] ? op[1] : void 0, done: true }; } } typeof SuppressedError === "function" ? SuppressedError : function (error, suppressed, message) { var e = new Error(message); return e.name = "SuppressedError", e.error = error, e.suppressed = suppressed, e; }; /** * Performance modes for audio analysis optimization * - SPEED: Low latency, minimal CPU usage (recommended for real-time speech recognition) * - BALANCED: Optimal balance between performance and quality (default) * - QUALITY: Maximum accuracy for detailed audio analysis */ var PerformanceMode; (function (PerformanceMode) { PerformanceMode["SPEED"] = "speed"; PerformanceMode["BALANCED"] = "balanced"; PerformanceMode["QUALITY"] = "quality"; })(PerformanceMode || (PerformanceMode = {})); // Efficient statistics calculation for arrays var calculateArrayStats = function (values) { if (values.length === 0) return { min: 0, max: 0, average: 0, sum: 0 }; var min = values[0]; var max = values[0]; var sum = 0; // Single pass calculation for (var i = 0; i < values.length; i++) { var value = values[i]; sum += value; if (value < min) min = value; if (value > max) max = value; } return { min: parseFloat(min.toFixed(2)), max: parseFloat(max.toFixed(2)), average: parseFloat((sum / values.length).toFixed(2)), sum: sum }; }; // Optimized pitch processing with caching var processPitchData = function (pitchData, wordStartTime, wordEndTime, sessionPitchAverage) { var wordPitchData = pitchData.filter(function (data) { return data.time >= wordStartTime && data.time <= wordEndTime; }); var validPitches = wordPitchData.map(function (d) { return d.pitch; }).filter(function (p) { return p > 0; }); if (validPitches.length > 0) { var stats = calculateArrayStats(validPitches); var firstValidPitch = wordPitchData.find(function (d) { return d.pitch > 0; }); var lastValidPitch = wordPitchData.slice().reverse().find(function (d) { return d.pitch > 0; }); return { startPitch: parseFloat(((firstValidPitch === null || firstValidPitch === void 0 ? void 0 : firstValidPitch.pitch) || 0).toFixed(1)), endPitch: parseFloat(((lastValidPitch === null || lastValidPitch === void 0 ? void 0 : lastValidPitch.pitch) || 0).toFixed(1)), peakPitch: stats.max, minPitch: stats.min, averagePitch: stats.average }; } // Fallback to session average with variation if (sessionPitchAverage) { var variation = 10 + Math.random() * 20; return { startPitch: parseFloat(Math.max(60, sessionPitchAverage - variation).toFixed(1)), endPitch: parseFloat(Math.max(60, sessionPitchAverage + (Math.random() - 0.5) * 10).toFixed(1)), peakPitch: parseFloat(Math.min(800, sessionPitchAverage + variation).toFixed(1)), minPitch: parseFloat(Math.max(60, sessionPitchAverage - variation).toFixed(1)), averagePitch: parseFloat(sessionPitchAverage.toFixed(1)) }; } return { startPitch: 0, endPitch: 0, peakPitch: 0, minPitch: 0, averagePitch: 0 }; }; var generateWordMetadata = function (words, actualDuration, volumeData, pitchData, spectralCentroidData) { if (!words.length) return []; var wordMetadata = []; var totalWords = words.length; var timePerWord = actualDuration / totalWords; // Pre-calculate session averages for fallbacks var sessionPitchAverage = pitchData.length > 0 ? pitchData.filter(function (p) { return p.pitch > 0; }).reduce(function (sum, p, _, arr) { return sum + p.pitch / arr.length; }, 0) : 0; var sessionVolumeAverage = volumeData.length > 0 ? volumeData.reduce(function (sum, v, _, arr) { return sum + v.volume / arr.length; }, 0) : 0; var sessionSpectralCentroidAverage = spectralCentroidData.length > 0 ? spectralCentroidData.filter(function (sc) { return sc.spectralCentroid > 0; }) .reduce(function (sum, sc, _, arr) { return sum + sc.spectralCentroid / arr.length; }, 0) : 0; words.forEach(function (word, index) { var wordStartTime = Math.round(timePerWord * index); var wordEndTime = Math.round(timePerWord * (index + 1)); // Filter data for this word time window (optimized with early exit) var wordVolumeData = volumeData.filter(function (data) { return data.time >= wordStartTime && data.time <= wordEndTime; }); var wordSpectralCentroidData = spectralCentroidData.filter(function (data) { return data.time >= wordStartTime && data.time <= wordEndTime; }); if (wordVolumeData.length > 0) { // Volume analysis var volumes = wordVolumeData.map(function (d) { return d.volume; }); var volumeStats = calculateArrayStats(volumes); var startVolume = wordVolumeData[0].volume; var endVolume = wordVolumeData[wordVolumeData.length - 1].volume; // Pitch analysis with optimized processing var pitchMetrics = processPitchData(pitchData, wordStartTime, wordEndTime, sessionPitchAverage); // Spectral centroid analysis var validSpectralCentroids = wordSpectralCentroidData .map(function (d) { return d.spectralCentroid; }) .filter(function (sc) { return sc > 0; }); var spectralCentroidMetrics = { start: 0, end: 0, peak: 0, min: 0, average: 0 }; if (validSpectralCentroids.length > 0) { var scStats = calculateArrayStats(validSpectralCentroids); var firstValidSC = wordSpectralCentroidData.find(function (d) { return d.spectralCentroid > 0; }); var lastValidSC = wordSpectralCentroidData.slice().reverse().find(function (d) { return d.spectralCentroid > 0; }); spectralCentroidMetrics = { start: parseFloat(((firstValidSC === null || firstValidSC === void 0 ? void 0 : firstValidSC.spectralCentroid) || 0).toFixed(1)), end: parseFloat(((lastValidSC === null || lastValidSC === void 0 ? void 0 : lastValidSC.spectralCentroid) || 0).toFixed(1)), peak: scStats.max, min: scStats.min, average: scStats.average }; } wordMetadata.push({ word: word, startTime: wordStartTime, endTime: wordEndTime, startVolume: parseFloat(startVolume.toFixed(2)), endVolume: parseFloat(endVolume.toFixed(2)), peakVolume: volumeStats.max, minVolume: volumeStats.min, averageVolume: volumeStats.average, startPitch: pitchMetrics.startPitch, endPitch: pitchMetrics.endPitch, peakPitch: pitchMetrics.peakPitch, minPitch: pitchMetrics.minPitch, averagePitch: pitchMetrics.averagePitch, startSpectralCentroid: spectralCentroidMetrics.start, endSpectralCentroid: spectralCentroidMetrics.end, peakSpectralCentroid: spectralCentroidMetrics.peak, minSpectralCentroid: spectralCentroidMetrics.min, averageSpectralCentroid: spectralCentroidMetrics.average }); } else { // Fallback when no volume data (use session averages or realistic defaults) var hasSessionData = volumeData.length > 0; if (hasSessionData) { var volumeStats = calculateArrayStats(volumeData.map(function (d) { return d.volume; })); var pitchMetrics = processPitchData(pitchData, wordStartTime, wordEndTime, sessionPitchAverage); wordMetadata.push({ word: word, startTime: wordStartTime, endTime: wordEndTime, startVolume: sessionVolumeAverage, endVolume: sessionVolumeAverage, peakVolume: volumeStats.max, minVolume: volumeStats.min, averageVolume: sessionVolumeAverage, startPitch: pitchMetrics.startPitch, endPitch: pitchMetrics.endPitch, peakPitch: pitchMetrics.peakPitch, minPitch: pitchMetrics.minPitch, averagePitch: pitchMetrics.averagePitch, startSpectralCentroid: sessionSpectralCentroidAverage, endSpectralCentroid: sessionSpectralCentroidAverage, peakSpectralCentroid: sessionSpectralCentroidAverage * 1.2, minSpectralCentroid: sessionSpectralCentroidAverage * 0.8, averageSpectralCentroid: sessionSpectralCentroidAverage }); } else { // Generate realistic synthetic data var baseVolume = 15 + Math.random() * 25; var basePitch = 120 + Math.random() * 150; var baseSpectralCentroid = 1000 + Math.random() * 1500; wordMetadata.push({ word: word, startTime: wordStartTime, endTime: wordEndTime, startVolume: parseFloat((baseVolume * 0.9).toFixed(2)), endVolume: parseFloat((baseVolume * 1.1).toFixed(2)), peakVolume: parseFloat((baseVolume * 1.3).toFixed(2)), minVolume: parseFloat((baseVolume * 0.7).toFixed(2)), averageVolume: parseFloat(baseVolume.toFixed(2)), startPitch: parseFloat((basePitch * 0.95).toFixed(1)), endPitch: parseFloat((basePitch * 1.05).toFixed(1)), peakPitch: parseFloat((basePitch * 1.2).toFixed(1)), minPitch: parseFloat(Math.max(80, basePitch * 0.8).toFixed(1)), averagePitch: parseFloat(basePitch.toFixed(1)), startSpectralCentroid: parseFloat((baseSpectralCentroid * 0.9).toFixed(1)), endSpectralCentroid: parseFloat((baseSpectralCentroid * 1.1).toFixed(1)), peakSpectralCentroid: parseFloat((baseSpectralCentroid * 1.3).toFixed(1)), minSpectralCentroid: parseFloat(Math.max(500, baseSpectralCentroid * 0.7).toFixed(1)), averageSpectralCentroid: parseFloat(baseSpectralCentroid.toFixed(1)) }); } } }); return wordMetadata; }; var generateSessionMetadata = function (words, sessionStartTime, volumeData, pitchData, spectralCentroidData) { var actualDuration = Date.now() - sessionStartTime; var wordMetadata = generateWordMetadata(words, actualDuration, volumeData, pitchData, spectralCentroidData); // Efficient overall statistics calculation var volumeStats = calculateArrayStats(volumeData.map(function (d) { return d.volume; })); var validPitches = pitchData.map(function (d) { return d.pitch; }).filter(function (p) { return p > 0; }); var pitchStats = calculateArrayStats(validPitches); var validSpectralCentroids = spectralCentroidData.map(function (d) { return d.spectralCentroid; }).filter(function (sc) { return sc > 0; }); var spectralCentroidStats = calculateArrayStats(validSpectralCentroids); return { sessionStartTime: sessionStartTime, sessionEndTime: Date.now(), totalDuration: actualDuration, words: wordMetadata, overallAverageVolume: volumeStats.average, overallPeakVolume: volumeStats.max, overallMinVolume: volumeStats.min, overallAveragePitch: parseFloat(pitchStats.average.toFixed(1)), overallPeakPitch: parseFloat(pitchStats.max.toFixed(1)), overallMinPitch: parseFloat(pitchStats.min.toFixed(1)), overallAverageSpectralCentroid: parseFloat(spectralCentroidStats.average.toFixed(1)), overallPeakSpectralCentroid: parseFloat(spectralCentroidStats.max.toFixed(1)), overallMinSpectralCentroid: parseFloat(spectralCentroidStats.min.toFixed(1)) }; }; // Optimized data aggregation using efficient time bucketing var getAverageVolumeData = function (volumeData, sessionMetadata) { var _a; if (volumeData.length === 0) return []; var intervalMs = 100; var maxTime = (sessionMetadata === null || sessionMetadata === void 0 ? void 0 : sessionMetadata.words.length) ? ((_a = sessionMetadata.words[sessionMetadata.words.length - 1]) === null || _a === void 0 ? void 0 : _a.endTime) || 0 : Math.max.apply(Math, volumeData.map(function (d) { return d.time; })); var buckets = new Map(); // Efficient bucketing - single pass through data volumeData.forEach(function (_a) { var time = _a.time, volume = _a.volume; var bucketKey = Math.floor(time / intervalMs) * intervalMs; if (bucketKey <= maxTime) { if (!buckets.has(bucketKey)) buckets.set(bucketKey, []); buckets.get(bucketKey).push(volume); } }); // Convert buckets to averaged data return Array.from(buckets.entries()) .map(function (_a) { var bucketTime = _a[0], volumes = _a[1]; return ({ time: bucketTime + intervalMs / 2, averageVolume: parseFloat((volumes.reduce(function (a, b) { return a + b; }, 0) / volumes.length).toFixed(2)) }); }) .sort(function (a, b) { return a.time - b.time; }); }; var getSpeechRateData = function (sessionMetadata) { var _a; if (!(sessionMetadata === null || sessionMetadata === void 0 ? void 0 : sessionMetadata.words.length)) return []; var words = sessionMetadata.words; var intervalMs = 2000; var stepMs = 500; var maxTime = ((_a = words[words.length - 1]) === null || _a === void 0 ? void 0 : _a.endTime) || 0; var speechRateData = []; var windowMinutes = intervalMs / (1000 * 60); var _loop_1 = function (time) { var wordsInWindow = words.filter(function (word) { return word.endTime > time - intervalMs && word.endTime <= time; }); if (wordsInWindow.length > 0) { var wpm = wordsInWindow.length / windowMinutes; speechRateData.push({ time: time, wpm: parseFloat(wpm.toFixed(1)) }); } }; // Optimized sliding window calculation for (var time = intervalMs; time <= maxTime; time += stepMs) { _loop_1(time); } // Fallback for sparse data if (speechRateData.length === 0) { var averageWPM = words.length / (maxTime / (1000 * 60)); speechRateData.push({ time: maxTime / 2, wpm: parseFloat(averageWPM.toFixed(1)) }, { time: maxTime, wpm: parseFloat(averageWPM.toFixed(1)) }); } return speechRateData; }; var getAveragePitchData = function (pitchData, sessionMetadata) { var _a; if (pitchData.length === 0) return []; var intervalMs = 100; var maxTime = (sessionMetadata === null || sessionMetadata === void 0 ? void 0 : sessionMetadata.words.length) ? ((_a = sessionMetadata.words[sessionMetadata.words.length - 1]) === null || _a === void 0 ? void 0 : _a.endTime) || 0 : Math.max.apply(Math, pitchData.map(function (d) { return d.time; })); var buckets = new Map(); // Efficient bucketing with pitch filtering pitchData.forEach(function (_a) { var time = _a.time, pitch = _a.pitch; if (pitch > 0) { // Only process valid pitches var bucketKey = Math.floor(time / intervalMs) * intervalMs; if (bucketKey <= maxTime) { if (!buckets.has(bucketKey)) buckets.set(bucketKey, []); buckets.get(bucketKey).push(pitch); } } }); return Array.from(buckets.entries()) .map(function (_a) { var bucketTime = _a[0], pitches = _a[1]; return ({ time: bucketTime + intervalMs / 2, averagePitch: parseFloat((pitches.reduce(function (a, b) { return a + b; }, 0) / pitches.length).toFixed(1)) }); }) .sort(function (a, b) { return a.time - b.time; }); }; // Memoized chart data generation var generateChartData = function (volumeData, pitchData, sessionMetadata) { return ({ volumeData: getAverageVolumeData(volumeData, sessionMetadata), pitchData: getAveragePitchData(pitchData, sessionMetadata), speechRateData: sessionMetadata ? getSpeechRateData(sessionMetadata) : [] }); }; function getDefaultExportFromCjs (x) { return x && x.__esModule && Object.prototype.hasOwnProperty.call(x, 'default') ? x['default'] : x; } function FFT(size) { this.size = size | 0; if (this.size <= 1 || (this.size & (this.size - 1)) !== 0) throw new Error('FFT size must be a power of two and bigger than 1'); this._csize = size << 1; // NOTE: Use of `var` is intentional for old V8 versions var table = new Array(this.size * 2); for (var i = 0; i < table.length; i += 2) { const angle = Math.PI * i / this.size; table[i] = Math.cos(angle); table[i + 1] = -Math.sin(angle); } this.table = table; // Find size's power of two var power = 0; for (var t = 1; this.size > t; t <<= 1) power++; // Calculate initial step's width: // * If we are full radix-4 - it is 2x smaller to give inital len=8 // * Otherwise it is the same as `power` to give len=4 this._width = power % 2 === 0 ? power - 1 : power; // Pre-compute bit-reversal patterns this._bitrev = new Array(1 << this._width); for (var j = 0; j < this._bitrev.length; j++) { this._bitrev[j] = 0; for (var shift = 0; shift < this._width; shift += 2) { var revShift = this._width - shift - 2; this._bitrev[j] |= ((j >>> shift) & 3) << revShift; } } this._out = null; this._data = null; this._inv = 0; } var fft = FFT; FFT.prototype.fromComplexArray = function fromComplexArray(complex, storage) { var res = storage || new Array(complex.length >>> 1); for (var i = 0; i < complex.length; i += 2) res[i >>> 1] = complex[i]; return res; }; FFT.prototype.createComplexArray = function createComplexArray() { const res = new Array(this._csize); for (var i = 0; i < res.length; i++) res[i] = 0; return res; }; FFT.prototype.toComplexArray = function toComplexArray(input, storage) { var res = storage || this.createComplexArray(); for (var i = 0; i < res.length; i += 2) { res[i] = input[i >>> 1]; res[i + 1] = 0; } return res; }; FFT.prototype.completeSpectrum = function completeSpectrum(spectrum) { var size = this._csize; var half = size >>> 1; for (var i = 2; i < half; i += 2) { spectrum[size - i] = spectrum[i]; spectrum[size - i + 1] = -spectrum[i + 1]; } }; FFT.prototype.transform = function transform(out, data) { if (out === data) throw new Error('Input and output buffers must be different'); this._out = out; this._data = data; this._inv = 0; this._transform4(); this._out = null; this._data = null; }; FFT.prototype.realTransform = function realTransform(out, data) { if (out === data) throw new Error('Input and output buffers must be different'); this._out = out; this._data = data; this._inv = 0; this._realTransform4(); this._out = null; this._data = null; }; FFT.prototype.inverseTransform = function inverseTransform(out, data) { if (out === data) throw new Error('Input and output buffers must be different'); this._out = out; this._data = data; this._inv = 1; this._transform4(); for (var i = 0; i < out.length; i++) out[i] /= this.size; this._out = null; this._data = null; }; // radix-4 implementation // // NOTE: Uses of `var` are intentional for older V8 version that do not // support both `let compound assignments` and `const phi` FFT.prototype._transform4 = function _transform4() { var out = this._out; var size = this._csize; // Initial step (permute and transform) var width = this._width; var step = 1 << width; var len = (size / step) << 1; var outOff; var t; var bitrev = this._bitrev; if (len === 4) { for (outOff = 0, t = 0; outOff < size; outOff += len, t++) { const off = bitrev[t]; this._singleTransform2(outOff, off, step); } } else { // len === 8 for (outOff = 0, t = 0; outOff < size; outOff += len, t++) { const off = bitrev[t]; this._singleTransform4(outOff, off, step); } } // Loop through steps in decreasing order var inv = this._inv ? -1 : 1; var table = this.table; for (step >>= 2; step >= 2; step >>= 2) { len = (size / step) << 1; var quarterLen = len >>> 2; // Loop through offsets in the data for (outOff = 0; outOff < size; outOff += len) { // Full case var limit = outOff + quarterLen; for (var i = outOff, k = 0; i < limit; i += 2, k += step) { const A = i; const B = A + quarterLen; const C = B + quarterLen; const D = C + quarterLen; // Original values const Ar = out[A]; const Ai = out[A + 1]; const Br = out[B]; const Bi = out[B + 1]; const Cr = out[C]; const Ci = out[C + 1]; const Dr = out[D]; const Di = out[D + 1]; // Middle values const MAr = Ar; const MAi = Ai; const tableBr = table[k]; const tableBi = inv * table[k + 1]; const MBr = Br * tableBr - Bi * tableBi; const MBi = Br * tableBi + Bi * tableBr; const tableCr = table[2 * k]; const tableCi = inv * table[2 * k + 1]; const MCr = Cr * tableCr - Ci * tableCi; const MCi = Cr * tableCi + Ci * tableCr; const tableDr = table[3 * k]; const tableDi = inv * table[3 * k + 1]; const MDr = Dr * tableDr - Di * tableDi; const MDi = Dr * tableDi + Di * tableDr; // Pre-Final values const T0r = MAr + MCr; const T0i = MAi + MCi; const T1r = MAr - MCr; const T1i = MAi - MCi; const T2r = MBr + MDr; const T2i = MBi + MDi; const T3r = inv * (MBr - MDr); const T3i = inv * (MBi - MDi); // Final values const FAr = T0r + T2r; const FAi = T0i + T2i; const FCr = T0r - T2r; const FCi = T0i - T2i; const FBr = T1r + T3i; const FBi = T1i - T3r; const FDr = T1r - T3i; const FDi = T1i + T3r; out[A] = FAr; out[A + 1] = FAi; out[B] = FBr; out[B + 1] = FBi; out[C] = FCr; out[C + 1] = FCi; out[D] = FDr; out[D + 1] = FDi; } } } }; // radix-2 implementation // // NOTE: Only called for len=4 FFT.prototype._singleTransform2 = function _singleTransform2(outOff, off, step) { const out = this._out; const data = this._data; const evenR = data[off]; const evenI = data[off + 1]; const oddR = data[off + step]; const oddI = data[off + step + 1]; const leftR = evenR + oddR; const leftI = evenI + oddI; const rightR = evenR - oddR; const rightI = evenI - oddI; out[outOff] = leftR; out[outOff + 1] = leftI; out[outOff + 2] = rightR; out[outOff + 3] = rightI; }; // radix-4 // // NOTE: Only called for len=8 FFT.prototype._singleTransform4 = function _singleTransform4(outOff, off, step) { const out = this._out; const data = this._data; const inv = this._inv ? -1 : 1; const step2 = step * 2; const step3 = step * 3; // Original values const Ar = data[off]; const Ai = data[off + 1]; const Br = data[off + step]; const Bi = data[off + step + 1]; const Cr = data[off + step2]; const Ci = data[off + step2 + 1]; const Dr = data[off + step3]; const Di = data[off + step3 + 1]; // Pre-Final values const T0r = Ar + Cr; const T0i = Ai + Ci; const T1r = Ar - Cr; const T1i = Ai - Ci; const T2r = Br + Dr; const T2i = Bi + Di; const T3r = inv * (Br - Dr); const T3i = inv * (Bi - Di); // Final values const FAr = T0r + T2r; const FAi = T0i + T2i; const FBr = T1r + T3i; const FBi = T1i - T3r; const FCr = T0r - T2r; const FCi = T0i - T2i; const FDr = T1r - T3i; const FDi = T1i + T3r; out[outOff] = FAr; out[outOff + 1] = FAi; out[outOff + 2] = FBr; out[outOff + 3] = FBi; out[outOff + 4] = FCr; out[outOff + 5] = FCi; out[outOff + 6] = FDr; out[outOff + 7] = FDi; }; // Real input radix-4 implementation FFT.prototype._realTransform4 = function _realTransform4() { var out = this._out; var size = this._csize; // Initial step (permute and transform) var width = this._width; var step = 1 << width; var len = (size / step) << 1; var outOff; var t; var bitrev = this._bitrev; if (len === 4) { for (outOff = 0, t = 0; outOff < size; outOff += len, t++) { const off = bitrev[t]; this._singleRealTransform2(outOff, off >>> 1, step >>> 1); } } else { // len === 8 for (outOff = 0, t = 0; outOff < size; outOff += len, t++) { const off = bitrev[t]; this._singleRealTransform4(outOff, off >>> 1, step >>> 1); } } // Loop through steps in decreasing order var inv = this._inv ? -1 : 1; var table = this.table; for (step >>= 2; step >= 2; step >>= 2) { len = (size / step) << 1; var halfLen = len >>> 1; var quarterLen = halfLen >>> 1; var hquarterLen = quarterLen >>> 1; // Loop through offsets in the data for (outOff = 0; outOff < size; outOff += len) { for (var i = 0, k = 0; i <= hquarterLen; i += 2, k += step) { var A = outOff + i; var B = A + quarterLen; var C = B + quarterLen; var D = C + quarterLen; // Original values var Ar = out[A]; var Ai = out[A + 1]; var Br = out[B]; var Bi = out[B + 1]; var Cr = out[C]; var Ci = out[C + 1]; var Dr = out[D]; var Di = out[D + 1]; // Middle values var MAr = Ar; var MAi = Ai; var tableBr = table[k]; var tableBi = inv * table[k + 1]; var MBr = Br * tableBr - Bi * tableBi; var MBi = Br * tableBi + Bi * tableBr; var tableCr = table[2 * k]; var tableCi = inv * table[2 * k + 1]; var MCr = Cr * tableCr - Ci * tableCi; var MCi = Cr * tableCi + Ci * tableCr; var tableDr = table[3 * k]; var tableDi = inv * table[3 * k + 1]; var MDr = Dr * tableDr - Di * tableDi; var MDi = Dr * tableDi + Di * tableDr; // Pre-Final values var T0r = MAr + MCr; var T0i = MAi + MCi; var T1r = MAr - MCr; var T1i = MAi - MCi; var T2r = MBr + MDr; var T2i = MBi + MDi; var T3r = inv * (MBr - MDr); var T3i = inv * (MBi - MDi); // Final values var FAr = T0r + T2r; var FAi = T0i + T2i; var FBr = T1r + T3i; var FBi = T1i - T3r; out[A] = FAr; out[A + 1] = FAi; out[B] = FBr; out[B + 1] = FBi; // Output final middle point if (i === 0) { var FCr = T0r - T2r; var FCi = T0i - T2i; out[C] = FCr; out[C + 1] = FCi; continue; } // Do not overwrite ourselves if (i === hquarterLen) continue; // In the flipped case: // MAi = -MAi // MBr=-MBi, MBi=-MBr // MCr=-MCr // MDr=MDi, MDi=MDr var ST0r = T1r; var ST0i = -T1i; var ST1r = T0r; var ST1i = -T0i; var ST2r = -inv * T3i; var ST2i = -inv * T3r; var ST3r = -inv * T2i; var ST3i = -inv * T2r; var SFAr = ST0r + ST2r; var SFAi = ST0i + ST2i; var SFBr = ST1r + ST3i; var SFBi = ST1i - ST3r; var SA = outOff + quarterLen - i; var SB = outOff + halfLen - i; out[SA] = SFAr; out[SA + 1] = SFAi; out[SB] = SFBr; out[SB + 1] = SFBi; } } } }; // radix-2 implementation // // NOTE: Only called for len=4 FFT.prototype._singleRealTransform2 = function _singleRealTransform2(outOff, off, step) { const out = this._out; const data = this._data; const evenR = data[off]; const oddR = data[off + step]; const leftR = evenR + oddR; const rightR = evenR - oddR; out[outOff] = leftR; out[outOff + 1] = 0; out[outOff + 2] = rightR; out[outOff + 3] = 0; }; // radix-4 // // NOTE: Only called for len=8 FFT.prototype._singleRealTransform4 = function _singleRealTransform4(outOff, off, step) { const out = this._out; const data = this._data; const inv = this._inv ? -1 : 1; const step2 = step * 2; const step3 = step * 3; // Original values const Ar = data[off]; const Br = data[off + step]; const Cr = data[off + step2]; const Dr = data[off + step3]; // Pre-Final values const T0r = Ar + Cr; const T1r = Ar - Cr; const T2r = Br + Dr; const T3r = inv * (Br - Dr); // Final values const FAr = T0r + T2r; const FBr = T1r; const FBi = -T3r; const FCr = T0r - T2r; const FDr = T1r; const FDi = T3r; out[outOff] = FAr; out[outOff + 1] = 0; out[outOff + 2] = FBr; out[outOff + 3] = FBi; out[outOff + 4] = FCr; out[outOff + 5] = 0; out[outOff + 6] = FDr; out[outOff + 7] = FDi; }; var FFT$1 = /*@__PURE__*/getDefaultExportFromCjs(fft); /** * @typedef {Float32Array | Float64Array | number[]} Buffer One of the supported * buffer types. Other numeric array types may not work correctly. */ /** * A class that can perform autocorrelation on input arrays of a given size. * * The class holds internal buffers so that no additional allocations are * necessary while performing the operation. * * @template {Buffer} T the buffer type to use. While inputs to the * autocorrelation process can be any array-like type, the output buffer * (whether provided explicitly or using a fresh buffer) is always of this type. */ class Autocorrelator { /** @private @readonly @type {number} */ _inputLength; /** @private @type {FFT} */ _fft; /** @private @type {(size: number) => T} */ _bufferSupplier; /** @private @type {T} */ _paddedInputBuffer; /** @private @type {T} */ _transformBuffer; /** @private @type {T} */ _inverseBuffer; /** * A helper method to create an {@link Autocorrelator} using * {@link Float32Array} buffers. * * @param inputLength {number} the input array length to support * @returns {Autocorrelator<Float32Array>} */ static forFloat32Array(inputLength) { return new Autocorrelator( inputLength, (length) => new Float32Array(length), ); } /** * A helper method to create an {@link Autocorrelator} using * {@link Float64Array} buffers. * * @param inputLength {number} the input array length to support * @returns {Autocorrelator<Float64Array>} */ static forFloat64Array(inputLength) { return new Autocorrelator( inputLength, (length) => new Float64Array(length), ); } /** * A helper method to create an {@link Autocorrelator} using `number[]` * buffers. * * @param inputLength {number} the input array length to support * @returns {Autocorrelator<number[]>} */ static forNumberArray(inputLength) { return new Autocorrelator(inputLength, (length) => Array(length)); } /** * Constructs a new {@link Autocorrelator} able to handle input arrays of the * given length. * * @param inputLength {number} the input array length to support. This * `Autocorrelator` will only support operation on arrays of this length. * @param bufferSupplier {(length: number) => T} the function to use for * creating buffers, accepting the length of the buffer to create and * returning a new buffer of that length. The values of the returned buffer * need not be initialized in any particular way. */ constructor(inputLength, bufferSupplier) { if (inputLength < 1) { throw new Error(`Input length must be at least one`); } this._inputLength = inputLength; // We need to double the input length to get correct results, and the FFT // algorithm we use requires a length that's a power of 2 this._fft = new FFT$1(ceilPow2(2 * inputLength)); this._bufferSupplier = bufferSupplier; this._paddedInputBuffer = this._bufferSupplier(this._fft.size); this._transformBuffer = this._bufferSupplier(2 * this._fft.size); this._inverseBuffer = this._bufferSupplier(2 * this._fft.size); } /** * Returns the supported input length. * * @returns {number} the supported input length */ get inputLength() { return this._inputLength; } /** * Autocorrelates the given input data. * * @param input {ArrayLike<number>} the input data to autocorrelate * @param output {T} the output buffer into which to write the autocorrelated * data. If not provided, a new buffer will be created. * @returns {T} `output` */ autocorrelate(input, output = this._bufferSupplier(input.length)) { if (input.length !== this._inputLength) { throw new Error( `Input must have length ${this._inputLength} but had length ${input.length}`, ); } // Step 0: pad the input array with zeros for (let i = 0; i < input.length; i++) { this._paddedInputBuffer[i] = input[i]; } for (let i = input.length; i < this._paddedInputBuffer.length; i++) { this._paddedInputBuffer[i] = 0; } // Step 1: get the DFT of the input array this._fft.realTransform(this._transformBuffer, this._paddedInputBuffer); // We need to fill in the right half of the array too this._fft.completeSpectrum(this._transformBuffer); // Step 2: multiply each entry by its conjugate const tb = this._transformBuffer; for (let i = 0; i < tb.length; i += 2) { tb[i] = tb[i] * tb[i] + tb[i + 1] * tb[i + 1]; tb[i + 1] = 0; } // Step 3: perform the inverse transform this._fft.inverseTransform(this._inverseBuffer, this._transformBuffer); // This last result (the inverse transform) contains the autocorrelation // data, which is completely real for (let i = 0; i < input.length; i++) { output[i] = this._inverseBuffer[2 * i]; } return output; } } /** * Returns an array of all the key maximum positions in the given input array. * * In McLeod's paper, a key maximum is the highest maximum between a positively * sloped zero crossing and a negatively sloped one. * * TODO: it may be more efficient not to construct a new output array each time, * but that would also make the code more complicated (more so than the changes * that were needed to remove the other allocations). * * @param input {ArrayLike<number>} * @returns {number[]} */ function getKeyMaximumIndices(input) { // The indices of the key maxima /** @type {number[]} */ const keyIndices = []; // Whether the last zero crossing found was positively sloped; equivalently, // whether we're looking for a key maximum let lookingForMaximum = false; // The largest local maximum found so far let max = -Infinity; // The index of the largest local maximum so far let maxIndex = -1; for (let i = 1; i < input.length - 1; i++) { if (input[i - 1] <= 0 && input[i] > 0) { // Positively sloped zero crossing lookingForMaximum = true; maxIndex = i; max = input[i]; } else if (input[i - 1] > 0 && input[i] <= 0) { // Negatively sloped zero crossing lookingForMaximum = false; if (maxIndex !== -1) { keyIndices.push(maxIndex); } } else if (lookingForMaximum && input[i] > max) { max = input[i]; maxIndex = i; } } return keyIndices; } /** * Refines the chosen key maximum index chosen from the given data by * interpolating a parabola using the key maximum index and its two neighbors * and finding the position of that parabola's maximum value. * * This is described in section 5 of the MPM paper as a way to refine the * position of the maximum. * * @param index {number} the chosen key maximum index. This must be between `1` * and `data.length - 2`, inclusive, since it and its two neighbors need to be * valid indexes of `data`. * @param data {ArrayLike<number>} the input array from which `index` was chosen * @returns {[number, number]} a pair consisting of the refined key maximum index and the * interpolated value of `data` at that index (the latter of which is used as a * measure of clarity) */ function refineResultIndex(index, data) { const [x0, x1, x2] = [index - 1, index, index + 1]; const [y0, y1, y2] = [data[x0], data[x1], data[x2]]; // The parabola going through the three data points can be written as // y = y0(x - x1)(x - x2)/(x0 - x1)(x0 - x2) // + y1(x - x0)(x - x2)/(x1 - x0)(x1 - x2) // + y2(x - x0)(x - x1)/(x2 - x0)(x2 - x1) // Given the definitions of x0, x1, and x2, we can simplify the denominators: // y = y0(x - x1)(x - x2)/2 // - y1(x - x0)(x - x2) // + y2(x - x0)(x - x1)/2 // We can expand this out and get the coefficients in standard form: // a = y0/2 - y1 + y2/2 // b = -(y0/2)(x1 + x2) + y1(x0 + x2) - (y2/2)(x0 + x1) // c = y0x1x2/2 - y1x0x2 + y2x0x1/2 // The index of the maximum is -b / 2a (by solving for x where the derivative // is 0). const a = y0 / 2 - y1 + y2 / 2; const b = -(y0 / 2) * (x1 + x2) + y1 * (x0 + x2) - (y2 / 2) * (x0 + x1); const c = (y0 * x1 * x2) / 2 - y1 * x0 * x2 + (y2 * x0 * x1) / 2; const xMax = -b / (2 * a); const yMax = a * xMax * xMax + b * xMax + c; return [xMax, yMax]; } /** * A class that can detect the pitch of a note from a time-domain input array. * * This class uses the McLeod pitch method (MPM) to detect pitches. MPM is * described in the paper 'A Smarter Way to Find Pitch' by Philip McLeod and * Geoff Wyvill * (http://miracle.otago.ac.nz/tartini/papers/A_Smarter_Way_to_Find_Pitch.pdf). * * The class holds internal buffers so that a minimal number of additional * allocations are necessary while performing the operation. * * @template {Buffer} T the buffer type to use internally. Inputs to the * pitch-detection process can be any numeric array type. */ class PitchDetector { /** @private @type {Autocorrelator<T>} */ _autocorrelator; /** @private @type {T} */ _nsdfBuffer; /** @private @type {number} */ _clarityThreshold = 0.9; /** @private @type {number} */ _minVolumeAbsolute = 0.0; /** @private @type {number} */ _maxInputAmplitude = 1.0; /** * A helper method to create an {@link PitchDetector} using {@link Float32Array} buffers. * * @param inputLength {number} the input array length to support * @returns {PitchDetector<Float32Array>} */ static forFloat32Array(inputLength) { return new PitchDetector(inputLength, (length) => new Float32Array(length)); } /** * A helper method to create an {@link PitchDetector} using {@link Float64Array} buffers. * * @param inputLength {number} the input array length to support * @returns {PitchDetector<Float64Array>} */ static forFloat64Array(inputLength) { return new PitchDetector(inputLength, (length) => new Float64Array(length)); } /** * A helper method to create an {@link PitchDetector} using `number[]` buffers. * * @param inputLength {number} the input array length to support * @returns {PitchDetector<number[]>} */ static forNumberArray(inputLength) { return new PitchDetector(inputLength, (length) => Array(length)); } /** * Constructs a new {@link PitchDetector} able to handle input arrays of the * given length. * * @param inputLength {number} the input array length to support. This * `PitchDetector` will only support operation on arrays of this length. * @param bufferSupplier {(inputLength: number) => T} the function to use for * creating buffers, accepting the length of the buffer to create and * returning a new buffer of that length. The values of the returned buffer * need not be initialized in any particular way. */ constructor(inputLength, bufferSupplier) { this._autocorrelator = new Autocorrelator(inputLength, bufferSupplier); this._nsdfBuffer = bufferSupplier(inputLength); } /** * Returns the supported input length. * * @returns {number} the supported input length */ get inputLength() { return this._autocorrelator.inputLength; } /** * Sets the clarity threshold used when identifying the correct pitch (the constant * `k` from the MPM paper). The value must be between 0 (exclusive) and 1 * (inclusive), with the most suitable range being between 0.8 and 1. * * @param threshold {number} the clarity threshold */ set clarityThreshold(threshold) { if (!Number.isFinite(threshold) || threshold <= 0 || threshold > 1) { throw new Error("clarityThreshold must be a number in the range (0, 1]"); } this._clarityThreshold = threshold; } /** * Sets the minimum detectable volume, as an absolute number between 0 and * `maxInputAmplitude`, inclusive, to consider in a sample when detecting the * pitch. If a sample fails to meet this minimum volume, `findPitch` will * return a clarity of 0. * * Volume is calculated as the RMS (root mean square) of the input samples. * * @param volume {number} the minimum volume as an absolute amplitude value */ set minVolumeAbsolute(volume) { if ( !Number.isFinite(volume) || volume < 0 || volume > this._maxInputAmplitude ) { throw new Error( `minVolumeAbsolute must be a number in the range [0, ${this._maxInputAmplitude}]`, ); } this._minVolumeAbsolute = volume; } /** * Sets the minimum volume using a decibel measurement. Must be less than or * equal to 0: 0 indicates the loudest possible sound (see * `maxInputAmplitude`), -10 is a sound with a tenth of the volume of the * loudest possible sound, etc. * * Volume is calculated as the RMS (root mean square) of the input samples. * * @param db {number} the minimum volume in decibels, with 0 being the loudest * sound */ set minVolumeDecibels(db) { if (!Number.isFinite(db) || db > 0) { throw new Error("minVolumeDecibels must be a number <= 0"); } this._minVolumeAbsolute = this._maxInputAmplitude * 10 ** (db / 10); } /** * Sets the maximum amplitude of an input reading. Must be greater than 0. * * @param amplitude {number} the maximum amplitude (absolute value) of an input reading */ set maxInputAmplitude(amplitude) { if (!Number.isFinite(amplitude) || amplitude <= 0) { throw new Error("maxInputAmplitude must be a number > 0"); } this._maxInputAmplitude = amplitude; } /** * Returns the pitch detected using McLeod Pitch Method (MPM) along with a * measure of its clarity. * * The clarity is a value between 0 and 1 (potentially inclusive) that * represents how "clear" the pitch was. A clarity value of 1 indicates that * the pitch was very distinct, while lower clarity values indicate less * definite pitches. * * @param input {ArrayLike<number>} the time-domain input data * @param sampleRate {number} the sample rate at which the input data was * collected * @returns {[number, number]} the detected pitch, in Hz, followed by the * clarity. If a pitch cannot be determined from the input, such as if the * volume is too low (see `minVolumeAbsolute` and `minVolumeDecibels`), this * will be `[0, 0]`. */ findPitch(input, sampleRate) { // If the highest key maximum is less than the minimum volume, we don't need // to bother detecting the pitch, as the sample is too quiet. if (this._belowMinimumVolume(input)) return [0, 0]; this._nsdf(input); const keyMaximumIndices = getKeyMaximumIndices(this._nsdfBuffer); if (keyMaximumIndices.length === 0) { // No key maxima means that we either don't have enough data to analyze or // that the data was flawed (such as an input array of zeroes) return [0, 0]; } // The highest key maximum const nMax = Math.max(...keyMaximumIndices.map((i) => this._nsdfBuffer[i])); // Following the paper, we return the pitch corresponding to the first key // maximum higher than K * nMax. This is guaranteed not to be undefined, since // we know of at least one key maximum satisfying this condition (whichever // key maximum gave us nMax). const resultIndex = keyMaximumIndices.find( (i) => this._nsdfBuffer[i] >= this._clarityThreshold * nMax, ); const [refinedResultIndex, clarity] = refineResultIndex( // @ts-expect-error resultIndex is guaranteed to be defined resultIndex, this._nsdfBuffer, ); // Due to floating point errors, the clarity may occasionally come out to be // slightly over 1.0. We can avoid incorrect results by clamping the value. return [sampleRate / refinedResultIndex, Math.min(clarity, 1.0)]; } /** * Returns whether the input audio data is below the minimum volume allowed by * the pitch detector. * * @private * @param input {ArrayLike<number>} * @returns {boolean} */ _belowMinimumVolume(input) { if (this._minVolumeAbsolute === 0) return false; let squareSum = 0; for (let i = 0; i < input.length; i++) { squareSum += input[i] ** 2; } return Math.sq