UNPKG

pitchy

Version:

A simple pitch detection library.

507 lines (474 loc) 18.4 kB
import FFT from "fft.js"; /** * @typedef {Float32Array | Float64Array | number[]} Buffer One of the supported * buffer types. Other numeric array types may not work correctly. */ /** * A class that can perform autocorrelation on input arrays of a given size. * * The class holds internal buffers so that no additional allocations are * necessary while performing the operation. * * @template {Buffer} T the buffer type to use. While inputs to the * autocorrelation process can be any array-like type, the output buffer * (whether provided explicitly or using a fresh buffer) is always of this type. */ export class Autocorrelator { /** @private @readonly @type {number} */ _inputLength; /** @private @type {FFT} */ _fft; /** @private @type {(size: number) => T} */ _bufferSupplier; /** @private @type {T} */ _paddedInputBuffer; /** @private @type {T} */ _transformBuffer; /** @private @type {T} */ _inverseBuffer; /** * A helper method to create an {@link Autocorrelator} using * {@link Float32Array} buffers. * * @param inputLength {number} the input array length to support * @returns {Autocorrelator<Float32Array>} */ static forFloat32Array(inputLength) { return new Autocorrelator( inputLength, (length) => new Float32Array(length), ); } /** * A helper method to create an {@link Autocorrelator} using * {@link Float64Array} buffers. * * @param inputLength {number} the input array length to support * @returns {Autocorrelator<Float64Array>} */ static forFloat64Array(inputLength) { return new Autocorrelator( inputLength, (length) => new Float64Array(length), ); } /** * A helper method to create an {@link Autocorrelator} using `number[]` * buffers. * * @param inputLength {number} the input array length to support * @returns {Autocorrelator<number[]>} */ static forNumberArray(inputLength) { return new Autocorrelator(inputLength, (length) => Array(length)); } /** * Constructs a new {@link Autocorrelator} able to handle input arrays of the * given length. * * @param inputLength {number} the input array length to support. This * `Autocorrelator` will only support operation on arrays of this length. * @param bufferSupplier {(length: number) => T} the function to use for * creating buffers, accepting the length of the buffer to create and * returning a new buffer of that length. The values of the returned buffer * need not be initialized in any particular way. */ constructor(inputLength, bufferSupplier) { if (inputLength < 1) { throw new Error(`Input length must be at least one`); } this._inputLength = inputLength; // We need to double the input length to get correct results, and the FFT // algorithm we use requires a length that's a power of 2 this._fft = new FFT(ceilPow2(2 * inputLength)); this._bufferSupplier = bufferSupplier; this._paddedInputBuffer = this._bufferSupplier(this._fft.size); this._transformBuffer = this._bufferSupplier(2 * this._fft.size); this._inverseBuffer = this._bufferSupplier(2 * this._fft.size); } /** * Returns the supported input length. * * @returns {number} the supported input length */ get inputLength() { return this._inputLength; } /** * Autocorrelates the given input data. * * @param input {ArrayLike<number>} the input data to autocorrelate * @param output {T} the output buffer into which to write the autocorrelated * data. If not provided, a new buffer will be created. * @returns {T} `output` */ autocorrelate(input, output = this._bufferSupplier(input.length)) { if (input.length !== this._inputLength) { throw new Error( `Input must have length ${this._inputLength} but had length ${input.length}`, ); } // Step 0: pad the input array with zeros for (let i = 0; i < input.length; i++) { this._paddedInputBuffer[i] = input[i]; } for (let i = input.length; i < this._paddedInputBuffer.length; i++) { this._paddedInputBuffer[i] = 0; } // Step 1: get the DFT of the input array this._fft.realTransform(this._transformBuffer, this._paddedInputBuffer); // We need to fill in the right half of the array too this._fft.completeSpectrum(this._transformBuffer); // Step 2: multiply each entry by its conjugate const tb = this._transformBuffer; for (let i = 0; i < tb.length; i += 2) { tb[i] = tb[i] * tb[i] + tb[i + 1] * tb[i + 1]; tb[i + 1] = 0; } // Step 3: perform the inverse transform this._fft.inverseTransform(this._inverseBuffer, this._transformBuffer); // This last result (the inverse transform) contains the autocorrelation // data, which is completely real for (let i = 0; i < input.length; i++) { output[i] = this._inverseBuffer[2 * i]; } return output; } } /** * Returns an array of all the key maximum positions in the given input array. * * In McLeod's paper, a key maximum is the highest maximum between a positively * sloped zero crossing and a negatively sloped one. * * TODO: it may be more efficient not to construct a new output array each time, * but that would also make the code more complicated (more so than the changes * that were needed to remove the other allocations). * * @param input {ArrayLike<number>} * @returns {number[]} */ function getKeyMaximumIndices(input) { // The indices of the key maxima /** @type {number[]} */ const keyIndices = []; // Whether the last zero crossing found was positively sloped; equivalently, // whether we're looking for a key maximum let lookingForMaximum = false; // The largest local maximum found so far let max = -Infinity; // The index of the largest local maximum so far let maxIndex = -1; for (let i = 1; i < input.length - 1; i++) { if (input[i - 1] <= 0 && input[i] > 0) { // Positively sloped zero crossing lookingForMaximum = true; maxIndex = i; max = input[i]; } else if (input[i - 1] > 0 && input[i] <= 0) { // Negatively sloped zero crossing lookingForMaximum = false; if (maxIndex !== -1) { keyIndices.push(maxIndex); } } else if (lookingForMaximum && input[i] > max) { max = input[i]; maxIndex = i; } } return keyIndices; } /** * Refines the chosen key maximum index chosen from the given data by * interpolating a parabola using the key maximum index and its two neighbors * and finding the position of that parabola's maximum value. * * This is described in section 5 of the MPM paper as a way to refine the * position of the maximum. * * @param index {number} the chosen key maximum index. This must be between `1` * and `data.length - 2`, inclusive, since it and its two neighbors need to be * valid indexes of `data`. * @param data {ArrayLike<number>} the input array from which `index` was chosen * @returns {[number, number]} a pair consisting of the refined key maximum index and the * interpolated value of `data` at that index (the latter of which is used as a * measure of clarity) */ function refineResultIndex(index, data) { const [x0, x1, x2] = [index - 1, index, index + 1]; const [y0, y1, y2] = [data[x0], data[x1], data[x2]]; // The parabola going through the three data points can be written as // y = y0(x - x1)(x - x2)/(x0 - x1)(x0 - x2) // + y1(x - x0)(x - x2)/(x1 - x0)(x1 - x2) // + y2(x - x0)(x - x1)/(x2 - x0)(x2 - x1) // Given the definitions of x0, x1, and x2, we can simplify the denominators: // y = y0(x - x1)(x - x2)/2 // - y1(x - x0)(x - x2) // + y2(x - x0)(x - x1)/2 // We can expand this out and get the coefficients in standard form: // a = y0/2 - y1 + y2/2 // b = -(y0/2)(x1 + x2) + y1(x0 + x2) - (y2/2)(x0 + x1) // c = y0x1x2/2 - y1x0x2 + y2x0x1/2 // The index of the maximum is -b / 2a (by solving for x where the derivative // is 0). const a = y0 / 2 - y1 + y2 / 2; const b = -(y0 / 2) * (x1 + x2) + y1 * (x0 + x2) - (y2 / 2) * (x0 + x1); const c = (y0 * x1 * x2) / 2 - y1 * x0 * x2 + (y2 * x0 * x1) / 2; const xMax = -b / (2 * a); const yMax = a * xMax * xMax + b * xMax + c; return [xMax, yMax]; } /** * A class that can detect the pitch of a note from a time-domain input array. * * This class uses the McLeod pitch method (MPM) to detect pitches. MPM is * described in the paper 'A Smarter Way to Find Pitch' by Philip McLeod and * Geoff Wyvill * (http://miracle.otago.ac.nz/tartini/papers/A_Smarter_Way_to_Find_Pitch.pdf). * * The class holds internal buffers so that a minimal number of additional * allocations are necessary while performing the operation. * * @template {Buffer} T the buffer type to use internally. Inputs to the * pitch-detection process can be any numeric array type. */ export class PitchDetector { /** @private @type {Autocorrelator<T>} */ _autocorrelator; /** @private @type {T} */ _nsdfBuffer; /** @private @type {number} */ _clarityThreshold = 0.9; /** @private @type {number} */ _minVolumeAbsolute = 0.0; /** @private @type {number} */ _maxInputAmplitude = 1.0; /** * A helper method to create an {@link PitchDetector} using {@link Float32Array} buffers. * * @param inputLength {number} the input array length to support * @returns {PitchDetector<Float32Array>} */ static forFloat32Array(inputLength) { return new PitchDetector(inputLength, (length) => new Float32Array(length)); } /** * A helper method to create an {@link PitchDetector} using {@link Float64Array} buffers. * * @param inputLength {number} the input array length to support * @returns {PitchDetector<Float64Array>} */ static forFloat64Array(inputLength) { return new PitchDetector(inputLength, (length) => new Float64Array(length)); } /** * A helper method to create an {@link PitchDetector} using `number[]` buffers. * * @param inputLength {number} the input array length to support * @returns {PitchDetector<number[]>} */ static forNumberArray(inputLength) { return new PitchDetector(inputLength, (length) => Array(length)); } /** * Constructs a new {@link PitchDetector} able to handle input arrays of the * given length. * * @param inputLength {number} the input array length to support. This * `PitchDetector` will only support operation on arrays of this length. * @param bufferSupplier {(inputLength: number) => T} the function to use for * creating buffers, accepting the length of the buffer to create and * returning a new buffer of that length. The values of the returned buffer * need not be initialized in any particular way. */ constructor(inputLength, bufferSupplier) { this._autocorrelator = new Autocorrelator(inputLength, bufferSupplier); this._nsdfBuffer = bufferSupplier(inputLength); } /** * Returns the supported input length. * * @returns {number} the supported input length */ get inputLength() { return this._autocorrelator.inputLength; } /** * Sets the clarity threshold used when identifying the correct pitch (the constant * `k` from the MPM paper). The value must be between 0 (exclusive) and 1 * (inclusive), with the most suitable range being between 0.8 and 1. * * @param threshold {number} the clarity threshold */ set clarityThreshold(threshold) { if (!Number.isFinite(threshold) || threshold <= 0 || threshold > 1) { throw new Error("clarityThreshold must be a number in the range (0, 1]"); } this._clarityThreshold = threshold; } /** * Sets the minimum detectable volume, as an absolute number between 0 and * `maxInputAmplitude`, inclusive, to consider in a sample when detecting the * pitch. If a sample fails to meet this minimum volume, `findPitch` will * return a clarity of 0. * * Volume is calculated as the RMS (root mean square) of the input samples. * * @param volume {number} the minimum volume as an absolute amplitude value */ set minVolumeAbsolute(volume) { if ( !Number.isFinite(volume) || volume < 0 || volume > this._maxInputAmplitude ) { throw new Error( `minVolumeAbsolute must be a number in the range [0, ${this._maxInputAmplitude}]`, ); } this._minVolumeAbsolute = volume; } /** * Sets the minimum volume using a decibel measurement. Must be less than or * equal to 0: 0 indicates the loudest possible sound (see * `maxInputAmplitude`), -10 is a sound with a tenth of the volume of the * loudest possible sound, etc. * * Volume is calculated as the RMS (root mean square) of the input samples. * * @param db {number} the minimum volume in decibels, with 0 being the loudest * sound */ set minVolumeDecibels(db) { if (!Number.isFinite(db) || db > 0) { throw new Error("minVolumeDecibels must be a number <= 0"); } this._minVolumeAbsolute = this._maxInputAmplitude * 10 ** (db / 10); } /** * Sets the maximum amplitude of an input reading. Must be greater than 0. * * @param amplitude {number} the maximum amplitude (absolute value) of an input reading */ set maxInputAmplitude(amplitude) { if (!Number.isFinite(amplitude) || amplitude <= 0) { throw new Error("maxInputAmplitude must be a number > 0"); } this._maxInputAmplitude = amplitude; } /** * Returns the pitch detected using McLeod Pitch Method (MPM) along with a * measure of its clarity. * * The clarity is a value between 0 and 1 (potentially inclusive) that * represents how "clear" the pitch was. A clarity value of 1 indicates that * the pitch was very distinct, while lower clarity values indicate less * definite pitches. * * @param input {ArrayLike<number>} the time-domain input data * @param sampleRate {number} the sample rate at which the input data was * collected * @returns {[number, number]} the detected pitch, in Hz, followed by the * clarity. If a pitch cannot be determined from the input, such as if the * volume is too low (see `minVolumeAbsolute` and `minVolumeDecibels`), this * will be `[0, 0]`. */ findPitch(input, sampleRate) { // If the highest key maximum is less than the minimum volume, we don't need // to bother detecting the pitch, as the sample is too quiet. if (this._belowMinimumVolume(input)) return [0, 0]; this._nsdf(input); const keyMaximumIndices = getKeyMaximumIndices(this._nsdfBuffer); if (keyMaximumIndices.length === 0) { // No key maxima means that we either don't have enough data to analyze or // that the data was flawed (such as an input array of zeroes) return [0, 0]; } // The highest key maximum const nMax = Math.max(...keyMaximumIndices.map((i) => this._nsdfBuffer[i])); // Following the paper, we return the pitch corresponding to the first key // maximum higher than K * nMax. This is guaranteed not to be undefined, since // we know of at least one key maximum satisfying this condition (whichever // key maximum gave us nMax). const resultIndex = keyMaximumIndices.find( (i) => this._nsdfBuffer[i] >= this._clarityThreshold * nMax, ); const [refinedResultIndex, clarity] = refineResultIndex( // @ts-expect-error resultIndex is guaranteed to be defined resultIndex, this._nsdfBuffer, ); // Due to floating point errors, the clarity may occasionally come out to be // slightly over 1.0. We can avoid incorrect results by clamping the value. return [sampleRate / refinedResultIndex, Math.min(clarity, 1.0)]; } /** * Returns whether the input audio data is below the minimum volume allowed by * the pitch detector. * * @private * @param input {ArrayLike<number>} * @returns {boolean} */ _belowMinimumVolume(input) { if (this._minVolumeAbsolute === 0) return false; let squareSum = 0; for (let i = 0; i < input.length; i++) { squareSum += input[i] ** 2; } return Math.sqrt(squareSum / input.length) < this._minVolumeAbsolute; } /** * Computes the NSDF of the input and stores it in the internal buffer. This * is equation (9) in the McLeod pitch method paper. * * @private * @param input {ArrayLike<number>} */ _nsdf(input) { // The function r'(tau) is the autocorrelation this._autocorrelator.autocorrelate(input, this._nsdfBuffer); // The function m'(tau) (defined in equation (6)) can be computed starting // with m'(0), which is equal to 2r'(0), and then iteratively modified to // get m'(1), m'(2), etc. For example, to get m'(1), we take m'(0) and // subtract x_0^2 and x_{W-1}^2. Then, to get m'(2), we take m'(1) and // subtract x_1^2 and x_{W-2}^2, and further values are similar (see the // note at the end of section 6 in the MPM paper). // // The resulting array values are 2 * r'(tau) / m'(tau). We use m below as // the incremental value of m'. let m = 2 * this._nsdfBuffer[0]; /** @type {number} */ let i; // As pointed out by issuefiler on GitHub, we can take advantage of the fact // that m will never increase to avoid division by zero by ending this loop // once m === 0. The rest of the array values after m becomes 0 will just be // set to 0 themselves. We actually check for m > 0 rather than m === 0 // because there may be small floating-point errors that cause m to become // negative rather than exactly 0. for (i = 0; i < this._nsdfBuffer.length && m > 0; i++) { this._nsdfBuffer[i] = (2 * this._nsdfBuffer[i]) / m; m -= input[i] ** 2 + input[input.length - i - 1] ** 2; } // If there are any array values remaining, it means m === 0 for those // values of tau, so we can just set them to 0 for (; i < this._nsdfBuffer.length; i++) { this._nsdfBuffer[i] = 0; } } } /** * Rounds up the input to the next power of 2. * * @param {number} v * @returns {number} the next power of 2 at least as large as `v` */ function ceilPow2(v) { // https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2 v--; v |= v >> 1; v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; v++; return v; }