pitchy
Version:
A simple pitch detection library.
507 lines (474 loc) • 18.4 kB
JavaScript
import FFT from "fft.js";
/**
* @typedef {Float32Array | Float64Array | number[]} Buffer One of the supported
* buffer types. Other numeric array types may not work correctly.
*/
/**
* A class that can perform autocorrelation on input arrays of a given size.
*
* The class holds internal buffers so that no additional allocations are
* necessary while performing the operation.
*
* @template {Buffer} T the buffer type to use. While inputs to the
* autocorrelation process can be any array-like type, the output buffer
* (whether provided explicitly or using a fresh buffer) is always of this type.
*/
export class Autocorrelator {
/** @private @readonly @type {number} */
_inputLength;
/** @private @type {FFT} */
_fft;
/** @private @type {(size: number) => T} */
_bufferSupplier;
/** @private @type {T} */
_paddedInputBuffer;
/** @private @type {T} */
_transformBuffer;
/** @private @type {T} */
_inverseBuffer;
/**
* A helper method to create an {@link Autocorrelator} using
* {@link Float32Array} buffers.
*
* @param inputLength {number} the input array length to support
* @returns {Autocorrelator<Float32Array>}
*/
static forFloat32Array(inputLength) {
return new Autocorrelator(
inputLength,
(length) => new Float32Array(length),
);
}
/**
* A helper method to create an {@link Autocorrelator} using
* {@link Float64Array} buffers.
*
* @param inputLength {number} the input array length to support
* @returns {Autocorrelator<Float64Array>}
*/
static forFloat64Array(inputLength) {
return new Autocorrelator(
inputLength,
(length) => new Float64Array(length),
);
}
/**
* A helper method to create an {@link Autocorrelator} using `number[]`
* buffers.
*
* @param inputLength {number} the input array length to support
* @returns {Autocorrelator<number[]>}
*/
static forNumberArray(inputLength) {
return new Autocorrelator(inputLength, (length) => Array(length));
}
/**
* Constructs a new {@link Autocorrelator} able to handle input arrays of the
* given length.
*
* @param inputLength {number} the input array length to support. This
* `Autocorrelator` will only support operation on arrays of this length.
* @param bufferSupplier {(length: number) => T} the function to use for
* creating buffers, accepting the length of the buffer to create and
* returning a new buffer of that length. The values of the returned buffer
* need not be initialized in any particular way.
*/
constructor(inputLength, bufferSupplier) {
if (inputLength < 1) {
throw new Error(`Input length must be at least one`);
}
this._inputLength = inputLength;
// We need to double the input length to get correct results, and the FFT
// algorithm we use requires a length that's a power of 2
this._fft = new FFT(ceilPow2(2 * inputLength));
this._bufferSupplier = bufferSupplier;
this._paddedInputBuffer = this._bufferSupplier(this._fft.size);
this._transformBuffer = this._bufferSupplier(2 * this._fft.size);
this._inverseBuffer = this._bufferSupplier(2 * this._fft.size);
}
/**
* Returns the supported input length.
*
* @returns {number} the supported input length
*/
get inputLength() {
return this._inputLength;
}
/**
* Autocorrelates the given input data.
*
* @param input {ArrayLike<number>} the input data to autocorrelate
* @param output {T} the output buffer into which to write the autocorrelated
* data. If not provided, a new buffer will be created.
* @returns {T} `output`
*/
autocorrelate(input, output = this._bufferSupplier(input.length)) {
if (input.length !== this._inputLength) {
throw new Error(
`Input must have length ${this._inputLength} but had length ${input.length}`,
);
}
// Step 0: pad the input array with zeros
for (let i = 0; i < input.length; i++) {
this._paddedInputBuffer[i] = input[i];
}
for (let i = input.length; i < this._paddedInputBuffer.length; i++) {
this._paddedInputBuffer[i] = 0;
}
// Step 1: get the DFT of the input array
this._fft.realTransform(this._transformBuffer, this._paddedInputBuffer);
// We need to fill in the right half of the array too
this._fft.completeSpectrum(this._transformBuffer);
// Step 2: multiply each entry by its conjugate
const tb = this._transformBuffer;
for (let i = 0; i < tb.length; i += 2) {
tb[i] = tb[i] * tb[i] + tb[i + 1] * tb[i + 1];
tb[i + 1] = 0;
}
// Step 3: perform the inverse transform
this._fft.inverseTransform(this._inverseBuffer, this._transformBuffer);
// This last result (the inverse transform) contains the autocorrelation
// data, which is completely real
for (let i = 0; i < input.length; i++) {
output[i] = this._inverseBuffer[2 * i];
}
return output;
}
}
/**
* Returns an array of all the key maximum positions in the given input array.
*
* In McLeod's paper, a key maximum is the highest maximum between a positively
* sloped zero crossing and a negatively sloped one.
*
* TODO: it may be more efficient not to construct a new output array each time,
* but that would also make the code more complicated (more so than the changes
* that were needed to remove the other allocations).
*
* @param input {ArrayLike<number>}
* @returns {number[]}
*/
function getKeyMaximumIndices(input) {
// The indices of the key maxima
/** @type {number[]} */ const keyIndices = [];
// Whether the last zero crossing found was positively sloped; equivalently,
// whether we're looking for a key maximum
let lookingForMaximum = false;
// The largest local maximum found so far
let max = -Infinity;
// The index of the largest local maximum so far
let maxIndex = -1;
for (let i = 1; i < input.length - 1; i++) {
if (input[i - 1] <= 0 && input[i] > 0) {
// Positively sloped zero crossing
lookingForMaximum = true;
maxIndex = i;
max = input[i];
} else if (input[i - 1] > 0 && input[i] <= 0) {
// Negatively sloped zero crossing
lookingForMaximum = false;
if (maxIndex !== -1) {
keyIndices.push(maxIndex);
}
} else if (lookingForMaximum && input[i] > max) {
max = input[i];
maxIndex = i;
}
}
return keyIndices;
}
/**
* Refines the chosen key maximum index chosen from the given data by
* interpolating a parabola using the key maximum index and its two neighbors
* and finding the position of that parabola's maximum value.
*
* This is described in section 5 of the MPM paper as a way to refine the
* position of the maximum.
*
* @param index {number} the chosen key maximum index. This must be between `1`
* and `data.length - 2`, inclusive, since it and its two neighbors need to be
* valid indexes of `data`.
* @param data {ArrayLike<number>} the input array from which `index` was chosen
* @returns {[number, number]} a pair consisting of the refined key maximum index and the
* interpolated value of `data` at that index (the latter of which is used as a
* measure of clarity)
*/
function refineResultIndex(index, data) {
const [x0, x1, x2] = [index - 1, index, index + 1];
const [y0, y1, y2] = [data[x0], data[x1], data[x2]];
// The parabola going through the three data points can be written as
// y = y0(x - x1)(x - x2)/(x0 - x1)(x0 - x2)
// + y1(x - x0)(x - x2)/(x1 - x0)(x1 - x2)
// + y2(x - x0)(x - x1)/(x2 - x0)(x2 - x1)
// Given the definitions of x0, x1, and x2, we can simplify the denominators:
// y = y0(x - x1)(x - x2)/2
// - y1(x - x0)(x - x2)
// + y2(x - x0)(x - x1)/2
// We can expand this out and get the coefficients in standard form:
// a = y0/2 - y1 + y2/2
// b = -(y0/2)(x1 + x2) + y1(x0 + x2) - (y2/2)(x0 + x1)
// c = y0x1x2/2 - y1x0x2 + y2x0x1/2
// The index of the maximum is -b / 2a (by solving for x where the derivative
// is 0).
const a = y0 / 2 - y1 + y2 / 2;
const b = -(y0 / 2) * (x1 + x2) + y1 * (x0 + x2) - (y2 / 2) * (x0 + x1);
const c = (y0 * x1 * x2) / 2 - y1 * x0 * x2 + (y2 * x0 * x1) / 2;
const xMax = -b / (2 * a);
const yMax = a * xMax * xMax + b * xMax + c;
return [xMax, yMax];
}
/**
* A class that can detect the pitch of a note from a time-domain input array.
*
* This class uses the McLeod pitch method (MPM) to detect pitches. MPM is
* described in the paper 'A Smarter Way to Find Pitch' by Philip McLeod and
* Geoff Wyvill
* (http://miracle.otago.ac.nz/tartini/papers/A_Smarter_Way_to_Find_Pitch.pdf).
*
* The class holds internal buffers so that a minimal number of additional
* allocations are necessary while performing the operation.
*
* @template {Buffer} T the buffer type to use internally. Inputs to the
* pitch-detection process can be any numeric array type.
*/
export class PitchDetector {
/** @private @type {Autocorrelator<T>} */
_autocorrelator;
/** @private @type {T} */
_nsdfBuffer;
/** @private @type {number} */
_clarityThreshold = 0.9;
/** @private @type {number} */
_minVolumeAbsolute = 0.0;
/** @private @type {number} */
_maxInputAmplitude = 1.0;
/**
* A helper method to create an {@link PitchDetector} using {@link Float32Array} buffers.
*
* @param inputLength {number} the input array length to support
* @returns {PitchDetector<Float32Array>}
*/
static forFloat32Array(inputLength) {
return new PitchDetector(inputLength, (length) => new Float32Array(length));
}
/**
* A helper method to create an {@link PitchDetector} using {@link Float64Array} buffers.
*
* @param inputLength {number} the input array length to support
* @returns {PitchDetector<Float64Array>}
*/
static forFloat64Array(inputLength) {
return new PitchDetector(inputLength, (length) => new Float64Array(length));
}
/**
* A helper method to create an {@link PitchDetector} using `number[]` buffers.
*
* @param inputLength {number} the input array length to support
* @returns {PitchDetector<number[]>}
*/
static forNumberArray(inputLength) {
return new PitchDetector(inputLength, (length) => Array(length));
}
/**
* Constructs a new {@link PitchDetector} able to handle input arrays of the
* given length.
*
* @param inputLength {number} the input array length to support. This
* `PitchDetector` will only support operation on arrays of this length.
* @param bufferSupplier {(inputLength: number) => T} the function to use for
* creating buffers, accepting the length of the buffer to create and
* returning a new buffer of that length. The values of the returned buffer
* need not be initialized in any particular way.
*/
constructor(inputLength, bufferSupplier) {
this._autocorrelator = new Autocorrelator(inputLength, bufferSupplier);
this._nsdfBuffer = bufferSupplier(inputLength);
}
/**
* Returns the supported input length.
*
* @returns {number} the supported input length
*/
get inputLength() {
return this._autocorrelator.inputLength;
}
/**
* Sets the clarity threshold used when identifying the correct pitch (the constant
* `k` from the MPM paper). The value must be between 0 (exclusive) and 1
* (inclusive), with the most suitable range being between 0.8 and 1.
*
* @param threshold {number} the clarity threshold
*/
set clarityThreshold(threshold) {
if (!Number.isFinite(threshold) || threshold <= 0 || threshold > 1) {
throw new Error("clarityThreshold must be a number in the range (0, 1]");
}
this._clarityThreshold = threshold;
}
/**
* Sets the minimum detectable volume, as an absolute number between 0 and
* `maxInputAmplitude`, inclusive, to consider in a sample when detecting the
* pitch. If a sample fails to meet this minimum volume, `findPitch` will
* return a clarity of 0.
*
* Volume is calculated as the RMS (root mean square) of the input samples.
*
* @param volume {number} the minimum volume as an absolute amplitude value
*/
set minVolumeAbsolute(volume) {
if (
!Number.isFinite(volume) ||
volume < 0 ||
volume > this._maxInputAmplitude
) {
throw new Error(
`minVolumeAbsolute must be a number in the range [0, ${this._maxInputAmplitude}]`,
);
}
this._minVolumeAbsolute = volume;
}
/**
* Sets the minimum volume using a decibel measurement. Must be less than or
* equal to 0: 0 indicates the loudest possible sound (see
* `maxInputAmplitude`), -10 is a sound with a tenth of the volume of the
* loudest possible sound, etc.
*
* Volume is calculated as the RMS (root mean square) of the input samples.
*
* @param db {number} the minimum volume in decibels, with 0 being the loudest
* sound
*/
set minVolumeDecibels(db) {
if (!Number.isFinite(db) || db > 0) {
throw new Error("minVolumeDecibels must be a number <= 0");
}
this._minVolumeAbsolute = this._maxInputAmplitude * 10 ** (db / 10);
}
/**
* Sets the maximum amplitude of an input reading. Must be greater than 0.
*
* @param amplitude {number} the maximum amplitude (absolute value) of an input reading
*/
set maxInputAmplitude(amplitude) {
if (!Number.isFinite(amplitude) || amplitude <= 0) {
throw new Error("maxInputAmplitude must be a number > 0");
}
this._maxInputAmplitude = amplitude;
}
/**
* Returns the pitch detected using McLeod Pitch Method (MPM) along with a
* measure of its clarity.
*
* The clarity is a value between 0 and 1 (potentially inclusive) that
* represents how "clear" the pitch was. A clarity value of 1 indicates that
* the pitch was very distinct, while lower clarity values indicate less
* definite pitches.
*
* @param input {ArrayLike<number>} the time-domain input data
* @param sampleRate {number} the sample rate at which the input data was
* collected
* @returns {[number, number]} the detected pitch, in Hz, followed by the
* clarity. If a pitch cannot be determined from the input, such as if the
* volume is too low (see `minVolumeAbsolute` and `minVolumeDecibels`), this
* will be `[0, 0]`.
*/
findPitch(input, sampleRate) {
// If the highest key maximum is less than the minimum volume, we don't need
// to bother detecting the pitch, as the sample is too quiet.
if (this._belowMinimumVolume(input)) return [0, 0];
this._nsdf(input);
const keyMaximumIndices = getKeyMaximumIndices(this._nsdfBuffer);
if (keyMaximumIndices.length === 0) {
// No key maxima means that we either don't have enough data to analyze or
// that the data was flawed (such as an input array of zeroes)
return [0, 0];
}
// The highest key maximum
const nMax = Math.max(...keyMaximumIndices.map((i) => this._nsdfBuffer[i]));
// Following the paper, we return the pitch corresponding to the first key
// maximum higher than K * nMax. This is guaranteed not to be undefined, since
// we know of at least one key maximum satisfying this condition (whichever
// key maximum gave us nMax).
const resultIndex = keyMaximumIndices.find(
(i) => this._nsdfBuffer[i] >= this._clarityThreshold * nMax,
);
const [refinedResultIndex, clarity] = refineResultIndex(
// @ts-expect-error resultIndex is guaranteed to be defined
resultIndex,
this._nsdfBuffer,
);
// Due to floating point errors, the clarity may occasionally come out to be
// slightly over 1.0. We can avoid incorrect results by clamping the value.
return [sampleRate / refinedResultIndex, Math.min(clarity, 1.0)];
}
/**
* Returns whether the input audio data is below the minimum volume allowed by
* the pitch detector.
*
* @private
* @param input {ArrayLike<number>}
* @returns {boolean}
*/
_belowMinimumVolume(input) {
if (this._minVolumeAbsolute === 0) return false;
let squareSum = 0;
for (let i = 0; i < input.length; i++) {
squareSum += input[i] ** 2;
}
return Math.sqrt(squareSum / input.length) < this._minVolumeAbsolute;
}
/**
* Computes the NSDF of the input and stores it in the internal buffer. This
* is equation (9) in the McLeod pitch method paper.
*
* @private
* @param input {ArrayLike<number>}
*/
_nsdf(input) {
// The function r'(tau) is the autocorrelation
this._autocorrelator.autocorrelate(input, this._nsdfBuffer);
// The function m'(tau) (defined in equation (6)) can be computed starting
// with m'(0), which is equal to 2r'(0), and then iteratively modified to
// get m'(1), m'(2), etc. For example, to get m'(1), we take m'(0) and
// subtract x_0^2 and x_{W-1}^2. Then, to get m'(2), we take m'(1) and
// subtract x_1^2 and x_{W-2}^2, and further values are similar (see the
// note at the end of section 6 in the MPM paper).
//
// The resulting array values are 2 * r'(tau) / m'(tau). We use m below as
// the incremental value of m'.
let m = 2 * this._nsdfBuffer[0];
/** @type {number} */ let i;
// As pointed out by issuefiler on GitHub, we can take advantage of the fact
// that m will never increase to avoid division by zero by ending this loop
// once m === 0. The rest of the array values after m becomes 0 will just be
// set to 0 themselves. We actually check for m > 0 rather than m === 0
// because there may be small floating-point errors that cause m to become
// negative rather than exactly 0.
for (i = 0; i < this._nsdfBuffer.length && m > 0; i++) {
this._nsdfBuffer[i] = (2 * this._nsdfBuffer[i]) / m;
m -= input[i] ** 2 + input[input.length - i - 1] ** 2;
}
// If there are any array values remaining, it means m === 0 for those
// values of tau, so we can just set them to 0
for (; i < this._nsdfBuffer.length; i++) {
this._nsdfBuffer[i] = 0;
}
}
}
/**
* Rounds up the input to the next power of 2.
*
* @param {number} v
* @returns {number} the next power of 2 at least as large as `v`
*/
function ceilPow2(v) {
// https://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
v--;
v |= v >> 1;
v |= v >> 2;
v |= v >> 4;
v |= v >> 8;
v |= v >> 16;
v++;
return v;
}