@qgustavor/stream-audio-fingerprint
Version:
Audio landmark fingerprinting as a JavaScript module
231 lines (230 loc) • 11.7 kB
JavaScript
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
// Copyright (c) 2018 Alexandre Storelli
// Online implementation of the landmark audio fingerprinting algorithm.
// inspired by D. Ellis (2009), "Robust Landmark-Based Audio Fingerprinting"
// http://labrosa.ee.columbia.edu/matlab/fingerprint/
// itself inspired by Wang 2003 paper
// This module exports Fingerprinter
// By default, the process method must be fed with an input signal with the following properties:
// - single channel
// - 16bit PCM
// - 22050 Hz sampling rate
//
// It will output objects of the form
// { tcodes: [time stamps], hcodes: [fingerprints] }
import FFT from "./lib/fft.mjs";
const buildOptions = (options) => {
const verbose = options.verbose ?? false;
// sampling rate in Hz. If you change this, you must adapt windowDt and pruningDt below to match your needs
// set the Nyquist frequency, SAMPLING_RATE/2,
// so as to match the max frequencies you want to get landmark fingerprints.
const samplingRate = options.samplingRate ?? 22050;
// bytes per sample, 2 for 16 bit PCM. If you change this, you must change readInt16LE methods in the code.
const bps = options.bps ?? 2;
// maximum number of local maxima for each spectrum. useful to tune the amount of fingerprints at output
const mnlm = options.mnlm ?? 5;
// maximum of hashes each peak can lead to. useful to tune the amount of fingerprints at output
const mppp = options.mppp ?? 3;
// size of the FFT window. As we use real signals, the spectra will have nfft/2 points.
// Increasing it will give more spectral precision, less temporal precision.
// It may be good or bad depending on the sounds you want to match,
// and on whether your input is deformed by EQ or noise.
const nfft = options.nfft ?? 512;
// 50 % overlap
// if SAMPLING_RATE is 22050 Hz, this leads to a sampling frequency
// fs = (SAMPLING_RATE / step) /s = 86/s, or dt = 1/fs = 11,61 ms.
// It's not really useful to change the overlap ratio.
const step = options.step ?? (nfft / 2);
const dt = options.dt ?? (1 / (samplingRate / step));
const hwin = options.hwin ??
Array(nfft).fill(null).map((_f, i) => (0.5 * (1 - Math.cos(((2 * Math.PI) * i) / (nfft - 1)))));
// threshold decay factor between frames.
const maskDecayLog = options.maskDecayLog ?? Math.log(0.995);
// frequency window to generate landmark pairs, in units of DF = SAMPLING_RATE / NFFT. Values between 0 and NFFT/2
// you can increase this to avoid having fingerprints for low frequencies
const ifMin = options.ifMin ?? 0;
// you don't really want to decrease this, better reduce SAMPLING_RATE instead for faster computation.
const ifMax = options.ifMax ?? nfft / 2;
// we set this to avoid getting fingerprints linking very different frequencies.
// useful to reduce the amount of fingerprints. this can be maxed at NFFT/2 if you wish.
const windowDf = options.windowDf ?? 60;
// time window to generate landmark pairs. time in units of dt (see definition above)
// a little more than 1 sec.
const windowDt = options.windowDt ?? 96;
// about 250 ms, window to remove previous peaks that are superseded by later ones.
// tune the pruningDt value to match the effects of maskDecayLog.
// also, pruningDt controls the latency of the pipeline. higher pruningDt = higher latency
const pruningDt = options.pruningDt ?? 24;
// prepare the values of exponential masks.
// mask decay scale in DF units on the frequency axis.
const maskDf = options.maskDf ?? 3;
// gaussian mask is a polynom when working on the log-spectrum. log(exp()) = Id()
// MASK_DF is multiplied by Math.sqrt(i+3) to have wider masks at higher frequencies
// see the visualization out-thr.png for better insight of what is happening
const eww = options.eww ??
Array(nfft / 2).fill(null).map((_f, i) => (Array(nfft / 2).fill(null).map((_f, j) => (-0.5 * Math.pow((j - i) / maskDf / Math.sqrt(i + 3), 2)))));
return {
verbose,
samplingRate,
bps,
mnlm,
mppp,
nfft,
step,
dt,
hwin,
maskDecayLog,
ifMin,
ifMax,
windowDf,
windowDt,
pruningDt,
maskDf,
eww
};
};
class Fingerprinter {
constructor(options) {
this.options = buildOptions(options ?? {});
this.buffer = new Uint8Array(0);
this.bufferDelta = 0;
this.stepIndex = 0;
this.marks = [];
this.threshold = Array(this.options.nfft).fill(null).map(() => -3);
this.fft = new FFT(this.options.nfft);
}
process(chunk) {
const { verbose, bps, mnlm, mppp, nfft, step, hwin, maskDecayLog, ifMin, ifMax, windowDf, windowDt, pruningDt, eww } = this.options;
if (verbose) {
const t = Math.round(this.stepIndex / step).toString();
const received = chunk.length.toString();
console.log(`t=${t} received ${received} bytes`);
}
const tcodes = [];
const hcodes = [];
const concatedBuffer = new Uint8Array(this.buffer.length + chunk.length);
concatedBuffer.set(this.buffer, 0);
concatedBuffer.set(chunk, this.buffer.length);
this.buffer = concatedBuffer;
const bufferView = new DataView(concatedBuffer.buffer);
while ((this.stepIndex + nfft) * bps < this.buffer.length + this.bufferDelta) {
const data = new Array(nfft); // window data
const image = new Array(nfft).fill(0);
// fill the data, windowed (HWIN) and scaled
for (let i = 0, limit = nfft; i < limit; i++) {
const readInt = bufferView.getInt16((this.stepIndex + i) * bps - this.bufferDelta, true);
data[i] = (hwin[i] * readInt) / Math.pow(2, 8 * bps - 1);
}
this.stepIndex += step;
this.fft.forward(data, image); // compute FFT
// log-normal surface
for (let i = ifMin; i < ifMax; i += 1) {
// the lower part of the spectrum is damped,
// the higher part is boosted, leading to a better peaks detection.
this.fft.spectrum[i] = Math.abs(this.fft.spectrum[i]) * Math.sqrt(i + 16);
}
// positive values of the difference between log spectrum and threshold
const diff = new Array(nfft / 2);
for (let i = ifMin; i < ifMax; i += 1) {
diff[i] = Math.max(Math.log(Math.max(1e-6, this.fft.spectrum[i])) - this.threshold[i], 0);
}
// find at most MNLM local maxima in the spectrum at this timestamp.
const iLocMax = new Array(mnlm);
const vLocMax = new Array(mnlm);
for (let i = 0; i < mnlm; i += 1) {
iLocMax[i] = NaN;
vLocMax[i] = Number.NEGATIVE_INFINITY;
}
for (let i = ifMin + 1; i < ifMax - 1; i += 1) {
if (diff[i] > diff[i - 1] &&
diff[i] > diff[i + 1] &&
this.fft.spectrum[i] > vLocMax[mnlm - 1]) { // if local maximum big enough
// insert the newly found local maximum in the ordered list of maxima
for (let j = mnlm - 1; j >= 0; j -= 1) {
// navigate the table of previously saved maxima
if (j >= 1 && this.fft.spectrum[i] > vLocMax[j - 1])
continue;
for (let k = mnlm - 1; k >= j + 1; k -= 1) {
iLocMax[k] = iLocMax[k - 1]; // offset the bottom values
vLocMax[k] = vLocMax[k - 1];
}
iLocMax[j] = i;
vLocMax[j] = this.fft.spectrum[i];
break;
}
}
}
// now that we have the MNLM highest local maxima of the spectrum,
// update the local maximum threshold so that only major peaks are taken into account.
for (let i = 0; i < mnlm; i += 1) {
if (vLocMax[i] > Number.NEGATIVE_INFINITY) {
for (let j = ifMin; j < ifMax; j += 1) {
this.threshold[j] = (Math.max(this.threshold[j], Math.log(this.fft.spectrum[iLocMax[i]]) + eww[iLocMax[i]][j]));
}
}
else {
vLocMax.splice(i, mnlm - i); // remove the last elements.
iLocMax.splice(i, mnlm - i);
break;
}
}
// array that stores local maxima for each time step
this.marks.push({ t: Math.round(this.stepIndex / step), i: iLocMax, v: vLocMax });
// remove previous (in time) maxima that would be too close and/or too low.
const nm = this.marks.length;
const t0 = nm - pruningDt - 1;
for (let i = nm - 1; i >= Math.max(t0 + 1, 0); i -= 1) {
for (let j = 0; j < this.marks[i].v.length; j += 1) {
if (this.marks[i].i[j] !== 0 &&
Math.log(this.marks[i].v[j]) < (this.threshold[this.marks[i].i[j]] + maskDecayLog * (nm - 1 - i))) {
this.marks[i].v[j] = Number.NEGATIVE_INFINITY;
this.marks[i].i[j] = Number.NEGATIVE_INFINITY;
}
}
}
// generate hashes for peaks that can no longer be pruned. stepIndex:{f1:f2:deltaindex}
let nFingersTotal = 0;
if (t0 >= 0) {
const m = this.marks[t0];
for (let i = 0; i < m.i.length; i += 1) {
let nFingers = 0;
let canBreak = false;
for (let j = t0; j >= Math.max(0, t0 - windowDt); j -= 1) {
if (canBreak)
break;
const m2 = this.marks[j];
for (let k = 0; k < m2.i.length; k += 1) {
if (canBreak)
break;
if (m2.i[k] !== m.i[i] && Math.abs(m2.i[k] - m.i[i]) < windowDf) {
tcodes.push(m.t);
hcodes.push(m2.i[k] + (nfft / 2) * (m.i[i] + (nfft / 2) * (t0 - j)));
nFingers += 1;
nFingersTotal += 1;
if (nFingers >= mppp)
canBreak = true;
}
}
}
}
}
if (nFingersTotal > 0 && verbose) {
console.log(`t=${Math.round(this.stepIndex / step)} generated ${nFingersTotal} fingerprints`);
}
this.marks.splice(0, t0 + 1 - windowDt);
// decrease the threshold for the next iteration
for (let j = 0; j < this.threshold.length; j += 1) {
this.threshold[j] += maskDecayLog;
}
}
if (this.buffer.length > 1000000) {
const delta = this.buffer.length - 20000;
this.bufferDelta += delta;
this.buffer = this.buffer.slice(delta);
}
return { tcodes, hcodes };
}
}
export default Fingerprinter;