UNPKG

@qgustavor/stream-audio-fingerprint

Version:
231 lines (230 loc) 11.7 kB
// This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. // Copyright (c) 2018 Alexandre Storelli // Online implementation of the landmark audio fingerprinting algorithm. // inspired by D. Ellis (2009), "Robust Landmark-Based Audio Fingerprinting" // http://labrosa.ee.columbia.edu/matlab/fingerprint/ // itself inspired by Wang 2003 paper // This module exports Fingerprinter // By default, the process method must be fed with an input signal with the following properties: // - single channel // - 16bit PCM // - 22050 Hz sampling rate // // It will output objects of the form // { tcodes: [time stamps], hcodes: [fingerprints] } import FFT from "./lib/fft.mjs"; const buildOptions = (options) => { const verbose = options.verbose ?? false; // sampling rate in Hz. If you change this, you must adapt windowDt and pruningDt below to match your needs // set the Nyquist frequency, SAMPLING_RATE/2, // so as to match the max frequencies you want to get landmark fingerprints. const samplingRate = options.samplingRate ?? 22050; // bytes per sample, 2 for 16 bit PCM. If you change this, you must change readInt16LE methods in the code. const bps = options.bps ?? 2; // maximum number of local maxima for each spectrum. useful to tune the amount of fingerprints at output const mnlm = options.mnlm ?? 5; // maximum of hashes each peak can lead to. useful to tune the amount of fingerprints at output const mppp = options.mppp ?? 3; // size of the FFT window. As we use real signals, the spectra will have nfft/2 points. // Increasing it will give more spectral precision, less temporal precision. // It may be good or bad depending on the sounds you want to match, // and on whether your input is deformed by EQ or noise. const nfft = options.nfft ?? 512; // 50 % overlap // if SAMPLING_RATE is 22050 Hz, this leads to a sampling frequency // fs = (SAMPLING_RATE / step) /s = 86/s, or dt = 1/fs = 11,61 ms. // It's not really useful to change the overlap ratio. const step = options.step ?? (nfft / 2); const dt = options.dt ?? (1 / (samplingRate / step)); const hwin = options.hwin ?? Array(nfft).fill(null).map((_f, i) => (0.5 * (1 - Math.cos(((2 * Math.PI) * i) / (nfft - 1))))); // threshold decay factor between frames. const maskDecayLog = options.maskDecayLog ?? Math.log(0.995); // frequency window to generate landmark pairs, in units of DF = SAMPLING_RATE / NFFT. Values between 0 and NFFT/2 // you can increase this to avoid having fingerprints for low frequencies const ifMin = options.ifMin ?? 0; // you don't really want to decrease this, better reduce SAMPLING_RATE instead for faster computation. const ifMax = options.ifMax ?? nfft / 2; // we set this to avoid getting fingerprints linking very different frequencies. // useful to reduce the amount of fingerprints. this can be maxed at NFFT/2 if you wish. const windowDf = options.windowDf ?? 60; // time window to generate landmark pairs. time in units of dt (see definition above) // a little more than 1 sec. const windowDt = options.windowDt ?? 96; // about 250 ms, window to remove previous peaks that are superseded by later ones. // tune the pruningDt value to match the effects of maskDecayLog. // also, pruningDt controls the latency of the pipeline. higher pruningDt = higher latency const pruningDt = options.pruningDt ?? 24; // prepare the values of exponential masks. // mask decay scale in DF units on the frequency axis. const maskDf = options.maskDf ?? 3; // gaussian mask is a polynom when working on the log-spectrum. log(exp()) = Id() // MASK_DF is multiplied by Math.sqrt(i+3) to have wider masks at higher frequencies // see the visualization out-thr.png for better insight of what is happening const eww = options.eww ?? Array(nfft / 2).fill(null).map((_f, i) => (Array(nfft / 2).fill(null).map((_f, j) => (-0.5 * Math.pow((j - i) / maskDf / Math.sqrt(i + 3), 2))))); return { verbose, samplingRate, bps, mnlm, mppp, nfft, step, dt, hwin, maskDecayLog, ifMin, ifMax, windowDf, windowDt, pruningDt, maskDf, eww }; }; class Fingerprinter { constructor(options) { this.options = buildOptions(options ?? {}); this.buffer = new Uint8Array(0); this.bufferDelta = 0; this.stepIndex = 0; this.marks = []; this.threshold = Array(this.options.nfft).fill(null).map(() => -3); this.fft = new FFT(this.options.nfft); } process(chunk) { const { verbose, bps, mnlm, mppp, nfft, step, hwin, maskDecayLog, ifMin, ifMax, windowDf, windowDt, pruningDt, eww } = this.options; if (verbose) { const t = Math.round(this.stepIndex / step).toString(); const received = chunk.length.toString(); console.log(`t=${t} received ${received} bytes`); } const tcodes = []; const hcodes = []; const concatedBuffer = new Uint8Array(this.buffer.length + chunk.length); concatedBuffer.set(this.buffer, 0); concatedBuffer.set(chunk, this.buffer.length); this.buffer = concatedBuffer; const bufferView = new DataView(concatedBuffer.buffer); while ((this.stepIndex + nfft) * bps < this.buffer.length + this.bufferDelta) { const data = new Array(nfft); // window data const image = new Array(nfft).fill(0); // fill the data, windowed (HWIN) and scaled for (let i = 0, limit = nfft; i < limit; i++) { const readInt = bufferView.getInt16((this.stepIndex + i) * bps - this.bufferDelta, true); data[i] = (hwin[i] * readInt) / Math.pow(2, 8 * bps - 1); } this.stepIndex += step; this.fft.forward(data, image); // compute FFT // log-normal surface for (let i = ifMin; i < ifMax; i += 1) { // the lower part of the spectrum is damped, // the higher part is boosted, leading to a better peaks detection. this.fft.spectrum[i] = Math.abs(this.fft.spectrum[i]) * Math.sqrt(i + 16); } // positive values of the difference between log spectrum and threshold const diff = new Array(nfft / 2); for (let i = ifMin; i < ifMax; i += 1) { diff[i] = Math.max(Math.log(Math.max(1e-6, this.fft.spectrum[i])) - this.threshold[i], 0); } // find at most MNLM local maxima in the spectrum at this timestamp. const iLocMax = new Array(mnlm); const vLocMax = new Array(mnlm); for (let i = 0; i < mnlm; i += 1) { iLocMax[i] = NaN; vLocMax[i] = Number.NEGATIVE_INFINITY; } for (let i = ifMin + 1; i < ifMax - 1; i += 1) { if (diff[i] > diff[i - 1] && diff[i] > diff[i + 1] && this.fft.spectrum[i] > vLocMax[mnlm - 1]) { // if local maximum big enough // insert the newly found local maximum in the ordered list of maxima for (let j = mnlm - 1; j >= 0; j -= 1) { // navigate the table of previously saved maxima if (j >= 1 && this.fft.spectrum[i] > vLocMax[j - 1]) continue; for (let k = mnlm - 1; k >= j + 1; k -= 1) { iLocMax[k] = iLocMax[k - 1]; // offset the bottom values vLocMax[k] = vLocMax[k - 1]; } iLocMax[j] = i; vLocMax[j] = this.fft.spectrum[i]; break; } } } // now that we have the MNLM highest local maxima of the spectrum, // update the local maximum threshold so that only major peaks are taken into account. for (let i = 0; i < mnlm; i += 1) { if (vLocMax[i] > Number.NEGATIVE_INFINITY) { for (let j = ifMin; j < ifMax; j += 1) { this.threshold[j] = (Math.max(this.threshold[j], Math.log(this.fft.spectrum[iLocMax[i]]) + eww[iLocMax[i]][j])); } } else { vLocMax.splice(i, mnlm - i); // remove the last elements. iLocMax.splice(i, mnlm - i); break; } } // array that stores local maxima for each time step this.marks.push({ t: Math.round(this.stepIndex / step), i: iLocMax, v: vLocMax }); // remove previous (in time) maxima that would be too close and/or too low. const nm = this.marks.length; const t0 = nm - pruningDt - 1; for (let i = nm - 1; i >= Math.max(t0 + 1, 0); i -= 1) { for (let j = 0; j < this.marks[i].v.length; j += 1) { if (this.marks[i].i[j] !== 0 && Math.log(this.marks[i].v[j]) < (this.threshold[this.marks[i].i[j]] + maskDecayLog * (nm - 1 - i))) { this.marks[i].v[j] = Number.NEGATIVE_INFINITY; this.marks[i].i[j] = Number.NEGATIVE_INFINITY; } } } // generate hashes for peaks that can no longer be pruned. stepIndex:{f1:f2:deltaindex} let nFingersTotal = 0; if (t0 >= 0) { const m = this.marks[t0]; for (let i = 0; i < m.i.length; i += 1) { let nFingers = 0; let canBreak = false; for (let j = t0; j >= Math.max(0, t0 - windowDt); j -= 1) { if (canBreak) break; const m2 = this.marks[j]; for (let k = 0; k < m2.i.length; k += 1) { if (canBreak) break; if (m2.i[k] !== m.i[i] && Math.abs(m2.i[k] - m.i[i]) < windowDf) { tcodes.push(m.t); hcodes.push(m2.i[k] + (nfft / 2) * (m.i[i] + (nfft / 2) * (t0 - j))); nFingers += 1; nFingersTotal += 1; if (nFingers >= mppp) canBreak = true; } } } } } if (nFingersTotal > 0 && verbose) { console.log(`t=${Math.round(this.stepIndex / step)} generated ${nFingersTotal} fingerprints`); } this.marks.splice(0, t0 + 1 - windowDt); // decrease the threshold for the next iteration for (let j = 0; j < this.threshold.length; j += 1) { this.threshold[j] += maskDecayLog; } } if (this.buffer.length > 1000000) { const delta = this.buffer.length - 20000; this.bufferDelta += delta; this.buffer = this.buffer.slice(delta); } return { tcodes, hcodes }; } } export default Fingerprinter;