@qgustavor/stream-audio-fingerprint

// This Source Code Form is subject to the terms of the Mozilla Public // License, v. 2.0. If a copy of the MPL was not distributed with this // file, You can obtain one at http://mozilla.org/MPL/2.0/. // Copyright (c) 2018 Alexandre Storelli // Online implementation of the landmark audio fingerprinting algorithm. // inspired by D. Ellis (2009), "Robust Landmark-Based Audio Fingerprinting" // http://labrosa.ee.columbia.edu/matlab/fingerprint/ // itself inspired by Wang 2003 paper // This module exports Codegen, an instance of stream.Transform // By default, the writable side must be fed with an input signal with the following properties: // - single channel // - 16bit PCM // - 22050 Hz sampling rate // // The readable side outputs objects of the form // { tcodes: [time stamps], hcodes: [fingerprints] } import FFT from './lib/fft.ts' interface CodegenOptions { verbose: boolean samplingRate: number bps: number mnlm: number mppp: number nfft: number step: number dt: number hwin: number[] maskDecayLog: number ifMin: number ifMax: number windowDf: number windowDt: number pruningDt: number maskDf: number eww: number[][] } interface CodegenUserOpts { verbose?: boolean samplingRate?: number bps?: number mnlm?: number mppp?: number nfft?: number step?: number dt?: number hwin?: number[] maskDecayLog?: number ifMin?: number ifMax?: number windowDf?: number windowDt?: number pruningDt?: number maskDf?: number eww?: number[][] } const buildOptions = (options: CodegenUserOpts): CodegenOptions => { const verbose = options.verbose ?? false // sampling rate in Hz. If you change this, you must adapt windowDt and pruningDt below to match your needs // set the Nyquist frequency, SAMPLING_RATE/2, // so as to match the max frequencies you want to get landmark fingerprints. const samplingRate = options.samplingRate ?? 22050 // bytes per sample, 2 for 16 bit PCM. If you change this, you must change readInt16LE methods in the code. const bps = options.bps ?? 2 // maximum number of local maxima for each spectrum. useful to tune the amount of fingerprints at output const mnlm = options.mnlm ?? 5 // maximum of hashes each peak can lead to. useful to tune the amount of fingerprints at output const mppp = options.mppp ?? 3 // size of the FFT window. As we use real signals, the spectra will have nfft/2 points. // Increasing it will give more spectral precision, less temporal precision. // It may be good or bad depending on the sounds you want to match, // and on whether your input is deformed by EQ or noise. const nfft = options.nfft ?? 512 // 50 % overlap // if SAMPLING_RATE is 22050 Hz, this leads to a sampling frequency // fs = (SAMPLING_RATE / step) /s = 86/s, or dt = 1/fs = 11,61 ms. // It's not really useful to change the overlap ratio. const step = options.step ?? (nfft / 2) const dt = options.dt ?? (1 / (samplingRate / step)) const hwin = options.hwin ?? Array(nfft).fill(null).map((_f, i) => ( 0.5 * (1 - Math.cos(((2 * Math.PI) * i) / (nfft - 1))) )) // threshold decay factor between frames. const maskDecayLog = options.maskDecayLog ?? Math.log(0.995) // frequency window to generate landmark pairs, in units of DF = SAMPLING_RATE / NFFT. Values between 0 and NFFT/2 // you can increase this to avoid having fingerprints for low frequencies const ifMin = options.ifMin ?? 0 // you don't really want to decrease this, better reduce SAMPLING_RATE instead for faster computation. const ifMax = options.ifMax ?? nfft / 2 // we set this to avoid getting fingerprints linking very different frequencies. // useful to reduce the amount of fingerprints. this can be maxed at NFFT/2 if you wish. const windowDf = options.windowDf ?? 60 // time window to generate landmark pairs. time in units of dt (see definition above) // a little more than 1 sec. const windowDt = options.windowDt ?? 96 // about 250 ms, window to remove previous peaks that are superseded by later ones. // tune the pruningDt value to match the effects of maskDecayLog. // also, pruningDt controls the latency of the pipeline. higher pruningDt = higher latency const pruningDt = options.pruningDt ?? 24 // prepare the values of exponential masks. // mask decay scale in DF units on the frequency axis. const maskDf = options.maskDf ?? 3 // gaussian mask is a polynom when working on the log-spectrum. log(exp()) = Id() // MASK_DF is multiplied by Math.sqrt(i+3) to have wider masks at higher frequencies // see the visualization out-thr.png for better insight of what is happening const eww = options.eww ?? Array(nfft / 2).fill(null).map((_f, i) => ( Array(nfft / 2).fill(null).map((_f, j) => ( -0.5 * Math.pow((j - i) / maskDf / Math.sqrt(i + 3), 2) )) )) return { verbose, samplingRate, bps, mnlm, mppp, nfft, step, dt, hwin, maskDecayLog, ifMin, ifMax, windowDf, windowDt, pruningDt, maskDf, eww } } interface Mark { t: number i: number[] v: number[] } export interface CodegenBuffer { tcodes: number[] hcodes: number[] } class Codegen { options: CodegenOptions buffer: Uint8Array bufferDelta: number stepIndex: number marks: Mark[] threshold: number[] fft: FFT constructor (options?: CodegenUserOpts) { this.options = buildOptions(options ?? {}) this.buffer = new Uint8Array(0) this.bufferDelta = 0 this.stepIndex = 0 this.marks = [] this.threshold = Array(this.options.nfft).fill(null).map(() => -3) this.fft = new FFT(this.options.nfft) } process (chunk: Uint8Array): CodegenBuffer { const { verbose, bps, mnlm, mppp, nfft, step, hwin, maskDecayLog, ifMin, ifMax, windowDf, windowDt, pruningDt, eww } = this.options if (verbose) { const t = Math.round(this.stepIndex / step).toString() const received = chunk.length.toString() console.log(`t=${t} received ${received} bytes`) } const tcodes: number[] = [] const hcodes: number[] = [] const concatedBuffer = new Uint8Array(this.buffer.length + chunk.length) concatedBuffer.set(this.buffer, 0) concatedBuffer.set(chunk, this.buffer.length) this.buffer = concatedBuffer const bufferView = new DataView(concatedBuffer.buffer) while ((this.stepIndex + nfft) * bps < this.buffer.length + this.bufferDelta) { const data = new Array(nfft) // window data const image = new Array(nfft).fill(0) // fill the data, windowed (HWIN) and scaled for (let i = 0, limit = nfft; i < limit; i++) { const readInt = bufferView.getInt16((this.stepIndex + i) * bps - this.bufferDelta, true) data[i] = (hwin[i] * readInt) / Math.pow(2, 8 * bps - 1) } this.stepIndex += step this.fft.forward(data, image) // compute FFT // log-normal surface for (let i = ifMin; i < ifMax; i += 1) { // the lower part of the spectrum is damped, // the higher part is boosted, leading to a better peaks detection. this.fft.spectrum[i] = Math.abs(this.fft.spectrum[i]) * Math.sqrt(i + 16) } // positive values of the difference between log spectrum and threshold const diff = new Array(nfft / 2) for (let i = ifMin; i < ifMax; i += 1) { diff[i] = Math.max(Math.log(Math.max(1e-6, this.fft.spectrum[i])) - this.threshold[i], 0) } // find at most MNLM local maxima in the spectrum at this timestamp. const iLocMax = new Array(mnlm) const vLocMax = new Array(mnlm) for (let i = 0; i < mnlm; i += 1) { iLocMax[i] = NaN vLocMax[i] = Number.NEGATIVE_INFINITY } for (let i = ifMin + 1; i < ifMax - 1; i += 1) { if ( diff[i] > diff[i - 1] && diff[i] > diff[i + 1] && this.fft.spectrum[i] > vLocMax[mnlm - 1] ) { // if local maximum big enough // insert the newly found local maximum in the ordered list of maxima for (let j = mnlm - 1; j >= 0; j -= 1) { // navigate the table of previously saved maxima if (j >= 1 && this.fft.spectrum[i] > vLocMax[j - 1]) continue for (let k = mnlm - 1; k >= j + 1; k -= 1) { iLocMax[k] = iLocMax[k - 1] // offset the bottom values vLocMax[k] = vLocMax[k - 1] } iLocMax[j] = i vLocMax[j] = this.fft.spectrum[i] break } } } // now that we have the MNLM highest local maxima of the spectrum, // update the local maximum threshold so that only major peaks are taken into account. for (let i = 0; i < mnlm; i += 1) { if (vLocMax[i] > Number.NEGATIVE_INFINITY) { for (let j = ifMin; j < ifMax; j += 1) { this.threshold[j] = ( Math.max(this.threshold[j], Math.log(this.fft.spectrum[iLocMax[i]]) + eww[iLocMax[i]][j]) ) } } else { vLocMax.splice(i, mnlm - i) // remove the last elements. iLocMax.splice(i, mnlm - i) break } } // array that stores local maxima for each time step this.marks.push({ t: Math.round(this.stepIndex / step), i: iLocMax, v: vLocMax }) // remove previous (in time) maxima that would be too close and/or too low. const nm = this.marks.length const t0 = nm - pruningDt - 1 for (let i = nm - 1; i >= Math.max(t0 + 1, 0); i -= 1) { for (let j = 0; j < this.marks[i].v.length; j += 1) { if ( this.marks[i].i[j] !== 0 && Math.log(this.marks[i].v[j]) < ( this.threshold[this.marks[i].i[j]] + maskDecayLog * (nm - 1 - i) ) ) { this.marks[i].v[j] = Number.NEGATIVE_INFINITY this.marks[i].i[j] = Number.NEGATIVE_INFINITY } } } // generate hashes for peaks that can no longer be pruned. stepIndex:{f1:f2:deltaindex} let nFingersTotal = 0 if (t0 >= 0) { const m = this.marks[t0] for (let i = 0; i < m.i.length; i += 1) { let nFingers = 0 let canBreak = false for (let j = t0; j >= Math.max(0, t0 - windowDt); j -= 1) { if (canBreak) break const m2 = this.marks[j] for (let k = 0; k < m2.i.length; k += 1) { if (canBreak) break if (m2.i[k] !== m.i[i] && Math.abs(m2.i[k] - m.i[i]) < windowDf) { tcodes.push(m.t) hcodes.push(m2.i[k] + (nfft / 2) * (m.i[i] + (nfft / 2) * (t0 - j))) nFingers += 1 nFingersTotal += 1 if (nFingers >= mppp) canBreak = true } } } } } if (nFingersTotal > 0 && verbose) { console.log(`t=${Math.round(this.stepIndex / step)} generated ${nFingersTotal} fingerprints`) } this.marks.splice(0, t0 + 1 - windowDt) // decrease the threshold for the next iteration for (let j = 0; j < this.threshold.length; j += 1) { this.threshold[j] += maskDecayLog } } if (this.buffer.length > 1000000) { const delta = this.buffer.length - 20000 this.bufferDelta += delta this.buffer = this.buffer.slice(delta) } return { tcodes, hcodes } } } export default Codegen