speechflow
Version:
Speech Processing Flow Graph
300 lines (276 loc) • 12.6 kB
text/typescript
/*
** SpeechFlow - Speech Processing Flow Graph
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
*/
/* standard dependencies */
import Stream from "node:stream"
/* external dependencies */
import * as Deepgram from "@deepgram/sdk"
import { DateTime, Duration } from "luxon"
/* internal dependencies */
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
import * as util from "./speechflow-util"
/* SpeechFlow node for Deepgram speech-to-text conversion */
export default class SpeechFlowNodeA2TDeepgram extends SpeechFlowNode {
/* declare official node name */
public static name = "a2t-deepgram"
/* internal state */
private dg: Deepgram.LiveClient | null = null
private destroyed = false
private initTimeout: ReturnType<typeof setTimeout> | null = null
private connectionTimeout: ReturnType<typeof setTimeout> | null = null
private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
/* construct node */
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
super(id, cfg, opts, args)
/* declare node configuration parameters */
this.configure({
key: { type: "string", val: process.env.SPEECHFLOW_DEEPGRAM_KEY },
keyAdm: { type: "string", val: process.env.SPEECHFLOW_DEEPGRAM_KEY_ADM },
model: { type: "string", val: "nova-2", pos: 0 },
version: { type: "string", val: "latest", pos: 1 },
language: { type: "string", val: "multi", pos: 2 },
interim: { type: "boolean", val: false, pos: 3 }
})
/* declare node input/output format */
this.input = "audio"
this.output = "text"
}
/* one-time status of node */
async status () {
let balance = 0
try {
const deepgram = Deepgram.createClient(this.params.keyAdm)
const response = await deepgram.manage.getProjects()
if (response !== null && response.error === null && response.result?.projects) {
for (const project of response.result.projects) {
const balanceResponse = await deepgram.manage.getProjectBalances(project.project_id)
if (balanceResponse !== null && balanceResponse.error === null && balanceResponse.result?.balances)
balance += balanceResponse.result.balances[0]?.amount ?? 0
}
}
else if (response?.error !== null)
this.log("warning", `API error fetching projects: ${response.error}`)
}
catch (error) {
this.log("warning", `failed to fetch balance: ${error}`)
}
return { balance: balance.toFixed(2) }
}
/* open node */
async open () {
/* sanity check situation */
if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
throw new Error("Deepgram node currently supports PCM-S16LE audio only")
/* clear destruction flag */
this.destroyed = false
/* create queue for results */
this.queue = new util.SingleQueue<SpeechFlowChunk | null>()
/* create a store for the meta information */
const metastore = new util.TimeStore<Map<string, any>>()
/* connect to Deepgram API */
const deepgram = Deepgram.createClient(this.params.key)
let language = "en"
if (this.params.language !== "en") {
if (this.params.model.match(/^nova-2/))
language = this.params.language
else if (this.params.model.match(/^nova-3/))
language = "multi"
}
this.dg = deepgram.listen.live({
mip_opt_out: true,
model: this.params.model,
version: this.params.version,
language,
channels: this.config.audioChannels,
sample_rate: this.config.audioSampleRate,
encoding: "linear16",
multichannel: false,
endpointing: false,
interim_results: this.params.interim,
smart_format: true,
punctuate: true,
filler_words: true,
numerals: true,
diarize: false,
profanity_filter: false,
redact: false
})
/* hook onto Deepgram API events */
this.dg.on(Deepgram.LiveTranscriptionEvents.Transcript, async (data) => {
if (this.destroyed || this.queue === null)
return
const text = (data.channel?.alternatives[0]?.transcript ?? "") as string
const words = (data.channel?.alternatives[0]?.words ?? []) as
{ word: string, punctuated_word?: string, start: number, end: number }[]
const isFinal = (data.is_final ?? false) as boolean
if (text === "")
this.log("info", `empty/dummy text received (start: ${data.start}s, duration: ${data.duration.toFixed(2)}s)`)
else {
this.log("info", `text received (start: ${data.start}s, ` +
`duration: ${data.duration.toFixed(2)}s, ` +
`kind: ${isFinal ? "final" : "intermediate"}): ` +
`${text}"`)
const start = Duration.fromMillis(data.start * 1000).plus(this.timeZeroOffset)
const end = start.plus({ seconds: data.duration })
const metas = metastore.fetch(start, end)
const meta = metas.reduce((prev: Map<string, any>, curr: Map<string, any>) => {
curr.forEach((val, key) => { prev.set(key, val) })
return prev
}, new Map<string, any>())
metastore.prune(start)
meta.set("words", words.map((word) => {
const start = Duration.fromMillis(word.start * 1000).plus(this.timeZeroOffset)
const end = Duration.fromMillis(word.end * 1000).plus(this.timeZeroOffset)
return { word: word.punctuated_word ?? word.word, start, end }
}))
const chunk = new SpeechFlowChunk(start, end,
isFinal ? "final" : "intermediate", "text", text, meta)
this.queue.write(chunk)
}
})
this.dg.on(Deepgram.LiveTranscriptionEvents.SpeechStarted, (data) => {
this.log("info", "speech started", data)
})
this.dg.on(Deepgram.LiveTranscriptionEvents.UtteranceEnd, (data) => {
this.log("info", "utterance end received", data)
})
this.dg.on(Deepgram.LiveTranscriptionEvents.Metadata, (data) => {
this.log("info", "metadata received")
})
this.dg.on(Deepgram.LiveTranscriptionEvents.Close, () => {
this.log("info", "connection close")
if (!this.destroyed && this.queue !== null)
this.queue.write(null)
})
this.dg.on(Deepgram.LiveTranscriptionEvents.Error, (error: Error) => {
this.log("error", `error: ${error.message}`)
if (!this.destroyed && this.queue !== null)
this.queue.write(null)
this.emit("error")
})
/* wait for Deepgram API to be available */
await new Promise((resolve, reject) => {
this.connectionTimeout = setTimeout(() => {
this.connectionTimeout = null
reject(new Error("Deepgram: timeout waiting for connection open"))
}, 8000)
this.dg!.once(Deepgram.LiveTranscriptionEvents.Open, () => {
this.log("info", "connection open")
if (this.connectionTimeout !== null) {
clearTimeout(this.connectionTimeout)
this.connectionTimeout = null
}
resolve(true)
})
})
/* remember opening time to receive time zero offset */
this.timeOpen = DateTime.now()
/* provide Duplex stream and internally attach to Deepgram API */
const self = this
this.stream = new Stream.Duplex({
writableObjectMode: true,
readableObjectMode: true,
decodeStrings: false,
highWaterMark: 1,
write (chunk: SpeechFlowChunk, encoding, callback) {
if (self.destroyed || self.dg === null) {
callback(new Error("stream already destroyed"))
return
}
if (chunk.type !== "audio")
callback(new Error("expected audio input chunk"))
else if (!Buffer.isBuffer(chunk.payload))
callback(new Error("expected Buffer input chunk"))
else {
if (chunk.payload.byteLength > 0) {
self.log("debug", `send data (${chunk.payload.byteLength} bytes)`)
if (chunk.meta.size > 0)
metastore.store(chunk.timestampStart, chunk.timestampEnd, chunk.meta)
try {
self.dg.send(chunk.payload.buffer) /* intentionally discard all time information */
}
catch (error) {
callback(error instanceof Error ? error : new Error("failed to send to Deepgram"))
return
}
}
callback()
}
},
read (size) {
if (self.destroyed || self.queue === null) {
this.push(null)
return
}
self.queue.read().then((chunk) => {
if (self.destroyed) {
this.push(null)
return
}
if (chunk === null) {
self.log("info", "received EOF signal")
this.push(null)
}
else {
self.log("debug", `received data (${chunk.payload.length} bytes)`)
this.push(chunk)
}
}).catch((error: unknown) => {
if (!self.destroyed)
self.log("error", `queue read error: ${util.ensureError(error).message}`)
})
},
final (callback) {
if (self.destroyed || self.dg === null) {
callback()
return
}
try {
self.dg.requestClose()
}
catch (error) {
self.log("warning", `error closing Deepgram connection: ${error}`)
}
/* NOTICE: do not push null here -- let the Deepgram close event handle it */
callback()
}
})
}
/* close node */
async close () {
/* indicate destruction first to stop all async operations */
this.destroyed = true
/* cleanup all timers */
if (this.initTimeout !== null) {
clearTimeout(this.initTimeout)
this.initTimeout = null
}
if (this.connectionTimeout !== null) {
clearTimeout(this.connectionTimeout)
this.connectionTimeout = null
}
/* close stream */
if (this.stream !== null) {
this.stream.destroy()
this.stream = null
}
/* close Deepgram connection and remove listeners */
if (this.dg !== null) {
try {
this.dg.removeAllListeners()
this.dg.requestClose()
}
catch (error) {
this.log("warning", `error during Deepgram cleanup: ${error}`)
}
this.dg = null
}
/* signal EOF to any pending read operations */
if (this.queue !== null) {
this.queue.write(null)
this.queue = null
}
}
}