speechflow
Version:
Speech Processing Flow Graph
332 lines (305 loc) • 13.3 kB
text/typescript
/*
** SpeechFlow - Speech Processing Flow Graph
** Copyright (c) 2024-2025 Dr. Ralf S. Engelschall <rse@engelschall.com>
** Licensed under GPL 3.0 <https://spdx.org/licenses/GPL-3.0-only>
*/
/* standard dependencies */
import Stream from "node:stream"
/* external dependencies */
import OpenAI from "openai"
import { DateTime } from "luxon"
import SpeexResampler from "speex-resampler"
import ws from "ws"
/* internal dependencies */
import SpeechFlowNode, { SpeechFlowChunk } from "./speechflow-node"
import * as util from "./speechflow-util"
/* SpeechFlow node for OpenAI speech-to-text conversion */
export default class SpeechFlowNodeA2TOpenAI extends SpeechFlowNode {
/* declare official node name */
public static name = "a2t-openai"
/* internal state */
private openai: OpenAI | null = null
private ws: ws.WebSocket | null = null
private queue: util.SingleQueue<SpeechFlowChunk | null> | null = null
private resampler: SpeexResampler | null = null
private destroyed = false
private connectionTimeout: ReturnType<typeof setTimeout> | null = null
/* construct node */
constructor (id: string, cfg: { [ id: string ]: any }, opts: { [ id: string ]: any }, args: any[]) {
super(id, cfg, opts, args)
/* declare node configuration parameters */
this.configure({
key: { type: "string", val: process.env.SPEECHFLOW_OPENAI_KEY },
api: { type: "string", val: "https://api.openai.com/v1", match: /^https?:\/\/.+/ },
model: { type: "string", val: "gpt-4o-mini-transcribe" },
language: { type: "string", val: "de", match: /^(?:en|de)$/ },
interim: { type: "boolean", val: false }
})
/* declare node input/output format */
this.input = "audio"
this.output = "text"
}
/* one-time status of node */
async status () {
return {}
}
/* open node */
async open () {
/* sanity check situation */
if (this.config.audioBitDepth !== 16 || !this.config.audioLittleEndian)
throw new Error("OpenAI transcribe node currently supports PCM-S16LE audio only")
/* clear destruction flag */
this.destroyed = false
/* create queue for results */
this.queue = new util.SingleQueue<SpeechFlowChunk | null>()
/* create a store for the meta information */
const metastore = new util.TimeStore<Map<string, any>>()
/* establish resampler from our standard audio sample rate (48Khz)
to OpenAI's maximum 24Khz input sample rate */
this.resampler = new SpeexResampler(1, this.config.audioSampleRate, 24000, 7)
/* instantiate OpenAI API */
this.openai = new OpenAI({
baseURL: this.params.api,
apiKey: this.params.key,
dangerouslyAllowBrowser: true
})
/* open the WebSocket connection for streaming */
const url = `${this.params.api.replace(/^http/, "ws")}/realtime?intent=transcription`
this.ws = new ws.WebSocket(url, {
headers: {
Authorization: `Bearer ${this.params.key}`,
"OpenAI-Beta": "realtime=v1"
}
})
const sendMessage = (obj: any) => {
this.ws?.send(JSON.stringify(obj))
}
/* wait for OpenAI API to be available */
await new Promise((resolve, reject) => {
this.connectionTimeout = setTimeout(() => {
if (this.connectionTimeout !== null) {
this.connectionTimeout = null
reject(new Error("OpenAI: timeout waiting for connection open"))
}
}, 8000)
this.ws!.once("open", () => {
this.log("info", "connection open")
if (this.connectionTimeout !== null) {
clearTimeout(this.connectionTimeout)
this.connectionTimeout = null
}
resolve(true)
})
this.ws!.once("error", (err) => {
if (this.connectionTimeout !== null) {
clearTimeout(this.connectionTimeout)
this.connectionTimeout = null
}
reject(err)
})
})
/* configure session */
sendMessage({
type: "transcription_session.update",
session: {
input_audio_format: "pcm16",
input_audio_transcription: {
model: this.params.model,
language: this.params.language
},
turn_detection: {
type: "server_vad",
threshold: 0.5,
prefix_padding_ms: 300,
silence_duration_ms: 500
}
}
})
/* hook onto session events */
this.ws.on("open", () => {
this.log("info", "WebSocket connection opened")
sendMessage({ type: "transcription.create" })
})
this.ws.on("close", () => {
this.log("info", "WebSocket connection closed")
this.queue!.write(null)
})
this.ws.on("error", (err) => {
this.log("error", `WebSocket connection error: ${err}`)
})
let text = ""
this.ws.on("message", (data) => {
let ev: any
try {
ev = JSON.parse(data.toString())
}
catch (err) {
this.log("warning", `failed to parse WebSocket message: ${err}`)
return
}
if (!(typeof ev === "object" && ev !== null)) {
this.log("warning", "received invalid WebSocket message")
return
}
switch (ev.type) {
case "transcription_session.created":
break
case "conversation.item.created":
text = ""
break
case "conversation.item.input_audio_transcription.delta": {
text += ev.delta as string
if (this.params.interim) {
const start = DateTime.now().diff(this.timeOpen!) // FIXME: OpenAI does not provide timestamps
const end = start // FIXME: OpenAI does not provide timestamps
const metas = metastore.fetch(start, end)
const meta = metas.reduce((prev: Map<string, any>, curr: Map<string, any>) => {
curr.forEach((val, key) => { prev.set(key, val) })
return prev
}, new Map<string, any>())
const chunk = new SpeechFlowChunk(start, end, "intermediate", "text", text)
chunk.meta = meta
this.queue!.write(chunk)
}
break
}
case "conversation.item.input_audio_transcription.completed": {
text = ev.transcript as string
const start = DateTime.now().diff(this.timeOpen!) // FIXME: OpenAI does not provide timestamps
const end = start // FIXME: OpenAI does not provide timestamps
const metas = metastore.fetch(start, end)
const meta = metas.reduce((prev: Map<string, any>, curr: Map<string, any>) => {
curr.forEach((val, key) => { prev.set(key, val) })
return prev
}, new Map<string, any>())
metastore.prune(start)
const chunk = new SpeechFlowChunk(start, end, "final", "text", text)
chunk.meta = meta
this.queue!.write(chunk)
text = ""
break
}
case "input_audio_buffer.speech_started":
this.log("info", "VAD: speech started")
break
case "input_audio_buffer.speech_stopped":
this.log("info", "VAD: speech stopped")
break
case "input_audio_buffer.committed":
this.log("info", "input buffer committed")
break
case "error":
this.log("error", `error: ${ev.error?.message}`)
break
default:
break
}
})
/* remember opening time to receive time zero offset */
this.timeOpen = DateTime.now()
/* provide Duplex stream and internally attach to OpenAI API */
const self = this
this.stream = new Stream.Duplex({
writableObjectMode: true,
readableObjectMode: true,
decodeStrings: false,
highWaterMark: 1,
write (chunk: SpeechFlowChunk, encoding, callback) {
if (self.destroyed || self.ws === null) {
callback(new Error("stream already destroyed"))
return
}
if (chunk.type !== "audio")
callback(new Error("expected audio input chunk"))
else if (!Buffer.isBuffer(chunk.payload))
callback(new Error("expected Buffer input chunk"))
else {
if (chunk.payload.byteLength > 0) {
self.log("debug", `send data (${chunk.payload.byteLength} bytes)`)
if (chunk.meta.size > 0)
metastore.store(chunk.timestampStart, chunk.timestampEnd, chunk.meta)
try {
const payload = self.resampler!.processChunk(chunk.payload)
const audioB64 = payload.toString("base64")
sendMessage({
type: "input_audio_buffer.append",
audio: audioB64 /* intentionally discard all time information */
})
}
catch (error) {
callback(error instanceof Error ? error : new Error("failed to send to OpenAI transcribe"))
return
}
}
callback()
}
},
read (size) {
if (self.destroyed || self.queue === null) {
this.push(null)
return
}
self.queue.read().then((chunk) => {
if (self.destroyed) {
this.push(null)
return
}
if (chunk === null) {
self.log("info", "received EOF signal")
this.push(null)
}
else {
self.log("debug", `received data (${chunk.payload.length} bytes)`)
this.push(chunk)
}
}).catch((error: unknown) => {
if (!self.destroyed)
self.log("error", `queue read error: ${util.ensureError(error).message}`)
})
},
final (callback) {
if (self.destroyed || self.ws === null) {
callback()
return
}
try {
sendMessage({ type: "input_audio_buffer.commit" })
self.ws.close()
/* NOTICE: do not push null here -- let the OpenAI close event handle it */
callback()
}
catch (error) {
self.log("warning", `error closing OpenAI connection: ${error}`)
callback(error instanceof Error ? error : new Error("failed to close OpenAI connection"))
}
}
})
}
/* close node */
async close () {
/* indicate destruction first to stop all async operations */
this.destroyed = true
/* clear connection timeout */
if (this.connectionTimeout !== null) {
clearTimeout(this.connectionTimeout)
this.connectionTimeout = null
}
/* signal EOF to any pending read operations */
if (this.queue !== null) {
this.queue.write(null)
this.queue = null
}
/* close OpenAI connection */
if (this.ws !== null) {
this.ws.close()
this.ws = null
}
if (this.openai !== null)
this.openai = null
/* close stream */
if (this.stream !== null) {
this.stream.destroy()
this.stream = null
}
}
}