UNPKG

multi-voice-sdk

Version:

A universal Text-to-Speech (TTS) and Speech-to-Text (STT) SDK supporting multiple providers (OpenAI, Google Gemini, Deepgram, Groq PlayAI, Cartesia, AssemblyAI) with audio merging capabilities

295 lines (266 loc) • 9.46 kB

JavaScript

import fs from "fs"; import { createClient } from "@deepgram/sdk"; import { AssemblyAI } from "assemblyai"; /** * Transcribe audio to text using various STT providers * @param {Object} options - STT configuration options * @param {"deepgram"|"assemblyai"} options.provider - STT provider to use (required) * @param {string} options.apiKey - API key for the chosen provider * @param {string} [options.audioFile] - Path to local audio file or URL of remote audio file to transcribe * @param {string} [options.outputFile="transcription.json"] - Output file path for transcription results * @param {string} [options.model="nova-2"] - Model to use (provider-specific) * @param {boolean} [options.smartFormat=true] - Enable smart formatting * @param {boolean} [options.detect_language=true] - Automatic language detection * @param {boolean} [options.punctuate=true] - Enable punctuation * @param {boolean} [options.diarize=false] - Enable speaker diarization * @param {number} [options.channels=1] - Number of audio channels * @param {boolean} [options.fullResponse=false] - Return full detailed response instead of just transcript * @returns {Promise<string|Object>} Promise that resolves with transcript string or full transcription results object */ export async function stt({ provider, apiKey, audioFile, outputFile = "transcription.json", model, smartFormat = true, detect_language = true, punctuate = true, diarize = false, channels = 1, fullResponse = false, }) { if (!provider) { throw new Error("Missing required parameter: provider"); } if (!apiKey) { throw new Error("Missing required parameter: apiKey"); } if (!audioFile) { throw new Error( "audioFile parameter is required (can be local file path or HTTP URL)" ); } // Validate provider const supportedProviders = ["deepgram", "assemblyai"]; if (!supportedProviders.includes(provider.toLowerCase())) { throw new Error( `Provider "${provider}" is not supported. Supported providers: ${supportedProviders.join( ", " )}` ); } // Detect if audioFile is a URL or local file path const isUrl = /^https?:\/\//i.test(audioFile); const audioUrl = isUrl ? audioFile : null; const localAudioFile = isUrl ? null : audioFile; switch (provider.toLowerCase()) { case "deepgram": return await transcribeWithDeepgram({ apiKey, audioFile: localAudioFile || audioUrl, // Pass the actual audio source outputFile, model, smartFormat, detect_language, punctuate, diarize, channels, fullResponse, }); case "assemblyai": return await transcribeWithAssemblyAI({ apiKey, audioFile: localAudioFile || audioUrl, // Pass the actual audio source outputFile, model, fullResponse, }); default: throw new Error(`Provider "${provider}" is not implemented yet.`); } } /** * Transcribe audio using Deepgram STT */ async function transcribeWithDeepgram({ apiKey, audioFile, outputFile, model = "nova-3", // Default model smartFormat, detect_language, punctuate, diarize, channels, fullResponse, }) { try { // STEP 1: Create a Deepgram client using the API key const deepgram = createClient(apiKey); // STEP 2: Detect if audioFile is a URL or local file path const isUrl = /^https?:\/\//i.test(audioFile); // STEP 3: Configure Deepgram options for audio analysis const options = { model, smart_format: smartFormat, detect_language, punctuate, diarize, channels, }; let result, error; if (isUrl) { // STEP 4a: Transcribe remote file via URL console.log(`🎙️ Transcribing remote audio from URL: ${audioFile}`); console.log(`🔧 Using model: ${model}`); const response = await deepgram.listen.prerecorded.transcribeUrl( { url: audioFile }, options ); result = response.result; error = response.error; } else { // STEP 4b: Transcribe local file console.log(`🎙️ Transcribing local audio file: ${audioFile}`); console.log(`🔧 Using model: ${model}`); // Check if file exists if (!fs.existsSync(audioFile)) { throw new Error(`Audio file not found: ${audioFile}`); } const audioBuffer = fs.readFileSync(audioFile); const response = await deepgram.listen.prerecorded.transcribeFile( audioBuffer, options ); result = response.result; error = response.error; } if (error) { console.error("❌ Deepgram STT error:", error); throw error; } // STEP 4: Process and return results if (result) { const transcript = result.results?.channels?.[0]?.alternatives?.[0]?.transcript || ""; const confidence = result.results?.channels?.[0]?.alternatives?.[0]?.confidence || 0; const words = result.results?.channels?.[0]?.alternatives?.[0]?.words || []; console.log(`✅ Transcription completed successfully`); const transcriptionResult = { transcript, confidence, words, fullResult: result, metadata: { model, language: result.results?.channels?.[0]?.detected_language || detect_language, duration: result.metadata?.duration, channels: result.metadata?.channels, provider: "deepgram", }, }; // Save results to output file if specified if (outputFile) { try { // Save based on fullResponse setting - just transcript or full object const dataToSave = fullResponse ? transcriptionResult : { transcript }; await fs.promises.writeFile( outputFile, JSON.stringify(dataToSave, null, 2) ); console.log(`💾 Transcription results saved to: ${outputFile}`); } catch (writeError) { console.warn( `⚠️ Failed to save transcription to file: ${writeError.message}` ); } } // Return just transcript by default, or full response if requested return fullResponse ? transcriptionResult : transcript; } else { throw new Error("No transcription result received"); } } catch (err) { console.error("❌ STT transcription failed:", err.message); throw err; } } /** * Transcribe audio using AssemblyAI STT */ async function transcribeWithAssemblyAI({ apiKey, audioFile, outputFile, model, fullResponse, }) { try { // STEP 1: Create an AssemblyAI client using the API key const client = new AssemblyAI({ apiKey: apiKey, }); // STEP 2: Determine audio source const audioSource = audioFile; console.log(`🎙️ Transcribing audio with AssemblyAI: ${audioSource}`); console.log(`🔧 Using model: slam-1`); // STEP 3: Configure AssemblyAI options for audio analysis const params = { audio: audioSource, speech_model: "slam-1", // Always use slam-1 for AssemblyAI }; // STEP 4: Start transcription const transcript = await client.transcripts.transcribe(params); if (transcript.status === "error") { console.error("❌ AssemblyAI STT error:", transcript.error); throw new Error(`AssemblyAI transcription failed: ${transcript.error}`); } // STEP 5: Process and return results if (transcript.status === "completed") { const transcriptText = transcript.text || ""; const confidence = transcript.confidence || 0; const words = transcript.words || []; console.log(`✅ Transcription completed successfully`); const transcriptionResult = { transcript: transcriptText, confidence: confidence, words: words, fullResult: transcript, metadata: { model: "slam-1", language: transcript.language_code || "auto", duration: transcript.audio_duration, channels: 1, provider: "assemblyai", }, }; // Save results to output file if specified if (outputFile) { try { // Save based on fullResponse setting - just transcript or full object const dataToSave = fullResponse ? transcriptionResult : { transcript: transcriptText }; await fs.promises.writeFile( outputFile, JSON.stringify(dataToSave, null, 2) ); console.log(`💾 Transcription results saved to: ${outputFile}`); } catch (writeError) { console.warn( `⚠️ Failed to save transcription to file: ${writeError.message}` ); } } // Return just transcript by default, or full response if requested return fullResponse ? transcriptionResult : transcriptText; } else { throw new Error(`Transcription failed with status: ${transcript.status}`); } } catch (err) { console.error("❌ STT transcription failed:", err.message); throw err; } }