audio-to-text-node
Version:
Backend audio file to text transcription using Web Speech API with Puppeteer
138 lines • 5.63 kB
JavaScript
import chalk from "chalk";
import puppeteer from "puppeteer-core";
import { playAudio } from "./routing";
import * as fs from 'fs';
let browserInstance = null;
/**
* Gets or creates a Puppeteer browser instance.
* If a browser instance already exists, it returns that instance.
* If not, it launches a new browser instance with specific arguments.
* @param executablePath Optional path to the browser executable. If not provided, will try common paths.
* @returns A Promise that resolves to the Puppeteer browser instance.
*/
async function getOrCreateBrowser(executablePath) {
if (!browserInstance) {
const defaultPaths = [
// Microsoft Edge
'/usr/bin/microsoft-edge',
'/usr/bin/microsoft-edge-stable',
// Google Chrome
'/usr/bin/google-chrome',
'/usr/bin/google-chrome-stable',
// Chromium
'/usr/bin/chromium-browser',
'/usr/bin/chromium',
];
let browserPath = executablePath;
// If no path provided, try to find a browser automatically
if (!browserPath) {
for (const path of defaultPaths) {
if (fs.existsSync(path)) {
browserPath = path;
break;
}
}
}
if (!browserPath) {
throw new Error('No browser executable found. Please provide executablePath parameter or install Microsoft Edge, Chrome, or Chromium.');
}
browserInstance = await puppeteer.launch({
headless: true,
executablePath: browserPath,
args: [
'--use-fake-ui-for-media-stream',
'--no-sandbox',
'--disable-setuid-sandbox',
],
});
}
return browserInstance;
}
/**
* Launches the speech recognition process in the browser.
* @param files Array of audio file chunks with their paths and start times
* @param language Language code for the speech recognition
* @param executablePath Optional path to the browser executable
* @returns A Promise that resolves to the final transcript text
*/
export async function launchRecognizer(files, language, executablePath) {
console.log(chalk.blue("- Launching browser and setting up recognizer..."));
const browser = await getOrCreateBrowser(executablePath);
let transcriptText = "";
console.log(chalk.yellow("- Playing audio..."));
for (const file of files) {
console.log(chalk.green(`- Processing audio chunk: ${file.path} (start: ${file.start}s)`));
const page = await browser.newPage();
await page.exposeFunction("playAudio", async () => await playAudio(file.path));
await page.exposeFunction("log", (c) => console.log(c));
await page.exposeFunction("onSpeechResult", (text) => {
// A space should be added at the beginning to prevent sentences from sticking together
transcriptText += ` ${text}`;
});
await page.exposeFunction("onSpeechError", (e) => {
console.error(chalk.red("Speech recognition error:"), e);
});
await page.evaluate(async (language) => {
// @ts-ignore
const recognition = new window.webkitSpeechRecognition();
recognition.lang = language;
recognition.continuous = true;
recognition.interimResults = true;
recognition.onresult = (event) => {
let finalText = '';
for (let i = event.resultIndex; i < event.results.length; ++i) {
// @ts-ignore
// window.log(event.results[i][0].transcript);
if (event.results[i].isFinal) {
finalText += event.results[i][0].transcript;
}
}
// @ts-ignore
window.onSpeechResult(finalText);
};
recognition.onerror = (e) => {
// @ts-ignore
window.onSpeechError({
name: 'SpeechRecognitionErrorEvent',
isTrusted: e.isTrusted,
bubbles: e.bubbles,
cancelBubble: e.cancelBubble,
cancelable: e.cancelable,
composed: e.composed,
defaultPrevented: e.defaultPrevented,
error: e.error,
eventPhase: e.eventPhase,
message: e.message,
returnValue: e.returnValue,
timeStamp: e.timeStamp,
type: e.type,
date: new Date(),
});
};
recognition.start();
// Wait 500 milliseconds for the page to load and the audio to be ready
await new Promise((r) => setTimeout(r, 500));
// @ts-ignore
await window.playAudio();
// w
await new Promise((r) => setTimeout(r, 500));
recognition.stop();
await new Promise((r) => setTimeout(r, 500));
}, language);
await page.close();
}
await closeBrowser();
return transcriptText.replace(/\s+/g, ' ').trim();
}
/**
* Closes the Puppeteer browser instance if it exists.
* Sets the browserInstance to null after closing.
* @returns A Promise that resolves when the browser is closed.
*/
export async function closeBrowser() {
if (browserInstance) {
await browserInstance.close();
browserInstance = null;
}
}
//# sourceMappingURL=puppeteer.js.map