UNPKG

@juspay/neurolink

Version:

Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio

730 lines 28 kB
/** * Audio File Processor * * Handles downloading, validating, and processing audio files to extract metadata * and build text content suitable for LLM consumption. Audio files cannot be sent * raw to most LLMs, so this processor extracts structured metadata (duration, codec, * bitrate, tags) and formats it as text. * * Uses the `music-metadata` library (pure JavaScript, no native dependencies) for * metadata extraction. Supports all major audio formats: MP3, WAV, OGG, FLAC, M4A, * AAC, WMA, WebM, AIFF, AMR, APE, WavPack, and more. * * Key features: * - Metadata extraction: duration, codec, bitrate, sample rate, channels * - Tag extraction: title, artist, album, year, genre, track number, composer * - Embedded cover art extraction * - Graceful degradation for corrupt or partially readable files * - LLM-friendly text content generation * * @module processors/media/AudioProcessor * * @example * ```typescript * import { audioProcessor, processAudio, isAudioFile } from "./AudioProcessor.js"; * * // Check if a file is an audio file * if (isAudioFile(fileInfo.mimetype, fileInfo.name)) { * const result = await processAudio(fileInfo); * * if (result.success) { * console.log(`Duration: ${result.data.metadata.durationFormatted}`); * console.log(`Codec: ${result.data.metadata.codec}`); * console.log(`Artist: ${result.data.tags.artist}`); * console.log(`Text for LLM: ${result.data.textContent}`); * } * } * ``` */ import { BaseFileProcessor } from "../base/BaseFileProcessor.js"; import { SIZE_LIMITS_MB } from "../config/index.js"; import { FileErrorCode } from "../errors/index.js"; let _musicMetadata = null; async function loadMusicMetadata() { if (_musicMetadata) { return _musicMetadata; } try { _musicMetadata = await import(/* @vite-ignore */ "music-metadata"); return _musicMetadata; } catch (err) { const e = err instanceof Error ? err : null; if (e?.code === "ERR_MODULE_NOT_FOUND" && e.message.includes("music-metadata")) { throw new Error('Audio processing requires the "music-metadata" package. Install it with:\n pnpm add music-metadata', { cause: err }); } throw err; } } // ============================================================================= // TYPES // ============================================================================= // ============================================================================= // CONSTANTS // ============================================================================= /** * Audio processor configuration constants. */ const AUDIO_CONFIG = { /** Maximum audio file size in MB (uses centralized constant from sizeLimits) */ MAX_SIZE_MB: SIZE_LIMITS_MB.AUDIO_MAX_MB, /** Processing timeout in milliseconds (audio metadata parsing is fast) */ TIMEOUT_MS: 30000, /** Maximum file size for Whisper API transcription (25MB) */ WHISPER_MAX_SIZE_MB: 25, /** Transcription timeout in milliseconds (120 seconds for large files) */ TRANSCRIPTION_TIMEOUT_MS: 120_000, /** Whisper-supported audio formats */ WHISPER_SUPPORTED_FORMATS: [ "mp3", "mp4", "mpeg", "mpga", "m4a", "wav", "webm", "flac", "ogg", ], }; /** * Supported MIME types for audio files. * Covers all major audio formats including common variants and aliases. */ const SUPPORTED_AUDIO_MIME_TYPES = [ "audio/mpeg", "audio/mp3", "audio/wav", "audio/x-wav", "audio/wave", "audio/ogg", "audio/vorbis", "audio/opus", "audio/flac", "audio/x-flac", "audio/mp4", "audio/x-m4a", "audio/aac", "audio/x-ms-wma", "audio/webm", "audio/aiff", "audio/x-aiff", "audio/amr", "audio/3gpp", ]; /** * Supported file extensions for audio files. * Includes common audio container formats and lossless variants. */ const SUPPORTED_AUDIO_EXTENSIONS = [ ".mp3", ".wav", ".ogg", ".oga", ".opus", ".flac", ".m4a", ".aac", ".wma", ".webm", ".aiff", ".aif", ".amr", ".3gp", ".ape", ".wv", ]; // ============================================================================= // AUDIO PROCESSOR CLASS // ============================================================================= /** * Audio Processor - extracts metadata and tags from audio files for LLM consumption. * * Audio files cannot be directly sent to most language models. This processor * parses audio file headers to extract structured metadata (duration, codec, * bitrate, sample rate, channels) and embedded tags (title, artist, album, etc.), * then builds a human-readable text summary for the AI to reason about. * * Uses the `music-metadata` library which is a pure JavaScript implementation * with no native dependencies, making it safe for all deployment environments. * * @example * ```typescript * const processor = new AudioProcessor(); * * const result = await processor.processFile({ * id: 'audio-123', * name: 'song.mp3', * mimetype: 'audio/mpeg', * size: 5242880, * buffer: audioBuffer, * }); * * if (result.success) { * console.log(result.data.textContent); * // "[Audio File: song.mp3] * // Duration: 3:45 | Codec: MPEG 1 Layer 3 | Bitrate: 320 kbps | ..." * } * ``` */ export class AudioProcessor extends BaseFileProcessor { constructor() { super({ maxSizeMB: AUDIO_CONFIG.MAX_SIZE_MB, timeoutMs: AUDIO_CONFIG.TIMEOUT_MS, supportedMimeTypes: [...SUPPORTED_AUDIO_MIME_TYPES], supportedExtensions: [...SUPPORTED_AUDIO_EXTENSIONS], fileTypeName: "audio", defaultFilename: "audio.mp3", }); } // =========================================================================== // PROCESSING OVERRIDE // =========================================================================== /** * Override processFile for async audio metadata parsing with music-metadata. * * Processing pipeline: * 1. Validate file type and size (base class) * 2. Get file buffer (from direct buffer or download) * 3. Parse audio metadata using music-metadata's parseBuffer() * 4. Extract tags (title, artist, album, etc.) * 5. Extract embedded cover art if present * 6. Build LLM-friendly text content * * @param fileInfo - File information (can include URL or buffer) * @param options - Optional processing options (auth headers, timeout, etc.) * @returns Processing result with audio metadata or error */ async processFile(fileInfo, options) { try { // Step 1: Validate file type and size const validationResult = this.validateFileWithResult(fileInfo); if (!validationResult.success) { return { success: false, error: validationResult.error, }; } // Step 2: Get file buffer (from direct buffer or download from URL) let buffer; if (fileInfo.buffer) { buffer = fileInfo.buffer; } else if (fileInfo.url) { const downloadResult = await this.downloadFileWithRetry(fileInfo, options); if (!downloadResult.success) { return { success: false, error: downloadResult.error, }; } if (!downloadResult.data) { return { success: false, error: this.createError(FileErrorCode.DOWNLOAD_FAILED, { reason: "Download succeeded but returned no data", }), }; } buffer = downloadResult.data; // Validate actual downloaded size against limit if (!this.validateFileSize(buffer.length)) { return { success: false, error: this.createError(FileErrorCode.FILE_TOO_LARGE, { sizeMB: (buffer.length / (1024 * 1024)).toFixed(2), maxMB: this.config.maxSizeMB, type: this.config.fileTypeName, }), }; } } else { return { success: false, error: this.createError(FileErrorCode.DOWNLOAD_FAILED, { reason: "No buffer or URL provided for file", }), }; } // Step 3: Parse audio metadata using music-metadata const audioMetadata = await this.parseAudioMetadata(buffer, fileInfo); // Step 4: Extract structured metadata from parsed result const metadata = this.extractMetadata(audioMetadata, buffer.length); // Step 5: Extract tags from common metadata const tags = this.extractTags(audioMetadata); // Step 6: Extract embedded cover art if present const coverArt = await this.extractCoverArt(audioMetadata); // Step 7: Attempt transcription if API key is available const filename = this.getFilename(fileInfo); const transcriptionResult = await this.attemptTranscription(buffer, filename, fileInfo.mimetype); // Step 8: Build LLM-friendly text content (includes transcript if available) const textContent = this.buildTextContent(filename, metadata, tags, transcriptionResult.transcript); return { success: true, data: { textContent, metadata, tags, transcript: transcriptionResult.transcript, hasTranscript: transcriptionResult.hasTranscript, transcriptionProvider: transcriptionResult.transcriptionProvider, coverArt: coverArt ?? undefined, buffer, mimetype: fileInfo.mimetype || "audio/mpeg", size: fileInfo.size, filename, }, }; } catch (error) { // Classify music-metadata parse errors as INVALID_FORMAT // (corrupt/truncated files, unsupported codec variants, etc.) const isParseError = error instanceof Error && (error.message.includes("parse") || error.message.includes("codec") || error.message.includes("header") || error.message.includes("format") || error.message.includes("unexpected end") || error.name === "CouldNotDetermineFileTypeError" || error.name === "UnsupportedFileTypeError"); const errorCode = isParseError ? FileErrorCode.INVALID_FORMAT : FileErrorCode.PROCESSING_FAILED; return { success: false, error: this.createError(errorCode, { fileType: "audio", error: error instanceof Error ? error.message : String(error), }, error instanceof Error ? error : undefined), }; } } // =========================================================================== // PRIVATE: AUDIO TRANSCRIPTION // =========================================================================== /** * Attempt to transcribe audio using the Vercel AI SDK's `transcribe()` function * with the OpenAI Whisper model. * * Transcription is attempted when: * 1. `OPENAI_API_KEY` environment variable is set * 2. File size is within Whisper's 25MB limit * 3. File format is supported by Whisper * * Gracefully degrades: if transcription fails for any reason, metadata-only * output is returned (transcription is additive, never blocks processing). * * @param buffer - Audio file content * @param filename - Original filename (used for format detection) * @param mimetype - MIME type of the audio file * @returns Transcription result with transcript text, or empty result */ async attemptTranscription(buffer, filename, mimetype) { const emptyResult = { transcript: undefined, hasTranscript: false, transcriptionProvider: undefined, }; // Check if OPENAI_API_KEY is available const apiKey = process.env.OPENAI_API_KEY; if (!apiKey) { return emptyResult; } // Check file size (Whisper limit is 25MB) const fileSizeMB = buffer.length / (1024 * 1024); if (fileSizeMB > AUDIO_CONFIG.WHISPER_MAX_SIZE_MB) { return emptyResult; } // Check if file format is supported by Whisper const ext = filename.split(".").pop()?.toLowerCase(); const isFormatSupported = ext && AUDIO_CONFIG.WHISPER_SUPPORTED_FORMATS.includes(ext); const isMimeSupported = mimetype && (mimetype.startsWith("audio/mpeg") || mimetype.startsWith("audio/mp4") || mimetype.startsWith("audio/wav") || mimetype.startsWith("audio/webm") || mimetype.startsWith("audio/flac") || mimetype.startsWith("audio/ogg") || mimetype.startsWith("audio/x-m4a")); if (!isFormatSupported && !isMimeSupported) { return emptyResult; } try { // Dynamic imports to avoid loading these modules when transcription is not needed const [{ createOpenAI }, { experimental_transcribe }] = await Promise.all([import("@ai-sdk/openai"), import("ai")]); const openai = createOpenAI({ apiKey }); const model = openai.transcription("whisper-1"); const result = await experimental_transcribe({ model, audio: buffer, }); if (result.text && result.text.trim().length > 0) { return { transcript: result.text.trim(), hasTranscript: true, transcriptionProvider: "openai-whisper", }; } return emptyResult; } catch { // Transcription is best-effort — never fail the entire processing pipeline // Common failures: rate limiting, network issues, unsupported audio encoding return emptyResult; } } // =========================================================================== // STUB: buildProcessedResult (required by base class, unused due to override) // =========================================================================== /** * Stub implementation required by BaseFileProcessor. * Not used because processFile is fully overridden. * * @param buffer - File buffer * @param fileInfo - File information * @returns Empty ProcessedAudio structure */ buildProcessedResult(buffer, fileInfo) { return { textContent: "", metadata: { duration: 0, durationFormatted: "0:00", codec: "unknown", lossless: false, fileSize: buffer.length, }, tags: {}, hasTranscript: false, buffer, mimetype: fileInfo.mimetype || "audio/mpeg", size: fileInfo.size, filename: this.getFilename(fileInfo), }; } // =========================================================================== // PRIVATE: METADATA PARSING // =========================================================================== /** * Parse audio metadata from a buffer using music-metadata. * * @param buffer - Audio file content * @param fileInfo - File information (used for MIME type hint) * @returns Parsed audio metadata from music-metadata * @throws Error if the buffer cannot be parsed (corrupt file, unsupported format) */ async parseAudioMetadata(buffer, fileInfo) { // Provide MIME type as a string hint to music-metadata for more accurate parsing. // parseBuffer accepts (Uint8Array, fileInfo?: IFileInfo | string, options?) // where string is interpreted as MIME type. const mimeType = fileInfo.mimetype || undefined; const { parseBuffer } = await loadMusicMetadata(); return parseBuffer(buffer, mimeType); } /** * Extract structured metadata from the parsed audio format information. * * @param audioMetadata - Parsed audio metadata from music-metadata * @param fileSize - File size in bytes * @returns Structured metadata object */ extractMetadata(audioMetadata, fileSize) { const format = audioMetadata.format; const duration = format.duration ?? 0; const durationFormatted = this.formatDuration(duration); return { duration, durationFormatted, codec: format.codec ?? format.container ?? "unknown", codecProfile: format.codecProfile ?? undefined, bitrate: format.bitrate ?? undefined, sampleRate: format.sampleRate ?? undefined, channels: format.numberOfChannels ?? undefined, bitsPerSample: format.bitsPerSample ?? undefined, lossless: format.lossless ?? false, fileSize, }; } /** * Extract common tags from the parsed audio metadata. * * Maps music-metadata's common tag format to our simplified tag structure. * Handles array-to-scalar conversions (e.g., comment[] -> first comment string). * * @param audioMetadata - Parsed audio metadata from music-metadata * @returns Simplified tag object */ extractTags(audioMetadata) { const common = audioMetadata.common; return { title: common.title ?? undefined, artist: common.artist ?? undefined, album: common.album ?? undefined, year: common.year ?? undefined, genre: common.genre && common.genre.length > 0 ? common.genre : undefined, track: common.track.no !== null || common.track.of !== null ? { no: common.track.no, of: common.track.of } : undefined, comment: common.comment && common.comment.length > 0 ? (common.comment[0]?.text ?? undefined) : undefined, composer: common.composer && common.composer.length > 0 ? common.composer[0] : undefined, }; } /** * Extract embedded cover art from the audio file. * * Uses music-metadata's selectCover() to pick the most appropriate * cover image when multiple are embedded (e.g., front cover vs. back cover). * * @param audioMetadata - Parsed audio metadata from music-metadata * @returns Cover art as Buffer, or null if no cover art is embedded */ async extractCoverArt(audioMetadata) { const pictures = audioMetadata.common.picture; if (!pictures || pictures.length === 0) { return null; } const { selectCover } = await loadMusicMetadata(); const cover = selectCover(pictures); if (!cover) { return null; } return Buffer.from(cover.data); } // =========================================================================== // PRIVATE: TEXT CONTENT BUILDING // =========================================================================== /** * Build an LLM-friendly text representation of the audio file. * * Produces a structured text block that gives the AI context about the * audio file without requiring the actual audio stream. The format is * designed to be scannable and information-dense. * * @param filename - Original filename * @param metadata - Extracted audio metadata * @param tags - Extracted audio tags * @param transcript - Optional transcribed text from Whisper * @returns Formatted text content string * * @example Output: * ``` * [Audio File: song.mp3] * Duration: 3:45 | Codec: MPEG 1 Layer 3 | Bitrate: 320 kbps | Sample Rate: 44100 Hz | Channels: 2 (Stereo) | Lossless: No * File Size: 5.00 MB * Title: Yesterday | Artist: The Beatles | Album: Help! | Year: 1965 | Genre: Rock, Pop * Track: 1/14 | Composer: Lennon-McCartney * * --- Transcript --- * [full transcribed text here] * ``` */ buildTextContent(filename, metadata, tags, transcript) { const lines = []; // Header line lines.push(`[Audio File: ${filename}]`); // Technical metadata line const techParts = []; techParts.push(`Duration: ${metadata.durationFormatted}`); techParts.push(`Codec: ${metadata.codec}`); if (metadata.codecProfile) { techParts.push(`Profile: ${metadata.codecProfile}`); } if (metadata.bitrate) { techParts.push(`Bitrate: ${this.formatBitrate(metadata.bitrate)}`); } if (metadata.sampleRate) { techParts.push(`Sample Rate: ${metadata.sampleRate} Hz`); } if (metadata.channels) { techParts.push(`Channels: ${metadata.channels} (${this.getChannelLabel(metadata.channels)})`); } if (metadata.bitsPerSample) { techParts.push(`Bit Depth: ${metadata.bitsPerSample}-bit`); } techParts.push(`Lossless: ${metadata.lossless ? "Yes" : "No"}`); lines.push(techParts.join(" | ")); // File size line lines.push(`File Size: ${(metadata.fileSize / (1024 * 1024)).toFixed(2)} MB`); // Tags line (only if any tags are present) const tagParts = []; if (tags.title) { tagParts.push(`Title: ${tags.title}`); } if (tags.artist) { tagParts.push(`Artist: ${tags.artist}`); } if (tags.album) { tagParts.push(`Album: ${tags.album}`); } if (tags.year) { tagParts.push(`Year: ${tags.year}`); } if (tags.genre && tags.genre.length > 0) { tagParts.push(`Genre: ${tags.genre.join(", ")}`); } if (tagParts.length > 0) { lines.push(tagParts.join(" | ")); } // Secondary tags line (track, composer, comment) const secondaryParts = []; if (tags.track) { const trackStr = tags.track.of !== null ? `${tags.track.no ?? "?"}/${tags.track.of}` : `${tags.track.no ?? "?"}`; secondaryParts.push(`Track: ${trackStr}`); } if (tags.composer) { secondaryParts.push(`Composer: ${tags.composer}`); } if (tags.comment) { secondaryParts.push(`Comment: ${tags.comment}`); } if (secondaryParts.length > 0) { lines.push(secondaryParts.join(" | ")); } // Transcript section (if transcription was performed) if (transcript) { lines.push(""); lines.push("--- Transcript ---"); lines.push(transcript); } return lines.join("\n"); } // =========================================================================== // PRIVATE: FORMATTING UTILITIES // =========================================================================== /** * Format a duration in seconds to a human-readable string. * * @param seconds - Duration in seconds * @returns Formatted string: "M:SS" for < 1 hour, "H:MM:SS" for >= 1 hour * * @example * formatDuration(225) // "3:45" * formatDuration(3750) // "1:02:30" * formatDuration(0) // "0:00" */ formatDuration(seconds) { if (!seconds || seconds <= 0) { return "0:00"; } const totalSeconds = Math.round(seconds); const hours = Math.floor(totalSeconds / 3600); const minutes = Math.floor((totalSeconds % 3600) / 60); const secs = totalSeconds % 60; if (hours > 0) { return `${hours}:${String(minutes).padStart(2, "0")}:${String(secs).padStart(2, "0")}`; } return `${minutes}:${String(secs).padStart(2, "0")}`; } /** * Format bitrate to a human-readable string. * * @param bitrate - Bitrate in bits per second * @returns Formatted string (e.g., "320 kbps", "1411 kbps") */ formatBitrate(bitrate) { const kbps = Math.round(bitrate / 1000); return `${kbps} kbps`; } /** * Get a human-readable label for the number of audio channels. * * @param channels - Number of audio channels * @returns Channel label (e.g., "Mono", "Stereo", "5.1 Surround") */ getChannelLabel(channels) { switch (channels) { case 1: return "Mono"; case 2: return "Stereo"; case 6: return "5.1 Surround"; case 8: return "7.1 Surround"; default: return `${channels}ch`; } } } // ============================================================================= // SINGLETON INSTANCE // ============================================================================= /** * Singleton Audio processor instance. * Use this for standard audio processing operations. * * @example * ```typescript * import { audioProcessor } from "./AudioProcessor.js"; * * const result = await audioProcessor.processFile(fileInfo); * ``` */ export const audioProcessor = new AudioProcessor(); // ============================================================================= // HELPER FUNCTIONS // ============================================================================= /** * Check if a file is an audio file. * Matches by MIME type or file extension. * * @param mimetype - MIME type of the file * @param filename - Filename (for extension-based detection) * @returns true if the file is an audio file * * @example * ```typescript * if (isAudioFile('audio/mpeg', 'song.mp3')) { * // Process as audio * } * * if (isAudioFile('', 'recording.flac')) { * // Also matches by extension * } * ``` */ export function isAudioFile(mimetype, filename) { return audioProcessor.isFileSupported(mimetype, filename); } /** * Process a single audio file. * Convenience function that uses the singleton processor. * * @param fileInfo - File information (can include URL or buffer) * @param options - Optional processing options (auth headers, timeout, etc.) * @returns Processing result with audio metadata or error * * @example * ```typescript * import { processAudio } from "./AudioProcessor.js"; * * const result = await processAudio({ * id: 'audio-1', * name: 'podcast.mp3', * mimetype: 'audio/mpeg', * size: 15728640, * buffer: mp3Buffer, * }); * * if (result.success) { * const { metadata, tags, textContent } = result.data; * console.log(`${tags.title} by ${tags.artist} (${metadata.durationFormatted})`); * // Send textContent to LLM for analysis * } else { * console.error(`Processing failed: ${result.error?.userMessage}`); * } * ``` */ export async function processAudio(fileInfo, options) { return audioProcessor.processFile(fileInfo, options); } //# sourceMappingURL=AudioProcessor.js.map