@juspay/neurolink
Version:
Universal AI Development Platform with working MCP integration, multi-provider support, voice (TTS/STT/realtime), and professional CLI. 58+ external MCP servers discoverable, multimodal file processing, RAG pipelines. Build, test, and deploy AI applicatio
502 lines • 20.3 kB
JavaScript
/**
* D-ID Avatar / Lip-sync Handler
*
* Async talking-head generation. Submits a /talks request with a source
* image and either an audio URL or a text+voice script, polls the talk
* status, and downloads the resulting MP4.
*
* @module avatar/providers/DIDAvatar
* @see https://docs.d-id.com/reference/talks-overview
*/
import { ErrorCategory, ErrorSeverity } from "../../constants/enums.js";
import { logger } from "../../utils/logger.js";
import { sanitizeForLog } from "../../utils/logSanitize.js";
import { AVATAR_ERROR_CODES, AvatarError, } from "../../utils/avatarProcessor.js";
import { assertSafeUrl } from "../../utils/ssrfGuard.js";
import { MAX_AUDIO_BYTES, MAX_IMAGE_BYTES, MAX_VIDEO_BYTES, readBoundedBuffer, } from "../../utils/sizeGuard.js";
const DEFAULT_BASE_URL = "https://api.d-id.com";
const REQUEST_TIMEOUT_MS = 30_000;
const POLL_INTERVAL_MS = 3_000;
const TOTAL_TIMEOUT_MS = 5 * 60_000;
/**
* D-ID Avatar Handler.
*
* Auth: `Authorization: Basic ${DID_API_KEY}` (the API key is
* already a base64-encoded `username:password` from the D-ID console).
*
* Env vars: `DID_API_KEY` (preferred) / `D_ID_API_KEY` (legacy alias).
*/
export class DIDAvatar {
maxAudioDurationSeconds = 60;
supportedFormats = ["mp4"];
apiKey;
baseUrl;
constructor(apiKey) {
const resolved = (apiKey ??
process.env.DID_API_KEY ??
process.env.D_ID_API_KEY ??
"").trim();
this.apiKey = resolved.length > 0 ? resolved : null;
this.baseUrl = (process.env.DID_BASE_URL ??
process.env.D_ID_BASE_URL ??
DEFAULT_BASE_URL).replace(/\/$/, "");
}
isConfigured() {
return this.apiKey !== null;
}
async generate(options) {
if (!this.apiKey) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.PROVIDER_NOT_CONFIGURED,
message: "DID_API_KEY not configured",
category: ErrorCategory.CONFIGURATION,
severity: ErrorSeverity.HIGH,
retriable: false,
});
}
if (!options.audio && !options.text) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.AUDIO_REQUIRED,
message: "D-ID requires either `audio` (Buffer/path) or `text` (with voice id) to drive the talk",
category: ErrorCategory.VALIDATION,
severity: ErrorSeverity.MEDIUM,
retriable: false,
});
}
const startTime = Date.now();
// 1. Upload image (D-ID needs a hosted URL).
const sourceUrl = await this.uploadImage(options.image);
// 2. Optional audio upload.
const audioUrl = options.audio
? await this.uploadAudio(options.audio)
: undefined;
// 3. Submit talk.
const talkId = await this.submitTalk(options, sourceUrl, audioUrl);
// 4. Poll for completion.
const completed = await this.pollUntilDone(talkId, options.timeout ?? TOTAL_TIMEOUT_MS, options.abortSignal);
if (!completed.result_url) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: `D-ID talk ${talkId} completed but no result_url returned`,
category: ErrorCategory.EXECUTION,
severity: ErrorSeverity.HIGH,
retriable: false,
context: { talkId, completed },
});
}
// 5. Guard the provider-returned URL before fetching (SSRF — same threat
// model as caller-supplied URLs: the API response could be tampered).
try {
await assertSafeUrl(completed.result_url);
}
catch (err) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: `D-ID result_url rejected as unsafe: ${err instanceof Error ? err.message : String(err)}`,
category: ErrorCategory.VALIDATION,
severity: ErrorSeverity.HIGH,
retriable: false,
context: { talkId, url: completed.result_url },
});
}
// 6. Download the MP4.
const buffer = await this.downloadResult(completed.result_url);
const latency = Date.now() - startTime;
logger.info(`[DIDAvatar] Generated ${buffer.length} bytes in ${latency}ms — talk ${talkId}`);
return {
buffer,
format: "mp4",
size: buffer.length,
duration: completed.duration,
provider: "d-id",
metadata: {
latency,
provider: "d-id",
jobId: talkId,
},
};
}
async uploadImage(image) {
// If an HTTPS URL is provided directly, use it (D-ID accepts public URLs).
if (typeof image === "string" && /^https:\/\//.test(image)) {
return image;
}
const buffer = await this.resolveBuffer(image);
const form = new FormData();
form.append("image", new Blob([new Uint8Array(buffer)], {
type: this.detectImageMime(buffer),
}), "source.png");
const response = await this.fetchWithTimeout(`${this.baseUrl}/images`, {
method: "POST",
headers: {
Authorization: `Basic ${this.apiKey}`,
},
body: form,
});
const data = (await this.assertOk(response, "image upload"));
if (!data.url) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: "D-ID image upload succeeded but returned no URL",
category: ErrorCategory.EXECUTION,
severity: ErrorSeverity.HIGH,
retriable: false,
});
}
return data.url;
}
async uploadAudio(audio) {
if (typeof audio === "string" && /^https:\/\//.test(audio)) {
return audio;
}
const buffer = await this.resolveBuffer(audio);
const audioSubtype = this.detectAudioType(buffer);
// Map the detected subtype to a file extension.
const extMap = {
mp3: "mp3",
mpeg: "mp3",
wav: "wav",
ogg: "ogg",
mp4: "m4a",
};
const ext = extMap[audioSubtype] ?? "mp3";
const form = new FormData();
form.append("audio", new Blob([new Uint8Array(buffer)], { type: `audio/${audioSubtype}` }), `narration.${ext}`);
const response = await this.fetchWithTimeout(`${this.baseUrl}/audios`, {
method: "POST",
headers: {
Authorization: `Basic ${this.apiKey}`,
},
body: form,
});
const data = (await this.assertOk(response, "audio upload"));
if (!data.url) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: "D-ID audio upload succeeded but returned no URL",
category: ErrorCategory.EXECUTION,
severity: ErrorSeverity.HIGH,
retriable: false,
});
}
return data.url;
}
async submitTalk(options, sourceUrl, audioUrl) {
const script = audioUrl
? { type: "audio", audio_url: audioUrl }
: {
type: "text",
input: options.text,
provider: {
type: "microsoft",
voice_id: options.voice ?? "en-US-JennyNeural",
},
};
const body = {
source_url: sourceUrl,
script,
config: {
result_format: "mp4",
stitch: true,
},
};
const response = await this.fetchWithTimeout(`${this.baseUrl}/talks`, {
method: "POST",
headers: {
Authorization: `Basic ${this.apiKey}`,
"Content-Type": "application/json",
},
body: JSON.stringify(body),
});
const data = (await this.assertOk(response, "talk submit"));
if (!data.id) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: "D-ID talk submit returned no id",
category: ErrorCategory.EXECUTION,
severity: ErrorSeverity.HIGH,
retriable: false,
});
}
return data.id;
}
async pollUntilDone(talkId, totalTimeoutMs, abortSignal) {
const startTime = Date.now();
while (Date.now() - startTime < totalTimeoutMs) {
if (abortSignal?.aborted) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: `D-ID poll for talk ${talkId} aborted by caller`,
category: ErrorCategory.NETWORK,
severity: ErrorSeverity.MEDIUM,
retriable: false,
context: { talkId },
});
}
const response = await this.fetchWithTimeout(`${this.baseUrl}/talks/${talkId}`, {
method: "GET",
headers: { Authorization: `Basic ${this.apiKey}` },
}, abortSignal);
const data = (await this.assertOk(response, "talk status"));
if (data.status === "done") {
return data;
}
if (data.status === "error" || data.status === "rejected") {
throw new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: `D-ID talk ${talkId} ${data.status}: ${data.error?.description ?? "unknown"}`,
category: ErrorCategory.EXECUTION,
severity: ErrorSeverity.HIGH,
retriable: false,
context: { talkId, status: data.status, error: data.error },
});
}
// Abortable sleep.
await new Promise((resolve, reject) => {
const onAbort = () => {
clearTimeout(timer);
reject(new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: `D-ID poll for talk ${talkId} aborted by caller`,
category: ErrorCategory.NETWORK,
severity: ErrorSeverity.MEDIUM,
retriable: false,
context: { talkId },
}));
};
const timer = setTimeout(() => {
abortSignal?.removeEventListener("abort", onAbort);
resolve();
}, POLL_INTERVAL_MS);
abortSignal?.addEventListener("abort", onAbort, { once: true });
});
}
throw new AvatarError({
code: AVATAR_ERROR_CODES.POLL_TIMEOUT,
message: `D-ID talk ${talkId} did not complete within ${Math.round(totalTimeoutMs / 1000)}s`,
category: ErrorCategory.TIMEOUT,
severity: ErrorSeverity.MEDIUM,
retriable: true,
context: { talkId },
});
}
async downloadResult(url) {
const response = await this.fetchWithTimeout(url, { method: "GET" });
if (!response.ok) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: `D-ID result download failed: ${response.status}`,
category: ErrorCategory.NETWORK,
severity: ErrorSeverity.MEDIUM,
retriable: response.status >= 500,
context: { status: response.status, url },
});
}
try {
return await readBoundedBuffer(response, MAX_VIDEO_BYTES, "D-ID result");
}
catch (err) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: `D-ID result download rejected: ${err instanceof Error ? err.message : String(err)}`,
category: ErrorCategory.NETWORK,
severity: ErrorSeverity.HIGH,
retriable: false,
context: { url },
originalError: err instanceof Error ? err : undefined,
});
}
}
async resolveBuffer(input) {
if (Buffer.isBuffer(input)) {
return input;
}
// Reject local file paths — only Buffer or HTTPS URLs are accepted.
if (!/^https:\/\//.test(input)) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.INVALID_INPUT,
message: `Invalid input: expected Buffer or HTTPS URL, got string "${input}". Local file reads are not supported.`,
category: ErrorCategory.VALIDATION,
severity: ErrorSeverity.HIGH,
retriable: false,
});
}
try {
await assertSafeUrl(input);
}
catch (err) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.INVALID_INPUT,
message: `Unsafe URL rejected: ${err instanceof Error ? err.message : String(err)}`,
category: ErrorCategory.VALIDATION,
severity: ErrorSeverity.HIGH,
retriable: false,
context: { url: input },
});
}
const response = await this.fetchWithTimeout(input, { method: "GET" });
if (!response.ok) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.INVALID_INPUT,
message: `Failed to fetch input from ${input}: ${response.status}`,
category: ErrorCategory.NETWORK,
severity: ErrorSeverity.MEDIUM,
retriable: response.status >= 500,
context: { url: input, status: response.status },
});
}
// Use the larger of the two input caps (audio 50 MiB > image 25 MiB) so
// both audio and image URLs are bounded without falsely rejecting valid audio.
const inputCap = Math.max(MAX_AUDIO_BYTES, MAX_IMAGE_BYTES);
try {
return await readBoundedBuffer(response, inputCap, "D-ID input");
}
catch (err) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.INVALID_INPUT,
message: `D-ID input download rejected: ${err instanceof Error ? err.message : String(err)}`,
category: ErrorCategory.NETWORK,
severity: ErrorSeverity.HIGH,
retriable: false,
context: { url: input },
originalError: err instanceof Error ? err : undefined,
});
}
}
detectImageMime(buffer) {
if (buffer.length < 4) {
return "image/jpeg";
}
if (buffer[0] === 0x89 && buffer[1] === 0x50) {
return "image/png";
}
if (buffer[0] === 0xff && buffer[1] === 0xd8) {
return "image/jpeg";
}
// RIFF container: check offset 8 to distinguish WebP from WAV.
// A WAV file also begins with RIFF but carries "WAVE" at offset 8.
// If we can't confirm the WEBP four-CC we fall back to jpeg so callers
// that accidentally pass audio here get a visible mismatch rather than
// a silent wrong content-type.
if (buffer.length >= 12 &&
buffer[0] === 0x52 &&
buffer[1] === 0x49 &&
buffer[2] === 0x46 &&
buffer[3] === 0x46) {
if (buffer[8] === 0x57 &&
buffer[9] === 0x45 &&
buffer[10] === 0x42 &&
buffer[11] === 0x50) {
return "image/webp";
}
// RIFF but not WEBP (e.g. WAVE audio) — not a valid image.
return "image/jpeg";
}
return "image/jpeg";
}
/**
* Detect the audio subtype from magic bytes.
*
* Recognised formats:
* - WAV : RIFF header (52 49 46 46)
* - OGG : OggS capture (4F 67 67 53)
* - MP3 : ID3 tag (49 44 33) or MPEG sync word (FF Ex)
* - M4A : "ftyp" box at offset 4 (ISO base media / M4A)
*
* Falls back to "mp3" when detection is inconclusive.
*/
detectAudioType(buffer) {
if (buffer.length < 4) {
return "mp3";
}
// WAV: RIFF header
if (buffer[0] === 0x52 &&
buffer[1] === 0x49 &&
buffer[2] === 0x46 &&
buffer[3] === 0x46) {
return "wav";
}
// OGG: OggS capture pattern
if (buffer[0] === 0x4f &&
buffer[1] === 0x67 &&
buffer[2] === 0x67 &&
buffer[3] === 0x53) {
return "ogg";
}
// MP3: ID3 header
if (buffer[0] === 0x49 && buffer[1] === 0x44 && buffer[2] === 0x33) {
return "mp3";
}
// MP3: MPEG sync word (0xFF 0xE0–0xFF)
if (buffer[0] === 0xff && (buffer[1] & 0xe0) === 0xe0) {
return "mpeg";
}
// M4A / AAC: "ftyp" box at offset 4
if (buffer.length >= 8 &&
buffer[4] === 0x66 &&
buffer[5] === 0x74 &&
buffer[6] === 0x79 &&
buffer[7] === 0x70) {
return "mp4";
}
return "mp3";
}
async fetchWithTimeout(url, init, callerAbortSignal) {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), REQUEST_TIMEOUT_MS);
// Forward caller abort into this request so in-flight polls can be
// cancelled immediately without waiting for the timeout to fire.
const onCallerAbort = () => controller.abort();
callerAbortSignal?.addEventListener("abort", onCallerAbort, { once: true });
try {
return await fetch(url, { ...init, signal: controller.signal });
}
catch (err) {
if (err instanceof Error && err.name === "AbortError") {
// Distinguish an intentional caller cancellation from an internal
// timeout so the caller gets the right error semantics.
if (callerAbortSignal?.aborted) {
throw new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: `D-ID request to ${url} aborted by caller`,
category: ErrorCategory.NETWORK,
severity: ErrorSeverity.MEDIUM,
retriable: false,
originalError: err,
});
}
throw new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: `D-ID request to ${url} timed out after ${REQUEST_TIMEOUT_MS / 1000}s`,
category: ErrorCategory.NETWORK,
severity: ErrorSeverity.HIGH,
retriable: true,
originalError: err,
});
}
throw err;
}
finally {
callerAbortSignal?.removeEventListener("abort", onCallerAbort);
clearTimeout(timeoutId);
}
}
async assertOk(response, label) {
if (response.ok) {
return response.json();
}
const raw = await response.text();
const retriable = response.status === 408 ||
response.status === 429 ||
response.status >= 500;
throw new AvatarError({
code: AVATAR_ERROR_CODES.GENERATION_FAILED,
message: `D-ID ${label} failed: ${response.status} — ${sanitizeForLog(raw, 500)}`,
category: retriable ? ErrorCategory.NETWORK : ErrorCategory.EXECUTION,
severity: ErrorSeverity.HIGH,
retriable,
context: { status: response.status, label },
});
}
}
//# sourceMappingURL=DIDAvatar.js.map