UNPKG

sandai-react

Version:

React components and utilities for the Sandai 3D AI Characters.

442 lines 24 kB
import { MotionExpression, MouthExpression, FaceExpression, VRMManager } from "@davidcks/r3f-vrm"; import { Expressions } from "piper-wasm/expressions"; import { EmotionAnimationMetadataType } from "./repo/animations/emotions"; import { VoiceNames } from "./repo/voices"; import { InteractionAnimationMetadataType } from "./repo/animations/interactions"; import { ChainManager } from "./managers/ChainManager"; import { LlmManager } from "./managers/LlmManager"; import { AsrManager } from "./managers/AsrManager"; import { ListenEvent, ListenOptions, ListenResult, MicManager } from "./managers/MicManager"; import { PuppeteerManager } from "./managers/PuppeteerManager"; import { Misc } from "./misc/Misc"; import { Message, ModelProvider } from "./worker/types/types"; export type AICharacterRenderController = { setVrmUrl: (vrmUrl: string) => void; setVoiceName: (voiceName: VoiceNames) => void; onRenderPropsUpdate: () => void; [key: string]: (args: any) => void; }; export type AICharacterEventDataType = "motion" | "face" | "mouth"; export type AICharacterEventListenerType = (e: { type: "motion"; data: EmotionAnimationMetadataType; } | { type: "interaction"; data: InteractionAnimationMetadataType; } | { type: "face"; data: any; } | { type: "mouth"; data: any; }) => void; export type AICharacterEventType = "change"; export declare class AICharacterManager { private _voiceName; private _uid; private _currentEmotion; private _currentTargetEmotion; private _currentEmotionAnimationIntensity; private _currentFaceEmotionIntensity; private _currentEmotionAnimationName; private _chainManager; private _llmManager; private _asrManager; private _micManager; private _webCamManager; private _puppeteerManager; private _destructionManager; private _vrmManager; private _misc; private _renderController?; private _piperWarmupPromise; private _llmOpt; private _wLipSync; private _prevRandomIndex; get renderController(): AICharacterRenderController | undefined; get uid(): string; get vrmManager(): VRMManager; get chainManager(): ChainManager; get misc(): Misc; get llmManager(): LlmManager; get asrManager(): AsrManager; get micManager(): MicManager; get voiceName(): "kareem" | "maria" | "huayan" | "jirka" | "lars" | "nathalie" | "louis" | "pim" | "ronnie" | "wataame-chibi" | "wataame-v1" | "wataame-v2" | "ruri" | "yui" | "olivia" | "lindsey" | "emma" | "kristin" | "norman" | "johnny" | "homer" | "gary" | "manny" | "noah" | "liam" | "william" | "ben" | "mason" | "bryce" | "ryan" | "alba" | "jenny" | "meggan" | "emily" | "gertrude" | "bridget" | "molly" | "alan" | "greg" | "bob" | "james" | "john" | "gray" | "ahmad" | "ganji" | "amir" | "harri" | "chloe" | "jessica" | "pierre" | "natia" | "thorsten" | "daniela" | "isabella" | "diego" | "lucia" | "miguel" | "vidya" | "rahul" | "berta" | "ugla" | "steinn" | "salka" | "bui" | "paola" | "aigerim" | "raya" | "alikhan" | "nurlan" | "janis" | "eva" | "meera" | "arjun" | "sita" | "saraswati" | "laxmi" | "palmu" | "nisha" | "usha" | "saraswati-slow" | "chitwan" | "aksel" | "gosia" | "antoni" | "cadu" | "afonso" | "mihai" | "irina" | "dimitri" | "artur" | "anders" | "ahmet" | "lada" | "tetiana" | "mykyta" | "alys" | "arthur" | "teco"; get currentFaceEmotionIntensity(): number; get currentEmotion(): string; get currentEmotionAnimationIntensity(): 2 | 3 | 1; get currentEmotionName(): string; get currentTargetEmotion(): string; get puppeteerManager(): PuppeteerManager | undefined; static get llms(): { readonly "gemma3-1b": { readonly model: "gemma3-1b"; readonly dtype: "int8"; readonly pipeline: "text"; }; readonly "gemma3-270M": { readonly model: "gemma-3-270m-it-ONNX"; readonly dtype: "fp16"; readonly pipeline: "text"; }; readonly "Qwen2.5-0.5B-Instruct": { readonly model: "Qwen2.5-0.5B-Instruct"; readonly dtype: "q4"; readonly pipeline: "text"; }; readonly "mediapipe-gemma3n-E4B-it": { readonly model: "/aic-runtime-deps/llm-deps/dist-mediapipe-genai/gemma-3n-E4B-it-int4-Web.litertlm"; readonly dtype: "int4"; readonly pipeline: "mediapipe"; }; }; set currentEmotion(emotion: string); constructor(vrmManager: VRMManager, voiceName?: VoiceNames, renderController?: AICharacterRenderController, llmOpt?: { provider?: ModelProvider; }); /** * Initializes in an interactive context. You **need** to call this * on a **user interaction** so that audio can play. You can thank * Safari for having to do this. */ initInteractive(): Promise<{ listen: (factory: (context: AudioContext) => AudioNode) => void; render: (normalizationOptions: { scale: number; } | undefined) => { duration: number; aa: number; ee: number; ih: number; oh: number; ou: number; }; disconnect: () => void; destroy: () => void; }>; private ensureWLipSync; private _eventListeners; addEventListener(type: AICharacterEventType, listener: AICharacterEventListenerType): void; removeEventListener(type: AICharacterEventType, listener: AICharacterEventListenerType): void; _onExpressionUpdate: (e: MotionExpression<any> | FaceExpression<any> | MouthExpression<any>) => void; /** * Sets the emotion of the VRM character by applying a facial and motion expression * corresponding to the provided emotion and intensity. If an invalid emotion is * provided, the emotion will default to "neutral". * * @param {string} newEmotion - The emotion to be applied. It should be one of the following values: * 'love', 'joy', 'gratitude', 'caring', 'excitement', 'admiration', * 'optimism', 'pride', 'amusement', 'relief', 'approval', 'desire', * 'curiosity', 'surprise', 'realization', 'neutral', 'confusion', * 'embarrassment', 'nervousness', 'annoyance', 'disapproval', 'remorse', * 'fear', 'disappointment', 'sadness', 'anger', 'grief', 'disgust'. * @param {number} intensity - The intensity of the emotion, a value between 0 and 1. * Higher values represent stronger expressions. * * @param {FaceExpression[]} faceExpressions - An override if you want to customize the face expressions * @param {MotionExpression[]} motionExpressions - An override if you want to customize the motion expressions * * @example * setEmotion('joy', 0.8); * setEmotion('unknownEmotion', 0.5); // Will default to "neutral" */ setEmotion(newEmotion: string, intensity?: number, faceExpressions?: FaceExpression[], motionExpressions?: MotionExpression<EmotionAnimationMetadataType>[]): Promise<void>; /** * Sets the intensity of the current emotion for the VRM character. * * @param {number} intensity - The new intensity for the current emotion, a value between 0 and 1 or 1 and 3. * Higher values represent stronger expressions. * * @example * character.setEmotionIntensity(0.9); */ setEmotionIntensity(intensity: number): Promise<void>; private _streamCleanup?; puppeteer(opt: { backend: "ik" | "profile" | "ik2"; }): Promise<{ snapshot: Map<"neutral" | "admiration" | "amusement" | "anger" | "annoyance" | "approval" | "caring" | "confusion" | "curiosity" | "desire" | "disappointment" | "disapproval" | "disgust" | "embarrassment" | "excitement" | "fear" | "gratitude" | "grief" | "joy" | "love" | "nervousness" | "optimism" | "pride" | "realization" | "relief" | "remorse" | "sadness" | "surprise" | `la__${string}` | `la__${number}` | `bo__${string}` | `bo__${number}`, () => void>; lifecycle: { end: () => void; }; }>; /** * Start voice capture. In continuous mode we transcribe each finalized mic * utterance (one-shot per utterance) and emit `final transcript`. * Single-shot remains: capture once, transcribe once, return the result. */ listen(options?: ListenOptions, onEvent?: (e: ListenEvent | { type: "final transcript"; transcript: string; }) => void): Promise<void | (ListenResult & { transcript: string; })>; /** Stop an active continuous listen (or abort single-shot early). */ stopListening(): Promise<void>; private _generatePiperData; /** * Says the provided text. * Emotions and expressions are inferred from the text. * * @param {string} text - The text to be spoken. * @param {VoiceNames} [voiceName] - The name of the voice to be used. * Will default to using the voiceName provided in the props if provided, * else it will use the default voice. * @param {(progress: number) => void} [onProgress] - A callback function that will be called with the progress of the speech synthesis. * @param {string} [emotionOverride] - An override if you want to customize the emotion. * It should be one of the following values: * 'love', 'joy', 'gratitude', 'caring', 'excitement', 'admiration', * 'optimism', 'pride', 'amusement', 'relief', 'approval', 'desire', * 'curiosity', 'surprise', 'realization', 'neutral', 'confusion', * 'embarrassment', 'nervousness', 'annoyance', 'disapproval', 'remorse', * 'fear', 'disappointment', 'sadness', 'anger', 'grief', 'disgust'. * @param {number} [intensityOverride] - An override if you want to customize the intensity of the emotion. * Should be a value between 0 and 1. * Higher values represent stronger expressions. * @param {FaceExpression[]} [faceExpressionsOverride] - An override if you want to customize the face expressions * @param {MotionExpression<EmotionAnimationMetadataType>[]} [motionExpressionsOverride] - An override if you want to customize the motion expressions * @param {string | AudioNode | MediaStream} [audioOverride] - Optional audio URL to use instead of generating TTS. When provided, * the text will still be used for emotion inference and lip sync * generation unless realtimeLipSync is enabled. * * If the audioOverride is an AudioNode or a MediaStream instead, it will be treated as a live feed * that you handle externally. * * While audio playback will be taken care of for string urls, you'll need to manage AudioNode * and MediaStream playback yourself. * @param {boolean} [realtimeLipSync] - Controls lip sync generation. When true, lip sync is generated * on the fly based on the audio. When false or when using the * internal TTS system, lip sync is generated from the text. * For custom audio, realtime lip sync is recommended. * @param {boolean} [normalizeAudioOverride] - Makes it so the lip-sync generated will be scaled up in case the audio is * silent. This is useful for microphone inputs if you don't want to avoid screaming. * @returns {Promise<{ data: any; audio: HTMLAudioElement }>} * Resolves with an object containing: * - `data`: TTS artifacts/metadata (including inferred expressions) * - `audio`: The HTMLAudioElement that is playing the generated speech * * @example * // Basic usage * await character.say("Hello world!"); * * @example * // With custom voice and progress tracking * await character.say( * "Welcome to our application!", * "alloy", * (progress) => console.log(`Progress: ${progress * 100}%`) * ); * * @example * // With emotion override * await character.say( * "I'm so excited to see you!", * undefined, * undefined, * "excitement", * 0.8 * ); * * @example * // Using pre-generated audio with realtime lip sync * await character.say( * "This is pre-recorded audio", * undefined, * undefined, * undefined, * undefined, * undefined, * undefined, * "https://example.com/audio.wav", * true * ); */ say<T>(text: string, voiceName?: VoiceNames, onProgress?: (arg0: number) => void, emotionOverride?: string, intensityOverride?: number, faceExpressionsOverride?: FaceExpression<T>[], motionExpressionsOverride?: MotionExpression<EmotionAnimationMetadataType>[], audioOverride?: string | AudioNode | MediaStream, realtimeLipSync?: boolean, normalizeAudioOverride?: boolean, emotionInferrenceType?: "sentiment" | "distilbert", emotionIntensityMultiplier?: number): Promise<{ data: { file: string; expressions: Expressions; duration: number; input: string; kind: string; phonemes: string[]; phonemeIds: int[]; startOffset: number; endOffset: number; pcm: Float32Array; } & { sampleRate: number; }; audio: AudioNode | MediaStreamAudioSourceNode | AudioBufferSourceNode | HTMLAudioElement; audioEndPromise: Promise<void>; }>; private respondChain; /** * Generates a spoken reply to the provided user text. * * This method first asks the LLM for a reply to `text`, then speaks that reply * using {@link say}. Emotions and expressions are inferred from the generated * reply unless you provide overrides. * * Flow: * 1. Calls `llmManager.generateResponse([{ role: "user", content: text }], 48)` * to produce the reply text (48 token cap by default). * 2. Forwards the reply to {@link say} with the same voice and expression options. * * @param {string} text - The user's input that the character should respond to. * @param {string} [context] - Optional conversation context to pass along to the LLM. * @param {VoiceNames} [voiceName] - The voice to use. Defaults to the default voice if omitted. * @param {(progress: number) => void} [onProgress] - Callback for TTS generation * progress (0..1), forwarded to {@link say}. * @param {string} [emotionOverride] - Optional emotion override. * One of: 'love','joy','gratitude','caring','excitement','admiration', * 'optimism','pride','amusement','relief','approval','desire', * 'curiosity','surprise','realization','neutral','confusion', * 'embarrassment','nervousness','annoyance','disapproval','remorse', * 'fear','disappointment','sadness','anger','grief','disgust'. * @param {number} [intensityOverride] - Emotion intensity override (0..1). * @param {FaceExpression[]} [faceExpressionsOverride] - Optional face expression overrides. * @param {MotionExpression<EmotionAnimationMetadataType>[]} [motionExpressionsOverride] * - Optional body/motion expression overrides. * @param {(sentence: string) => string | Promise<string>} [audioGenerator] - Optional custom * audio generator function. When provided, this function * will be called for each sentence to generate audio. * Should return a URL to load the audio from (optimally * a blob URL, but any URL is fine). * @param {boolean} [realtimeLipSync] - Controls lip sync generation. When true, lip sync * is generated on the fly based on the audio. When false, * lip sync is generated from the text. Note: when using * a custom `audioGenerator`, realtime lip sync should * typically be enabled since pre-generation is tuned * for the internal TTS system. * @param {"sentiment" | "distilbert"} [emotionInferrenceType] - The type of inferrence to * run on the generated output. "distilbert" is more * accurate, but will have to download the ~50MB model * first. "sentiment" does basic sentiment analysis * and tries to infer the emotion based on sentiment * alone, so it's less accurate but doesn't require * an additional download. * @param {ModelProvider} [modelProvider] - Typically set in the constructor, but that can * be a bit inaccessible for some use-cases, so this will * let you provide the model to use. You can grab one of * the built-in ones from the AICharacterManager.llms * ```ts * { * model: string; // local path relative to the dist folder * // in aic-runtime-deps/llm-deps * // or huggingface onnx identifier, * // e.g. onnx-community/gemma-3-1b-it-ONNX * // when running an onnx model. * // (it's relative to the worker if you're curious) * // * // for mediapipe-genai models, this is a * // local path relative to root, so "/model.litertlm" * // will load the model from 127.0.0.1:8080/model.litertlm * dtype: DTYPE; // "int8", "fp16", etc. * pipeline: "text" | "auto" | "mediapipe"; // "text" and "auto" are for onnx, * // "mediapipe" is for litertlm * } * ``` * * @returns {Promise<{ data: any; audio: HTMLAudioElement }>} * Resolves with the same object returned by {@link say}: * - `data`: TTS artifacts/metadata (including inferred expressions). * - `audio`: The HTMLAudioElement that is playing the generated reply. * * @example * // Simple usage * await character.respond("Hey! What's the plan for today?"); * * @example * // Choose a specific voice and receive progress callbacks * await character.respond( * "Can you summarize the latest notes?", * "ruri", * (p) => progressBar.value = p * ); * * @example * // Override emotion and intensity * await character.respond( * "That's amazing news!", * "alloy", * undefined, * "excitement", * 0.9 * ); * * @example * // Use custom audio generator with realtime lip sync * await character.respond( * "Hello there!", * undefined, * undefined, * undefined, * undefined, * undefined, * undefined, * undefined, * async (sentence) => { * // Generate custom audio and return blob URL * const audioBlob = await myCustomTTS(sentence); * return URL.createObjectURL(audioBlob); * }, * true // Enable realtime lip sync for custom audio * ); * * @example * // If you need the raw LLM text (before TTS), call the LLM directly: * const reply = await character.llmManager.generateResponse( * [{ role: "user", content: "Tell me a joke about space." }], * 48 * ); * // …then speak it: * await character.say(reply); */ respond(text: string, context?: string, voiceName?: VoiceNames, onProgress?: (arg0: number) => void, emotionOverride?: string, intensityOverride?: number, faceExpressionsOverride?: FaceExpression[], motionExpressionsOverride?: MotionExpression<EmotionAnimationMetadataType>[], audioGenerator?: (sentence: string) => string | Promise<string>, realtimeLipSync?: boolean, emotionInferrenceType?: "sentiment" | "distilbert", modelProvider?: ModelProvider, emotionIntensityMultiplier?: number, chatHistory?: Message[]): Promise<{ text: string; chatHistory: Message[]; responsePromise: Promise<void>; }>; stop(): Promise<void>; _getEmotionMotionChain(emotion: string, intensity: number): Promise<MotionExpression<EmotionAnimationMetadataType>[]>; _getNearestEmotion(emotion: string): string; _getClosestViableAnimation(emotion: string, intensity: 1 | 2 | 3, motionType?: "Gesture" | "Loop", isFallback?: boolean): EmotionAnimationMetadataType; /** * Sets the animation of the VRM character by applying a motion animation * corresponding to the provided emotion, intensity and name. If an invalid emotion is * provided, the emotion will default to "neutral". * * @param {string} animationEmotion - The emotion to be applied. It should be one of the following values: * 'love', 'joy', 'gratitude', 'caring', 'excitement', 'admiration', * 'optimism', 'pride', 'amusement', 'relief', 'approval', 'desire', * 'curiosity', 'surprise', 'realization', 'neutral', 'confusion', * 'embarrassment', 'nervousness', 'annoyance', 'disapproval', 'remorse', * 'fear', 'disappointment', 'sadness', 'anger', 'grief', 'disgust'. * @param {number} animationIntensity - The intensity of the emotion, a value between 0 and 1. * Higher values represent stronger expressions. * @param {string} newAnimationName - The name of the animation to be applied. */ _setAnimation(animationEmotion: string, animationIntensity: number, newAnimationName: string): Promise<void>; /** * Gets viable emotion animations * @param {string} emotion one of these strings: 'love', 'joy', 'gratitude', 'caring', 'excitement', 'admiration', * 'optimism', 'pride', 'amusement', 'relief', 'approval', 'desire', * 'curiosity', 'surprise', 'realization', 'neutral', 'confusion', * 'embarrassment', 'nervousness', 'annoyance', 'disapproval', 'remorse', * 'fear', 'disappointment', 'sadness', 'anger', 'grief', 'disgust' * @param {number} intensity number between 0 and 1 * * @returns {EmotionAnimationMetadataType[] | undefined} an array of viable emotion animations */ _getExactViableEmotionAnimations(emotion: string, intensity: number): EmotionAnimationMetadataType[] | undefined; /** * Normalizes intensity * @param {number} intensity a number between 0 and 1 or 1 and 3 * @returns {1 | 2 | 3} 1, 2 or 3. Useful for selecting the correct animation * based on intensity */ _normalizeIntensity(intensity: number): 1 | 2 | 3; destroy(): Promise<void>; update(delta: number): void; } //# sourceMappingURL=AICharacterManager.d.ts.map