sandai-react
Version:
React components and utilities for the Sandai 3D AI Characters.
442 lines • 24 kB
TypeScript
import { MotionExpression, MouthExpression, FaceExpression, VRMManager } from "@davidcks/r3f-vrm";
import { Expressions } from "piper-wasm/expressions";
import { EmotionAnimationMetadataType } from "./repo/animations/emotions";
import { VoiceNames } from "./repo/voices";
import { InteractionAnimationMetadataType } from "./repo/animations/interactions";
import { ChainManager } from "./managers/ChainManager";
import { LlmManager } from "./managers/LlmManager";
import { AsrManager } from "./managers/AsrManager";
import { ListenEvent, ListenOptions, ListenResult, MicManager } from "./managers/MicManager";
import { PuppeteerManager } from "./managers/PuppeteerManager";
import { Misc } from "./misc/Misc";
import { Message, ModelProvider } from "./worker/types/types";
export type AICharacterRenderController = {
setVrmUrl: (vrmUrl: string) => void;
setVoiceName: (voiceName: VoiceNames) => void;
onRenderPropsUpdate: () => void;
[key: string]: (args: any) => void;
};
export type AICharacterEventDataType = "motion" | "face" | "mouth";
export type AICharacterEventListenerType = (e: {
type: "motion";
data: EmotionAnimationMetadataType;
} | {
type: "interaction";
data: InteractionAnimationMetadataType;
} | {
type: "face";
data: any;
} | {
type: "mouth";
data: any;
}) => void;
export type AICharacterEventType = "change";
export declare class AICharacterManager {
private _voiceName;
private _uid;
private _currentEmotion;
private _currentTargetEmotion;
private _currentEmotionAnimationIntensity;
private _currentFaceEmotionIntensity;
private _currentEmotionAnimationName;
private _chainManager;
private _llmManager;
private _asrManager;
private _micManager;
private _webCamManager;
private _puppeteerManager;
private _destructionManager;
private _vrmManager;
private _misc;
private _renderController?;
private _piperWarmupPromise;
private _llmOpt;
private _wLipSync;
private _prevRandomIndex;
get renderController(): AICharacterRenderController | undefined;
get uid(): string;
get vrmManager(): VRMManager;
get chainManager(): ChainManager;
get misc(): Misc;
get llmManager(): LlmManager;
get asrManager(): AsrManager;
get micManager(): MicManager;
get voiceName(): "kareem" | "maria" | "huayan" | "jirka" | "lars" | "nathalie" | "louis" | "pim" | "ronnie" | "wataame-chibi" | "wataame-v1" | "wataame-v2" | "ruri" | "yui" | "olivia" | "lindsey" | "emma" | "kristin" | "norman" | "johnny" | "homer" | "gary" | "manny" | "noah" | "liam" | "william" | "ben" | "mason" | "bryce" | "ryan" | "alba" | "jenny" | "meggan" | "emily" | "gertrude" | "bridget" | "molly" | "alan" | "greg" | "bob" | "james" | "john" | "gray" | "ahmad" | "ganji" | "amir" | "harri" | "chloe" | "jessica" | "pierre" | "natia" | "thorsten" | "daniela" | "isabella" | "diego" | "lucia" | "miguel" | "vidya" | "rahul" | "berta" | "ugla" | "steinn" | "salka" | "bui" | "paola" | "aigerim" | "raya" | "alikhan" | "nurlan" | "janis" | "eva" | "meera" | "arjun" | "sita" | "saraswati" | "laxmi" | "palmu" | "nisha" | "usha" | "saraswati-slow" | "chitwan" | "aksel" | "gosia" | "antoni" | "cadu" | "afonso" | "mihai" | "irina" | "dimitri" | "artur" | "anders" | "ahmet" | "lada" | "tetiana" | "mykyta" | "alys" | "arthur" | "teco";
get currentFaceEmotionIntensity(): number;
get currentEmotion(): string;
get currentEmotionAnimationIntensity(): 2 | 3 | 1;
get currentEmotionName(): string;
get currentTargetEmotion(): string;
get puppeteerManager(): PuppeteerManager | undefined;
static get llms(): {
readonly "gemma3-1b": {
readonly model: "gemma3-1b";
readonly dtype: "int8";
readonly pipeline: "text";
};
readonly "gemma3-270M": {
readonly model: "gemma-3-270m-it-ONNX";
readonly dtype: "fp16";
readonly pipeline: "text";
};
readonly "Qwen2.5-0.5B-Instruct": {
readonly model: "Qwen2.5-0.5B-Instruct";
readonly dtype: "q4";
readonly pipeline: "text";
};
readonly "mediapipe-gemma3n-E4B-it": {
readonly model: "/aic-runtime-deps/llm-deps/dist-mediapipe-genai/gemma-3n-E4B-it-int4-Web.litertlm";
readonly dtype: "int4";
readonly pipeline: "mediapipe";
};
};
set currentEmotion(emotion: string);
constructor(vrmManager: VRMManager, voiceName?: VoiceNames, renderController?: AICharacterRenderController, llmOpt?: {
provider?: ModelProvider;
});
/**
* Initializes in an interactive context. You **need** to call this
* on a **user interaction** so that audio can play. You can thank
* Safari for having to do this.
*/
initInteractive(): Promise<{
listen: (factory: (context: AudioContext) => AudioNode) => void;
render: (normalizationOptions: {
scale: number;
} | undefined) => {
duration: number;
aa: number;
ee: number;
ih: number;
oh: number;
ou: number;
};
disconnect: () => void;
destroy: () => void;
}>;
private ensureWLipSync;
private _eventListeners;
addEventListener(type: AICharacterEventType, listener: AICharacterEventListenerType): void;
removeEventListener(type: AICharacterEventType, listener: AICharacterEventListenerType): void;
_onExpressionUpdate: (e: MotionExpression<any> | FaceExpression<any> | MouthExpression<any>) => void;
/**
* Sets the emotion of the VRM character by applying a facial and motion expression
* corresponding to the provided emotion and intensity. If an invalid emotion is
* provided, the emotion will default to "neutral".
*
* @param {string} newEmotion - The emotion to be applied. It should be one of the following values:
* 'love', 'joy', 'gratitude', 'caring', 'excitement', 'admiration',
* 'optimism', 'pride', 'amusement', 'relief', 'approval', 'desire',
* 'curiosity', 'surprise', 'realization', 'neutral', 'confusion',
* 'embarrassment', 'nervousness', 'annoyance', 'disapproval', 'remorse',
* 'fear', 'disappointment', 'sadness', 'anger', 'grief', 'disgust'.
* @param {number} intensity - The intensity of the emotion, a value between 0 and 1.
* Higher values represent stronger expressions.
*
* @param {FaceExpression[]} faceExpressions - An override if you want to customize the face expressions
* @param {MotionExpression[]} motionExpressions - An override if you want to customize the motion expressions
*
* @example
* setEmotion('joy', 0.8);
* setEmotion('unknownEmotion', 0.5); // Will default to "neutral"
*/
setEmotion(newEmotion: string, intensity?: number, faceExpressions?: FaceExpression[], motionExpressions?: MotionExpression<EmotionAnimationMetadataType>[]): Promise<void>;
/**
* Sets the intensity of the current emotion for the VRM character.
*
* @param {number} intensity - The new intensity for the current emotion, a value between 0 and 1 or 1 and 3.
* Higher values represent stronger expressions.
*
* @example
* character.setEmotionIntensity(0.9);
*/
setEmotionIntensity(intensity: number): Promise<void>;
private _streamCleanup?;
puppeteer(opt: {
backend: "ik" | "profile" | "ik2";
}): Promise<{
snapshot: Map<"neutral" | "admiration" | "amusement" | "anger" | "annoyance" | "approval" | "caring" | "confusion" | "curiosity" | "desire" | "disappointment" | "disapproval" | "disgust" | "embarrassment" | "excitement" | "fear" | "gratitude" | "grief" | "joy" | "love" | "nervousness" | "optimism" | "pride" | "realization" | "relief" | "remorse" | "sadness" | "surprise" | `la__${string}` | `la__${number}` | `bo__${string}` | `bo__${number}`, () => void>;
lifecycle: {
end: () => void;
};
}>;
/**
* Start voice capture. In continuous mode we transcribe each finalized mic
* utterance (one-shot per utterance) and emit `final transcript`.
* Single-shot remains: capture once, transcribe once, return the result.
*/
listen(options?: ListenOptions, onEvent?: (e: ListenEvent | {
type: "final transcript";
transcript: string;
}) => void): Promise<void | (ListenResult & {
transcript: string;
})>;
/** Stop an active continuous listen (or abort single-shot early). */
stopListening(): Promise<void>;
private _generatePiperData;
/**
* Says the provided text.
* Emotions and expressions are inferred from the text.
*
* @param {string} text - The text to be spoken.
* @param {VoiceNames} [voiceName] - The name of the voice to be used.
* Will default to using the voiceName provided in the props if provided,
* else it will use the default voice.
* @param {(progress: number) => void} [onProgress] - A callback function that will be called with the progress of the speech synthesis.
* @param {string} [emotionOverride] - An override if you want to customize the emotion.
* It should be one of the following values:
* 'love', 'joy', 'gratitude', 'caring', 'excitement', 'admiration',
* 'optimism', 'pride', 'amusement', 'relief', 'approval', 'desire',
* 'curiosity', 'surprise', 'realization', 'neutral', 'confusion',
* 'embarrassment', 'nervousness', 'annoyance', 'disapproval', 'remorse',
* 'fear', 'disappointment', 'sadness', 'anger', 'grief', 'disgust'.
* @param {number} [intensityOverride] - An override if you want to customize the intensity of the emotion.
* Should be a value between 0 and 1.
* Higher values represent stronger expressions.
* @param {FaceExpression[]} [faceExpressionsOverride] - An override if you want to customize the face expressions
* @param {MotionExpression<EmotionAnimationMetadataType>[]} [motionExpressionsOverride] - An override if you want to customize the motion expressions
* @param {string | AudioNode | MediaStream} [audioOverride] - Optional audio URL to use instead of generating TTS. When provided,
* the text will still be used for emotion inference and lip sync
* generation unless realtimeLipSync is enabled.
*
* If the audioOverride is an AudioNode or a MediaStream instead, it will be treated as a live feed
* that you handle externally.
*
* While audio playback will be taken care of for string urls, you'll need to manage AudioNode
* and MediaStream playback yourself.
* @param {boolean} [realtimeLipSync] - Controls lip sync generation. When true, lip sync is generated
* on the fly based on the audio. When false or when using the
* internal TTS system, lip sync is generated from the text.
* For custom audio, realtime lip sync is recommended.
* @param {boolean} [normalizeAudioOverride] - Makes it so the lip-sync generated will be scaled up in case the audio is
* silent. This is useful for microphone inputs if you don't want to avoid screaming.
* @returns {Promise<{ data: any; audio: HTMLAudioElement }>}
* Resolves with an object containing:
* - `data`: TTS artifacts/metadata (including inferred expressions)
* - `audio`: The HTMLAudioElement that is playing the generated speech
*
* @example
* // Basic usage
* await character.say("Hello world!");
*
* @example
* // With custom voice and progress tracking
* await character.say(
* "Welcome to our application!",
* "alloy",
* (progress) => console.log(`Progress: ${progress * 100}%`)
* );
*
* @example
* // With emotion override
* await character.say(
* "I'm so excited to see you!",
* undefined,
* undefined,
* "excitement",
* 0.8
* );
*
* @example
* // Using pre-generated audio with realtime lip sync
* await character.say(
* "This is pre-recorded audio",
* undefined,
* undefined,
* undefined,
* undefined,
* undefined,
* undefined,
* "https://example.com/audio.wav",
* true
* );
*/
say<T>(text: string, voiceName?: VoiceNames, onProgress?: (arg0: number) => void, emotionOverride?: string, intensityOverride?: number, faceExpressionsOverride?: FaceExpression<T>[], motionExpressionsOverride?: MotionExpression<EmotionAnimationMetadataType>[], audioOverride?: string | AudioNode | MediaStream, realtimeLipSync?: boolean, normalizeAudioOverride?: boolean, emotionInferrenceType?: "sentiment" | "distilbert", emotionIntensityMultiplier?: number): Promise<{
data: {
file: string;
expressions: Expressions;
duration: number;
input: string;
kind: string;
phonemes: string[];
phonemeIds: int[];
startOffset: number;
endOffset: number;
pcm: Float32Array;
} & {
sampleRate: number;
};
audio: AudioNode | MediaStreamAudioSourceNode | AudioBufferSourceNode | HTMLAudioElement;
audioEndPromise: Promise<void>;
}>;
private respondChain;
/**
* Generates a spoken reply to the provided user text.
*
* This method first asks the LLM for a reply to `text`, then speaks that reply
* using {@link say}. Emotions and expressions are inferred from the generated
* reply unless you provide overrides.
*
* Flow:
* 1. Calls `llmManager.generateResponse([{ role: "user", content: text }], 48)`
* to produce the reply text (48 token cap by default).
* 2. Forwards the reply to {@link say} with the same voice and expression options.
*
* @param {string} text - The user's input that the character should respond to.
* @param {string} [context] - Optional conversation context to pass along to the LLM.
* @param {VoiceNames} [voiceName] - The voice to use. Defaults to the default voice if omitted.
* @param {(progress: number) => void} [onProgress] - Callback for TTS generation
* progress (0..1), forwarded to {@link say}.
* @param {string} [emotionOverride] - Optional emotion override.
* One of: 'love','joy','gratitude','caring','excitement','admiration',
* 'optimism','pride','amusement','relief','approval','desire',
* 'curiosity','surprise','realization','neutral','confusion',
* 'embarrassment','nervousness','annoyance','disapproval','remorse',
* 'fear','disappointment','sadness','anger','grief','disgust'.
* @param {number} [intensityOverride] - Emotion intensity override (0..1).
* @param {FaceExpression[]} [faceExpressionsOverride] - Optional face expression overrides.
* @param {MotionExpression<EmotionAnimationMetadataType>[]} [motionExpressionsOverride]
* - Optional body/motion expression overrides.
* @param {(sentence: string) => string | Promise<string>} [audioGenerator] - Optional custom
* audio generator function. When provided, this function
* will be called for each sentence to generate audio.
* Should return a URL to load the audio from (optimally
* a blob URL, but any URL is fine).
* @param {boolean} [realtimeLipSync] - Controls lip sync generation. When true, lip sync
* is generated on the fly based on the audio. When false,
* lip sync is generated from the text. Note: when using
* a custom `audioGenerator`, realtime lip sync should
* typically be enabled since pre-generation is tuned
* for the internal TTS system.
* @param {"sentiment" | "distilbert"} [emotionInferrenceType] - The type of inferrence to
* run on the generated output. "distilbert" is more
* accurate, but will have to download the ~50MB model
* first. "sentiment" does basic sentiment analysis
* and tries to infer the emotion based on sentiment
* alone, so it's less accurate but doesn't require
* an additional download.
* @param {ModelProvider} [modelProvider] - Typically set in the constructor, but that can
* be a bit inaccessible for some use-cases, so this will
* let you provide the model to use. You can grab one of
* the built-in ones from the AICharacterManager.llms
* ```ts
* {
* model: string; // local path relative to the dist folder
* // in aic-runtime-deps/llm-deps
* // or huggingface onnx identifier,
* // e.g. onnx-community/gemma-3-1b-it-ONNX
* // when running an onnx model.
* // (it's relative to the worker if you're curious)
* //
* // for mediapipe-genai models, this is a
* // local path relative to root, so "/model.litertlm"
* // will load the model from 127.0.0.1:8080/model.litertlm
* dtype: DTYPE; // "int8", "fp16", etc.
* pipeline: "text" | "auto" | "mediapipe"; // "text" and "auto" are for onnx,
* // "mediapipe" is for litertlm
* }
* ```
*
* @returns {Promise<{ data: any; audio: HTMLAudioElement }>}
* Resolves with the same object returned by {@link say}:
* - `data`: TTS artifacts/metadata (including inferred expressions).
* - `audio`: The HTMLAudioElement that is playing the generated reply.
*
* @example
* // Simple usage
* await character.respond("Hey! What's the plan for today?");
*
* @example
* // Choose a specific voice and receive progress callbacks
* await character.respond(
* "Can you summarize the latest notes?",
* "ruri",
* (p) => progressBar.value = p
* );
*
* @example
* // Override emotion and intensity
* await character.respond(
* "That's amazing news!",
* "alloy",
* undefined,
* "excitement",
* 0.9
* );
*
* @example
* // Use custom audio generator with realtime lip sync
* await character.respond(
* "Hello there!",
* undefined,
* undefined,
* undefined,
* undefined,
* undefined,
* undefined,
* undefined,
* async (sentence) => {
* // Generate custom audio and return blob URL
* const audioBlob = await myCustomTTS(sentence);
* return URL.createObjectURL(audioBlob);
* },
* true // Enable realtime lip sync for custom audio
* );
*
* @example
* // If you need the raw LLM text (before TTS), call the LLM directly:
* const reply = await character.llmManager.generateResponse(
* [{ role: "user", content: "Tell me a joke about space." }],
* 48
* );
* // …then speak it:
* await character.say(reply);
*/
respond(text: string, context?: string, voiceName?: VoiceNames, onProgress?: (arg0: number) => void, emotionOverride?: string, intensityOverride?: number, faceExpressionsOverride?: FaceExpression[], motionExpressionsOverride?: MotionExpression<EmotionAnimationMetadataType>[], audioGenerator?: (sentence: string) => string | Promise<string>, realtimeLipSync?: boolean, emotionInferrenceType?: "sentiment" | "distilbert", modelProvider?: ModelProvider, emotionIntensityMultiplier?: number, chatHistory?: Message[]): Promise<{
text: string;
chatHistory: Message[];
responsePromise: Promise<void>;
}>;
stop(): Promise<void>;
_getEmotionMotionChain(emotion: string, intensity: number): Promise<MotionExpression<EmotionAnimationMetadataType>[]>;
_getNearestEmotion(emotion: string): string;
_getClosestViableAnimation(emotion: string, intensity: 1 | 2 | 3, motionType?: "Gesture" | "Loop", isFallback?: boolean): EmotionAnimationMetadataType;
/**
* Sets the animation of the VRM character by applying a motion animation
* corresponding to the provided emotion, intensity and name. If an invalid emotion is
* provided, the emotion will default to "neutral".
*
* @param {string} animationEmotion - The emotion to be applied. It should be one of the following values:
* 'love', 'joy', 'gratitude', 'caring', 'excitement', 'admiration',
* 'optimism', 'pride', 'amusement', 'relief', 'approval', 'desire',
* 'curiosity', 'surprise', 'realization', 'neutral', 'confusion',
* 'embarrassment', 'nervousness', 'annoyance', 'disapproval', 'remorse',
* 'fear', 'disappointment', 'sadness', 'anger', 'grief', 'disgust'.
* @param {number} animationIntensity - The intensity of the emotion, a value between 0 and 1.
* Higher values represent stronger expressions.
* @param {string} newAnimationName - The name of the animation to be applied.
*/
_setAnimation(animationEmotion: string, animationIntensity: number, newAnimationName: string): Promise<void>;
/**
* Gets viable emotion animations
* @param {string} emotion one of these strings: 'love', 'joy', 'gratitude', 'caring', 'excitement', 'admiration',
* 'optimism', 'pride', 'amusement', 'relief', 'approval', 'desire',
* 'curiosity', 'surprise', 'realization', 'neutral', 'confusion',
* 'embarrassment', 'nervousness', 'annoyance', 'disapproval', 'remorse',
* 'fear', 'disappointment', 'sadness', 'anger', 'grief', 'disgust'
* @param {number} intensity number between 0 and 1
*
* @returns {EmotionAnimationMetadataType[] | undefined} an array of viable emotion animations
*/
_getExactViableEmotionAnimations(emotion: string, intensity: number): EmotionAnimationMetadataType[] | undefined;
/**
* Normalizes intensity
* @param {number} intensity a number between 0 and 1 or 1 and 3
* @returns {1 | 2 | 3} 1, 2 or 3. Useful for selecting the correct animation
* based on intensity
*/
_normalizeIntensity(intensity: number): 1 | 2 | 3;
destroy(): Promise<void>;
update(delta: number): void;
}
//# sourceMappingURL=AICharacterManager.d.ts.map