@volley/recognition-client-sdk
Version:
Recognition Service TypeScript/Node.js Client SDK
102 lines (85 loc) • 4.69 kB
text/typescript
import { z } from "zod"
/**
* VGF-style state schema for game-side recognition state/results management.
*
* This schema provides a standardized way for game developers to manage
* voice recognition state and results in their applications. It supports:
*
* STEP 1: Basic transcription flow
* STEP 2: Mic auto-stop upon correct answer (using partial transcripts)
* STEP 3: Semantic/function-call outcomes for game actions
*
* Ideally this should be part of a more centralized shared type library to free
* game developers and provide helper functions (VGF? Platform SDK?).
*/
export const RecognitionVGFStateSchema = z.object({
// Core STT state
audioUtteranceId: z.string(),
startRecordingStatus: z.string().optional(), // "NOT_READY", "READY", "RECORDING", "FINISHED". States follow this order.
// Streaming should only start when "READY". Other states control mic UI and recording.
transcriptionStatus: z.string().optional(), // "NOT_STARTED", "IN_PROGRESS", "FINALIZED", "ABORTED", "ERROR"
finalTranscript: z.string().optional(), // Full finalized transcript for the utterance. Will not change.
finalConfidence: z.number().optional(),
// Tracking-only metadata
asrConfig: z.string().optional(), // Json format of the ASR config
startRecordingTimestamp: z.string().optional(), // Start of recording. Immutable after set.
finalRecordingTimestamp: z.string().optional(), // End of recording. Immutable after set. Transcription may still be in progress.
finalTranscriptionTimestamp: z.string().optional(), // When the final transcript was produced. Immutable after set.
// STEP 2: Support for mic auto-stop upon correct answer
pendingTranscript: z.string().optional().default(""), // Non-final transcript that may change (matches existing naming)
pendingConfidence: z.number().optional(),
// STEP 3: Support for semantic/function-call outcomes
functionCallMetadata: z.string().optional(), // Function call metadata in JSON, e.g. "{artist: true, title: true}"
functionCallConfidence: z.number().optional(), // Confidence score for the function call.
finalFunctionCallTimestamp: z.string().optional(), // When the final action after interpreting the transcript was taken. Immutable.
// Support for prompt slot mapping - passed to recognition context when present
promptSlotMap: z.record(z.string(), z.array(z.string())).optional(), // Optional map of slot names to prompt values for recognition context
// Recognition action processing state - managed externally, SDK preserves but never modifies
recognitionActionProcessingState: z.string().optional(), // "NOT_STARTED", "IN_PROGRESS", "COMPLETED"
})
export type RecognitionState = z.infer<typeof RecognitionVGFStateSchema>
// Status constants for better type safety and consistency
export const RecordingStatus = {
NOT_READY: "NOT_READY",
READY: "READY",
RECORDING: "RECORDING",
FINISHED: "FINISHED",
} as const
export type RecordingStatusType = typeof RecordingStatus[keyof typeof RecordingStatus]
export const TranscriptionStatus = {
NOT_STARTED: "NOT_STARTED",
IN_PROGRESS: "IN_PROGRESS",
FINALIZED: "FINALIZED",
ABORTED: "ABORTED", // Session was cancelled/abandoned by user
ERROR: "ERROR",
} as const
export type TranscriptionStatusType = typeof TranscriptionStatus[keyof typeof TranscriptionStatus]
export const RecognitionActionProcessingState = {
NOT_STARTED: "NOT_STARTED",
IN_PROGRESS: "IN_PROGRESS",
COMPLETED: "COMPLETED",
} as const
export type RecognitionActionProcessingStateType = typeof RecognitionActionProcessingState[keyof typeof RecognitionActionProcessingState]
// Helper function to create initial state
export function createInitialRecognitionState(audioUtteranceId: string): RecognitionState {
return {
audioUtteranceId,
startRecordingStatus: RecordingStatus.NOT_READY,
transcriptionStatus: TranscriptionStatus.NOT_STARTED,
pendingTranscript: "",
recognitionActionProcessingState: RecognitionActionProcessingState.NOT_STARTED,
}
}
// Helper function to validate state transitions
export function isValidRecordingStatusTransition(from: string | undefined, to: string): boolean {
const statusOrder = [
RecordingStatus.NOT_READY,
RecordingStatus.READY,
RecordingStatus.RECORDING,
RecordingStatus.FINISHED,
]
const fromIndex = from ? statusOrder.indexOf(from as RecordingStatusType) : -1
const toIndex = statusOrder.indexOf(to as RecordingStatusType)
// Can only move forward in the status order
return toIndex > fromIndex && toIndex !== -1
}