aivmlib-web
Version:
Aivis Voice Model File (.aivm/.aivmx) Utility Library for Web
400 lines (399 loc) • 17.1 kB
TypeScript
import { z } from 'zod';
export type StyleBertVITS2HyperParameters = z.infer<typeof StyleBertVITS2HyperParametersSchema>;
/**
* Style-Bert-VITS2 のハイパーパラメータのスキーマ
* 学習モデルの作成時期によって詳細なパラメータの有無が異なるため、実装上必須のパラメータ以外は optional としている
* 以下は Style-Bert-VITS2 v2.4.1 のハイパーパラメータスキーマ定義を TypeScript 向けに改変したもの
* ref: https://github.com/litagin02/Style-Bert-VITS2/blob/2.4.1/style_bert_vits2/models/hyper_parameters.py
*/
export declare const StyleBertVITS2HyperParametersSchema: z.ZodObject<{
model_name: z.ZodString;
version: z.ZodString;
train: z.ZodObject<{
log_interval: z.ZodOptional<z.ZodNumber>;
eval_interval: z.ZodOptional<z.ZodNumber>;
seed: z.ZodOptional<z.ZodNumber>;
epochs: z.ZodOptional<z.ZodNumber>;
learning_rate: z.ZodOptional<z.ZodNumber>;
betas: z.ZodOptional<z.ZodTuple<[z.ZodNumber, z.ZodNumber], null>>;
eps: z.ZodOptional<z.ZodNumber>;
batch_size: z.ZodOptional<z.ZodNumber>;
bf16_run: z.ZodOptional<z.ZodBoolean>;
fp16_run: z.ZodOptional<z.ZodBoolean>;
lr_decay: z.ZodOptional<z.ZodNumber>;
segment_size: z.ZodOptional<z.ZodNumber>;
init_lr_ratio: z.ZodOptional<z.ZodNumber>;
warmup_epochs: z.ZodOptional<z.ZodNumber>;
c_mel: z.ZodOptional<z.ZodNumber>;
c_kl: z.ZodOptional<z.ZodNumber>;
c_commit: z.ZodOptional<z.ZodNumber>;
skip_optimizer: z.ZodOptional<z.ZodBoolean>;
freeze_ZH_bert: z.ZodOptional<z.ZodBoolean>;
freeze_JP_bert: z.ZodOptional<z.ZodBoolean>;
freeze_EN_bert: z.ZodOptional<z.ZodBoolean>;
freeze_emo: z.ZodOptional<z.ZodBoolean>;
freeze_style: z.ZodOptional<z.ZodBoolean>;
freeze_decoder: z.ZodOptional<z.ZodBoolean>;
}, "strip", z.ZodTypeAny, {
log_interval?: number | undefined;
eval_interval?: number | undefined;
seed?: number | undefined;
epochs?: number | undefined;
learning_rate?: number | undefined;
betas?: [number, number] | undefined;
eps?: number | undefined;
batch_size?: number | undefined;
bf16_run?: boolean | undefined;
fp16_run?: boolean | undefined;
lr_decay?: number | undefined;
segment_size?: number | undefined;
init_lr_ratio?: number | undefined;
warmup_epochs?: number | undefined;
c_mel?: number | undefined;
c_kl?: number | undefined;
c_commit?: number | undefined;
skip_optimizer?: boolean | undefined;
freeze_ZH_bert?: boolean | undefined;
freeze_JP_bert?: boolean | undefined;
freeze_EN_bert?: boolean | undefined;
freeze_emo?: boolean | undefined;
freeze_style?: boolean | undefined;
freeze_decoder?: boolean | undefined;
}, {
log_interval?: number | undefined;
eval_interval?: number | undefined;
seed?: number | undefined;
epochs?: number | undefined;
learning_rate?: number | undefined;
betas?: [number, number] | undefined;
eps?: number | undefined;
batch_size?: number | undefined;
bf16_run?: boolean | undefined;
fp16_run?: boolean | undefined;
lr_decay?: number | undefined;
segment_size?: number | undefined;
init_lr_ratio?: number | undefined;
warmup_epochs?: number | undefined;
c_mel?: number | undefined;
c_kl?: number | undefined;
c_commit?: number | undefined;
skip_optimizer?: boolean | undefined;
freeze_ZH_bert?: boolean | undefined;
freeze_JP_bert?: boolean | undefined;
freeze_EN_bert?: boolean | undefined;
freeze_emo?: boolean | undefined;
freeze_style?: boolean | undefined;
freeze_decoder?: boolean | undefined;
}>;
data: z.ZodObject<{
use_jp_extra: z.ZodOptional<z.ZodBoolean>;
training_files: z.ZodOptional<z.ZodString>;
validation_files: z.ZodOptional<z.ZodString>;
max_wav_value: z.ZodOptional<z.ZodNumber>;
sampling_rate: z.ZodOptional<z.ZodNumber>;
filter_length: z.ZodOptional<z.ZodNumber>;
hop_length: z.ZodOptional<z.ZodNumber>;
win_length: z.ZodOptional<z.ZodNumber>;
n_mel_channels: z.ZodOptional<z.ZodNumber>;
mel_fmin: z.ZodOptional<z.ZodNumber>;
mel_fmax: z.ZodOptional<z.ZodNullable<z.ZodNumber>>;
add_blank: z.ZodOptional<z.ZodBoolean>;
n_speakers: z.ZodNumber;
cleaned_text: z.ZodOptional<z.ZodBoolean>;
spk2id: z.ZodRecord<z.ZodString, z.ZodNumber>;
num_styles: z.ZodNumber;
style2id: z.ZodRecord<z.ZodString, z.ZodNumber>;
}, "strip", z.ZodTypeAny, {
n_speakers: number;
spk2id: Record<string, number>;
num_styles: number;
style2id: Record<string, number>;
use_jp_extra?: boolean | undefined;
training_files?: string | undefined;
validation_files?: string | undefined;
max_wav_value?: number | undefined;
sampling_rate?: number | undefined;
filter_length?: number | undefined;
hop_length?: number | undefined;
win_length?: number | undefined;
n_mel_channels?: number | undefined;
mel_fmin?: number | undefined;
mel_fmax?: number | null | undefined;
add_blank?: boolean | undefined;
cleaned_text?: boolean | undefined;
}, {
n_speakers: number;
spk2id: Record<string, number>;
num_styles: number;
style2id: Record<string, number>;
use_jp_extra?: boolean | undefined;
training_files?: string | undefined;
validation_files?: string | undefined;
max_wav_value?: number | undefined;
sampling_rate?: number | undefined;
filter_length?: number | undefined;
hop_length?: number | undefined;
win_length?: number | undefined;
n_mel_channels?: number | undefined;
mel_fmin?: number | undefined;
mel_fmax?: number | null | undefined;
add_blank?: boolean | undefined;
cleaned_text?: boolean | undefined;
}>;
model: z.ZodObject<{
use_spk_conditioned_encoder: z.ZodOptional<z.ZodBoolean>;
use_noise_scaled_mas: z.ZodOptional<z.ZodBoolean>;
use_mel_posterior_encoder: z.ZodOptional<z.ZodBoolean>;
use_duration_discriminator: z.ZodOptional<z.ZodBoolean>;
use_wavlm_discriminator: z.ZodOptional<z.ZodBoolean>;
inter_channels: z.ZodOptional<z.ZodNumber>;
hidden_channels: z.ZodOptional<z.ZodNumber>;
filter_channels: z.ZodOptional<z.ZodNumber>;
n_heads: z.ZodOptional<z.ZodNumber>;
n_layers: z.ZodOptional<z.ZodNumber>;
kernel_size: z.ZodOptional<z.ZodNumber>;
p_dropout: z.ZodOptional<z.ZodNumber>;
resblock: z.ZodOptional<z.ZodString>;
resblock_kernel_sizes: z.ZodOptional<z.ZodArray<z.ZodNumber, "many">>;
resblock_dilation_sizes: z.ZodOptional<z.ZodArray<z.ZodArray<z.ZodNumber, "many">, "many">>;
upsample_rates: z.ZodOptional<z.ZodArray<z.ZodNumber, "many">>;
upsample_initial_channel: z.ZodOptional<z.ZodNumber>;
upsample_kernel_sizes: z.ZodOptional<z.ZodArray<z.ZodNumber, "many">>;
n_layers_q: z.ZodOptional<z.ZodNumber>;
use_spectral_norm: z.ZodOptional<z.ZodBoolean>;
gin_channels: z.ZodOptional<z.ZodNumber>;
slm: z.ZodOptional<z.ZodObject<{
model: z.ZodOptional<z.ZodString>;
sr: z.ZodOptional<z.ZodNumber>;
hidden: z.ZodOptional<z.ZodNumber>;
nlayers: z.ZodOptional<z.ZodNumber>;
initial_channel: z.ZodOptional<z.ZodNumber>;
}, "strip", z.ZodTypeAny, {
model?: string | undefined;
sr?: number | undefined;
hidden?: number | undefined;
nlayers?: number | undefined;
initial_channel?: number | undefined;
}, {
model?: string | undefined;
sr?: number | undefined;
hidden?: number | undefined;
nlayers?: number | undefined;
initial_channel?: number | undefined;
}>>;
}, "strip", z.ZodTypeAny, {
use_spk_conditioned_encoder?: boolean | undefined;
use_noise_scaled_mas?: boolean | undefined;
use_mel_posterior_encoder?: boolean | undefined;
use_duration_discriminator?: boolean | undefined;
use_wavlm_discriminator?: boolean | undefined;
inter_channels?: number | undefined;
hidden_channels?: number | undefined;
filter_channels?: number | undefined;
n_heads?: number | undefined;
n_layers?: number | undefined;
kernel_size?: number | undefined;
p_dropout?: number | undefined;
resblock?: string | undefined;
resblock_kernel_sizes?: number[] | undefined;
resblock_dilation_sizes?: number[][] | undefined;
upsample_rates?: number[] | undefined;
upsample_initial_channel?: number | undefined;
upsample_kernel_sizes?: number[] | undefined;
n_layers_q?: number | undefined;
use_spectral_norm?: boolean | undefined;
gin_channels?: number | undefined;
slm?: {
model?: string | undefined;
sr?: number | undefined;
hidden?: number | undefined;
nlayers?: number | undefined;
initial_channel?: number | undefined;
} | undefined;
}, {
use_spk_conditioned_encoder?: boolean | undefined;
use_noise_scaled_mas?: boolean | undefined;
use_mel_posterior_encoder?: boolean | undefined;
use_duration_discriminator?: boolean | undefined;
use_wavlm_discriminator?: boolean | undefined;
inter_channels?: number | undefined;
hidden_channels?: number | undefined;
filter_channels?: number | undefined;
n_heads?: number | undefined;
n_layers?: number | undefined;
kernel_size?: number | undefined;
p_dropout?: number | undefined;
resblock?: string | undefined;
resblock_kernel_sizes?: number[] | undefined;
resblock_dilation_sizes?: number[][] | undefined;
upsample_rates?: number[] | undefined;
upsample_initial_channel?: number | undefined;
upsample_kernel_sizes?: number[] | undefined;
n_layers_q?: number | undefined;
use_spectral_norm?: boolean | undefined;
gin_channels?: number | undefined;
slm?: {
model?: string | undefined;
sr?: number | undefined;
hidden?: number | undefined;
nlayers?: number | undefined;
initial_channel?: number | undefined;
} | undefined;
}>;
}, "strip", z.ZodTypeAny, {
model_name: string;
version: string;
train: {
log_interval?: number | undefined;
eval_interval?: number | undefined;
seed?: number | undefined;
epochs?: number | undefined;
learning_rate?: number | undefined;
betas?: [number, number] | undefined;
eps?: number | undefined;
batch_size?: number | undefined;
bf16_run?: boolean | undefined;
fp16_run?: boolean | undefined;
lr_decay?: number | undefined;
segment_size?: number | undefined;
init_lr_ratio?: number | undefined;
warmup_epochs?: number | undefined;
c_mel?: number | undefined;
c_kl?: number | undefined;
c_commit?: number | undefined;
skip_optimizer?: boolean | undefined;
freeze_ZH_bert?: boolean | undefined;
freeze_JP_bert?: boolean | undefined;
freeze_EN_bert?: boolean | undefined;
freeze_emo?: boolean | undefined;
freeze_style?: boolean | undefined;
freeze_decoder?: boolean | undefined;
};
data: {
n_speakers: number;
spk2id: Record<string, number>;
num_styles: number;
style2id: Record<string, number>;
use_jp_extra?: boolean | undefined;
training_files?: string | undefined;
validation_files?: string | undefined;
max_wav_value?: number | undefined;
sampling_rate?: number | undefined;
filter_length?: number | undefined;
hop_length?: number | undefined;
win_length?: number | undefined;
n_mel_channels?: number | undefined;
mel_fmin?: number | undefined;
mel_fmax?: number | null | undefined;
add_blank?: boolean | undefined;
cleaned_text?: boolean | undefined;
};
model: {
use_spk_conditioned_encoder?: boolean | undefined;
use_noise_scaled_mas?: boolean | undefined;
use_mel_posterior_encoder?: boolean | undefined;
use_duration_discriminator?: boolean | undefined;
use_wavlm_discriminator?: boolean | undefined;
inter_channels?: number | undefined;
hidden_channels?: number | undefined;
filter_channels?: number | undefined;
n_heads?: number | undefined;
n_layers?: number | undefined;
kernel_size?: number | undefined;
p_dropout?: number | undefined;
resblock?: string | undefined;
resblock_kernel_sizes?: number[] | undefined;
resblock_dilation_sizes?: number[][] | undefined;
upsample_rates?: number[] | undefined;
upsample_initial_channel?: number | undefined;
upsample_kernel_sizes?: number[] | undefined;
n_layers_q?: number | undefined;
use_spectral_norm?: boolean | undefined;
gin_channels?: number | undefined;
slm?: {
model?: string | undefined;
sr?: number | undefined;
hidden?: number | undefined;
nlayers?: number | undefined;
initial_channel?: number | undefined;
} | undefined;
};
}, {
model_name: string;
version: string;
train: {
log_interval?: number | undefined;
eval_interval?: number | undefined;
seed?: number | undefined;
epochs?: number | undefined;
learning_rate?: number | undefined;
betas?: [number, number] | undefined;
eps?: number | undefined;
batch_size?: number | undefined;
bf16_run?: boolean | undefined;
fp16_run?: boolean | undefined;
lr_decay?: number | undefined;
segment_size?: number | undefined;
init_lr_ratio?: number | undefined;
warmup_epochs?: number | undefined;
c_mel?: number | undefined;
c_kl?: number | undefined;
c_commit?: number | undefined;
skip_optimizer?: boolean | undefined;
freeze_ZH_bert?: boolean | undefined;
freeze_JP_bert?: boolean | undefined;
freeze_EN_bert?: boolean | undefined;
freeze_emo?: boolean | undefined;
freeze_style?: boolean | undefined;
freeze_decoder?: boolean | undefined;
};
data: {
n_speakers: number;
spk2id: Record<string, number>;
num_styles: number;
style2id: Record<string, number>;
use_jp_extra?: boolean | undefined;
training_files?: string | undefined;
validation_files?: string | undefined;
max_wav_value?: number | undefined;
sampling_rate?: number | undefined;
filter_length?: number | undefined;
hop_length?: number | undefined;
win_length?: number | undefined;
n_mel_channels?: number | undefined;
mel_fmin?: number | undefined;
mel_fmax?: number | null | undefined;
add_blank?: boolean | undefined;
cleaned_text?: boolean | undefined;
};
model: {
use_spk_conditioned_encoder?: boolean | undefined;
use_noise_scaled_mas?: boolean | undefined;
use_mel_posterior_encoder?: boolean | undefined;
use_duration_discriminator?: boolean | undefined;
use_wavlm_discriminator?: boolean | undefined;
inter_channels?: number | undefined;
hidden_channels?: number | undefined;
filter_channels?: number | undefined;
n_heads?: number | undefined;
n_layers?: number | undefined;
kernel_size?: number | undefined;
p_dropout?: number | undefined;
resblock?: string | undefined;
resblock_kernel_sizes?: number[] | undefined;
resblock_dilation_sizes?: number[][] | undefined;
upsample_rates?: number[] | undefined;
upsample_initial_channel?: number | undefined;
upsample_kernel_sizes?: number[] | undefined;
n_layers_q?: number | undefined;
use_spectral_norm?: boolean | undefined;
gin_channels?: number | undefined;
slm?: {
model?: string | undefined;
sr?: number | undefined;
hidden?: number | undefined;
nlayers?: number | undefined;
initial_channel?: number | undefined;
} | undefined;
};
}>;