From 7ddf344f32ad1b6ec01b0b1f1df396d072fca782 Mon Sep 17 00:00:00 2001 From: indextts Date: Sun, 7 Sep 2025 14:49:53 +0000 Subject: [PATCH] Upload config.yaml to ModelScope hub --- config.yaml | 120 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 config.yaml diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000..98d16c3 --- /dev/null +++ b/config.yaml @@ -0,0 +1,120 @@ +dataset: + bpe_model: bpe.model + sample_rate: 24000 + squeeze: false + mel: + sample_rate: 24000 + n_fft: 1024 + hop_length: 256 + win_length: 1024 + n_mels: 100 + mel_fmin: 0 + normalize: false + +gpt: + model_dim: 1280 + max_mel_tokens: 1815 + max_text_tokens: 600 + heads: 20 + use_mel_codes_as_input: true + mel_length_compression: 1024 + layers: 24 + number_text_tokens: 12000 + number_mel_codes: 8194 + start_mel_token: 8192 + stop_mel_token: 8193 + start_text_token: 0 + stop_text_token: 1 + train_solo_embeddings: false + condition_type: "conformer_perceiver" + condition_module: + output_size: 512 + linear_units: 2048 + attention_heads: 8 + num_blocks: 6 + input_layer: "conv2d2" + perceiver_mult: 2 + emo_condition_module: + output_size: 512 + linear_units: 1024 + attention_heads: 4 + num_blocks: 4 + input_layer: "conv2d2" + perceiver_mult: 2 + +semantic_codec: + codebook_size: 8192 + hidden_size: 1024 + codebook_dim: 8 + vocos_dim: 384 + vocos_intermediate_dim: 2048 + vocos_num_layers: 12 + +s2mel: + preprocess_params: + sr: 22050 + spect_params: + n_fft: 1024 + win_length: 1024 + hop_length: 256 + n_mels: 80 + fmin: 0 + fmax: "None" + + dit_type: "DiT" + reg_loss_type: "l1" + style_encoder: + dim: 192 + length_regulator: + channels: 512 + is_discrete: false + in_channels: 1024 + content_codebook_size: 2048 + sampling_ratios: [1, 1, 1, 1] + vector_quantize: false + n_codebooks: 1 + quantizer_dropout: 0.0 + f0_condition: false + n_f0_bins: 512 + DiT: + hidden_dim: 512 + num_heads: 8 + depth: 13 + class_dropout_prob: 0.1 + block_size: 8192 + in_channels: 80 + style_condition: true + final_layer_type: 'wavenet' + target: 'mel' + content_dim: 512 + content_codebook_size: 1024 + content_type: 'discrete' + f0_condition: false + n_f0_bins: 512 + content_codebooks: 1 + is_causal: false + long_skip_connection: true + zero_prompt_speech_token: false + time_as_token: false + style_as_token: false + uvit_skip_connection: true + add_resblock_in_transformer: false + wavenet: + hidden_dim: 512 + num_layers: 8 + kernel_size: 5 + dilation_rate: 1 + p_dropout: 0.2 + style_condition: true + +gpt_checkpoint: gpt.pth +w2v_stat: wav2vec2bert_stats.pt +s2mel_checkpoint: s2mel.pth +emo_matrix: feat2.pt +spk_matrix: feat1.pt +emo_num: [3, 17, 2, 8, 4, 5, 10, 24] +qwen_emo_path: qwen0.6bemo4-merge/ +vocoder: + type: "bigvgan" + name: "nvidia/bigvgan_v2_22khz_80band_256x" +version: 2.0