From d551729afac338c1006e4c7f2b11ae890cb552f2 Mon Sep 17 00:00:00 2001 From: "jiaqi.sjq" Date: Wed, 14 Dec 2022 16:34:44 +0800 Subject: [PATCH] update --- voices/voices.json | 8 + voices/zhibei_emo/am/ckpt/checkpoint_0.pth | 3 + voices/zhibei_emo/am/config.yaml | 105 ++++++++++ voices/zhibei_emo/am/pytorch_model.bin | 3 + voices/zhibei_emo/audio_config.yaml | 27 +++ voices/zhibei_emo/dict/emo_category_dict.txt | 33 +++ voices/zhibei_emo/dict/speaker_dict.txt | 6 + voices/zhibei_emo/dict/sy_dict.txt | 144 ++++++++++++++ voices/zhibei_emo/dict/syllable_flag_dict.txt | 5 + voices/zhibei_emo/dict/tone_dict.txt | 7 + voices/zhibei_emo/dict/word_segment_dict.txt | 5 + voices/zhibei_emo/voc/ckpt/checkpoint_0.pth | 3 + voices/zhibei_emo/voc/config.yaml | 188 ++++++++++++++++++ voices/zhibei_emo/vocoder/pytorch_model.bin | 3 + voices/zhitian_emo/am/ckpt/checkpoint_0.pth | 3 + voices/zhitian_emo/am/config.yaml | 105 ++++++++++ voices/zhitian_emo/am/pytorch_model.bin | 3 + voices/zhitian_emo/audio_config.yaml | 27 +++ voices/zhitian_emo/dict/emo_category_dict.txt | 33 +++ voices/zhitian_emo/dict/speaker_dict.txt | 6 + voices/zhitian_emo/dict/sy_dict.txt | 144 ++++++++++++++ .../zhitian_emo/dict/syllable_flag_dict.txt | 5 + voices/zhitian_emo/dict/tone_dict.txt | 7 + voices/zhitian_emo/dict/word_segment_dict.txt | 5 + voices/zhitian_emo/voc/ckpt/checkpoint_0.pth | 3 + voices/zhitian_emo/voc/config.yaml | 188 ++++++++++++++++++ voices/zhitian_emo/vocoder/pytorch_model.bin | 3 + voices/zhiyan_emo/am/ckpt/checkpoint_0.pth | 3 + voices/zhiyan_emo/am/config.yaml | 105 ++++++++++ voices/zhiyan_emo/am/pytorch_model.bin | 3 + voices/zhiyan_emo/audio_config.yaml | 27 +++ voices/zhiyan_emo/dict/emo_category_dict.txt | 33 +++ voices/zhiyan_emo/dict/speaker_dict.txt | 6 + voices/zhiyan_emo/dict/sy_dict.txt | 144 ++++++++++++++ voices/zhiyan_emo/dict/syllable_flag_dict.txt | 5 + voices/zhiyan_emo/dict/tone_dict.txt | 7 + voices/zhiyan_emo/dict/word_segment_dict.txt | 5 + voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth | 3 + voices/zhiyan_emo/voc/config.yaml | 188 ++++++++++++++++++ voices/zhiyan_emo/vocoder/pytorch_model.bin | 3 + voices/zhizhe_emo/am/ckpt/checkpoint_0.pth | 3 + voices/zhizhe_emo/am/config.yaml | 105 ++++++++++ voices/zhizhe_emo/am/pytorch_model.bin | 3 + voices/zhizhe_emo/audio_config.yaml | 27 +++ voices/zhizhe_emo/dict/emo_category_dict.txt | 33 +++ voices/zhizhe_emo/dict/speaker_dict.txt | 6 + voices/zhizhe_emo/dict/sy_dict.txt | 144 ++++++++++++++ voices/zhizhe_emo/dict/syllable_flag_dict.txt | 5 + voices/zhizhe_emo/dict/tone_dict.txt | 7 + voices/zhizhe_emo/dict/word_segment_dict.txt | 5 + voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth | 3 + voices/zhizhe_emo/voc/config.yaml | 188 ++++++++++++++++++ voices/zhizhe_emo/vocoder/pytorch_model.bin | 3 + 53 files changed, 2136 insertions(+) create mode 100644 voices/voices.json create mode 100644 voices/zhibei_emo/am/ckpt/checkpoint_0.pth create mode 100644 voices/zhibei_emo/am/config.yaml create mode 100644 voices/zhibei_emo/am/pytorch_model.bin create mode 100644 voices/zhibei_emo/audio_config.yaml create mode 100755 voices/zhibei_emo/dict/emo_category_dict.txt create mode 100755 voices/zhibei_emo/dict/speaker_dict.txt create mode 100755 voices/zhibei_emo/dict/sy_dict.txt create mode 100755 voices/zhibei_emo/dict/syllable_flag_dict.txt create mode 100755 voices/zhibei_emo/dict/tone_dict.txt create mode 100755 voices/zhibei_emo/dict/word_segment_dict.txt create mode 100644 voices/zhibei_emo/voc/ckpt/checkpoint_0.pth create mode 100644 voices/zhibei_emo/voc/config.yaml create mode 100644 voices/zhibei_emo/vocoder/pytorch_model.bin create mode 100644 voices/zhitian_emo/am/ckpt/checkpoint_0.pth create mode 100644 voices/zhitian_emo/am/config.yaml create mode 100644 voices/zhitian_emo/am/pytorch_model.bin create mode 100644 voices/zhitian_emo/audio_config.yaml create mode 100755 voices/zhitian_emo/dict/emo_category_dict.txt create mode 100755 voices/zhitian_emo/dict/speaker_dict.txt create mode 100755 voices/zhitian_emo/dict/sy_dict.txt create mode 100755 voices/zhitian_emo/dict/syllable_flag_dict.txt create mode 100755 voices/zhitian_emo/dict/tone_dict.txt create mode 100755 voices/zhitian_emo/dict/word_segment_dict.txt create mode 100644 voices/zhitian_emo/voc/ckpt/checkpoint_0.pth create mode 100644 voices/zhitian_emo/voc/config.yaml create mode 100644 voices/zhitian_emo/vocoder/pytorch_model.bin create mode 100644 voices/zhiyan_emo/am/ckpt/checkpoint_0.pth create mode 100644 voices/zhiyan_emo/am/config.yaml create mode 100644 voices/zhiyan_emo/am/pytorch_model.bin create mode 100644 voices/zhiyan_emo/audio_config.yaml create mode 100755 voices/zhiyan_emo/dict/emo_category_dict.txt create mode 100755 voices/zhiyan_emo/dict/speaker_dict.txt create mode 100755 voices/zhiyan_emo/dict/sy_dict.txt create mode 100755 voices/zhiyan_emo/dict/syllable_flag_dict.txt create mode 100755 voices/zhiyan_emo/dict/tone_dict.txt create mode 100755 voices/zhiyan_emo/dict/word_segment_dict.txt create mode 100644 voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth create mode 100644 voices/zhiyan_emo/voc/config.yaml create mode 100644 voices/zhiyan_emo/vocoder/pytorch_model.bin create mode 100644 voices/zhizhe_emo/am/ckpt/checkpoint_0.pth create mode 100644 voices/zhizhe_emo/am/config.yaml create mode 100644 voices/zhizhe_emo/am/pytorch_model.bin create mode 100644 voices/zhizhe_emo/audio_config.yaml create mode 100755 voices/zhizhe_emo/dict/emo_category_dict.txt create mode 100755 voices/zhizhe_emo/dict/speaker_dict.txt create mode 100755 voices/zhizhe_emo/dict/sy_dict.txt create mode 100755 voices/zhizhe_emo/dict/syllable_flag_dict.txt create mode 100755 voices/zhizhe_emo/dict/tone_dict.txt create mode 100755 voices/zhizhe_emo/dict/word_segment_dict.txt create mode 100644 voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth create mode 100644 voices/zhizhe_emo/voc/config.yaml create mode 100644 voices/zhizhe_emo/vocoder/pytorch_model.bin diff --git a/voices/voices.json b/voices/voices.json new file mode 100644 index 0000000..c43f58f --- /dev/null +++ b/voices/voices.json @@ -0,0 +1,8 @@ +{ + "voices": [ + "zhitian_emo", + "zhibei_emo", + "zhizhe_emo", + "zhiyan_emo" + ] +} diff --git a/voices/zhibei_emo/am/ckpt/checkpoint_0.pth b/voices/zhibei_emo/am/ckpt/checkpoint_0.pth new file mode 100644 index 0000000..99349c5 --- /dev/null +++ b/voices/zhibei_emo/am/ckpt/checkpoint_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd7fde59dac89ab5a5a076a8a74519953ceb1be67a3dabc24aad3d90b4521334 +size 49315631 diff --git a/voices/zhibei_emo/am/config.yaml b/voices/zhibei_emo/am/config.yaml new file mode 100644 index 0000000..56a980a --- /dev/null +++ b/voices/zhibei_emo/am/config.yaml @@ -0,0 +1,105 @@ +model_type: sambert +Model: +######################################################### +# SAMBERT NETWORK ARCHITECTURE SETTING # +######################################################### + KanTtsSAMBERT: + params: + max_len: 800 + + embedding_dim: 512 + encoder_num_layers: 8 + encoder_num_heads: 8 + encoder_num_units: 128 + encoder_ffn_inner_dim: 1024 + encoder_dropout: 0.1 + encoder_attention_dropout: 0.1 + encoder_relu_dropout: 0.1 + encoder_projection_units: 32 + + speaker_units: 32 + emotion_units: 32 + + predictor_filter_size: 41 + predictor_fsmn_num_layers: 3 + predictor_num_memory_units: 128 + predictor_ffn_inner_dim: 256 + predictor_dropout: 0.1 + predictor_shift: 0 + predictor_lstm_units: 128 + dur_pred_prenet_units: [128, 128] + dur_pred_lstm_units: 128 + + decoder_prenet_units: [256, 256] + decoder_num_layers: 12 + decoder_num_heads: 8 + decoder_num_units: 128 + decoder_ffn_inner_dim: 1024 + decoder_dropout: 0.1 + decoder_attention_dropout: 0.1 + decoder_relu_dropout: 0.1 + + outputs_per_step: 3 + num_mels: 80 + + postnet_filter_size: 41 + postnet_fsmn_num_layers: 4 + postnet_num_memory_units: 256 + postnet_ffn_inner_dim: 512 + postnet_dropout: 0.1 + postnet_shift: 17 + postnet_lstm_units: 128 + MAS: False + + optimizer: + type: Adam + params: + lr: 0.001 + betas: [0.9, 0.98] + eps: 1.0e-9 + weight_decay: 0.0 + scheduler: + type: NoamLR + params: + warmup_steps: 4000 + +linguistic_unit: + cleaners: english_cleaners + lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category + speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu +#################################################### +# LOSS SETTING # +#################################################### +Loss: + MelReconLoss: + enable: True + params: + loss_type: mae + + ProsodyReconLoss: + enable: True + params: + loss_type: mae + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 32 +pin_memory: False +num_workers: 4 # FIXME: set > 0 may stuck on macos +remove_short_samples: False +allow_cache: True +grad_norm: 1.0 + +########################################################### +# INTERVAL SETTING # +########################################################### +train_max_steps: 1000000 # Number of training steps. +save_interval_steps: 20000 # Interval steps to save checkpoint. +eval_interval_steps: 10000 # Interval steps to evaluate the network. +log_interval_steps: 1000 # Interval steps to record the training log. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. diff --git a/voices/zhibei_emo/am/pytorch_model.bin b/voices/zhibei_emo/am/pytorch_model.bin new file mode 100644 index 0000000..1f37fcb --- /dev/null +++ b/voices/zhibei_emo/am/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8750fbfe39c3b18e8a024900f1e81ea07192fb520f25ff1bad7e9ea88ebc34c +size 49234411 diff --git a/voices/zhibei_emo/audio_config.yaml b/voices/zhibei_emo/audio_config.yaml new file mode 100644 index 0000000..233817c --- /dev/null +++ b/voices/zhibei_emo/audio_config.yaml @@ -0,0 +1,27 @@ +# Audio processing configs + +audio_config: + # Preprocess + wav_normalize: True + trim_silence: True + trim_silence_threshold_db: 60 + preemphasize: False + + # Feature extraction + sampling_rate: 16000 + hop_length: 200 + win_length: 1000 + n_fft: 2048 + n_mels: 80 + fmin: 0.0 + fmax: 8000.0 + phone_level_feature: True + + # Normalization + norm_type: "mean_std" # "mean_std" or "global" + max_norm: 1.0 + symmetric: False + min_level_db: -100.0 + ref_level_db: 20 + + num_workers: 16 diff --git a/voices/zhibei_emo/dict/emo_category_dict.txt b/voices/zhibei_emo/dict/emo_category_dict.txt new file mode 100755 index 0000000..dfd88e8 --- /dev/null +++ b/voices/zhibei_emo/dict/emo_category_dict.txt @@ -0,0 +1,33 @@ +emotion_none +emotion_neutral +emotion_angry +emotion_disgust +emotion_fear +emotion_happy +emotion_sad +emotion_surprise +emotion_calm +emotion_gentle +emotion_relax +emotion_lyrical +emotion_serious +emotion_disgruntled +emotion_satisfied +emotion_disappointed +emotion_excited +emotion_anxiety +emotion_jealousy +emotion_hate +emotion_pity +emotion_pleasure +emotion_arousal +emotion_dominance +emotion_placeholder1 +emotion_placeholder2 +emotion_placeholder3 +emotion_placeholder4 +emotion_placeholder5 +emotion_placeholder6 +emotion_placeholder7 +emotion_placeholder8 +emotion_placeholder9 \ No newline at end of file diff --git a/voices/zhibei_emo/dict/speaker_dict.txt b/voices/zhibei_emo/dict/speaker_dict.txt new file mode 100755 index 0000000..af0ca1d --- /dev/null +++ b/voices/zhibei_emo/dict/speaker_dict.txt @@ -0,0 +1,6 @@ +F7 +F74 +FBYN +FRXL +M7 +xiaoyu diff --git a/voices/zhibei_emo/dict/sy_dict.txt b/voices/zhibei_emo/dict/sy_dict.txt new file mode 100755 index 0000000..ec54511 --- /dev/null +++ b/voices/zhibei_emo/dict/sy_dict.txt @@ -0,0 +1,144 @@ +a_c +ai_c +an_c +ang_c +ao_c +b_c +c_c +ch_c +d_c +e_c +ei_c +en_c +eng_c +er_c +f_c +g_c +h_c +i_c +ia_c +ian_c +iang_c +iao_c +ie_c +ih_c +ii_c +in_c +ing_c +io_c +iong_c +iou_c +j_c +k_c +l_c +m_c +n_c +o_c +ong_c +ou_c +p_c +q_c +r_c +s_c +sh_c +t_c +u_c +ua_c +uai_c +uan_c +uang_c +uei_c +uen_c +ueng_c +uo_c +v_c +van_c +ve_c +vn_c +xx_c +z_c +zh_c +w_c +y_c +ga +ge +go +aa +ae +ah +ao +aw +ay +b +ch +d +dh +eh +er +ey +f +g +hh +ih +iy +jh +k +l +m +n +ng +ow +oy +p +r +s +sh +t +th +uh +uw +v +w +y +z +zh +air_c +angr_c +anr_c +aor_c +ar_c +eir_c +engr_c +enr_c +iangr_c +ianr_c +iaor_c +iar_c +ier_c +ihr_c +iir_c +ingr_c +inr_c +iongr_c +iour_c +ir_c +ongr_c +or_c +our_c +uair_c +uangr_c +uanr_c +uar_c +ueir_c +uenr_c +uor_c +ur_c +vanr_c +ver_c +vnr_c +vr_c +pau +#1 +#2 +#3 +#4 \ No newline at end of file diff --git a/voices/zhibei_emo/dict/syllable_flag_dict.txt b/voices/zhibei_emo/dict/syllable_flag_dict.txt new file mode 100755 index 0000000..84a4d14 --- /dev/null +++ b/voices/zhibei_emo/dict/syllable_flag_dict.txt @@ -0,0 +1,5 @@ +s_begin +s_end +s_none +s_both +s_middle diff --git a/voices/zhibei_emo/dict/tone_dict.txt b/voices/zhibei_emo/dict/tone_dict.txt new file mode 100755 index 0000000..7af26ed --- /dev/null +++ b/voices/zhibei_emo/dict/tone_dict.txt @@ -0,0 +1,7 @@ +tone1 +tone_none +tone4 +tone2 +tone3 +tone5 +tone0 diff --git a/voices/zhibei_emo/dict/word_segment_dict.txt b/voices/zhibei_emo/dict/word_segment_dict.txt new file mode 100755 index 0000000..667bcf9 --- /dev/null +++ b/voices/zhibei_emo/dict/word_segment_dict.txt @@ -0,0 +1,5 @@ +word_begin +word_end +word_middle +word_both +word_none diff --git a/voices/zhibei_emo/voc/ckpt/checkpoint_0.pth b/voices/zhibei_emo/voc/ckpt/checkpoint_0.pth new file mode 100644 index 0000000..48f4107 --- /dev/null +++ b/voices/zhibei_emo/voc/ckpt/checkpoint_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ff65e91d6dda79c878564920b0964a9e5449eb658a8d0a8d51351ca2f56460c +size 19594437 diff --git a/voices/zhibei_emo/voc/config.yaml b/voices/zhibei_emo/voc/config.yaml new file mode 100644 index 0000000..e9853c3 --- /dev/null +++ b/voices/zhibei_emo/voc/config.yaml @@ -0,0 +1,188 @@ +model_type: hifigan +Model: +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### + Generator: + params: + in_channels: 80 + out_channels: 1 + channels: 256 + kernel_size: 7 + upsample_scales: [10, 5, 2, 2] + upsample_kernal_sizes: [20, 11, 4, 4] + resblock_kernel_sizes: [3, 7, 11] + resblock_dilations: + - [1, 3, 5, 7] + - [1, 3, 5, 7] + - [1, 3, 5, 7] + bias: true + causal: true + nonlinear_activation: "LeakyReLU" + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + optimizer: + type: Adam + params: + lr: 2.0e-4 + betas: [0.5, 0.9] + weight_decay: 0.0 + scheduler: + type: MultiStepLR + params: + gamma: 0.5 + milestones: + - 200000 + - 400000 + - 600000 + - 800000 + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### + MultiScaleDiscriminator: + params: + scales: 3 + downsample_pooling: "DWT" + downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: [15, 41, 5, 3] + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: [4, 4, 4, 4, 1] + nonlinear_activation: "LeakyReLU" + nonlinear_activation_params: + negative_slope: 0.1 + follow_official_norm: true + optimizer: + type: Adam + params: + lr: 2.0e-4 + betas: [0.5, 0.9] + weight_decay: 0.0 + scheduler: + type: MultiStepLR + params: + gamma: 0.5 + milestones: + - 200000 + - 400000 + - 600000 + - 800000 + + MultiPeriodDiscriminator: + params: + periods: [2, 3, 5, 7, 11] + discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: [5, 3] + channels: 32 + downsample_scales: [3, 3, 3, 3, 1] + max_downsample_channels: 1024 + bias: true + nonlinear_activation: "LeakyReLU" + nonlinear_activation_params: + negative_slope: 0.1 + use_spectral_norm: false + optimizer: + type: Adam + params: + lr: 2.0e-4 + betas: [0.5, 0.9] + weight_decay: 0.0 + scheduler: + type: MultiStepLR + params: + gamma: 0.5 + milestones: + - 200000 + - 400000 + - 600000 + - 800000 + +#################################################### +# LOSS SETTING # +#################################################### +Loss: + generator_adv_loss: + enable: True + params: + average_by_discriminators: False + weights: 1.0 + + discriminator_adv_loss: + enable: True + params: + average_by_discriminators: False + weights: 1.0 + + stft_loss: + enable: False # Whether to use multi-resolution STFT loss. + + mel_loss: + enable: True + params: + fs: 16000 + fft_size: 2048 + hop_size: 200 + win_length: 1000 + window: "hann" + num_mels: 80 + fmin: 0 + fmax: 8000 + log_base: null + weights: 45.0 + + subband_stft_loss: + enable: False + params: + fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. + hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss + win_lengths: [150, 300, 60] # List of window length for STFT-based loss. + window: "hann_window" # Window function for STFT-based loss + + feat_match_loss: + enable: True + params: + average_by_discriminators: false + average_by_layers: false + weights: 2.0 + + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 16 +batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size. +pin_memory: True +num_workers: 2 # FIXME: set > 0 may stuck on macos +remove_short_samples: False +allow_cache: True + +generator_grad_norm: -1 + +discriminator_grad_norm: -1 + +########################################################### +# INTERVAL SETTING # +########################################################### +generator_train_start_steps: 1 # Number of steps to start to train discriminator. +discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. +train_max_steps: 2500000 # Number of training steps. +save_interval_steps: 20000 # Interval steps to save checkpoint. +eval_interval_steps: 10000 # Interval steps to evaluate the network. +log_interval_steps: 1000 # Interval steps to record the training log. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. diff --git a/voices/zhibei_emo/vocoder/pytorch_model.bin b/voices/zhibei_emo/vocoder/pytorch_model.bin new file mode 100644 index 0000000..1323513 --- /dev/null +++ b/voices/zhibei_emo/vocoder/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b68cd8cda09c79cca36ef6ae17b1b547a4390686a77b0a3eeadc673bb7bb139 +size 19613277 diff --git a/voices/zhitian_emo/am/ckpt/checkpoint_0.pth b/voices/zhitian_emo/am/ckpt/checkpoint_0.pth new file mode 100644 index 0000000..7570874 --- /dev/null +++ b/voices/zhitian_emo/am/ckpt/checkpoint_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4108ce78f7737a386a7c5127fe6f63ca6c9442d2c0ca5263dd5fc2b0dddfae4 +size 49315631 diff --git a/voices/zhitian_emo/am/config.yaml b/voices/zhitian_emo/am/config.yaml new file mode 100644 index 0000000..56a980a --- /dev/null +++ b/voices/zhitian_emo/am/config.yaml @@ -0,0 +1,105 @@ +model_type: sambert +Model: +######################################################### +# SAMBERT NETWORK ARCHITECTURE SETTING # +######################################################### + KanTtsSAMBERT: + params: + max_len: 800 + + embedding_dim: 512 + encoder_num_layers: 8 + encoder_num_heads: 8 + encoder_num_units: 128 + encoder_ffn_inner_dim: 1024 + encoder_dropout: 0.1 + encoder_attention_dropout: 0.1 + encoder_relu_dropout: 0.1 + encoder_projection_units: 32 + + speaker_units: 32 + emotion_units: 32 + + predictor_filter_size: 41 + predictor_fsmn_num_layers: 3 + predictor_num_memory_units: 128 + predictor_ffn_inner_dim: 256 + predictor_dropout: 0.1 + predictor_shift: 0 + predictor_lstm_units: 128 + dur_pred_prenet_units: [128, 128] + dur_pred_lstm_units: 128 + + decoder_prenet_units: [256, 256] + decoder_num_layers: 12 + decoder_num_heads: 8 + decoder_num_units: 128 + decoder_ffn_inner_dim: 1024 + decoder_dropout: 0.1 + decoder_attention_dropout: 0.1 + decoder_relu_dropout: 0.1 + + outputs_per_step: 3 + num_mels: 80 + + postnet_filter_size: 41 + postnet_fsmn_num_layers: 4 + postnet_num_memory_units: 256 + postnet_ffn_inner_dim: 512 + postnet_dropout: 0.1 + postnet_shift: 17 + postnet_lstm_units: 128 + MAS: False + + optimizer: + type: Adam + params: + lr: 0.001 + betas: [0.9, 0.98] + eps: 1.0e-9 + weight_decay: 0.0 + scheduler: + type: NoamLR + params: + warmup_steps: 4000 + +linguistic_unit: + cleaners: english_cleaners + lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category + speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu +#################################################### +# LOSS SETTING # +#################################################### +Loss: + MelReconLoss: + enable: True + params: + loss_type: mae + + ProsodyReconLoss: + enable: True + params: + loss_type: mae + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 32 +pin_memory: False +num_workers: 4 # FIXME: set > 0 may stuck on macos +remove_short_samples: False +allow_cache: True +grad_norm: 1.0 + +########################################################### +# INTERVAL SETTING # +########################################################### +train_max_steps: 1000000 # Number of training steps. +save_interval_steps: 20000 # Interval steps to save checkpoint. +eval_interval_steps: 10000 # Interval steps to evaluate the network. +log_interval_steps: 1000 # Interval steps to record the training log. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. diff --git a/voices/zhitian_emo/am/pytorch_model.bin b/voices/zhitian_emo/am/pytorch_model.bin new file mode 100644 index 0000000..780ea34 --- /dev/null +++ b/voices/zhitian_emo/am/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61a3641e654e0621e949b0288566e840f170c32a5ba4fa51c3e7bd8247c51040 +size 49234411 diff --git a/voices/zhitian_emo/audio_config.yaml b/voices/zhitian_emo/audio_config.yaml new file mode 100644 index 0000000..233817c --- /dev/null +++ b/voices/zhitian_emo/audio_config.yaml @@ -0,0 +1,27 @@ +# Audio processing configs + +audio_config: + # Preprocess + wav_normalize: True + trim_silence: True + trim_silence_threshold_db: 60 + preemphasize: False + + # Feature extraction + sampling_rate: 16000 + hop_length: 200 + win_length: 1000 + n_fft: 2048 + n_mels: 80 + fmin: 0.0 + fmax: 8000.0 + phone_level_feature: True + + # Normalization + norm_type: "mean_std" # "mean_std" or "global" + max_norm: 1.0 + symmetric: False + min_level_db: -100.0 + ref_level_db: 20 + + num_workers: 16 diff --git a/voices/zhitian_emo/dict/emo_category_dict.txt b/voices/zhitian_emo/dict/emo_category_dict.txt new file mode 100755 index 0000000..dfd88e8 --- /dev/null +++ b/voices/zhitian_emo/dict/emo_category_dict.txt @@ -0,0 +1,33 @@ +emotion_none +emotion_neutral +emotion_angry +emotion_disgust +emotion_fear +emotion_happy +emotion_sad +emotion_surprise +emotion_calm +emotion_gentle +emotion_relax +emotion_lyrical +emotion_serious +emotion_disgruntled +emotion_satisfied +emotion_disappointed +emotion_excited +emotion_anxiety +emotion_jealousy +emotion_hate +emotion_pity +emotion_pleasure +emotion_arousal +emotion_dominance +emotion_placeholder1 +emotion_placeholder2 +emotion_placeholder3 +emotion_placeholder4 +emotion_placeholder5 +emotion_placeholder6 +emotion_placeholder7 +emotion_placeholder8 +emotion_placeholder9 \ No newline at end of file diff --git a/voices/zhitian_emo/dict/speaker_dict.txt b/voices/zhitian_emo/dict/speaker_dict.txt new file mode 100755 index 0000000..af0ca1d --- /dev/null +++ b/voices/zhitian_emo/dict/speaker_dict.txt @@ -0,0 +1,6 @@ +F7 +F74 +FBYN +FRXL +M7 +xiaoyu diff --git a/voices/zhitian_emo/dict/sy_dict.txt b/voices/zhitian_emo/dict/sy_dict.txt new file mode 100755 index 0000000..ec54511 --- /dev/null +++ b/voices/zhitian_emo/dict/sy_dict.txt @@ -0,0 +1,144 @@ +a_c +ai_c +an_c +ang_c +ao_c +b_c +c_c +ch_c +d_c +e_c +ei_c +en_c +eng_c +er_c +f_c +g_c +h_c +i_c +ia_c +ian_c +iang_c +iao_c +ie_c +ih_c +ii_c +in_c +ing_c +io_c +iong_c +iou_c +j_c +k_c +l_c +m_c +n_c +o_c +ong_c +ou_c +p_c +q_c +r_c +s_c +sh_c +t_c +u_c +ua_c +uai_c +uan_c +uang_c +uei_c +uen_c +ueng_c +uo_c +v_c +van_c +ve_c +vn_c +xx_c +z_c +zh_c +w_c +y_c +ga +ge +go +aa +ae +ah +ao +aw +ay +b +ch +d +dh +eh +er +ey +f +g +hh +ih +iy +jh +k +l +m +n +ng +ow +oy +p +r +s +sh +t +th +uh +uw +v +w +y +z +zh +air_c +angr_c +anr_c +aor_c +ar_c +eir_c +engr_c +enr_c +iangr_c +ianr_c +iaor_c +iar_c +ier_c +ihr_c +iir_c +ingr_c +inr_c +iongr_c +iour_c +ir_c +ongr_c +or_c +our_c +uair_c +uangr_c +uanr_c +uar_c +ueir_c +uenr_c +uor_c +ur_c +vanr_c +ver_c +vnr_c +vr_c +pau +#1 +#2 +#3 +#4 \ No newline at end of file diff --git a/voices/zhitian_emo/dict/syllable_flag_dict.txt b/voices/zhitian_emo/dict/syllable_flag_dict.txt new file mode 100755 index 0000000..84a4d14 --- /dev/null +++ b/voices/zhitian_emo/dict/syllable_flag_dict.txt @@ -0,0 +1,5 @@ +s_begin +s_end +s_none +s_both +s_middle diff --git a/voices/zhitian_emo/dict/tone_dict.txt b/voices/zhitian_emo/dict/tone_dict.txt new file mode 100755 index 0000000..7af26ed --- /dev/null +++ b/voices/zhitian_emo/dict/tone_dict.txt @@ -0,0 +1,7 @@ +tone1 +tone_none +tone4 +tone2 +tone3 +tone5 +tone0 diff --git a/voices/zhitian_emo/dict/word_segment_dict.txt b/voices/zhitian_emo/dict/word_segment_dict.txt new file mode 100755 index 0000000..667bcf9 --- /dev/null +++ b/voices/zhitian_emo/dict/word_segment_dict.txt @@ -0,0 +1,5 @@ +word_begin +word_end +word_middle +word_both +word_none diff --git a/voices/zhitian_emo/voc/ckpt/checkpoint_0.pth b/voices/zhitian_emo/voc/ckpt/checkpoint_0.pth new file mode 100644 index 0000000..d3d253f --- /dev/null +++ b/voices/zhitian_emo/voc/ckpt/checkpoint_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37ae58b77e73f1d32fda9be354662116c415803f0e9961d8c8fb935ccbcc7ada +size 19594437 diff --git a/voices/zhitian_emo/voc/config.yaml b/voices/zhitian_emo/voc/config.yaml new file mode 100644 index 0000000..e9853c3 --- /dev/null +++ b/voices/zhitian_emo/voc/config.yaml @@ -0,0 +1,188 @@ +model_type: hifigan +Model: +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### + Generator: + params: + in_channels: 80 + out_channels: 1 + channels: 256 + kernel_size: 7 + upsample_scales: [10, 5, 2, 2] + upsample_kernal_sizes: [20, 11, 4, 4] + resblock_kernel_sizes: [3, 7, 11] + resblock_dilations: + - [1, 3, 5, 7] + - [1, 3, 5, 7] + - [1, 3, 5, 7] + bias: true + causal: true + nonlinear_activation: "LeakyReLU" + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + optimizer: + type: Adam + params: + lr: 2.0e-4 + betas: [0.5, 0.9] + weight_decay: 0.0 + scheduler: + type: MultiStepLR + params: + gamma: 0.5 + milestones: + - 200000 + - 400000 + - 600000 + - 800000 + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### + MultiScaleDiscriminator: + params: + scales: 3 + downsample_pooling: "DWT" + downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: [15, 41, 5, 3] + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: [4, 4, 4, 4, 1] + nonlinear_activation: "LeakyReLU" + nonlinear_activation_params: + negative_slope: 0.1 + follow_official_norm: true + optimizer: + type: Adam + params: + lr: 2.0e-4 + betas: [0.5, 0.9] + weight_decay: 0.0 + scheduler: + type: MultiStepLR + params: + gamma: 0.5 + milestones: + - 200000 + - 400000 + - 600000 + - 800000 + + MultiPeriodDiscriminator: + params: + periods: [2, 3, 5, 7, 11] + discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: [5, 3] + channels: 32 + downsample_scales: [3, 3, 3, 3, 1] + max_downsample_channels: 1024 + bias: true + nonlinear_activation: "LeakyReLU" + nonlinear_activation_params: + negative_slope: 0.1 + use_spectral_norm: false + optimizer: + type: Adam + params: + lr: 2.0e-4 + betas: [0.5, 0.9] + weight_decay: 0.0 + scheduler: + type: MultiStepLR + params: + gamma: 0.5 + milestones: + - 200000 + - 400000 + - 600000 + - 800000 + +#################################################### +# LOSS SETTING # +#################################################### +Loss: + generator_adv_loss: + enable: True + params: + average_by_discriminators: False + weights: 1.0 + + discriminator_adv_loss: + enable: True + params: + average_by_discriminators: False + weights: 1.0 + + stft_loss: + enable: False # Whether to use multi-resolution STFT loss. + + mel_loss: + enable: True + params: + fs: 16000 + fft_size: 2048 + hop_size: 200 + win_length: 1000 + window: "hann" + num_mels: 80 + fmin: 0 + fmax: 8000 + log_base: null + weights: 45.0 + + subband_stft_loss: + enable: False + params: + fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. + hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss + win_lengths: [150, 300, 60] # List of window length for STFT-based loss. + window: "hann_window" # Window function for STFT-based loss + + feat_match_loss: + enable: True + params: + average_by_discriminators: false + average_by_layers: false + weights: 2.0 + + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 16 +batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size. +pin_memory: True +num_workers: 2 # FIXME: set > 0 may stuck on macos +remove_short_samples: False +allow_cache: True + +generator_grad_norm: -1 + +discriminator_grad_norm: -1 + +########################################################### +# INTERVAL SETTING # +########################################################### +generator_train_start_steps: 1 # Number of steps to start to train discriminator. +discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. +train_max_steps: 2500000 # Number of training steps. +save_interval_steps: 20000 # Interval steps to save checkpoint. +eval_interval_steps: 10000 # Interval steps to evaluate the network. +log_interval_steps: 1000 # Interval steps to record the training log. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. diff --git a/voices/zhitian_emo/vocoder/pytorch_model.bin b/voices/zhitian_emo/vocoder/pytorch_model.bin new file mode 100644 index 0000000..2805881 --- /dev/null +++ b/voices/zhitian_emo/vocoder/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2a1e65b70ea89a5f45b38d3d04d3d843c25eaed6c9805346fd7180af08c3a0a +size 19613277 diff --git a/voices/zhiyan_emo/am/ckpt/checkpoint_0.pth b/voices/zhiyan_emo/am/ckpt/checkpoint_0.pth new file mode 100644 index 0000000..882645f --- /dev/null +++ b/voices/zhiyan_emo/am/ckpt/checkpoint_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38980015065a60486f5c77204e7f20329bd6d0c54ac3ab40e6df3642520a5b5a +size 49315631 diff --git a/voices/zhiyan_emo/am/config.yaml b/voices/zhiyan_emo/am/config.yaml new file mode 100644 index 0000000..56a980a --- /dev/null +++ b/voices/zhiyan_emo/am/config.yaml @@ -0,0 +1,105 @@ +model_type: sambert +Model: +######################################################### +# SAMBERT NETWORK ARCHITECTURE SETTING # +######################################################### + KanTtsSAMBERT: + params: + max_len: 800 + + embedding_dim: 512 + encoder_num_layers: 8 + encoder_num_heads: 8 + encoder_num_units: 128 + encoder_ffn_inner_dim: 1024 + encoder_dropout: 0.1 + encoder_attention_dropout: 0.1 + encoder_relu_dropout: 0.1 + encoder_projection_units: 32 + + speaker_units: 32 + emotion_units: 32 + + predictor_filter_size: 41 + predictor_fsmn_num_layers: 3 + predictor_num_memory_units: 128 + predictor_ffn_inner_dim: 256 + predictor_dropout: 0.1 + predictor_shift: 0 + predictor_lstm_units: 128 + dur_pred_prenet_units: [128, 128] + dur_pred_lstm_units: 128 + + decoder_prenet_units: [256, 256] + decoder_num_layers: 12 + decoder_num_heads: 8 + decoder_num_units: 128 + decoder_ffn_inner_dim: 1024 + decoder_dropout: 0.1 + decoder_attention_dropout: 0.1 + decoder_relu_dropout: 0.1 + + outputs_per_step: 3 + num_mels: 80 + + postnet_filter_size: 41 + postnet_fsmn_num_layers: 4 + postnet_num_memory_units: 256 + postnet_ffn_inner_dim: 512 + postnet_dropout: 0.1 + postnet_shift: 17 + postnet_lstm_units: 128 + MAS: False + + optimizer: + type: Adam + params: + lr: 0.001 + betas: [0.9, 0.98] + eps: 1.0e-9 + weight_decay: 0.0 + scheduler: + type: NoamLR + params: + warmup_steps: 4000 + +linguistic_unit: + cleaners: english_cleaners + lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category + speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu +#################################################### +# LOSS SETTING # +#################################################### +Loss: + MelReconLoss: + enable: True + params: + loss_type: mae + + ProsodyReconLoss: + enable: True + params: + loss_type: mae + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 32 +pin_memory: False +num_workers: 4 # FIXME: set > 0 may stuck on macos +remove_short_samples: False +allow_cache: True +grad_norm: 1.0 + +########################################################### +# INTERVAL SETTING # +########################################################### +train_max_steps: 1000000 # Number of training steps. +save_interval_steps: 20000 # Interval steps to save checkpoint. +eval_interval_steps: 10000 # Interval steps to evaluate the network. +log_interval_steps: 1000 # Interval steps to record the training log. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. diff --git a/voices/zhiyan_emo/am/pytorch_model.bin b/voices/zhiyan_emo/am/pytorch_model.bin new file mode 100644 index 0000000..45313fb --- /dev/null +++ b/voices/zhiyan_emo/am/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f497d5cfa4082ecbe965f624d5ea81c1a8ab781f30b7394b32664d647446e4e +size 49234411 diff --git a/voices/zhiyan_emo/audio_config.yaml b/voices/zhiyan_emo/audio_config.yaml new file mode 100644 index 0000000..233817c --- /dev/null +++ b/voices/zhiyan_emo/audio_config.yaml @@ -0,0 +1,27 @@ +# Audio processing configs + +audio_config: + # Preprocess + wav_normalize: True + trim_silence: True + trim_silence_threshold_db: 60 + preemphasize: False + + # Feature extraction + sampling_rate: 16000 + hop_length: 200 + win_length: 1000 + n_fft: 2048 + n_mels: 80 + fmin: 0.0 + fmax: 8000.0 + phone_level_feature: True + + # Normalization + norm_type: "mean_std" # "mean_std" or "global" + max_norm: 1.0 + symmetric: False + min_level_db: -100.0 + ref_level_db: 20 + + num_workers: 16 diff --git a/voices/zhiyan_emo/dict/emo_category_dict.txt b/voices/zhiyan_emo/dict/emo_category_dict.txt new file mode 100755 index 0000000..dfd88e8 --- /dev/null +++ b/voices/zhiyan_emo/dict/emo_category_dict.txt @@ -0,0 +1,33 @@ +emotion_none +emotion_neutral +emotion_angry +emotion_disgust +emotion_fear +emotion_happy +emotion_sad +emotion_surprise +emotion_calm +emotion_gentle +emotion_relax +emotion_lyrical +emotion_serious +emotion_disgruntled +emotion_satisfied +emotion_disappointed +emotion_excited +emotion_anxiety +emotion_jealousy +emotion_hate +emotion_pity +emotion_pleasure +emotion_arousal +emotion_dominance +emotion_placeholder1 +emotion_placeholder2 +emotion_placeholder3 +emotion_placeholder4 +emotion_placeholder5 +emotion_placeholder6 +emotion_placeholder7 +emotion_placeholder8 +emotion_placeholder9 \ No newline at end of file diff --git a/voices/zhiyan_emo/dict/speaker_dict.txt b/voices/zhiyan_emo/dict/speaker_dict.txt new file mode 100755 index 0000000..af0ca1d --- /dev/null +++ b/voices/zhiyan_emo/dict/speaker_dict.txt @@ -0,0 +1,6 @@ +F7 +F74 +FBYN +FRXL +M7 +xiaoyu diff --git a/voices/zhiyan_emo/dict/sy_dict.txt b/voices/zhiyan_emo/dict/sy_dict.txt new file mode 100755 index 0000000..ec54511 --- /dev/null +++ b/voices/zhiyan_emo/dict/sy_dict.txt @@ -0,0 +1,144 @@ +a_c +ai_c +an_c +ang_c +ao_c +b_c +c_c +ch_c +d_c +e_c +ei_c +en_c +eng_c +er_c +f_c +g_c +h_c +i_c +ia_c +ian_c +iang_c +iao_c +ie_c +ih_c +ii_c +in_c +ing_c +io_c +iong_c +iou_c +j_c +k_c +l_c +m_c +n_c +o_c +ong_c +ou_c +p_c +q_c +r_c +s_c +sh_c +t_c +u_c +ua_c +uai_c +uan_c +uang_c +uei_c +uen_c +ueng_c +uo_c +v_c +van_c +ve_c +vn_c +xx_c +z_c +zh_c +w_c +y_c +ga +ge +go +aa +ae +ah +ao +aw +ay +b +ch +d +dh +eh +er +ey +f +g +hh +ih +iy +jh +k +l +m +n +ng +ow +oy +p +r +s +sh +t +th +uh +uw +v +w +y +z +zh +air_c +angr_c +anr_c +aor_c +ar_c +eir_c +engr_c +enr_c +iangr_c +ianr_c +iaor_c +iar_c +ier_c +ihr_c +iir_c +ingr_c +inr_c +iongr_c +iour_c +ir_c +ongr_c +or_c +our_c +uair_c +uangr_c +uanr_c +uar_c +ueir_c +uenr_c +uor_c +ur_c +vanr_c +ver_c +vnr_c +vr_c +pau +#1 +#2 +#3 +#4 \ No newline at end of file diff --git a/voices/zhiyan_emo/dict/syllable_flag_dict.txt b/voices/zhiyan_emo/dict/syllable_flag_dict.txt new file mode 100755 index 0000000..84a4d14 --- /dev/null +++ b/voices/zhiyan_emo/dict/syllable_flag_dict.txt @@ -0,0 +1,5 @@ +s_begin +s_end +s_none +s_both +s_middle diff --git a/voices/zhiyan_emo/dict/tone_dict.txt b/voices/zhiyan_emo/dict/tone_dict.txt new file mode 100755 index 0000000..7af26ed --- /dev/null +++ b/voices/zhiyan_emo/dict/tone_dict.txt @@ -0,0 +1,7 @@ +tone1 +tone_none +tone4 +tone2 +tone3 +tone5 +tone0 diff --git a/voices/zhiyan_emo/dict/word_segment_dict.txt b/voices/zhiyan_emo/dict/word_segment_dict.txt new file mode 100755 index 0000000..667bcf9 --- /dev/null +++ b/voices/zhiyan_emo/dict/word_segment_dict.txt @@ -0,0 +1,5 @@ +word_begin +word_end +word_middle +word_both +word_none diff --git a/voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth b/voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth new file mode 100644 index 0000000..bd2113d --- /dev/null +++ b/voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e1eb72c06fb8cc0dd876b287430088757f6efe4ec44e0f9fff3a7e40a762c52 +size 19594437 diff --git a/voices/zhiyan_emo/voc/config.yaml b/voices/zhiyan_emo/voc/config.yaml new file mode 100644 index 0000000..e9853c3 --- /dev/null +++ b/voices/zhiyan_emo/voc/config.yaml @@ -0,0 +1,188 @@ +model_type: hifigan +Model: +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### + Generator: + params: + in_channels: 80 + out_channels: 1 + channels: 256 + kernel_size: 7 + upsample_scales: [10, 5, 2, 2] + upsample_kernal_sizes: [20, 11, 4, 4] + resblock_kernel_sizes: [3, 7, 11] + resblock_dilations: + - [1, 3, 5, 7] + - [1, 3, 5, 7] + - [1, 3, 5, 7] + bias: true + causal: true + nonlinear_activation: "LeakyReLU" + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + optimizer: + type: Adam + params: + lr: 2.0e-4 + betas: [0.5, 0.9] + weight_decay: 0.0 + scheduler: + type: MultiStepLR + params: + gamma: 0.5 + milestones: + - 200000 + - 400000 + - 600000 + - 800000 + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### + MultiScaleDiscriminator: + params: + scales: 3 + downsample_pooling: "DWT" + downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: [15, 41, 5, 3] + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: [4, 4, 4, 4, 1] + nonlinear_activation: "LeakyReLU" + nonlinear_activation_params: + negative_slope: 0.1 + follow_official_norm: true + optimizer: + type: Adam + params: + lr: 2.0e-4 + betas: [0.5, 0.9] + weight_decay: 0.0 + scheduler: + type: MultiStepLR + params: + gamma: 0.5 + milestones: + - 200000 + - 400000 + - 600000 + - 800000 + + MultiPeriodDiscriminator: + params: + periods: [2, 3, 5, 7, 11] + discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: [5, 3] + channels: 32 + downsample_scales: [3, 3, 3, 3, 1] + max_downsample_channels: 1024 + bias: true + nonlinear_activation: "LeakyReLU" + nonlinear_activation_params: + negative_slope: 0.1 + use_spectral_norm: false + optimizer: + type: Adam + params: + lr: 2.0e-4 + betas: [0.5, 0.9] + weight_decay: 0.0 + scheduler: + type: MultiStepLR + params: + gamma: 0.5 + milestones: + - 200000 + - 400000 + - 600000 + - 800000 + +#################################################### +# LOSS SETTING # +#################################################### +Loss: + generator_adv_loss: + enable: True + params: + average_by_discriminators: False + weights: 1.0 + + discriminator_adv_loss: + enable: True + params: + average_by_discriminators: False + weights: 1.0 + + stft_loss: + enable: False # Whether to use multi-resolution STFT loss. + + mel_loss: + enable: True + params: + fs: 16000 + fft_size: 2048 + hop_size: 200 + win_length: 1000 + window: "hann" + num_mels: 80 + fmin: 0 + fmax: 8000 + log_base: null + weights: 45.0 + + subband_stft_loss: + enable: False + params: + fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. + hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss + win_lengths: [150, 300, 60] # List of window length for STFT-based loss. + window: "hann_window" # Window function for STFT-based loss + + feat_match_loss: + enable: True + params: + average_by_discriminators: false + average_by_layers: false + weights: 2.0 + + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 16 +batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size. +pin_memory: True +num_workers: 2 # FIXME: set > 0 may stuck on macos +remove_short_samples: False +allow_cache: True + +generator_grad_norm: -1 + +discriminator_grad_norm: -1 + +########################################################### +# INTERVAL SETTING # +########################################################### +generator_train_start_steps: 1 # Number of steps to start to train discriminator. +discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. +train_max_steps: 2500000 # Number of training steps. +save_interval_steps: 20000 # Interval steps to save checkpoint. +eval_interval_steps: 10000 # Interval steps to evaluate the network. +log_interval_steps: 1000 # Interval steps to record the training log. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. diff --git a/voices/zhiyan_emo/vocoder/pytorch_model.bin b/voices/zhiyan_emo/vocoder/pytorch_model.bin new file mode 100644 index 0000000..94859b6 --- /dev/null +++ b/voices/zhiyan_emo/vocoder/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d054c61986470740ed7c715bae1977468da993b7f5cca21da1245f759cbc3cec +size 19613277 diff --git a/voices/zhizhe_emo/am/ckpt/checkpoint_0.pth b/voices/zhizhe_emo/am/ckpt/checkpoint_0.pth new file mode 100644 index 0000000..5e48bb3 --- /dev/null +++ b/voices/zhizhe_emo/am/ckpt/checkpoint_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:002731ec3a90b51d7683d13477ed74545fd7092a552899e0d676eb8473d0dbbe +size 49315631 diff --git a/voices/zhizhe_emo/am/config.yaml b/voices/zhizhe_emo/am/config.yaml new file mode 100644 index 0000000..56a980a --- /dev/null +++ b/voices/zhizhe_emo/am/config.yaml @@ -0,0 +1,105 @@ +model_type: sambert +Model: +######################################################### +# SAMBERT NETWORK ARCHITECTURE SETTING # +######################################################### + KanTtsSAMBERT: + params: + max_len: 800 + + embedding_dim: 512 + encoder_num_layers: 8 + encoder_num_heads: 8 + encoder_num_units: 128 + encoder_ffn_inner_dim: 1024 + encoder_dropout: 0.1 + encoder_attention_dropout: 0.1 + encoder_relu_dropout: 0.1 + encoder_projection_units: 32 + + speaker_units: 32 + emotion_units: 32 + + predictor_filter_size: 41 + predictor_fsmn_num_layers: 3 + predictor_num_memory_units: 128 + predictor_ffn_inner_dim: 256 + predictor_dropout: 0.1 + predictor_shift: 0 + predictor_lstm_units: 128 + dur_pred_prenet_units: [128, 128] + dur_pred_lstm_units: 128 + + decoder_prenet_units: [256, 256] + decoder_num_layers: 12 + decoder_num_heads: 8 + decoder_num_units: 128 + decoder_ffn_inner_dim: 1024 + decoder_dropout: 0.1 + decoder_attention_dropout: 0.1 + decoder_relu_dropout: 0.1 + + outputs_per_step: 3 + num_mels: 80 + + postnet_filter_size: 41 + postnet_fsmn_num_layers: 4 + postnet_num_memory_units: 256 + postnet_ffn_inner_dim: 512 + postnet_dropout: 0.1 + postnet_shift: 17 + postnet_lstm_units: 128 + MAS: False + + optimizer: + type: Adam + params: + lr: 0.001 + betas: [0.9, 0.98] + eps: 1.0e-9 + weight_decay: 0.0 + scheduler: + type: NoamLR + params: + warmup_steps: 4000 + +linguistic_unit: + cleaners: english_cleaners + lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category + speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu +#################################################### +# LOSS SETTING # +#################################################### +Loss: + MelReconLoss: + enable: True + params: + loss_type: mae + + ProsodyReconLoss: + enable: True + params: + loss_type: mae + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 32 +pin_memory: False +num_workers: 4 # FIXME: set > 0 may stuck on macos +remove_short_samples: False +allow_cache: True +grad_norm: 1.0 + +########################################################### +# INTERVAL SETTING # +########################################################### +train_max_steps: 1000000 # Number of training steps. +save_interval_steps: 20000 # Interval steps to save checkpoint. +eval_interval_steps: 10000 # Interval steps to evaluate the network. +log_interval_steps: 1000 # Interval steps to record the training log. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. diff --git a/voices/zhizhe_emo/am/pytorch_model.bin b/voices/zhizhe_emo/am/pytorch_model.bin new file mode 100644 index 0000000..88077cf --- /dev/null +++ b/voices/zhizhe_emo/am/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83eb76bc45d0251fc1e609fd1c2e7ae337403165e56b1091b3dc2591b7e8520e +size 49234411 diff --git a/voices/zhizhe_emo/audio_config.yaml b/voices/zhizhe_emo/audio_config.yaml new file mode 100644 index 0000000..233817c --- /dev/null +++ b/voices/zhizhe_emo/audio_config.yaml @@ -0,0 +1,27 @@ +# Audio processing configs + +audio_config: + # Preprocess + wav_normalize: True + trim_silence: True + trim_silence_threshold_db: 60 + preemphasize: False + + # Feature extraction + sampling_rate: 16000 + hop_length: 200 + win_length: 1000 + n_fft: 2048 + n_mels: 80 + fmin: 0.0 + fmax: 8000.0 + phone_level_feature: True + + # Normalization + norm_type: "mean_std" # "mean_std" or "global" + max_norm: 1.0 + symmetric: False + min_level_db: -100.0 + ref_level_db: 20 + + num_workers: 16 diff --git a/voices/zhizhe_emo/dict/emo_category_dict.txt b/voices/zhizhe_emo/dict/emo_category_dict.txt new file mode 100755 index 0000000..dfd88e8 --- /dev/null +++ b/voices/zhizhe_emo/dict/emo_category_dict.txt @@ -0,0 +1,33 @@ +emotion_none +emotion_neutral +emotion_angry +emotion_disgust +emotion_fear +emotion_happy +emotion_sad +emotion_surprise +emotion_calm +emotion_gentle +emotion_relax +emotion_lyrical +emotion_serious +emotion_disgruntled +emotion_satisfied +emotion_disappointed +emotion_excited +emotion_anxiety +emotion_jealousy +emotion_hate +emotion_pity +emotion_pleasure +emotion_arousal +emotion_dominance +emotion_placeholder1 +emotion_placeholder2 +emotion_placeholder3 +emotion_placeholder4 +emotion_placeholder5 +emotion_placeholder6 +emotion_placeholder7 +emotion_placeholder8 +emotion_placeholder9 \ No newline at end of file diff --git a/voices/zhizhe_emo/dict/speaker_dict.txt b/voices/zhizhe_emo/dict/speaker_dict.txt new file mode 100755 index 0000000..af0ca1d --- /dev/null +++ b/voices/zhizhe_emo/dict/speaker_dict.txt @@ -0,0 +1,6 @@ +F7 +F74 +FBYN +FRXL +M7 +xiaoyu diff --git a/voices/zhizhe_emo/dict/sy_dict.txt b/voices/zhizhe_emo/dict/sy_dict.txt new file mode 100755 index 0000000..ec54511 --- /dev/null +++ b/voices/zhizhe_emo/dict/sy_dict.txt @@ -0,0 +1,144 @@ +a_c +ai_c +an_c +ang_c +ao_c +b_c +c_c +ch_c +d_c +e_c +ei_c +en_c +eng_c +er_c +f_c +g_c +h_c +i_c +ia_c +ian_c +iang_c +iao_c +ie_c +ih_c +ii_c +in_c +ing_c +io_c +iong_c +iou_c +j_c +k_c +l_c +m_c +n_c +o_c +ong_c +ou_c +p_c +q_c +r_c +s_c +sh_c +t_c +u_c +ua_c +uai_c +uan_c +uang_c +uei_c +uen_c +ueng_c +uo_c +v_c +van_c +ve_c +vn_c +xx_c +z_c +zh_c +w_c +y_c +ga +ge +go +aa +ae +ah +ao +aw +ay +b +ch +d +dh +eh +er +ey +f +g +hh +ih +iy +jh +k +l +m +n +ng +ow +oy +p +r +s +sh +t +th +uh +uw +v +w +y +z +zh +air_c +angr_c +anr_c +aor_c +ar_c +eir_c +engr_c +enr_c +iangr_c +ianr_c +iaor_c +iar_c +ier_c +ihr_c +iir_c +ingr_c +inr_c +iongr_c +iour_c +ir_c +ongr_c +or_c +our_c +uair_c +uangr_c +uanr_c +uar_c +ueir_c +uenr_c +uor_c +ur_c +vanr_c +ver_c +vnr_c +vr_c +pau +#1 +#2 +#3 +#4 \ No newline at end of file diff --git a/voices/zhizhe_emo/dict/syllable_flag_dict.txt b/voices/zhizhe_emo/dict/syllable_flag_dict.txt new file mode 100755 index 0000000..84a4d14 --- /dev/null +++ b/voices/zhizhe_emo/dict/syllable_flag_dict.txt @@ -0,0 +1,5 @@ +s_begin +s_end +s_none +s_both +s_middle diff --git a/voices/zhizhe_emo/dict/tone_dict.txt b/voices/zhizhe_emo/dict/tone_dict.txt new file mode 100755 index 0000000..7af26ed --- /dev/null +++ b/voices/zhizhe_emo/dict/tone_dict.txt @@ -0,0 +1,7 @@ +tone1 +tone_none +tone4 +tone2 +tone3 +tone5 +tone0 diff --git a/voices/zhizhe_emo/dict/word_segment_dict.txt b/voices/zhizhe_emo/dict/word_segment_dict.txt new file mode 100755 index 0000000..667bcf9 --- /dev/null +++ b/voices/zhizhe_emo/dict/word_segment_dict.txt @@ -0,0 +1,5 @@ +word_begin +word_end +word_middle +word_both +word_none diff --git a/voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth b/voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth new file mode 100644 index 0000000..b06f78b --- /dev/null +++ b/voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31d837f8dc54d1c80762cba2fc05cddd8a6a466beab3744e102658446f303831 +size 19594437 diff --git a/voices/zhizhe_emo/voc/config.yaml b/voices/zhizhe_emo/voc/config.yaml new file mode 100644 index 0000000..e9853c3 --- /dev/null +++ b/voices/zhizhe_emo/voc/config.yaml @@ -0,0 +1,188 @@ +model_type: hifigan +Model: +########################################################### +# GENERATOR NETWORK ARCHITECTURE SETTING # +########################################################### + Generator: + params: + in_channels: 80 + out_channels: 1 + channels: 256 + kernel_size: 7 + upsample_scales: [10, 5, 2, 2] + upsample_kernal_sizes: [20, 11, 4, 4] + resblock_kernel_sizes: [3, 7, 11] + resblock_dilations: + - [1, 3, 5, 7] + - [1, 3, 5, 7] + - [1, 3, 5, 7] + bias: true + causal: true + nonlinear_activation: "LeakyReLU" + nonlinear_activation_params: + negative_slope: 0.1 + use_weight_norm: true + optimizer: + type: Adam + params: + lr: 2.0e-4 + betas: [0.5, 0.9] + weight_decay: 0.0 + scheduler: + type: MultiStepLR + params: + gamma: 0.5 + milestones: + - 200000 + - 400000 + - 600000 + - 800000 + +########################################################### +# DISCRIMINATOR NETWORK ARCHITECTURE SETTING # +########################################################### + MultiScaleDiscriminator: + params: + scales: 3 + downsample_pooling: "DWT" + downsample_pooling_params: + kernel_size: 4 + stride: 2 + padding: 2 + discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: [15, 41, 5, 3] + channels: 128 + max_downsample_channels: 1024 + max_groups: 16 + bias: true + downsample_scales: [4, 4, 4, 4, 1] + nonlinear_activation: "LeakyReLU" + nonlinear_activation_params: + negative_slope: 0.1 + follow_official_norm: true + optimizer: + type: Adam + params: + lr: 2.0e-4 + betas: [0.5, 0.9] + weight_decay: 0.0 + scheduler: + type: MultiStepLR + params: + gamma: 0.5 + milestones: + - 200000 + - 400000 + - 600000 + - 800000 + + MultiPeriodDiscriminator: + params: + periods: [2, 3, 5, 7, 11] + discriminator_params: + in_channels: 1 + out_channels: 1 + kernel_sizes: [5, 3] + channels: 32 + downsample_scales: [3, 3, 3, 3, 1] + max_downsample_channels: 1024 + bias: true + nonlinear_activation: "LeakyReLU" + nonlinear_activation_params: + negative_slope: 0.1 + use_spectral_norm: false + optimizer: + type: Adam + params: + lr: 2.0e-4 + betas: [0.5, 0.9] + weight_decay: 0.0 + scheduler: + type: MultiStepLR + params: + gamma: 0.5 + milestones: + - 200000 + - 400000 + - 600000 + - 800000 + +#################################################### +# LOSS SETTING # +#################################################### +Loss: + generator_adv_loss: + enable: True + params: + average_by_discriminators: False + weights: 1.0 + + discriminator_adv_loss: + enable: True + params: + average_by_discriminators: False + weights: 1.0 + + stft_loss: + enable: False # Whether to use multi-resolution STFT loss. + + mel_loss: + enable: True + params: + fs: 16000 + fft_size: 2048 + hop_size: 200 + win_length: 1000 + window: "hann" + num_mels: 80 + fmin: 0 + fmax: 8000 + log_base: null + weights: 45.0 + + subband_stft_loss: + enable: False + params: + fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss. + hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss + win_lengths: [150, 300, 60] # List of window length for STFT-based loss. + window: "hann_window" # Window function for STFT-based loss + + feat_match_loss: + enable: True + params: + average_by_discriminators: false + average_by_layers: false + weights: 2.0 + + +########################################################### +# DATA LOADER SETTING # +########################################################### +batch_size: 16 +batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size. +pin_memory: True +num_workers: 2 # FIXME: set > 0 may stuck on macos +remove_short_samples: False +allow_cache: True + +generator_grad_norm: -1 + +discriminator_grad_norm: -1 + +########################################################### +# INTERVAL SETTING # +########################################################### +generator_train_start_steps: 1 # Number of steps to start to train discriminator. +discriminator_train_start_steps: 0 # Number of steps to start to train discriminator. +train_max_steps: 2500000 # Number of training steps. +save_interval_steps: 20000 # Interval steps to save checkpoint. +eval_interval_steps: 10000 # Interval steps to evaluate the network. +log_interval_steps: 1000 # Interval steps to record the training log. + +########################################################### +# OTHER SETTING # +########################################################### +num_save_intermediate_results: 4 # Number of results to be saved as intermediate results. diff --git a/voices/zhizhe_emo/vocoder/pytorch_model.bin b/voices/zhizhe_emo/vocoder/pytorch_model.bin new file mode 100644 index 0000000..8e0f5f4 --- /dev/null +++ b/voices/zhizhe_emo/vocoder/pytorch_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7df13ca0946a193b277a08e2a518cc97a58c636d1eaa34743acedf6731c199e3 +size 19613277