update

2026-07-16 13:22:52 +08:00 · 2022-12-14 16:34:44 +08:00
parent 517f2712ec
commit d551729afa
53 changed files with 2136 additions and 0 deletions
--- a/voices/voices.json
+++ b/voices/voices.json
@ -0,0 +1,8 @@
 {
  "voices": [
    "zhitian_emo",
    "zhibei_emo",
    "zhizhe_emo",
    "zhiyan_emo"
  ]
 }
--- a/voices/zhibei_emo/am/ckpt/checkpoint_0.pth
+++ b/voices/zhibei_emo/am/ckpt/checkpoint_0.pth
--- a/voices/zhibei_emo/am/config.yaml
+++ b/voices/zhibei_emo/am/config.yaml
@ -0,0 +1,105 @@
 model_type: sambert
 Model:
 #########################################################
 #         SAMBERT NETWORK ARCHITECTURE SETTING          #
 #########################################################
  KanTtsSAMBERT:
    params:
        max_len: 800
        embedding_dim: 512 
        encoder_num_layers: 8
        encoder_num_heads: 8
        encoder_num_units: 128
        encoder_ffn_inner_dim: 1024
        encoder_dropout: 0.1
        encoder_attention_dropout: 0.1
        encoder_relu_dropout: 0.1
        encoder_projection_units: 32
        speaker_units: 32
        emotion_units: 32
        predictor_filter_size: 41
        predictor_fsmn_num_layers: 3
        predictor_num_memory_units: 128
        predictor_ffn_inner_dim: 256
        predictor_dropout: 0.1
        predictor_shift: 0
        predictor_lstm_units: 128
        dur_pred_prenet_units: [128, 128]
        dur_pred_lstm_units: 128
        decoder_prenet_units: [256, 256]
        decoder_num_layers: 12
        decoder_num_heads: 8
        decoder_num_units: 128
        decoder_ffn_inner_dim: 1024
        decoder_dropout: 0.1
        decoder_attention_dropout: 0.1
        decoder_relu_dropout: 0.1
        outputs_per_step: 3
        num_mels: 80
        postnet_filter_size: 41
        postnet_fsmn_num_layers: 4
        postnet_num_memory_units: 256
        postnet_ffn_inner_dim: 512
        postnet_dropout: 0.1
        postnet_shift: 17
        postnet_lstm_units: 128
        MAS: False
    optimizer:
      type: Adam
      params:
        lr: 0.001
        betas: [0.9, 0.98]
        eps: 1.0e-9
        weight_decay: 0.0
    scheduler:
      type: NoamLR
      params:
        warmup_steps: 4000
 linguistic_unit: 
  cleaners: english_cleaners
  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
  speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
 ####################################################
 #                   LOSS SETTING                   #
 ####################################################
 Loss:
  MelReconLoss:
    enable: True
    params:
      loss_type: mae
  ProsodyReconLoss:
    enable: True
    params:
      loss_type: mae
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 32              
 pin_memory: False            
 num_workers: 4 # FIXME: set > 0 may stuck on macos              
 remove_short_samples: False 
 allow_cache: True           
 grad_norm: 1.0
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 train_max_steps: 1000000           # Number of training steps.
 save_interval_steps: 20000         # Interval steps to save checkpoint.
 eval_interval_steps: 10000          # Interval steps to evaluate the network.
 log_interval_steps: 1000            # Interval steps to record the training log.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/voices/zhibei_emo/am/pytorch_model.bin
+++ b/voices/zhibei_emo/am/pytorch_model.bin
--- a/voices/zhibei_emo/audio_config.yaml
+++ b/voices/zhibei_emo/audio_config.yaml
@ -0,0 +1,27 @@
 # Audio processing configs
 audio_config:
  # Preprocess
  wav_normalize: True
  trim_silence: True
  trim_silence_threshold_db: 60
  preemphasize: False
  # Feature extraction
  sampling_rate: 16000
  hop_length: 200
  win_length: 1000
  n_fft: 2048
  n_mels: 80
  fmin: 0.0
  fmax: 8000.0
  phone_level_feature: True
  # Normalization
  norm_type: "mean_std"  # "mean_std" or "global"
  max_norm: 1.0
  symmetric: False
  min_level_db: -100.0
  ref_level_db: 20
  num_workers: 16
--- a/voices/zhibei_emo/dict/emo_category_dict.txt
+++ b/voices/zhibei_emo/dict/emo_category_dict.txt
@ -0,0 +1,33 @@
 emotion_none
 emotion_neutral
 emotion_angry
 emotion_disgust
 emotion_fear
 emotion_happy
 emotion_sad
 emotion_surprise
 emotion_calm
 emotion_gentle
 emotion_relax
 emotion_lyrical
 emotion_serious
 emotion_disgruntled
 emotion_satisfied
 emotion_disappointed
 emotion_excited
 emotion_anxiety
 emotion_jealousy
 emotion_hate
 emotion_pity
 emotion_pleasure
 emotion_arousal
 emotion_dominance
 emotion_placeholder1
 emotion_placeholder2
 emotion_placeholder3
 emotion_placeholder4
 emotion_placeholder5
 emotion_placeholder6
 emotion_placeholder7
 emotion_placeholder8
 emotion_placeholder9
--- a/voices/zhibei_emo/dict/speaker_dict.txt
+++ b/voices/zhibei_emo/dict/speaker_dict.txt
@ -0,0 +1,6 @@
 F7
 F74
 FBYN
 FRXL
 M7
 xiaoyu
--- a/voices/zhibei_emo/dict/sy_dict.txt
+++ b/voices/zhibei_emo/dict/sy_dict.txt
@ -0,0 +1,144 @@
 a_c
 ai_c
 an_c
 ang_c
 ao_c
 b_c
 c_c
 ch_c
 d_c
 e_c
 ei_c
 en_c
 eng_c
 er_c
 f_c
 g_c
 h_c
 i_c
 ia_c
 ian_c
 iang_c
 iao_c
 ie_c
 ih_c
 ii_c
 in_c
 ing_c
 io_c
 iong_c
 iou_c
 j_c
 k_c
 l_c
 m_c
 n_c
 o_c
 ong_c
 ou_c
 p_c
 q_c
 r_c
 s_c
 sh_c
 t_c
 u_c
 ua_c
 uai_c
 uan_c
 uang_c
 uei_c
 uen_c
 ueng_c
 uo_c
 v_c
 van_c
 ve_c
 vn_c
 xx_c
 z_c
 zh_c
 w_c
 y_c
 ga
 ge
 go
 aa
 ae
 ah
 ao
 aw
 ay
 b
 ch
 d
 dh
 eh
 er
 ey
 f
 g
 hh
 ih
 iy
 jh
 k
 l
 m
 n
 ng
 ow
 oy
 p
 r
 s
 sh
 t
 th
 uh
 uw
 v
 w
 y
 z
 zh
 air_c
 angr_c
 anr_c
 aor_c
 ar_c
 eir_c
 engr_c
 enr_c
 iangr_c
 ianr_c
 iaor_c
 iar_c
 ier_c
 ihr_c
 iir_c
 ingr_c
 inr_c
 iongr_c
 iour_c
 ir_c
 ongr_c
 or_c
 our_c
 uair_c
 uangr_c
 uanr_c
 uar_c
 ueir_c
 uenr_c
 uor_c
 ur_c
 vanr_c
 ver_c
 vnr_c
 vr_c
 pau
 #1
 #2
 #3
 #4
--- a/voices/zhibei_emo/dict/syllable_flag_dict.txt
+++ b/voices/zhibei_emo/dict/syllable_flag_dict.txt
@ -0,0 +1,5 @@
 s_begin
 s_end
 s_none
 s_both
 s_middle
--- a/voices/zhibei_emo/dict/tone_dict.txt
+++ b/voices/zhibei_emo/dict/tone_dict.txt
@ -0,0 +1,7 @@
 tone1
 tone_none
 tone4
 tone2
 tone3
 tone5
 tone0
--- a/voices/zhibei_emo/dict/word_segment_dict.txt
+++ b/voices/zhibei_emo/dict/word_segment_dict.txt
@ -0,0 +1,5 @@
 word_begin
 word_end
 word_middle
 word_both
 word_none
--- a/voices/zhibei_emo/voc/ckpt/checkpoint_0.pth
+++ b/voices/zhibei_emo/voc/ckpt/checkpoint_0.pth
--- a/voices/zhibei_emo/voc/config.yaml
+++ b/voices/zhibei_emo/voc/config.yaml
@ -0,0 +1,188 @@
 model_type: hifigan
 Model:
 ###########################################################
 #         GENERATOR NETWORK ARCHITECTURE SETTING          #
 ###########################################################
  Generator:
    params:
      in_channels: 80                       
      out_channels: 1                      
      channels: 256                       
      kernel_size: 7                     
      upsample_scales: [10, 5, 2, 2]        
      upsample_kernal_sizes: [20, 11, 4, 4] 
      resblock_kernel_sizes: [3, 7, 11]     
      resblock_dilations:                  
            - [1, 3, 5, 7]
            - [1, 3, 5, 7]
            - [1, 3, 5, 7]
      bias: true                           
      causal: true                           
      nonlinear_activation: "LeakyReLU"    
      nonlinear_activation_params:         
        negative_slope: 0.1
      use_weight_norm: true               
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000
 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 ###########################################################
  MultiScaleDiscriminator:
    params:
      scales: 3                              
      downsample_pooling: "DWT"  
      downsample_pooling_params:
          kernel_size: 4                    
          stride: 2                         
          padding: 2                        
      discriminator_params:
          in_channels: 1                     
          out_channels: 1                    
          kernel_sizes: [15, 41, 5, 3]       
          channels: 128                      
          max_downsample_channels: 1024     
          max_groups: 16                   
          bias: true
          downsample_scales: [4, 4, 4, 4, 1]
          nonlinear_activation: "LeakyReLU"  
          nonlinear_activation_params:
            negative_slope: 0.1
      follow_official_norm: true    
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000
  MultiPeriodDiscriminator:
    params:
      periods: [2, 3, 5, 7, 11]      
      discriminator_params:
        in_channels: 1                  
        out_channels: 1                  
        kernel_sizes: [5, 3]              
        channels: 32                       
        downsample_scales: [3, 3, 3, 3, 1] 
        max_downsample_channels: 1024      
        bias: true                       
        nonlinear_activation: "LeakyReLU"  
        nonlinear_activation_params:       
          negative_slope: 0.1
        use_spectral_norm: false           
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000
 ####################################################
 #                   LOSS SETTING                   #
 ####################################################
 Loss:
  generator_adv_loss:
    enable: True
    params:
      average_by_discriminators: False
    weights: 1.0
  discriminator_adv_loss:
    enable: True
    params:
      average_by_discriminators: False
    weights: 1.0
  stft_loss:
    enable: False             # Whether to use multi-resolution STFT loss.
  mel_loss:
    enable: True
    params:
      fs: 16000
      fft_size: 2048
      hop_size: 200
      win_length: 1000
      window: "hann"
      num_mels: 80
      fmin: 0
      fmax: 8000
      log_base: null
    weights: 45.0
  subband_stft_loss:
    enable: False
    params:
      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
      window: "hann_window"       # Window function for STFT-based loss
  feat_match_loss:
    enable: True
    params:
      average_by_discriminators: false 
      average_by_layers: false         
    weights: 2.0
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 16              
 batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
 pin_memory: True            
 num_workers: 2 # FIXME: set > 0 may stuck on macos              
 remove_short_samples: False 
 allow_cache: True           
 generator_grad_norm: -1
 discriminator_grad_norm: -1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 generator_train_start_steps: 1     # Number of steps to start to train discriminator.
 discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
 train_max_steps: 2500000           # Number of training steps.
 save_interval_steps: 20000         # Interval steps to save checkpoint.
 eval_interval_steps: 10000          # Interval steps to evaluate the network.
 log_interval_steps: 1000            # Interval steps to record the training log.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/voices/zhibei_emo/vocoder/pytorch_model.bin
+++ b/voices/zhibei_emo/vocoder/pytorch_model.bin
--- a/voices/zhitian_emo/am/ckpt/checkpoint_0.pth
+++ b/voices/zhitian_emo/am/ckpt/checkpoint_0.pth
--- a/voices/zhitian_emo/am/config.yaml
+++ b/voices/zhitian_emo/am/config.yaml
@ -0,0 +1,105 @@
 model_type: sambert
 Model:
 #########################################################
 #         SAMBERT NETWORK ARCHITECTURE SETTING          #
 #########################################################
  KanTtsSAMBERT:
    params:
        max_len: 800
        embedding_dim: 512 
        encoder_num_layers: 8
        encoder_num_heads: 8
        encoder_num_units: 128
        encoder_ffn_inner_dim: 1024
        encoder_dropout: 0.1
        encoder_attention_dropout: 0.1
        encoder_relu_dropout: 0.1
        encoder_projection_units: 32
        speaker_units: 32
        emotion_units: 32
        predictor_filter_size: 41
        predictor_fsmn_num_layers: 3
        predictor_num_memory_units: 128
        predictor_ffn_inner_dim: 256
        predictor_dropout: 0.1
        predictor_shift: 0
        predictor_lstm_units: 128
        dur_pred_prenet_units: [128, 128]
        dur_pred_lstm_units: 128
        decoder_prenet_units: [256, 256]
        decoder_num_layers: 12
        decoder_num_heads: 8
        decoder_num_units: 128
        decoder_ffn_inner_dim: 1024
        decoder_dropout: 0.1
        decoder_attention_dropout: 0.1
        decoder_relu_dropout: 0.1
        outputs_per_step: 3
        num_mels: 80
        postnet_filter_size: 41
        postnet_fsmn_num_layers: 4
        postnet_num_memory_units: 256
        postnet_ffn_inner_dim: 512
        postnet_dropout: 0.1
        postnet_shift: 17
        postnet_lstm_units: 128
        MAS: False
    optimizer:
      type: Adam
      params:
        lr: 0.001
        betas: [0.9, 0.98]
        eps: 1.0e-9
        weight_decay: 0.0
    scheduler:
      type: NoamLR
      params:
        warmup_steps: 4000
 linguistic_unit: 
  cleaners: english_cleaners
  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
  speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
 ####################################################
 #                   LOSS SETTING                   #
 ####################################################
 Loss:
  MelReconLoss:
    enable: True
    params:
      loss_type: mae
  ProsodyReconLoss:
    enable: True
    params:
      loss_type: mae
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 32              
 pin_memory: False            
 num_workers: 4 # FIXME: set > 0 may stuck on macos              
 remove_short_samples: False 
 allow_cache: True           
 grad_norm: 1.0
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 train_max_steps: 1000000           # Number of training steps.
 save_interval_steps: 20000         # Interval steps to save checkpoint.
 eval_interval_steps: 10000          # Interval steps to evaluate the network.
 log_interval_steps: 1000            # Interval steps to record the training log.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/voices/zhitian_emo/am/pytorch_model.bin
+++ b/voices/zhitian_emo/am/pytorch_model.bin
--- a/voices/zhitian_emo/audio_config.yaml
+++ b/voices/zhitian_emo/audio_config.yaml
@ -0,0 +1,27 @@
 # Audio processing configs
 audio_config:
  # Preprocess
  wav_normalize: True
  trim_silence: True
  trim_silence_threshold_db: 60
  preemphasize: False
  # Feature extraction
  sampling_rate: 16000
  hop_length: 200
  win_length: 1000
  n_fft: 2048
  n_mels: 80
  fmin: 0.0
  fmax: 8000.0
  phone_level_feature: True
  # Normalization
  norm_type: "mean_std"  # "mean_std" or "global"
  max_norm: 1.0
  symmetric: False
  min_level_db: -100.0
  ref_level_db: 20
  num_workers: 16
--- a/voices/zhitian_emo/dict/emo_category_dict.txt
+++ b/voices/zhitian_emo/dict/emo_category_dict.txt
@ -0,0 +1,33 @@
 emotion_none
 emotion_neutral
 emotion_angry
 emotion_disgust
 emotion_fear
 emotion_happy
 emotion_sad
 emotion_surprise
 emotion_calm
 emotion_gentle
 emotion_relax
 emotion_lyrical
 emotion_serious
 emotion_disgruntled
 emotion_satisfied
 emotion_disappointed
 emotion_excited
 emotion_anxiety
 emotion_jealousy
 emotion_hate
 emotion_pity
 emotion_pleasure
 emotion_arousal
 emotion_dominance
 emotion_placeholder1
 emotion_placeholder2
 emotion_placeholder3
 emotion_placeholder4
 emotion_placeholder5
 emotion_placeholder6
 emotion_placeholder7
 emotion_placeholder8
 emotion_placeholder9
--- a/voices/zhitian_emo/dict/speaker_dict.txt
+++ b/voices/zhitian_emo/dict/speaker_dict.txt
@ -0,0 +1,6 @@
 F7
 F74
 FBYN
 FRXL
 M7
 xiaoyu
--- a/voices/zhitian_emo/dict/sy_dict.txt
+++ b/voices/zhitian_emo/dict/sy_dict.txt
@ -0,0 +1,144 @@
 a_c
 ai_c
 an_c
 ang_c
 ao_c
 b_c
 c_c
 ch_c
 d_c
 e_c
 ei_c
 en_c
 eng_c
 er_c
 f_c
 g_c
 h_c
 i_c
 ia_c
 ian_c
 iang_c
 iao_c
 ie_c
 ih_c
 ii_c
 in_c
 ing_c
 io_c
 iong_c
 iou_c
 j_c
 k_c
 l_c
 m_c
 n_c
 o_c
 ong_c
 ou_c
 p_c
 q_c
 r_c
 s_c
 sh_c
 t_c
 u_c
 ua_c
 uai_c
 uan_c
 uang_c
 uei_c
 uen_c
 ueng_c
 uo_c
 v_c
 van_c
 ve_c
 vn_c
 xx_c
 z_c
 zh_c
 w_c
 y_c
 ga
 ge
 go
 aa
 ae
 ah
 ao
 aw
 ay
 b
 ch
 d
 dh
 eh
 er
 ey
 f
 g
 hh
 ih
 iy
 jh
 k
 l
 m
 n
 ng
 ow
 oy
 p
 r
 s
 sh
 t
 th
 uh
 uw
 v
 w
 y
 z
 zh
 air_c
 angr_c
 anr_c
 aor_c
 ar_c
 eir_c
 engr_c
 enr_c
 iangr_c
 ianr_c
 iaor_c
 iar_c
 ier_c
 ihr_c
 iir_c
 ingr_c
 inr_c
 iongr_c
 iour_c
 ir_c
 ongr_c
 or_c
 our_c
 uair_c
 uangr_c
 uanr_c
 uar_c
 ueir_c
 uenr_c
 uor_c
 ur_c
 vanr_c
 ver_c
 vnr_c
 vr_c
 pau
 #1
 #2
 #3
 #4
--- a/voices/zhitian_emo/dict/syllable_flag_dict.txt
+++ b/voices/zhitian_emo/dict/syllable_flag_dict.txt
@ -0,0 +1,5 @@
 s_begin
 s_end
 s_none
 s_both
 s_middle
--- a/voices/zhitian_emo/dict/tone_dict.txt
+++ b/voices/zhitian_emo/dict/tone_dict.txt
@ -0,0 +1,7 @@
 tone1
 tone_none
 tone4
 tone2
 tone3
 tone5
 tone0
--- a/voices/zhitian_emo/dict/word_segment_dict.txt
+++ b/voices/zhitian_emo/dict/word_segment_dict.txt
@ -0,0 +1,5 @@
 word_begin
 word_end
 word_middle
 word_both
 word_none
--- a/voices/zhitian_emo/voc/ckpt/checkpoint_0.pth
+++ b/voices/zhitian_emo/voc/ckpt/checkpoint_0.pth
--- a/voices/zhitian_emo/voc/config.yaml
+++ b/voices/zhitian_emo/voc/config.yaml
@ -0,0 +1,188 @@
 model_type: hifigan
 Model:
 ###########################################################
 #         GENERATOR NETWORK ARCHITECTURE SETTING          #
 ###########################################################
  Generator:
    params:
      in_channels: 80                       
      out_channels: 1                      
      channels: 256                       
      kernel_size: 7                     
      upsample_scales: [10, 5, 2, 2]        
      upsample_kernal_sizes: [20, 11, 4, 4] 
      resblock_kernel_sizes: [3, 7, 11]     
      resblock_dilations:                  
            - [1, 3, 5, 7]
            - [1, 3, 5, 7]
            - [1, 3, 5, 7]
      bias: true                           
      causal: true                           
      nonlinear_activation: "LeakyReLU"    
      nonlinear_activation_params:         
        negative_slope: 0.1
      use_weight_norm: true               
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000
 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 ###########################################################
  MultiScaleDiscriminator:
    params:
      scales: 3                              
      downsample_pooling: "DWT"  
      downsample_pooling_params:
          kernel_size: 4                    
          stride: 2                         
          padding: 2                        
      discriminator_params:
          in_channels: 1                     
          out_channels: 1                    
          kernel_sizes: [15, 41, 5, 3]       
          channels: 128                      
          max_downsample_channels: 1024     
          max_groups: 16                   
          bias: true
          downsample_scales: [4, 4, 4, 4, 1]
          nonlinear_activation: "LeakyReLU"  
          nonlinear_activation_params:
            negative_slope: 0.1
      follow_official_norm: true    
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000
  MultiPeriodDiscriminator:
    params:
      periods: [2, 3, 5, 7, 11]      
      discriminator_params:
        in_channels: 1                  
        out_channels: 1                  
        kernel_sizes: [5, 3]              
        channels: 32                       
        downsample_scales: [3, 3, 3, 3, 1] 
        max_downsample_channels: 1024      
        bias: true                       
        nonlinear_activation: "LeakyReLU"  
        nonlinear_activation_params:       
          negative_slope: 0.1
        use_spectral_norm: false           
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000
 ####################################################
 #                   LOSS SETTING                   #
 ####################################################
 Loss:
  generator_adv_loss:
    enable: True
    params:
      average_by_discriminators: False
    weights: 1.0
  discriminator_adv_loss:
    enable: True
    params:
      average_by_discriminators: False
    weights: 1.0
  stft_loss:
    enable: False             # Whether to use multi-resolution STFT loss.
  mel_loss:
    enable: True
    params:
      fs: 16000
      fft_size: 2048
      hop_size: 200
      win_length: 1000
      window: "hann"
      num_mels: 80
      fmin: 0
      fmax: 8000
      log_base: null
    weights: 45.0
  subband_stft_loss:
    enable: False
    params:
      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
      window: "hann_window"       # Window function for STFT-based loss
  feat_match_loss:
    enable: True
    params:
      average_by_discriminators: false 
      average_by_layers: false         
    weights: 2.0
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 16              
 batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
 pin_memory: True            
 num_workers: 2 # FIXME: set > 0 may stuck on macos              
 remove_short_samples: False 
 allow_cache: True           
 generator_grad_norm: -1
 discriminator_grad_norm: -1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 generator_train_start_steps: 1     # Number of steps to start to train discriminator.
 discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
 train_max_steps: 2500000           # Number of training steps.
 save_interval_steps: 20000         # Interval steps to save checkpoint.
 eval_interval_steps: 10000          # Interval steps to evaluate the network.
 log_interval_steps: 1000            # Interval steps to record the training log.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/voices/zhitian_emo/vocoder/pytorch_model.bin
+++ b/voices/zhitian_emo/vocoder/pytorch_model.bin
--- a/voices/zhiyan_emo/am/ckpt/checkpoint_0.pth
+++ b/voices/zhiyan_emo/am/ckpt/checkpoint_0.pth
--- a/voices/zhiyan_emo/am/config.yaml
+++ b/voices/zhiyan_emo/am/config.yaml
@ -0,0 +1,105 @@
 model_type: sambert
 Model:
 #########################################################
 #         SAMBERT NETWORK ARCHITECTURE SETTING          #
 #########################################################
  KanTtsSAMBERT:
    params:
        max_len: 800
        embedding_dim: 512 
        encoder_num_layers: 8
        encoder_num_heads: 8
        encoder_num_units: 128
        encoder_ffn_inner_dim: 1024
        encoder_dropout: 0.1
        encoder_attention_dropout: 0.1
        encoder_relu_dropout: 0.1
        encoder_projection_units: 32
        speaker_units: 32
        emotion_units: 32
        predictor_filter_size: 41
        predictor_fsmn_num_layers: 3
        predictor_num_memory_units: 128
        predictor_ffn_inner_dim: 256
        predictor_dropout: 0.1
        predictor_shift: 0
        predictor_lstm_units: 128
        dur_pred_prenet_units: [128, 128]
        dur_pred_lstm_units: 128
        decoder_prenet_units: [256, 256]
        decoder_num_layers: 12
        decoder_num_heads: 8
        decoder_num_units: 128
        decoder_ffn_inner_dim: 1024
        decoder_dropout: 0.1
        decoder_attention_dropout: 0.1
        decoder_relu_dropout: 0.1
        outputs_per_step: 3
        num_mels: 80
        postnet_filter_size: 41
        postnet_fsmn_num_layers: 4
        postnet_num_memory_units: 256
        postnet_ffn_inner_dim: 512
        postnet_dropout: 0.1
        postnet_shift: 17
        postnet_lstm_units: 128
        MAS: False
    optimizer:
      type: Adam
      params:
        lr: 0.001
        betas: [0.9, 0.98]
        eps: 1.0e-9
        weight_decay: 0.0
    scheduler:
      type: NoamLR
      params:
        warmup_steps: 4000
 linguistic_unit: 
  cleaners: english_cleaners
  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
  speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
 ####################################################
 #                   LOSS SETTING                   #
 ####################################################
 Loss:
  MelReconLoss:
    enable: True
    params:
      loss_type: mae
  ProsodyReconLoss:
    enable: True
    params:
      loss_type: mae
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 32              
 pin_memory: False            
 num_workers: 4 # FIXME: set > 0 may stuck on macos              
 remove_short_samples: False 
 allow_cache: True           
 grad_norm: 1.0
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 train_max_steps: 1000000           # Number of training steps.
 save_interval_steps: 20000         # Interval steps to save checkpoint.
 eval_interval_steps: 10000          # Interval steps to evaluate the network.
 log_interval_steps: 1000            # Interval steps to record the training log.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/voices/zhiyan_emo/am/pytorch_model.bin
+++ b/voices/zhiyan_emo/am/pytorch_model.bin
--- a/voices/zhiyan_emo/audio_config.yaml
+++ b/voices/zhiyan_emo/audio_config.yaml
@ -0,0 +1,27 @@
 # Audio processing configs
 audio_config:
  # Preprocess
  wav_normalize: True
  trim_silence: True
  trim_silence_threshold_db: 60
  preemphasize: False
  # Feature extraction
  sampling_rate: 16000
  hop_length: 200
  win_length: 1000
  n_fft: 2048
  n_mels: 80
  fmin: 0.0
  fmax: 8000.0
  phone_level_feature: True
  # Normalization
  norm_type: "mean_std"  # "mean_std" or "global"
  max_norm: 1.0
  symmetric: False
  min_level_db: -100.0
  ref_level_db: 20
  num_workers: 16
--- a/voices/zhiyan_emo/dict/emo_category_dict.txt
+++ b/voices/zhiyan_emo/dict/emo_category_dict.txt
@ -0,0 +1,33 @@
 emotion_none
 emotion_neutral
 emotion_angry
 emotion_disgust
 emotion_fear
 emotion_happy
 emotion_sad
 emotion_surprise
 emotion_calm
 emotion_gentle
 emotion_relax
 emotion_lyrical
 emotion_serious
 emotion_disgruntled
 emotion_satisfied
 emotion_disappointed
 emotion_excited
 emotion_anxiety
 emotion_jealousy
 emotion_hate
 emotion_pity
 emotion_pleasure
 emotion_arousal
 emotion_dominance
 emotion_placeholder1
 emotion_placeholder2
 emotion_placeholder3
 emotion_placeholder4
 emotion_placeholder5
 emotion_placeholder6
 emotion_placeholder7
 emotion_placeholder8
 emotion_placeholder9
--- a/voices/zhiyan_emo/dict/speaker_dict.txt
+++ b/voices/zhiyan_emo/dict/speaker_dict.txt
@ -0,0 +1,6 @@
 F7
 F74
 FBYN
 FRXL
 M7
 xiaoyu
--- a/voices/zhiyan_emo/dict/sy_dict.txt
+++ b/voices/zhiyan_emo/dict/sy_dict.txt
@ -0,0 +1,144 @@
 a_c
 ai_c
 an_c
 ang_c
 ao_c
 b_c
 c_c
 ch_c
 d_c
 e_c
 ei_c
 en_c
 eng_c
 er_c
 f_c
 g_c
 h_c
 i_c
 ia_c
 ian_c
 iang_c
 iao_c
 ie_c
 ih_c
 ii_c
 in_c
 ing_c
 io_c
 iong_c
 iou_c
 j_c
 k_c
 l_c
 m_c
 n_c
 o_c
 ong_c
 ou_c
 p_c
 q_c
 r_c
 s_c
 sh_c
 t_c
 u_c
 ua_c
 uai_c
 uan_c
 uang_c
 uei_c
 uen_c
 ueng_c
 uo_c
 v_c
 van_c
 ve_c
 vn_c
 xx_c
 z_c
 zh_c
 w_c
 y_c
 ga
 ge
 go
 aa
 ae
 ah
 ao
 aw
 ay
 b
 ch
 d
 dh
 eh
 er
 ey
 f
 g
 hh
 ih
 iy
 jh
 k
 l
 m
 n
 ng
 ow
 oy
 p
 r
 s
 sh
 t
 th
 uh
 uw
 v
 w
 y
 z
 zh
 air_c
 angr_c
 anr_c
 aor_c
 ar_c
 eir_c
 engr_c
 enr_c
 iangr_c
 ianr_c
 iaor_c
 iar_c
 ier_c
 ihr_c
 iir_c
 ingr_c
 inr_c
 iongr_c
 iour_c
 ir_c
 ongr_c
 or_c
 our_c
 uair_c
 uangr_c
 uanr_c
 uar_c
 ueir_c
 uenr_c
 uor_c
 ur_c
 vanr_c
 ver_c
 vnr_c
 vr_c
 pau
 #1
 #2
 #3
 #4
--- a/voices/zhiyan_emo/dict/syllable_flag_dict.txt
+++ b/voices/zhiyan_emo/dict/syllable_flag_dict.txt
@ -0,0 +1,5 @@
 s_begin
 s_end
 s_none
 s_both
 s_middle
--- a/voices/zhiyan_emo/dict/tone_dict.txt
+++ b/voices/zhiyan_emo/dict/tone_dict.txt
@ -0,0 +1,7 @@
 tone1
 tone_none
 tone4
 tone2
 tone3
 tone5
 tone0
--- a/voices/zhiyan_emo/dict/word_segment_dict.txt
+++ b/voices/zhiyan_emo/dict/word_segment_dict.txt
@ -0,0 +1,5 @@
 word_begin
 word_end
 word_middle
 word_both
 word_none
--- a/voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
+++ b/voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
--- a/voices/zhiyan_emo/voc/config.yaml
+++ b/voices/zhiyan_emo/voc/config.yaml
@ -0,0 +1,188 @@
 model_type: hifigan
 Model:
 ###########################################################
 #         GENERATOR NETWORK ARCHITECTURE SETTING          #
 ###########################################################
  Generator:
    params:
      in_channels: 80                       
      out_channels: 1                      
      channels: 256                       
      kernel_size: 7                     
      upsample_scales: [10, 5, 2, 2]        
      upsample_kernal_sizes: [20, 11, 4, 4] 
      resblock_kernel_sizes: [3, 7, 11]     
      resblock_dilations:                  
            - [1, 3, 5, 7]
            - [1, 3, 5, 7]
            - [1, 3, 5, 7]
      bias: true                           
      causal: true                           
      nonlinear_activation: "LeakyReLU"    
      nonlinear_activation_params:         
        negative_slope: 0.1
      use_weight_norm: true               
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000
 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 ###########################################################
  MultiScaleDiscriminator:
    params:
      scales: 3                              
      downsample_pooling: "DWT"  
      downsample_pooling_params:
          kernel_size: 4                    
          stride: 2                         
          padding: 2                        
      discriminator_params:
          in_channels: 1                     
          out_channels: 1                    
          kernel_sizes: [15, 41, 5, 3]       
          channels: 128                      
          max_downsample_channels: 1024     
          max_groups: 16                   
          bias: true
          downsample_scales: [4, 4, 4, 4, 1]
          nonlinear_activation: "LeakyReLU"  
          nonlinear_activation_params:
            negative_slope: 0.1
      follow_official_norm: true    
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000
  MultiPeriodDiscriminator:
    params:
      periods: [2, 3, 5, 7, 11]      
      discriminator_params:
        in_channels: 1                  
        out_channels: 1                  
        kernel_sizes: [5, 3]              
        channels: 32                       
        downsample_scales: [3, 3, 3, 3, 1] 
        max_downsample_channels: 1024      
        bias: true                       
        nonlinear_activation: "LeakyReLU"  
        nonlinear_activation_params:       
          negative_slope: 0.1
        use_spectral_norm: false           
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000
 ####################################################
 #                   LOSS SETTING                   #
 ####################################################
 Loss:
  generator_adv_loss:
    enable: True
    params:
      average_by_discriminators: False
    weights: 1.0
  discriminator_adv_loss:
    enable: True
    params:
      average_by_discriminators: False
    weights: 1.0
  stft_loss:
    enable: False             # Whether to use multi-resolution STFT loss.
  mel_loss:
    enable: True
    params:
      fs: 16000
      fft_size: 2048
      hop_size: 200
      win_length: 1000
      window: "hann"
      num_mels: 80
      fmin: 0
      fmax: 8000
      log_base: null
    weights: 45.0
  subband_stft_loss:
    enable: False
    params:
      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
      window: "hann_window"       # Window function for STFT-based loss
  feat_match_loss:
    enable: True
    params:
      average_by_discriminators: false 
      average_by_layers: false         
    weights: 2.0
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 16              
 batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
 pin_memory: True            
 num_workers: 2 # FIXME: set > 0 may stuck on macos              
 remove_short_samples: False 
 allow_cache: True           
 generator_grad_norm: -1
 discriminator_grad_norm: -1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 generator_train_start_steps: 1     # Number of steps to start to train discriminator.
 discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
 train_max_steps: 2500000           # Number of training steps.
 save_interval_steps: 20000         # Interval steps to save checkpoint.
 eval_interval_steps: 10000          # Interval steps to evaluate the network.
 log_interval_steps: 1000            # Interval steps to record the training log.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/voices/zhiyan_emo/vocoder/pytorch_model.bin
+++ b/voices/zhiyan_emo/vocoder/pytorch_model.bin
--- a/voices/zhizhe_emo/am/ckpt/checkpoint_0.pth
+++ b/voices/zhizhe_emo/am/ckpt/checkpoint_0.pth
--- a/voices/zhizhe_emo/am/config.yaml
+++ b/voices/zhizhe_emo/am/config.yaml
@ -0,0 +1,105 @@
 model_type: sambert
 Model:
 #########################################################
 #         SAMBERT NETWORK ARCHITECTURE SETTING          #
 #########################################################
  KanTtsSAMBERT:
    params:
        max_len: 800
        embedding_dim: 512 
        encoder_num_layers: 8
        encoder_num_heads: 8
        encoder_num_units: 128
        encoder_ffn_inner_dim: 1024
        encoder_dropout: 0.1
        encoder_attention_dropout: 0.1
        encoder_relu_dropout: 0.1
        encoder_projection_units: 32
        speaker_units: 32
        emotion_units: 32
        predictor_filter_size: 41
        predictor_fsmn_num_layers: 3
        predictor_num_memory_units: 128
        predictor_ffn_inner_dim: 256
        predictor_dropout: 0.1
        predictor_shift: 0
        predictor_lstm_units: 128
        dur_pred_prenet_units: [128, 128]
        dur_pred_lstm_units: 128
        decoder_prenet_units: [256, 256]
        decoder_num_layers: 12
        decoder_num_heads: 8
        decoder_num_units: 128
        decoder_ffn_inner_dim: 1024
        decoder_dropout: 0.1
        decoder_attention_dropout: 0.1
        decoder_relu_dropout: 0.1
        outputs_per_step: 3
        num_mels: 80
        postnet_filter_size: 41
        postnet_fsmn_num_layers: 4
        postnet_num_memory_units: 256
        postnet_ffn_inner_dim: 512
        postnet_dropout: 0.1
        postnet_shift: 17
        postnet_lstm_units: 128
        MAS: False
    optimizer:
      type: Adam
      params:
        lr: 0.001
        betas: [0.9, 0.98]
        eps: 1.0e-9
        weight_decay: 0.0
    scheduler:
      type: NoamLR
      params:
        warmup_steps: 4000
 linguistic_unit: 
  cleaners: english_cleaners
  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
  speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
 ####################################################
 #                   LOSS SETTING                   #
 ####################################################
 Loss:
  MelReconLoss:
    enable: True
    params:
      loss_type: mae
  ProsodyReconLoss:
    enable: True
    params:
      loss_type: mae
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 32              
 pin_memory: False            
 num_workers: 4 # FIXME: set > 0 may stuck on macos              
 remove_short_samples: False 
 allow_cache: True           
 grad_norm: 1.0
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 train_max_steps: 1000000           # Number of training steps.
 save_interval_steps: 20000         # Interval steps to save checkpoint.
 eval_interval_steps: 10000          # Interval steps to evaluate the network.
 log_interval_steps: 1000            # Interval steps to record the training log.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/voices/zhizhe_emo/am/pytorch_model.bin
+++ b/voices/zhizhe_emo/am/pytorch_model.bin
--- a/voices/zhizhe_emo/audio_config.yaml
+++ b/voices/zhizhe_emo/audio_config.yaml
@ -0,0 +1,27 @@
 # Audio processing configs
 audio_config:
  # Preprocess
  wav_normalize: True
  trim_silence: True
  trim_silence_threshold_db: 60
  preemphasize: False
  # Feature extraction
  sampling_rate: 16000
  hop_length: 200
  win_length: 1000
  n_fft: 2048
  n_mels: 80
  fmin: 0.0
  fmax: 8000.0
  phone_level_feature: True
  # Normalization
  norm_type: "mean_std"  # "mean_std" or "global"
  max_norm: 1.0
  symmetric: False
  min_level_db: -100.0
  ref_level_db: 20
  num_workers: 16
--- a/voices/zhizhe_emo/dict/emo_category_dict.txt
+++ b/voices/zhizhe_emo/dict/emo_category_dict.txt
@ -0,0 +1,33 @@
 emotion_none
 emotion_neutral
 emotion_angry
 emotion_disgust
 emotion_fear
 emotion_happy
 emotion_sad
 emotion_surprise
 emotion_calm
 emotion_gentle
 emotion_relax
 emotion_lyrical
 emotion_serious
 emotion_disgruntled
 emotion_satisfied
 emotion_disappointed
 emotion_excited
 emotion_anxiety
 emotion_jealousy
 emotion_hate
 emotion_pity
 emotion_pleasure
 emotion_arousal
 emotion_dominance
 emotion_placeholder1
 emotion_placeholder2
 emotion_placeholder3
 emotion_placeholder4
 emotion_placeholder5
 emotion_placeholder6
 emotion_placeholder7
 emotion_placeholder8
 emotion_placeholder9
--- a/voices/zhizhe_emo/dict/speaker_dict.txt
+++ b/voices/zhizhe_emo/dict/speaker_dict.txt
@ -0,0 +1,6 @@
 F7
 F74
 FBYN
 FRXL
 M7
 xiaoyu
--- a/voices/zhizhe_emo/dict/sy_dict.txt
+++ b/voices/zhizhe_emo/dict/sy_dict.txt
@ -0,0 +1,144 @@
 a_c
 ai_c
 an_c
 ang_c
 ao_c
 b_c
 c_c
 ch_c
 d_c
 e_c
 ei_c
 en_c
 eng_c
 er_c
 f_c
 g_c
 h_c
 i_c
 ia_c
 ian_c
 iang_c
 iao_c
 ie_c
 ih_c
 ii_c
 in_c
 ing_c
 io_c
 iong_c
 iou_c
 j_c
 k_c
 l_c
 m_c
 n_c
 o_c
 ong_c
 ou_c
 p_c
 q_c
 r_c
 s_c
 sh_c
 t_c
 u_c
 ua_c
 uai_c
 uan_c
 uang_c
 uei_c
 uen_c
 ueng_c
 uo_c
 v_c
 van_c
 ve_c
 vn_c
 xx_c
 z_c
 zh_c
 w_c
 y_c
 ga
 ge
 go
 aa
 ae
 ah
 ao
 aw
 ay
 b
 ch
 d
 dh
 eh
 er
 ey
 f
 g
 hh
 ih
 iy
 jh
 k
 l
 m
 n
 ng
 ow
 oy
 p
 r
 s
 sh
 t
 th
 uh
 uw
 v
 w
 y
 z
 zh
 air_c
 angr_c
 anr_c
 aor_c
 ar_c
 eir_c
 engr_c
 enr_c
 iangr_c
 ianr_c
 iaor_c
 iar_c
 ier_c
 ihr_c
 iir_c
 ingr_c
 inr_c
 iongr_c
 iour_c
 ir_c
 ongr_c
 or_c
 our_c
 uair_c
 uangr_c
 uanr_c
 uar_c
 ueir_c
 uenr_c
 uor_c
 ur_c
 vanr_c
 ver_c
 vnr_c
 vr_c
 pau
 #1
 #2
 #3
 #4
--- a/voices/zhizhe_emo/dict/syllable_flag_dict.txt
+++ b/voices/zhizhe_emo/dict/syllable_flag_dict.txt
@ -0,0 +1,5 @@
 s_begin
 s_end
 s_none
 s_both
 s_middle
--- a/voices/zhizhe_emo/dict/tone_dict.txt
+++ b/voices/zhizhe_emo/dict/tone_dict.txt
@ -0,0 +1,7 @@
 tone1
 tone_none
 tone4
 tone2
 tone3
 tone5
 tone0
--- a/voices/zhizhe_emo/dict/word_segment_dict.txt
+++ b/voices/zhizhe_emo/dict/word_segment_dict.txt
@ -0,0 +1,5 @@
 word_begin
 word_end
 word_middle
 word_both
 word_none
--- a/voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth
+++ b/voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth
--- a/voices/zhizhe_emo/voc/config.yaml
+++ b/voices/zhizhe_emo/voc/config.yaml
@ -0,0 +1,188 @@
 model_type: hifigan
 Model:
 ###########################################################
 #         GENERATOR NETWORK ARCHITECTURE SETTING          #
 ###########################################################
  Generator:
    params:
      in_channels: 80                       
      out_channels: 1                      
      channels: 256                       
      kernel_size: 7                     
      upsample_scales: [10, 5, 2, 2]        
      upsample_kernal_sizes: [20, 11, 4, 4] 
      resblock_kernel_sizes: [3, 7, 11]     
      resblock_dilations:                  
            - [1, 3, 5, 7]
            - [1, 3, 5, 7]
            - [1, 3, 5, 7]
      bias: true                           
      causal: true                           
      nonlinear_activation: "LeakyReLU"    
      nonlinear_activation_params:         
        negative_slope: 0.1
      use_weight_norm: true               
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000
 ###########################################################
 #       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
 ###########################################################
  MultiScaleDiscriminator:
    params:
      scales: 3                              
      downsample_pooling: "DWT"  
      downsample_pooling_params:
          kernel_size: 4                    
          stride: 2                         
          padding: 2                        
      discriminator_params:
          in_channels: 1                     
          out_channels: 1                    
          kernel_sizes: [15, 41, 5, 3]       
          channels: 128                      
          max_downsample_channels: 1024     
          max_groups: 16                   
          bias: true
          downsample_scales: [4, 4, 4, 4, 1]
          nonlinear_activation: "LeakyReLU"  
          nonlinear_activation_params:
            negative_slope: 0.1
      follow_official_norm: true    
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000
  MultiPeriodDiscriminator:
    params:
      periods: [2, 3, 5, 7, 11]      
      discriminator_params:
        in_channels: 1                  
        out_channels: 1                  
        kernel_sizes: [5, 3]              
        channels: 32                       
        downsample_scales: [3, 3, 3, 3, 1] 
        max_downsample_channels: 1024      
        bias: true                       
        nonlinear_activation: "LeakyReLU"  
        nonlinear_activation_params:       
          negative_slope: 0.1
        use_spectral_norm: false           
    optimizer:
      type: Adam
      params:
        lr: 2.0e-4
        betas: [0.5, 0.9]
        weight_decay: 0.0
    scheduler:
      type: MultiStepLR
      params:
        gamma: 0.5
        milestones:
            - 200000
            - 400000
            - 600000
            - 800000
 ####################################################
 #                   LOSS SETTING                   #
 ####################################################
 Loss:
  generator_adv_loss:
    enable: True
    params:
      average_by_discriminators: False
    weights: 1.0
  discriminator_adv_loss:
    enable: True
    params:
      average_by_discriminators: False
    weights: 1.0
  stft_loss:
    enable: False             # Whether to use multi-resolution STFT loss.
  mel_loss:
    enable: True
    params:
      fs: 16000
      fft_size: 2048
      hop_size: 200
      win_length: 1000
      window: "hann"
      num_mels: 80
      fmin: 0
      fmax: 8000
      log_base: null
    weights: 45.0
  subband_stft_loss:
    enable: False
    params:
      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
      window: "hann_window"       # Window function for STFT-based loss
  feat_match_loss:
    enable: True
    params:
      average_by_discriminators: false 
      average_by_layers: false         
    weights: 2.0
 ###########################################################
 #                  DATA LOADER SETTING                    #
 ###########################################################
 batch_size: 16              
 batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
 pin_memory: True            
 num_workers: 2 # FIXME: set > 0 may stuck on macos              
 remove_short_samples: False 
 allow_cache: True           
 generator_grad_norm: -1
 discriminator_grad_norm: -1
 ###########################################################
 #                    INTERVAL SETTING                     #
 ###########################################################
 generator_train_start_steps: 1     # Number of steps to start to train discriminator.
 discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
 train_max_steps: 2500000           # Number of training steps.
 save_interval_steps: 20000         # Interval steps to save checkpoint.
 eval_interval_steps: 10000          # Interval steps to evaluate the network.
 log_interval_steps: 1000            # Interval steps to record the training log.
 ###########################################################
 #                     OTHER SETTING                       #
 ###########################################################
 num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/voices/zhizhe_emo/vocoder/pytorch_model.bin
+++ b/voices/zhizhe_emo/vocoder/pytorch_model.bin