update

2026-07-16 13:22:52 +08:00 · 2022-12-14 16:34:44 +08:00
parent 517f2712ec
commit d551729afa
53 changed files with 2136 additions and 0 deletions
--- a/voices/zhiyan_emo/am/ckpt/checkpoint_0.pth
+++ b/voices/zhiyan_emo/am/ckpt/checkpoint_0.pth
--- a/voices/zhiyan_emo/am/config.yaml
+++ b/voices/zhiyan_emo/am/config.yaml
@ -0,0 +1,105 @@
+model_type: sambert
+Model:
+#########################################################
+#         SAMBERT NETWORK ARCHITECTURE SETTING          #
+#########################################################
+  KanTtsSAMBERT:
+    params:
+        max_len: 800
+
+        embedding_dim: 512 
+        encoder_num_layers: 8
+        encoder_num_heads: 8
+        encoder_num_units: 128
+        encoder_ffn_inner_dim: 1024
+        encoder_dropout: 0.1
+        encoder_attention_dropout: 0.1
+        encoder_relu_dropout: 0.1
+        encoder_projection_units: 32
+
+        speaker_units: 32
+        emotion_units: 32
+
+        predictor_filter_size: 41
+        predictor_fsmn_num_layers: 3
+        predictor_num_memory_units: 128
+        predictor_ffn_inner_dim: 256
+        predictor_dropout: 0.1
+        predictor_shift: 0
+        predictor_lstm_units: 128
+        dur_pred_prenet_units: [128, 128]
+        dur_pred_lstm_units: 128
+
+        decoder_prenet_units: [256, 256]
+        decoder_num_layers: 12
+        decoder_num_heads: 8
+        decoder_num_units: 128
+        decoder_ffn_inner_dim: 1024
+        decoder_dropout: 0.1
+        decoder_attention_dropout: 0.1
+        decoder_relu_dropout: 0.1
+
+        outputs_per_step: 3
+        num_mels: 80
+
+        postnet_filter_size: 41
+        postnet_fsmn_num_layers: 4
+        postnet_num_memory_units: 256
+        postnet_ffn_inner_dim: 512
+        postnet_dropout: 0.1
+        postnet_shift: 17
+        postnet_lstm_units: 128
+        MAS: False
+
+    optimizer:
+      type: Adam
+      params:
+        lr: 0.001
+        betas: [0.9, 0.98]
+        eps: 1.0e-9
+        weight_decay: 0.0
+    scheduler:
+      type: NoamLR
+      params:
+        warmup_steps: 4000
+
+linguistic_unit: 
+  cleaners: english_cleaners
+  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
+  speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
+####################################################
+#                   LOSS SETTING                   #
+####################################################
+Loss:
+  MelReconLoss:
+    enable: True
+    params:
+      loss_type: mae
+
+  ProsodyReconLoss:
+    enable: True
+    params:
+      loss_type: mae
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              
+pin_memory: False            
+num_workers: 4 # FIXME: set > 0 may stuck on macos              
+remove_short_samples: False 
+allow_cache: True           
+grad_norm: 1.0
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+train_max_steps: 1000000           # Number of training steps.
+save_interval_steps: 20000         # Interval steps to save checkpoint.
+eval_interval_steps: 10000          # Interval steps to evaluate the network.
+log_interval_steps: 1000            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/voices/zhiyan_emo/am/pytorch_model.bin
+++ b/voices/zhiyan_emo/am/pytorch_model.bin
--- a/voices/zhiyan_emo/audio_config.yaml
+++ b/voices/zhiyan_emo/audio_config.yaml
@ -0,0 +1,27 @@
+# Audio processing configs
+
+audio_config:
+  # Preprocess
+  wav_normalize: True
+  trim_silence: True
+  trim_silence_threshold_db: 60
+  preemphasize: False
+
+  # Feature extraction
+  sampling_rate: 16000
+  hop_length: 200
+  win_length: 1000
+  n_fft: 2048
+  n_mels: 80
+  fmin: 0.0
+  fmax: 8000.0
+  phone_level_feature: True
+
+  # Normalization
+  norm_type: "mean_std"  # "mean_std" or "global"
+  max_norm: 1.0
+  symmetric: False
+  min_level_db: -100.0
+  ref_level_db: 20
+  
+  num_workers: 16
--- a/voices/zhiyan_emo/dict/emo_category_dict.txt
+++ b/voices/zhiyan_emo/dict/emo_category_dict.txt
@ -0,0 +1,33 @@
+emotion_none
+emotion_neutral
+emotion_angry
+emotion_disgust
+emotion_fear
+emotion_happy
+emotion_sad
+emotion_surprise
+emotion_calm
+emotion_gentle
+emotion_relax
+emotion_lyrical
+emotion_serious
+emotion_disgruntled
+emotion_satisfied
+emotion_disappointed
+emotion_excited
+emotion_anxiety
+emotion_jealousy
+emotion_hate
+emotion_pity
+emotion_pleasure
+emotion_arousal
+emotion_dominance
+emotion_placeholder1
+emotion_placeholder2
+emotion_placeholder3
+emotion_placeholder4
+emotion_placeholder5
+emotion_placeholder6
+emotion_placeholder7
+emotion_placeholder8
+emotion_placeholder9
--- a/voices/zhiyan_emo/dict/speaker_dict.txt
+++ b/voices/zhiyan_emo/dict/speaker_dict.txt
@ -0,0 +1,6 @@
+F7
+F74
+FBYN
+FRXL
+M7
+xiaoyu
--- a/voices/zhiyan_emo/dict/sy_dict.txt
+++ b/voices/zhiyan_emo/dict/sy_dict.txt
@ -0,0 +1,144 @@
+a_c
+ai_c
+an_c
+ang_c
+ao_c
+b_c
+c_c
+ch_c
+d_c
+e_c
+ei_c
+en_c
+eng_c
+er_c
+f_c
+g_c
+h_c
+i_c
+ia_c
+ian_c
+iang_c
+iao_c
+ie_c
+ih_c
+ii_c
+in_c
+ing_c
+io_c
+iong_c
+iou_c
+j_c
+k_c
+l_c
+m_c
+n_c
+o_c
+ong_c
+ou_c
+p_c
+q_c
+r_c
+s_c
+sh_c
+t_c
+u_c
+ua_c
+uai_c
+uan_c
+uang_c
+uei_c
+uen_c
+ueng_c
+uo_c
+v_c
+van_c
+ve_c
+vn_c
+xx_c
+z_c
+zh_c
+w_c
+y_c
+ga
+ge
+go
+aa
+ae
+ah
+ao
+aw
+ay
+b
+ch
+d
+dh
+eh
+er
+ey
+f
+g
+hh
+ih
+iy
+jh
+k
+l
+m
+n
+ng
+ow
+oy
+p
+r
+s
+sh
+t
+th
+uh
+uw
+v
+w
+y
+z
+zh
+air_c
+angr_c
+anr_c
+aor_c
+ar_c
+eir_c
+engr_c
+enr_c
+iangr_c
+ianr_c
+iaor_c
+iar_c
+ier_c
+ihr_c
+iir_c
+ingr_c
+inr_c
+iongr_c
+iour_c
+ir_c
+ongr_c
+or_c
+our_c
+uair_c
+uangr_c
+uanr_c
+uar_c
+ueir_c
+uenr_c
+uor_c
+ur_c
+vanr_c
+ver_c
+vnr_c
+vr_c
+pau
+#1
+#2
+#3
+#4
--- a/voices/zhiyan_emo/dict/syllable_flag_dict.txt
+++ b/voices/zhiyan_emo/dict/syllable_flag_dict.txt
@ -0,0 +1,5 @@
+s_begin
+s_end
+s_none
+s_both
+s_middle
--- a/voices/zhiyan_emo/dict/tone_dict.txt
+++ b/voices/zhiyan_emo/dict/tone_dict.txt
@ -0,0 +1,7 @@
+tone1
+tone_none
+tone4
+tone2
+tone3
+tone5
+tone0
--- a/voices/zhiyan_emo/dict/word_segment_dict.txt
+++ b/voices/zhiyan_emo/dict/word_segment_dict.txt
@ -0,0 +1,5 @@
+word_begin
+word_end
+word_middle
+word_both
+word_none
--- a/voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
+++ b/voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
--- a/voices/zhiyan_emo/voc/config.yaml
+++ b/voices/zhiyan_emo/voc/config.yaml
@ -0,0 +1,188 @@
+model_type: hifigan
+Model:
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+  Generator:
+    params:
+      in_channels: 80                       
+      out_channels: 1                      
+      channels: 256                       
+      kernel_size: 7                     
+      upsample_scales: [10, 5, 2, 2]        
+      upsample_kernal_sizes: [20, 11, 4, 4] 
+      resblock_kernel_sizes: [3, 7, 11]     
+      resblock_dilations:                  
+            - [1, 3, 5, 7]
+            - [1, 3, 5, 7]
+            - [1, 3, 5, 7]
+      bias: true                           
+      causal: true                           
+      nonlinear_activation: "LeakyReLU"    
+      nonlinear_activation_params:         
+        negative_slope: 0.1
+      use_weight_norm: true               
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+  MultiScaleDiscriminator:
+    params:
+      scales: 3                              
+      downsample_pooling: "DWT"  
+      downsample_pooling_params:
+          kernel_size: 4                    
+          stride: 2                         
+          padding: 2                        
+      discriminator_params:
+          in_channels: 1                     
+          out_channels: 1                    
+          kernel_sizes: [15, 41, 5, 3]       
+          channels: 128                      
+          max_downsample_channels: 1024     
+          max_groups: 16                   
+          bias: true
+          downsample_scales: [4, 4, 4, 4, 1]
+          nonlinear_activation: "LeakyReLU"  
+          nonlinear_activation_params:
+            negative_slope: 0.1
+      follow_official_norm: true    
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+  MultiPeriodDiscriminator:
+    params:
+      periods: [2, 3, 5, 7, 11]      
+      discriminator_params:
+        in_channels: 1                  
+        out_channels: 1                  
+        kernel_sizes: [5, 3]              
+        channels: 32                       
+        downsample_scales: [3, 3, 3, 3, 1] 
+        max_downsample_channels: 1024      
+        bias: true                       
+        nonlinear_activation: "LeakyReLU"  
+        nonlinear_activation_params:       
+          negative_slope: 0.1
+        use_spectral_norm: false           
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+####################################################
+#                   LOSS SETTING                   #
+####################################################
+Loss:
+  generator_adv_loss:
+    enable: True
+    params:
+      average_by_discriminators: False
+    weights: 1.0
+
+  discriminator_adv_loss:
+    enable: True
+    params:
+      average_by_discriminators: False
+    weights: 1.0
+
+  stft_loss:
+    enable: False             # Whether to use multi-resolution STFT loss.
+
+  mel_loss:
+    enable: True
+    params:
+      fs: 16000
+      fft_size: 2048
+      hop_size: 200
+      win_length: 1000
+      window: "hann"
+      num_mels: 80
+      fmin: 0
+      fmax: 8000
+      log_base: null
+    weights: 45.0
+
+  subband_stft_loss:
+    enable: False
+    params:
+      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
+      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+      window: "hann_window"       # Window function for STFT-based loss
+
+  feat_match_loss:
+    enable: True
+    params:
+      average_by_discriminators: false 
+      average_by_layers: false         
+    weights: 2.0
+
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              
+batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: True            
+num_workers: 2 # FIXME: set > 0 may stuck on macos              
+remove_short_samples: False 
+allow_cache: True           
+
+generator_grad_norm: -1
+
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 20000         # Interval steps to save checkpoint.
+eval_interval_steps: 10000          # Interval steps to evaluate the network.
+log_interval_steps: 1000            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
--- a/voices/zhiyan_emo/vocoder/pytorch_model.bin
+++ b/voices/zhiyan_emo/vocoder/pytorch_model.bin