update

2026-07-16 13:22:52 +08:00 · 2022-12-14 16:34:44 +08:00
parent 517f2712ec
commit d551729afa
53 changed files with 2136 additions and 0 deletions
--- a/voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
+++ b/voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
--- a/voices/zhiyan_emo/voc/config.yaml
+++ b/voices/zhiyan_emo/voc/config.yaml
@ -0,0 +1,188 @@
+model_type: hifigan
+Model:
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+  Generator:
+    params:
+      in_channels: 80                       
+      out_channels: 1                      
+      channels: 256                       
+      kernel_size: 7                     
+      upsample_scales: [10, 5, 2, 2]        
+      upsample_kernal_sizes: [20, 11, 4, 4] 
+      resblock_kernel_sizes: [3, 7, 11]     
+      resblock_dilations:                  
+            - [1, 3, 5, 7]
+            - [1, 3, 5, 7]
+            - [1, 3, 5, 7]
+      bias: true                           
+      causal: true                           
+      nonlinear_activation: "LeakyReLU"    
+      nonlinear_activation_params:         
+        negative_slope: 0.1
+      use_weight_norm: true               
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+  MultiScaleDiscriminator:
+    params:
+      scales: 3                              
+      downsample_pooling: "DWT"  
+      downsample_pooling_params:
+          kernel_size: 4                    
+          stride: 2                         
+          padding: 2                        
+      discriminator_params:
+          in_channels: 1                     
+          out_channels: 1                    
+          kernel_sizes: [15, 41, 5, 3]       
+          channels: 128                      
+          max_downsample_channels: 1024     
+          max_groups: 16                   
+          bias: true
+          downsample_scales: [4, 4, 4, 4, 1]
+          nonlinear_activation: "LeakyReLU"  
+          nonlinear_activation_params:
+            negative_slope: 0.1
+      follow_official_norm: true    
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+  MultiPeriodDiscriminator:
+    params:
+      periods: [2, 3, 5, 7, 11]      
+      discriminator_params:
+        in_channels: 1                  
+        out_channels: 1                  
+        kernel_sizes: [5, 3]              
+        channels: 32                       
+        downsample_scales: [3, 3, 3, 3, 1] 
+        max_downsample_channels: 1024      
+        bias: true                       
+        nonlinear_activation: "LeakyReLU"  
+        nonlinear_activation_params:       
+          negative_slope: 0.1
+        use_spectral_norm: false           
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+####################################################
+#                   LOSS SETTING                   #
+####################################################
+Loss:
+  generator_adv_loss:
+    enable: True
+    params:
+      average_by_discriminators: False
+    weights: 1.0
+
+  discriminator_adv_loss:
+    enable: True
+    params:
+      average_by_discriminators: False
+    weights: 1.0
+
+  stft_loss:
+    enable: False             # Whether to use multi-resolution STFT loss.
+
+  mel_loss:
+    enable: True
+    params:
+      fs: 16000
+      fft_size: 2048
+      hop_size: 200
+      win_length: 1000
+      window: "hann"
+      num_mels: 80
+      fmin: 0
+      fmax: 8000
+      log_base: null
+    weights: 45.0
+
+  subband_stft_loss:
+    enable: False
+    params:
+      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
+      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+      window: "hann_window"       # Window function for STFT-based loss
+
+  feat_match_loss:
+    enable: True
+    params:
+      average_by_discriminators: false 
+      average_by_layers: false         
+    weights: 2.0
+
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              
+batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: True            
+num_workers: 2 # FIXME: set > 0 may stuck on macos              
+remove_short_samples: False 
+allow_cache: True           
+
+generator_grad_norm: -1
+
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 20000         # Interval steps to save checkpoint.
+eval_interval_steps: 10000          # Interval steps to evaluate the network.
+log_interval_steps: 1000            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.