From d551729afac338c1006e4c7f2b11ae890cb552f2 Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Wed, 14 Dec 2022 16:34:44 +0800
Subject: [PATCH] update

---
 voices/voices.json                            |   8 +
 voices/zhibei_emo/am/ckpt/checkpoint_0.pth    |   3 +
 voices/zhibei_emo/am/config.yaml              | 105 ++++++++++
 voices/zhibei_emo/am/pytorch_model.bin        |   3 +
 voices/zhibei_emo/audio_config.yaml           |  27 +++
 voices/zhibei_emo/dict/emo_category_dict.txt  |  33 +++
 voices/zhibei_emo/dict/speaker_dict.txt       |   6 +
 voices/zhibei_emo/dict/sy_dict.txt            | 144 ++++++++++++++
 voices/zhibei_emo/dict/syllable_flag_dict.txt |   5 +
 voices/zhibei_emo/dict/tone_dict.txt          |   7 +
 voices/zhibei_emo/dict/word_segment_dict.txt  |   5 +
 voices/zhibei_emo/voc/ckpt/checkpoint_0.pth   |   3 +
 voices/zhibei_emo/voc/config.yaml             | 188 ++++++++++++++++++
 voices/zhibei_emo/vocoder/pytorch_model.bin   |   3 +
 voices/zhitian_emo/am/ckpt/checkpoint_0.pth   |   3 +
 voices/zhitian_emo/am/config.yaml             | 105 ++++++++++
 voices/zhitian_emo/am/pytorch_model.bin       |   3 +
 voices/zhitian_emo/audio_config.yaml          |  27 +++
 voices/zhitian_emo/dict/emo_category_dict.txt |  33 +++
 voices/zhitian_emo/dict/speaker_dict.txt      |   6 +
 voices/zhitian_emo/dict/sy_dict.txt           | 144 ++++++++++++++
 .../zhitian_emo/dict/syllable_flag_dict.txt   |   5 +
 voices/zhitian_emo/dict/tone_dict.txt         |   7 +
 voices/zhitian_emo/dict/word_segment_dict.txt |   5 +
 voices/zhitian_emo/voc/ckpt/checkpoint_0.pth  |   3 +
 voices/zhitian_emo/voc/config.yaml            | 188 ++++++++++++++++++
 voices/zhitian_emo/vocoder/pytorch_model.bin  |   3 +
 voices/zhiyan_emo/am/ckpt/checkpoint_0.pth    |   3 +
 voices/zhiyan_emo/am/config.yaml              | 105 ++++++++++
 voices/zhiyan_emo/am/pytorch_model.bin        |   3 +
 voices/zhiyan_emo/audio_config.yaml           |  27 +++
 voices/zhiyan_emo/dict/emo_category_dict.txt  |  33 +++
 voices/zhiyan_emo/dict/speaker_dict.txt       |   6 +
 voices/zhiyan_emo/dict/sy_dict.txt            | 144 ++++++++++++++
 voices/zhiyan_emo/dict/syllable_flag_dict.txt |   5 +
 voices/zhiyan_emo/dict/tone_dict.txt          |   7 +
 voices/zhiyan_emo/dict/word_segment_dict.txt  |   5 +
 voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth   |   3 +
 voices/zhiyan_emo/voc/config.yaml             | 188 ++++++++++++++++++
 voices/zhiyan_emo/vocoder/pytorch_model.bin   |   3 +
 voices/zhizhe_emo/am/ckpt/checkpoint_0.pth    |   3 +
 voices/zhizhe_emo/am/config.yaml              | 105 ++++++++++
 voices/zhizhe_emo/am/pytorch_model.bin        |   3 +
 voices/zhizhe_emo/audio_config.yaml           |  27 +++
 voices/zhizhe_emo/dict/emo_category_dict.txt  |  33 +++
 voices/zhizhe_emo/dict/speaker_dict.txt       |   6 +
 voices/zhizhe_emo/dict/sy_dict.txt            | 144 ++++++++++++++
 voices/zhizhe_emo/dict/syllable_flag_dict.txt |   5 +
 voices/zhizhe_emo/dict/tone_dict.txt          |   7 +
 voices/zhizhe_emo/dict/word_segment_dict.txt  |   5 +
 voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth   |   3 +
 voices/zhizhe_emo/voc/config.yaml             | 188 ++++++++++++++++++
 voices/zhizhe_emo/vocoder/pytorch_model.bin   |   3 +
 53 files changed, 2136 insertions(+)
 create mode 100644 voices/voices.json
 create mode 100644 voices/zhibei_emo/am/ckpt/checkpoint_0.pth
 create mode 100644 voices/zhibei_emo/am/config.yaml
 create mode 100644 voices/zhibei_emo/am/pytorch_model.bin
 create mode 100644 voices/zhibei_emo/audio_config.yaml
 create mode 100755 voices/zhibei_emo/dict/emo_category_dict.txt
 create mode 100755 voices/zhibei_emo/dict/speaker_dict.txt
 create mode 100755 voices/zhibei_emo/dict/sy_dict.txt
 create mode 100755 voices/zhibei_emo/dict/syllable_flag_dict.txt
 create mode 100755 voices/zhibei_emo/dict/tone_dict.txt
 create mode 100755 voices/zhibei_emo/dict/word_segment_dict.txt
 create mode 100644 voices/zhibei_emo/voc/ckpt/checkpoint_0.pth
 create mode 100644 voices/zhibei_emo/voc/config.yaml
 create mode 100644 voices/zhibei_emo/vocoder/pytorch_model.bin
 create mode 100644 voices/zhitian_emo/am/ckpt/checkpoint_0.pth
 create mode 100644 voices/zhitian_emo/am/config.yaml
 create mode 100644 voices/zhitian_emo/am/pytorch_model.bin
 create mode 100644 voices/zhitian_emo/audio_config.yaml
 create mode 100755 voices/zhitian_emo/dict/emo_category_dict.txt
 create mode 100755 voices/zhitian_emo/dict/speaker_dict.txt
 create mode 100755 voices/zhitian_emo/dict/sy_dict.txt
 create mode 100755 voices/zhitian_emo/dict/syllable_flag_dict.txt
 create mode 100755 voices/zhitian_emo/dict/tone_dict.txt
 create mode 100755 voices/zhitian_emo/dict/word_segment_dict.txt
 create mode 100644 voices/zhitian_emo/voc/ckpt/checkpoint_0.pth
 create mode 100644 voices/zhitian_emo/voc/config.yaml
 create mode 100644 voices/zhitian_emo/vocoder/pytorch_model.bin
 create mode 100644 voices/zhiyan_emo/am/ckpt/checkpoint_0.pth
 create mode 100644 voices/zhiyan_emo/am/config.yaml
 create mode 100644 voices/zhiyan_emo/am/pytorch_model.bin
 create mode 100644 voices/zhiyan_emo/audio_config.yaml
 create mode 100755 voices/zhiyan_emo/dict/emo_category_dict.txt
 create mode 100755 voices/zhiyan_emo/dict/speaker_dict.txt
 create mode 100755 voices/zhiyan_emo/dict/sy_dict.txt
 create mode 100755 voices/zhiyan_emo/dict/syllable_flag_dict.txt
 create mode 100755 voices/zhiyan_emo/dict/tone_dict.txt
 create mode 100755 voices/zhiyan_emo/dict/word_segment_dict.txt
 create mode 100644 voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
 create mode 100644 voices/zhiyan_emo/voc/config.yaml
 create mode 100644 voices/zhiyan_emo/vocoder/pytorch_model.bin
 create mode 100644 voices/zhizhe_emo/am/ckpt/checkpoint_0.pth
 create mode 100644 voices/zhizhe_emo/am/config.yaml
 create mode 100644 voices/zhizhe_emo/am/pytorch_model.bin
 create mode 100644 voices/zhizhe_emo/audio_config.yaml
 create mode 100755 voices/zhizhe_emo/dict/emo_category_dict.txt
 create mode 100755 voices/zhizhe_emo/dict/speaker_dict.txt
 create mode 100755 voices/zhizhe_emo/dict/sy_dict.txt
 create mode 100755 voices/zhizhe_emo/dict/syllable_flag_dict.txt
 create mode 100755 voices/zhizhe_emo/dict/tone_dict.txt
 create mode 100755 voices/zhizhe_emo/dict/word_segment_dict.txt
 create mode 100644 voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth
 create mode 100644 voices/zhizhe_emo/voc/config.yaml
 create mode 100644 voices/zhizhe_emo/vocoder/pytorch_model.bin

diff --git a/voices/voices.json b/voices/voices.json
new file mode 100644
index 0000000..c43f58f
--- /dev/null
+++ b/voices/voices.json
@@ -0,0 +1,8 @@
+{
+  "voices": [
+    "zhitian_emo",
+    "zhibei_emo",
+    "zhizhe_emo",
+    "zhiyan_emo"
+  ]
+}
diff --git a/voices/zhibei_emo/am/ckpt/checkpoint_0.pth b/voices/zhibei_emo/am/ckpt/checkpoint_0.pth
new file mode 100644
index 0000000..99349c5
--- /dev/null
+++ b/voices/zhibei_emo/am/ckpt/checkpoint_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cd7fde59dac89ab5a5a076a8a74519953ceb1be67a3dabc24aad3d90b4521334
+size 49315631
diff --git a/voices/zhibei_emo/am/config.yaml b/voices/zhibei_emo/am/config.yaml
new file mode 100644
index 0000000..56a980a
--- /dev/null
+++ b/voices/zhibei_emo/am/config.yaml
@@ -0,0 +1,105 @@
+model_type: sambert
+Model:
+#########################################################
+#         SAMBERT NETWORK ARCHITECTURE SETTING          #
+#########################################################
+  KanTtsSAMBERT:
+    params:
+        max_len: 800
+
+        embedding_dim: 512 
+        encoder_num_layers: 8
+        encoder_num_heads: 8
+        encoder_num_units: 128
+        encoder_ffn_inner_dim: 1024
+        encoder_dropout: 0.1
+        encoder_attention_dropout: 0.1
+        encoder_relu_dropout: 0.1
+        encoder_projection_units: 32
+
+        speaker_units: 32
+        emotion_units: 32
+
+        predictor_filter_size: 41
+        predictor_fsmn_num_layers: 3
+        predictor_num_memory_units: 128
+        predictor_ffn_inner_dim: 256
+        predictor_dropout: 0.1
+        predictor_shift: 0
+        predictor_lstm_units: 128
+        dur_pred_prenet_units: [128, 128]
+        dur_pred_lstm_units: 128
+
+        decoder_prenet_units: [256, 256]
+        decoder_num_layers: 12
+        decoder_num_heads: 8
+        decoder_num_units: 128
+        decoder_ffn_inner_dim: 1024
+        decoder_dropout: 0.1
+        decoder_attention_dropout: 0.1
+        decoder_relu_dropout: 0.1
+
+        outputs_per_step: 3
+        num_mels: 80
+
+        postnet_filter_size: 41
+        postnet_fsmn_num_layers: 4
+        postnet_num_memory_units: 256
+        postnet_ffn_inner_dim: 512
+        postnet_dropout: 0.1
+        postnet_shift: 17
+        postnet_lstm_units: 128
+        MAS: False
+
+    optimizer:
+      type: Adam
+      params:
+        lr: 0.001
+        betas: [0.9, 0.98]
+        eps: 1.0e-9
+        weight_decay: 0.0
+    scheduler:
+      type: NoamLR
+      params:
+        warmup_steps: 4000
+
+linguistic_unit: 
+  cleaners: english_cleaners
+  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
+  speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
+####################################################
+#                   LOSS SETTING                   #
+####################################################
+Loss:
+  MelReconLoss:
+    enable: True
+    params:
+      loss_type: mae
+
+  ProsodyReconLoss:
+    enable: True
+    params:
+      loss_type: mae
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              
+pin_memory: False            
+num_workers: 4 # FIXME: set > 0 may stuck on macos              
+remove_short_samples: False 
+allow_cache: True           
+grad_norm: 1.0
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+train_max_steps: 1000000           # Number of training steps.
+save_interval_steps: 20000         # Interval steps to save checkpoint.
+eval_interval_steps: 10000          # Interval steps to evaluate the network.
+log_interval_steps: 1000            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/voices/zhibei_emo/am/pytorch_model.bin b/voices/zhibei_emo/am/pytorch_model.bin
new file mode 100644
index 0000000..1f37fcb
--- /dev/null
+++ b/voices/zhibei_emo/am/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8750fbfe39c3b18e8a024900f1e81ea07192fb520f25ff1bad7e9ea88ebc34c
+size 49234411
diff --git a/voices/zhibei_emo/audio_config.yaml b/voices/zhibei_emo/audio_config.yaml
new file mode 100644
index 0000000..233817c
--- /dev/null
+++ b/voices/zhibei_emo/audio_config.yaml
@@ -0,0 +1,27 @@
+# Audio processing configs
+
+audio_config:
+  # Preprocess
+  wav_normalize: True
+  trim_silence: True
+  trim_silence_threshold_db: 60
+  preemphasize: False
+
+  # Feature extraction
+  sampling_rate: 16000
+  hop_length: 200
+  win_length: 1000
+  n_fft: 2048
+  n_mels: 80
+  fmin: 0.0
+  fmax: 8000.0
+  phone_level_feature: True
+
+  # Normalization
+  norm_type: "mean_std"  # "mean_std" or "global"
+  max_norm: 1.0
+  symmetric: False
+  min_level_db: -100.0
+  ref_level_db: 20
+  
+  num_workers: 16
diff --git a/voices/zhibei_emo/dict/emo_category_dict.txt b/voices/zhibei_emo/dict/emo_category_dict.txt
new file mode 100755
index 0000000..dfd88e8
--- /dev/null
+++ b/voices/zhibei_emo/dict/emo_category_dict.txt
@@ -0,0 +1,33 @@
+emotion_none
+emotion_neutral
+emotion_angry
+emotion_disgust
+emotion_fear
+emotion_happy
+emotion_sad
+emotion_surprise
+emotion_calm
+emotion_gentle
+emotion_relax
+emotion_lyrical
+emotion_serious
+emotion_disgruntled
+emotion_satisfied
+emotion_disappointed
+emotion_excited
+emotion_anxiety
+emotion_jealousy
+emotion_hate
+emotion_pity
+emotion_pleasure
+emotion_arousal
+emotion_dominance
+emotion_placeholder1
+emotion_placeholder2
+emotion_placeholder3
+emotion_placeholder4
+emotion_placeholder5
+emotion_placeholder6
+emotion_placeholder7
+emotion_placeholder8
+emotion_placeholder9
\ No newline at end of file
diff --git a/voices/zhibei_emo/dict/speaker_dict.txt b/voices/zhibei_emo/dict/speaker_dict.txt
new file mode 100755
index 0000000..af0ca1d
--- /dev/null
+++ b/voices/zhibei_emo/dict/speaker_dict.txt
@@ -0,0 +1,6 @@
+F7
+F74
+FBYN
+FRXL
+M7
+xiaoyu
diff --git a/voices/zhibei_emo/dict/sy_dict.txt b/voices/zhibei_emo/dict/sy_dict.txt
new file mode 100755
index 0000000..ec54511
--- /dev/null
+++ b/voices/zhibei_emo/dict/sy_dict.txt
@@ -0,0 +1,144 @@
+a_c
+ai_c
+an_c
+ang_c
+ao_c
+b_c
+c_c
+ch_c
+d_c
+e_c
+ei_c
+en_c
+eng_c
+er_c
+f_c
+g_c
+h_c
+i_c
+ia_c
+ian_c
+iang_c
+iao_c
+ie_c
+ih_c
+ii_c
+in_c
+ing_c
+io_c
+iong_c
+iou_c
+j_c
+k_c
+l_c
+m_c
+n_c
+o_c
+ong_c
+ou_c
+p_c
+q_c
+r_c
+s_c
+sh_c
+t_c
+u_c
+ua_c
+uai_c
+uan_c
+uang_c
+uei_c
+uen_c
+ueng_c
+uo_c
+v_c
+van_c
+ve_c
+vn_c
+xx_c
+z_c
+zh_c
+w_c
+y_c
+ga
+ge
+go
+aa
+ae
+ah
+ao
+aw
+ay
+b
+ch
+d
+dh
+eh
+er
+ey
+f
+g
+hh
+ih
+iy
+jh
+k
+l
+m
+n
+ng
+ow
+oy
+p
+r
+s
+sh
+t
+th
+uh
+uw
+v
+w
+y
+z
+zh
+air_c
+angr_c
+anr_c
+aor_c
+ar_c
+eir_c
+engr_c
+enr_c
+iangr_c
+ianr_c
+iaor_c
+iar_c
+ier_c
+ihr_c
+iir_c
+ingr_c
+inr_c
+iongr_c
+iour_c
+ir_c
+ongr_c
+or_c
+our_c
+uair_c
+uangr_c
+uanr_c
+uar_c
+ueir_c
+uenr_c
+uor_c
+ur_c
+vanr_c
+ver_c
+vnr_c
+vr_c
+pau
+#1
+#2
+#3
+#4
\ No newline at end of file
diff --git a/voices/zhibei_emo/dict/syllable_flag_dict.txt b/voices/zhibei_emo/dict/syllable_flag_dict.txt
new file mode 100755
index 0000000..84a4d14
--- /dev/null
+++ b/voices/zhibei_emo/dict/syllable_flag_dict.txt
@@ -0,0 +1,5 @@
+s_begin
+s_end
+s_none
+s_both
+s_middle
diff --git a/voices/zhibei_emo/dict/tone_dict.txt b/voices/zhibei_emo/dict/tone_dict.txt
new file mode 100755
index 0000000..7af26ed
--- /dev/null
+++ b/voices/zhibei_emo/dict/tone_dict.txt
@@ -0,0 +1,7 @@
+tone1
+tone_none
+tone4
+tone2
+tone3
+tone5
+tone0
diff --git a/voices/zhibei_emo/dict/word_segment_dict.txt b/voices/zhibei_emo/dict/word_segment_dict.txt
new file mode 100755
index 0000000..667bcf9
--- /dev/null
+++ b/voices/zhibei_emo/dict/word_segment_dict.txt
@@ -0,0 +1,5 @@
+word_begin
+word_end
+word_middle
+word_both
+word_none
diff --git a/voices/zhibei_emo/voc/ckpt/checkpoint_0.pth b/voices/zhibei_emo/voc/ckpt/checkpoint_0.pth
new file mode 100644
index 0000000..48f4107
--- /dev/null
+++ b/voices/zhibei_emo/voc/ckpt/checkpoint_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ff65e91d6dda79c878564920b0964a9e5449eb658a8d0a8d51351ca2f56460c
+size 19594437
diff --git a/voices/zhibei_emo/voc/config.yaml b/voices/zhibei_emo/voc/config.yaml
new file mode 100644
index 0000000..e9853c3
--- /dev/null
+++ b/voices/zhibei_emo/voc/config.yaml
@@ -0,0 +1,188 @@
+model_type: hifigan
+Model:
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+  Generator:
+    params:
+      in_channels: 80                       
+      out_channels: 1                      
+      channels: 256                       
+      kernel_size: 7                     
+      upsample_scales: [10, 5, 2, 2]        
+      upsample_kernal_sizes: [20, 11, 4, 4] 
+      resblock_kernel_sizes: [3, 7, 11]     
+      resblock_dilations:                  
+            - [1, 3, 5, 7]
+            - [1, 3, 5, 7]
+            - [1, 3, 5, 7]
+      bias: true                           
+      causal: true                           
+      nonlinear_activation: "LeakyReLU"    
+      nonlinear_activation_params:         
+        negative_slope: 0.1
+      use_weight_norm: true               
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+  MultiScaleDiscriminator:
+    params:
+      scales: 3                              
+      downsample_pooling: "DWT"  
+      downsample_pooling_params:
+          kernel_size: 4                    
+          stride: 2                         
+          padding: 2                        
+      discriminator_params:
+          in_channels: 1                     
+          out_channels: 1                    
+          kernel_sizes: [15, 41, 5, 3]       
+          channels: 128                      
+          max_downsample_channels: 1024     
+          max_groups: 16                   
+          bias: true
+          downsample_scales: [4, 4, 4, 4, 1]
+          nonlinear_activation: "LeakyReLU"  
+          nonlinear_activation_params:
+            negative_slope: 0.1
+      follow_official_norm: true    
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+  MultiPeriodDiscriminator:
+    params:
+      periods: [2, 3, 5, 7, 11]      
+      discriminator_params:
+        in_channels: 1                  
+        out_channels: 1                  
+        kernel_sizes: [5, 3]              
+        channels: 32                       
+        downsample_scales: [3, 3, 3, 3, 1] 
+        max_downsample_channels: 1024      
+        bias: true                       
+        nonlinear_activation: "LeakyReLU"  
+        nonlinear_activation_params:       
+          negative_slope: 0.1
+        use_spectral_norm: false           
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+####################################################
+#                   LOSS SETTING                   #
+####################################################
+Loss:
+  generator_adv_loss:
+    enable: True
+    params:
+      average_by_discriminators: False
+    weights: 1.0
+
+  discriminator_adv_loss:
+    enable: True
+    params:
+      average_by_discriminators: False
+    weights: 1.0
+
+  stft_loss:
+    enable: False             # Whether to use multi-resolution STFT loss.
+
+  mel_loss:
+    enable: True
+    params:
+      fs: 16000
+      fft_size: 2048
+      hop_size: 200
+      win_length: 1000
+      window: "hann"
+      num_mels: 80
+      fmin: 0
+      fmax: 8000
+      log_base: null
+    weights: 45.0
+
+  subband_stft_loss:
+    enable: False
+    params:
+      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
+      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+      window: "hann_window"       # Window function for STFT-based loss
+
+  feat_match_loss:
+    enable: True
+    params:
+      average_by_discriminators: false 
+      average_by_layers: false         
+    weights: 2.0
+
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              
+batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: True            
+num_workers: 2 # FIXME: set > 0 may stuck on macos              
+remove_short_samples: False 
+allow_cache: True           
+
+generator_grad_norm: -1
+
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 20000         # Interval steps to save checkpoint.
+eval_interval_steps: 10000          # Interval steps to evaluate the network.
+log_interval_steps: 1000            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/voices/zhibei_emo/vocoder/pytorch_model.bin b/voices/zhibei_emo/vocoder/pytorch_model.bin
new file mode 100644
index 0000000..1323513
--- /dev/null
+++ b/voices/zhibei_emo/vocoder/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3b68cd8cda09c79cca36ef6ae17b1b547a4390686a77b0a3eeadc673bb7bb139
+size 19613277
diff --git a/voices/zhitian_emo/am/ckpt/checkpoint_0.pth b/voices/zhitian_emo/am/ckpt/checkpoint_0.pth
new file mode 100644
index 0000000..7570874
--- /dev/null
+++ b/voices/zhitian_emo/am/ckpt/checkpoint_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a4108ce78f7737a386a7c5127fe6f63ca6c9442d2c0ca5263dd5fc2b0dddfae4
+size 49315631
diff --git a/voices/zhitian_emo/am/config.yaml b/voices/zhitian_emo/am/config.yaml
new file mode 100644
index 0000000..56a980a
--- /dev/null
+++ b/voices/zhitian_emo/am/config.yaml
@@ -0,0 +1,105 @@
+model_type: sambert
+Model:
+#########################################################
+#         SAMBERT NETWORK ARCHITECTURE SETTING          #
+#########################################################
+  KanTtsSAMBERT:
+    params:
+        max_len: 800
+
+        embedding_dim: 512 
+        encoder_num_layers: 8
+        encoder_num_heads: 8
+        encoder_num_units: 128
+        encoder_ffn_inner_dim: 1024
+        encoder_dropout: 0.1
+        encoder_attention_dropout: 0.1
+        encoder_relu_dropout: 0.1
+        encoder_projection_units: 32
+
+        speaker_units: 32
+        emotion_units: 32
+
+        predictor_filter_size: 41
+        predictor_fsmn_num_layers: 3
+        predictor_num_memory_units: 128
+        predictor_ffn_inner_dim: 256
+        predictor_dropout: 0.1
+        predictor_shift: 0
+        predictor_lstm_units: 128
+        dur_pred_prenet_units: [128, 128]
+        dur_pred_lstm_units: 128
+
+        decoder_prenet_units: [256, 256]
+        decoder_num_layers: 12
+        decoder_num_heads: 8
+        decoder_num_units: 128
+        decoder_ffn_inner_dim: 1024
+        decoder_dropout: 0.1
+        decoder_attention_dropout: 0.1
+        decoder_relu_dropout: 0.1
+
+        outputs_per_step: 3
+        num_mels: 80
+
+        postnet_filter_size: 41
+        postnet_fsmn_num_layers: 4
+        postnet_num_memory_units: 256
+        postnet_ffn_inner_dim: 512
+        postnet_dropout: 0.1
+        postnet_shift: 17
+        postnet_lstm_units: 128
+        MAS: False
+
+    optimizer:
+      type: Adam
+      params:
+        lr: 0.001
+        betas: [0.9, 0.98]
+        eps: 1.0e-9
+        weight_decay: 0.0
+    scheduler:
+      type: NoamLR
+      params:
+        warmup_steps: 4000
+
+linguistic_unit: 
+  cleaners: english_cleaners
+  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
+  speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
+####################################################
+#                   LOSS SETTING                   #
+####################################################
+Loss:
+  MelReconLoss:
+    enable: True
+    params:
+      loss_type: mae
+
+  ProsodyReconLoss:
+    enable: True
+    params:
+      loss_type: mae
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              
+pin_memory: False            
+num_workers: 4 # FIXME: set > 0 may stuck on macos              
+remove_short_samples: False 
+allow_cache: True           
+grad_norm: 1.0
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+train_max_steps: 1000000           # Number of training steps.
+save_interval_steps: 20000         # Interval steps to save checkpoint.
+eval_interval_steps: 10000          # Interval steps to evaluate the network.
+log_interval_steps: 1000            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/voices/zhitian_emo/am/pytorch_model.bin b/voices/zhitian_emo/am/pytorch_model.bin
new file mode 100644
index 0000000..780ea34
--- /dev/null
+++ b/voices/zhitian_emo/am/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:61a3641e654e0621e949b0288566e840f170c32a5ba4fa51c3e7bd8247c51040
+size 49234411
diff --git a/voices/zhitian_emo/audio_config.yaml b/voices/zhitian_emo/audio_config.yaml
new file mode 100644
index 0000000..233817c
--- /dev/null
+++ b/voices/zhitian_emo/audio_config.yaml
@@ -0,0 +1,27 @@
+# Audio processing configs
+
+audio_config:
+  # Preprocess
+  wav_normalize: True
+  trim_silence: True
+  trim_silence_threshold_db: 60
+  preemphasize: False
+
+  # Feature extraction
+  sampling_rate: 16000
+  hop_length: 200
+  win_length: 1000
+  n_fft: 2048
+  n_mels: 80
+  fmin: 0.0
+  fmax: 8000.0
+  phone_level_feature: True
+
+  # Normalization
+  norm_type: "mean_std"  # "mean_std" or "global"
+  max_norm: 1.0
+  symmetric: False
+  min_level_db: -100.0
+  ref_level_db: 20
+  
+  num_workers: 16
diff --git a/voices/zhitian_emo/dict/emo_category_dict.txt b/voices/zhitian_emo/dict/emo_category_dict.txt
new file mode 100755
index 0000000..dfd88e8
--- /dev/null
+++ b/voices/zhitian_emo/dict/emo_category_dict.txt
@@ -0,0 +1,33 @@
+emotion_none
+emotion_neutral
+emotion_angry
+emotion_disgust
+emotion_fear
+emotion_happy
+emotion_sad
+emotion_surprise
+emotion_calm
+emotion_gentle
+emotion_relax
+emotion_lyrical
+emotion_serious
+emotion_disgruntled
+emotion_satisfied
+emotion_disappointed
+emotion_excited
+emotion_anxiety
+emotion_jealousy
+emotion_hate
+emotion_pity
+emotion_pleasure
+emotion_arousal
+emotion_dominance
+emotion_placeholder1
+emotion_placeholder2
+emotion_placeholder3
+emotion_placeholder4
+emotion_placeholder5
+emotion_placeholder6
+emotion_placeholder7
+emotion_placeholder8
+emotion_placeholder9
\ No newline at end of file
diff --git a/voices/zhitian_emo/dict/speaker_dict.txt b/voices/zhitian_emo/dict/speaker_dict.txt
new file mode 100755
index 0000000..af0ca1d
--- /dev/null
+++ b/voices/zhitian_emo/dict/speaker_dict.txt
@@ -0,0 +1,6 @@
+F7
+F74
+FBYN
+FRXL
+M7
+xiaoyu
diff --git a/voices/zhitian_emo/dict/sy_dict.txt b/voices/zhitian_emo/dict/sy_dict.txt
new file mode 100755
index 0000000..ec54511
--- /dev/null
+++ b/voices/zhitian_emo/dict/sy_dict.txt
@@ -0,0 +1,144 @@
+a_c
+ai_c
+an_c
+ang_c
+ao_c
+b_c
+c_c
+ch_c
+d_c
+e_c
+ei_c
+en_c
+eng_c
+er_c
+f_c
+g_c
+h_c
+i_c
+ia_c
+ian_c
+iang_c
+iao_c
+ie_c
+ih_c
+ii_c
+in_c
+ing_c
+io_c
+iong_c
+iou_c
+j_c
+k_c
+l_c
+m_c
+n_c
+o_c
+ong_c
+ou_c
+p_c
+q_c
+r_c
+s_c
+sh_c
+t_c
+u_c
+ua_c
+uai_c
+uan_c
+uang_c
+uei_c
+uen_c
+ueng_c
+uo_c
+v_c
+van_c
+ve_c
+vn_c
+xx_c
+z_c
+zh_c
+w_c
+y_c
+ga
+ge
+go
+aa
+ae
+ah
+ao
+aw
+ay
+b
+ch
+d
+dh
+eh
+er
+ey
+f
+g
+hh
+ih
+iy
+jh
+k
+l
+m
+n
+ng
+ow
+oy
+p
+r
+s
+sh
+t
+th
+uh
+uw
+v
+w
+y
+z
+zh
+air_c
+angr_c
+anr_c
+aor_c
+ar_c
+eir_c
+engr_c
+enr_c
+iangr_c
+ianr_c
+iaor_c
+iar_c
+ier_c
+ihr_c
+iir_c
+ingr_c
+inr_c
+iongr_c
+iour_c
+ir_c
+ongr_c
+or_c
+our_c
+uair_c
+uangr_c
+uanr_c
+uar_c
+ueir_c
+uenr_c
+uor_c
+ur_c
+vanr_c
+ver_c
+vnr_c
+vr_c
+pau
+#1
+#2
+#3
+#4
\ No newline at end of file
diff --git a/voices/zhitian_emo/dict/syllable_flag_dict.txt b/voices/zhitian_emo/dict/syllable_flag_dict.txt
new file mode 100755
index 0000000..84a4d14
--- /dev/null
+++ b/voices/zhitian_emo/dict/syllable_flag_dict.txt
@@ -0,0 +1,5 @@
+s_begin
+s_end
+s_none
+s_both
+s_middle
diff --git a/voices/zhitian_emo/dict/tone_dict.txt b/voices/zhitian_emo/dict/tone_dict.txt
new file mode 100755
index 0000000..7af26ed
--- /dev/null
+++ b/voices/zhitian_emo/dict/tone_dict.txt
@@ -0,0 +1,7 @@
+tone1
+tone_none
+tone4
+tone2
+tone3
+tone5
+tone0
diff --git a/voices/zhitian_emo/dict/word_segment_dict.txt b/voices/zhitian_emo/dict/word_segment_dict.txt
new file mode 100755
index 0000000..667bcf9
--- /dev/null
+++ b/voices/zhitian_emo/dict/word_segment_dict.txt
@@ -0,0 +1,5 @@
+word_begin
+word_end
+word_middle
+word_both
+word_none
diff --git a/voices/zhitian_emo/voc/ckpt/checkpoint_0.pth b/voices/zhitian_emo/voc/ckpt/checkpoint_0.pth
new file mode 100644
index 0000000..d3d253f
--- /dev/null
+++ b/voices/zhitian_emo/voc/ckpt/checkpoint_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:37ae58b77e73f1d32fda9be354662116c415803f0e9961d8c8fb935ccbcc7ada
+size 19594437
diff --git a/voices/zhitian_emo/voc/config.yaml b/voices/zhitian_emo/voc/config.yaml
new file mode 100644
index 0000000..e9853c3
--- /dev/null
+++ b/voices/zhitian_emo/voc/config.yaml
@@ -0,0 +1,188 @@
+model_type: hifigan
+Model:
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+  Generator:
+    params:
+      in_channels: 80                       
+      out_channels: 1                      
+      channels: 256                       
+      kernel_size: 7                     
+      upsample_scales: [10, 5, 2, 2]        
+      upsample_kernal_sizes: [20, 11, 4, 4] 
+      resblock_kernel_sizes: [3, 7, 11]     
+      resblock_dilations:                  
+            - [1, 3, 5, 7]
+            - [1, 3, 5, 7]
+            - [1, 3, 5, 7]
+      bias: true                           
+      causal: true                           
+      nonlinear_activation: "LeakyReLU"    
+      nonlinear_activation_params:         
+        negative_slope: 0.1
+      use_weight_norm: true               
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+  MultiScaleDiscriminator:
+    params:
+      scales: 3                              
+      downsample_pooling: "DWT"  
+      downsample_pooling_params:
+          kernel_size: 4                    
+          stride: 2                         
+          padding: 2                        
+      discriminator_params:
+          in_channels: 1                     
+          out_channels: 1                    
+          kernel_sizes: [15, 41, 5, 3]       
+          channels: 128                      
+          max_downsample_channels: 1024     
+          max_groups: 16                   
+          bias: true
+          downsample_scales: [4, 4, 4, 4, 1]
+          nonlinear_activation: "LeakyReLU"  
+          nonlinear_activation_params:
+            negative_slope: 0.1
+      follow_official_norm: true    
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+  MultiPeriodDiscriminator:
+    params:
+      periods: [2, 3, 5, 7, 11]      
+      discriminator_params:
+        in_channels: 1                  
+        out_channels: 1                  
+        kernel_sizes: [5, 3]              
+        channels: 32                       
+        downsample_scales: [3, 3, 3, 3, 1] 
+        max_downsample_channels: 1024      
+        bias: true                       
+        nonlinear_activation: "LeakyReLU"  
+        nonlinear_activation_params:       
+          negative_slope: 0.1
+        use_spectral_norm: false           
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+####################################################
+#                   LOSS SETTING                   #
+####################################################
+Loss:
+  generator_adv_loss:
+    enable: True
+    params:
+      average_by_discriminators: False
+    weights: 1.0
+
+  discriminator_adv_loss:
+    enable: True
+    params:
+      average_by_discriminators: False
+    weights: 1.0
+
+  stft_loss:
+    enable: False             # Whether to use multi-resolution STFT loss.
+
+  mel_loss:
+    enable: True
+    params:
+      fs: 16000
+      fft_size: 2048
+      hop_size: 200
+      win_length: 1000
+      window: "hann"
+      num_mels: 80
+      fmin: 0
+      fmax: 8000
+      log_base: null
+    weights: 45.0
+
+  subband_stft_loss:
+    enable: False
+    params:
+      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
+      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+      window: "hann_window"       # Window function for STFT-based loss
+
+  feat_match_loss:
+    enable: True
+    params:
+      average_by_discriminators: false 
+      average_by_layers: false         
+    weights: 2.0
+
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              
+batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: True            
+num_workers: 2 # FIXME: set > 0 may stuck on macos              
+remove_short_samples: False 
+allow_cache: True           
+
+generator_grad_norm: -1
+
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 20000         # Interval steps to save checkpoint.
+eval_interval_steps: 10000          # Interval steps to evaluate the network.
+log_interval_steps: 1000            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/voices/zhitian_emo/vocoder/pytorch_model.bin b/voices/zhitian_emo/vocoder/pytorch_model.bin
new file mode 100644
index 0000000..2805881
--- /dev/null
+++ b/voices/zhitian_emo/vocoder/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f2a1e65b70ea89a5f45b38d3d04d3d843c25eaed6c9805346fd7180af08c3a0a
+size 19613277
diff --git a/voices/zhiyan_emo/am/ckpt/checkpoint_0.pth b/voices/zhiyan_emo/am/ckpt/checkpoint_0.pth
new file mode 100644
index 0000000..882645f
--- /dev/null
+++ b/voices/zhiyan_emo/am/ckpt/checkpoint_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:38980015065a60486f5c77204e7f20329bd6d0c54ac3ab40e6df3642520a5b5a
+size 49315631
diff --git a/voices/zhiyan_emo/am/config.yaml b/voices/zhiyan_emo/am/config.yaml
new file mode 100644
index 0000000..56a980a
--- /dev/null
+++ b/voices/zhiyan_emo/am/config.yaml
@@ -0,0 +1,105 @@
+model_type: sambert
+Model:
+#########################################################
+#         SAMBERT NETWORK ARCHITECTURE SETTING          #
+#########################################################
+  KanTtsSAMBERT:
+    params:
+        max_len: 800
+
+        embedding_dim: 512 
+        encoder_num_layers: 8
+        encoder_num_heads: 8
+        encoder_num_units: 128
+        encoder_ffn_inner_dim: 1024
+        encoder_dropout: 0.1
+        encoder_attention_dropout: 0.1
+        encoder_relu_dropout: 0.1
+        encoder_projection_units: 32
+
+        speaker_units: 32
+        emotion_units: 32
+
+        predictor_filter_size: 41
+        predictor_fsmn_num_layers: 3
+        predictor_num_memory_units: 128
+        predictor_ffn_inner_dim: 256
+        predictor_dropout: 0.1
+        predictor_shift: 0
+        predictor_lstm_units: 128
+        dur_pred_prenet_units: [128, 128]
+        dur_pred_lstm_units: 128
+
+        decoder_prenet_units: [256, 256]
+        decoder_num_layers: 12
+        decoder_num_heads: 8
+        decoder_num_units: 128
+        decoder_ffn_inner_dim: 1024
+        decoder_dropout: 0.1
+        decoder_attention_dropout: 0.1
+        decoder_relu_dropout: 0.1
+
+        outputs_per_step: 3
+        num_mels: 80
+
+        postnet_filter_size: 41
+        postnet_fsmn_num_layers: 4
+        postnet_num_memory_units: 256
+        postnet_ffn_inner_dim: 512
+        postnet_dropout: 0.1
+        postnet_shift: 17
+        postnet_lstm_units: 128
+        MAS: False
+
+    optimizer:
+      type: Adam
+      params:
+        lr: 0.001
+        betas: [0.9, 0.98]
+        eps: 1.0e-9
+        weight_decay: 0.0
+    scheduler:
+      type: NoamLR
+      params:
+        warmup_steps: 4000
+
+linguistic_unit: 
+  cleaners: english_cleaners
+  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
+  speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
+####################################################
+#                   LOSS SETTING                   #
+####################################################
+Loss:
+  MelReconLoss:
+    enable: True
+    params:
+      loss_type: mae
+
+  ProsodyReconLoss:
+    enable: True
+    params:
+      loss_type: mae
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              
+pin_memory: False            
+num_workers: 4 # FIXME: set > 0 may stuck on macos              
+remove_short_samples: False 
+allow_cache: True           
+grad_norm: 1.0
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+train_max_steps: 1000000           # Number of training steps.
+save_interval_steps: 20000         # Interval steps to save checkpoint.
+eval_interval_steps: 10000          # Interval steps to evaluate the network.
+log_interval_steps: 1000            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/voices/zhiyan_emo/am/pytorch_model.bin b/voices/zhiyan_emo/am/pytorch_model.bin
new file mode 100644
index 0000000..45313fb
--- /dev/null
+++ b/voices/zhiyan_emo/am/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f497d5cfa4082ecbe965f624d5ea81c1a8ab781f30b7394b32664d647446e4e
+size 49234411
diff --git a/voices/zhiyan_emo/audio_config.yaml b/voices/zhiyan_emo/audio_config.yaml
new file mode 100644
index 0000000..233817c
--- /dev/null
+++ b/voices/zhiyan_emo/audio_config.yaml
@@ -0,0 +1,27 @@
+# Audio processing configs
+
+audio_config:
+  # Preprocess
+  wav_normalize: True
+  trim_silence: True
+  trim_silence_threshold_db: 60
+  preemphasize: False
+
+  # Feature extraction
+  sampling_rate: 16000
+  hop_length: 200
+  win_length: 1000
+  n_fft: 2048
+  n_mels: 80
+  fmin: 0.0
+  fmax: 8000.0
+  phone_level_feature: True
+
+  # Normalization
+  norm_type: "mean_std"  # "mean_std" or "global"
+  max_norm: 1.0
+  symmetric: False
+  min_level_db: -100.0
+  ref_level_db: 20
+  
+  num_workers: 16
diff --git a/voices/zhiyan_emo/dict/emo_category_dict.txt b/voices/zhiyan_emo/dict/emo_category_dict.txt
new file mode 100755
index 0000000..dfd88e8
--- /dev/null
+++ b/voices/zhiyan_emo/dict/emo_category_dict.txt
@@ -0,0 +1,33 @@
+emotion_none
+emotion_neutral
+emotion_angry
+emotion_disgust
+emotion_fear
+emotion_happy
+emotion_sad
+emotion_surprise
+emotion_calm
+emotion_gentle
+emotion_relax
+emotion_lyrical
+emotion_serious
+emotion_disgruntled
+emotion_satisfied
+emotion_disappointed
+emotion_excited
+emotion_anxiety
+emotion_jealousy
+emotion_hate
+emotion_pity
+emotion_pleasure
+emotion_arousal
+emotion_dominance
+emotion_placeholder1
+emotion_placeholder2
+emotion_placeholder3
+emotion_placeholder4
+emotion_placeholder5
+emotion_placeholder6
+emotion_placeholder7
+emotion_placeholder8
+emotion_placeholder9
\ No newline at end of file
diff --git a/voices/zhiyan_emo/dict/speaker_dict.txt b/voices/zhiyan_emo/dict/speaker_dict.txt
new file mode 100755
index 0000000..af0ca1d
--- /dev/null
+++ b/voices/zhiyan_emo/dict/speaker_dict.txt
@@ -0,0 +1,6 @@
+F7
+F74
+FBYN
+FRXL
+M7
+xiaoyu
diff --git a/voices/zhiyan_emo/dict/sy_dict.txt b/voices/zhiyan_emo/dict/sy_dict.txt
new file mode 100755
index 0000000..ec54511
--- /dev/null
+++ b/voices/zhiyan_emo/dict/sy_dict.txt
@@ -0,0 +1,144 @@
+a_c
+ai_c
+an_c
+ang_c
+ao_c
+b_c
+c_c
+ch_c
+d_c
+e_c
+ei_c
+en_c
+eng_c
+er_c
+f_c
+g_c
+h_c
+i_c
+ia_c
+ian_c
+iang_c
+iao_c
+ie_c
+ih_c
+ii_c
+in_c
+ing_c
+io_c
+iong_c
+iou_c
+j_c
+k_c
+l_c
+m_c
+n_c
+o_c
+ong_c
+ou_c
+p_c
+q_c
+r_c
+s_c
+sh_c
+t_c
+u_c
+ua_c
+uai_c
+uan_c
+uang_c
+uei_c
+uen_c
+ueng_c
+uo_c
+v_c
+van_c
+ve_c
+vn_c
+xx_c
+z_c
+zh_c
+w_c
+y_c
+ga
+ge
+go
+aa
+ae
+ah
+ao
+aw
+ay
+b
+ch
+d
+dh
+eh
+er
+ey
+f
+g
+hh
+ih
+iy
+jh
+k
+l
+m
+n
+ng
+ow
+oy
+p
+r
+s
+sh
+t
+th
+uh
+uw
+v
+w
+y
+z
+zh
+air_c
+angr_c
+anr_c
+aor_c
+ar_c
+eir_c
+engr_c
+enr_c
+iangr_c
+ianr_c
+iaor_c
+iar_c
+ier_c
+ihr_c
+iir_c
+ingr_c
+inr_c
+iongr_c
+iour_c
+ir_c
+ongr_c
+or_c
+our_c
+uair_c
+uangr_c
+uanr_c
+uar_c
+ueir_c
+uenr_c
+uor_c
+ur_c
+vanr_c
+ver_c
+vnr_c
+vr_c
+pau
+#1
+#2
+#3
+#4
\ No newline at end of file
diff --git a/voices/zhiyan_emo/dict/syllable_flag_dict.txt b/voices/zhiyan_emo/dict/syllable_flag_dict.txt
new file mode 100755
index 0000000..84a4d14
--- /dev/null
+++ b/voices/zhiyan_emo/dict/syllable_flag_dict.txt
@@ -0,0 +1,5 @@
+s_begin
+s_end
+s_none
+s_both
+s_middle
diff --git a/voices/zhiyan_emo/dict/tone_dict.txt b/voices/zhiyan_emo/dict/tone_dict.txt
new file mode 100755
index 0000000..7af26ed
--- /dev/null
+++ b/voices/zhiyan_emo/dict/tone_dict.txt
@@ -0,0 +1,7 @@
+tone1
+tone_none
+tone4
+tone2
+tone3
+tone5
+tone0
diff --git a/voices/zhiyan_emo/dict/word_segment_dict.txt b/voices/zhiyan_emo/dict/word_segment_dict.txt
new file mode 100755
index 0000000..667bcf9
--- /dev/null
+++ b/voices/zhiyan_emo/dict/word_segment_dict.txt
@@ -0,0 +1,5 @@
+word_begin
+word_end
+word_middle
+word_both
+word_none
diff --git a/voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth b/voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
new file mode 100644
index 0000000..bd2113d
--- /dev/null
+++ b/voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e1eb72c06fb8cc0dd876b287430088757f6efe4ec44e0f9fff3a7e40a762c52
+size 19594437
diff --git a/voices/zhiyan_emo/voc/config.yaml b/voices/zhiyan_emo/voc/config.yaml
new file mode 100644
index 0000000..e9853c3
--- /dev/null
+++ b/voices/zhiyan_emo/voc/config.yaml
@@ -0,0 +1,188 @@
+model_type: hifigan
+Model:
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+  Generator:
+    params:
+      in_channels: 80                       
+      out_channels: 1                      
+      channels: 256                       
+      kernel_size: 7                     
+      upsample_scales: [10, 5, 2, 2]        
+      upsample_kernal_sizes: [20, 11, 4, 4] 
+      resblock_kernel_sizes: [3, 7, 11]     
+      resblock_dilations:                  
+            - [1, 3, 5, 7]
+            - [1, 3, 5, 7]
+            - [1, 3, 5, 7]
+      bias: true                           
+      causal: true                           
+      nonlinear_activation: "LeakyReLU"    
+      nonlinear_activation_params:         
+        negative_slope: 0.1
+      use_weight_norm: true               
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+  MultiScaleDiscriminator:
+    params:
+      scales: 3                              
+      downsample_pooling: "DWT"  
+      downsample_pooling_params:
+          kernel_size: 4                    
+          stride: 2                         
+          padding: 2                        
+      discriminator_params:
+          in_channels: 1                     
+          out_channels: 1                    
+          kernel_sizes: [15, 41, 5, 3]       
+          channels: 128                      
+          max_downsample_channels: 1024     
+          max_groups: 16                   
+          bias: true
+          downsample_scales: [4, 4, 4, 4, 1]
+          nonlinear_activation: "LeakyReLU"  
+          nonlinear_activation_params:
+            negative_slope: 0.1
+      follow_official_norm: true    
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+  MultiPeriodDiscriminator:
+    params:
+      periods: [2, 3, 5, 7, 11]      
+      discriminator_params:
+        in_channels: 1                  
+        out_channels: 1                  
+        kernel_sizes: [5, 3]              
+        channels: 32                       
+        downsample_scales: [3, 3, 3, 3, 1] 
+        max_downsample_channels: 1024      
+        bias: true                       
+        nonlinear_activation: "LeakyReLU"  
+        nonlinear_activation_params:       
+          negative_slope: 0.1
+        use_spectral_norm: false           
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+####################################################
+#                   LOSS SETTING                   #
+####################################################
+Loss:
+  generator_adv_loss:
+    enable: True
+    params:
+      average_by_discriminators: False
+    weights: 1.0
+
+  discriminator_adv_loss:
+    enable: True
+    params:
+      average_by_discriminators: False
+    weights: 1.0
+
+  stft_loss:
+    enable: False             # Whether to use multi-resolution STFT loss.
+
+  mel_loss:
+    enable: True
+    params:
+      fs: 16000
+      fft_size: 2048
+      hop_size: 200
+      win_length: 1000
+      window: "hann"
+      num_mels: 80
+      fmin: 0
+      fmax: 8000
+      log_base: null
+    weights: 45.0
+
+  subband_stft_loss:
+    enable: False
+    params:
+      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
+      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+      window: "hann_window"       # Window function for STFT-based loss
+
+  feat_match_loss:
+    enable: True
+    params:
+      average_by_discriminators: false 
+      average_by_layers: false         
+    weights: 2.0
+
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              
+batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: True            
+num_workers: 2 # FIXME: set > 0 may stuck on macos              
+remove_short_samples: False 
+allow_cache: True           
+
+generator_grad_norm: -1
+
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 20000         # Interval steps to save checkpoint.
+eval_interval_steps: 10000          # Interval steps to evaluate the network.
+log_interval_steps: 1000            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/voices/zhiyan_emo/vocoder/pytorch_model.bin b/voices/zhiyan_emo/vocoder/pytorch_model.bin
new file mode 100644
index 0000000..94859b6
--- /dev/null
+++ b/voices/zhiyan_emo/vocoder/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d054c61986470740ed7c715bae1977468da993b7f5cca21da1245f759cbc3cec
+size 19613277
diff --git a/voices/zhizhe_emo/am/ckpt/checkpoint_0.pth b/voices/zhizhe_emo/am/ckpt/checkpoint_0.pth
new file mode 100644
index 0000000..5e48bb3
--- /dev/null
+++ b/voices/zhizhe_emo/am/ckpt/checkpoint_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:002731ec3a90b51d7683d13477ed74545fd7092a552899e0d676eb8473d0dbbe
+size 49315631
diff --git a/voices/zhizhe_emo/am/config.yaml b/voices/zhizhe_emo/am/config.yaml
new file mode 100644
index 0000000..56a980a
--- /dev/null
+++ b/voices/zhizhe_emo/am/config.yaml
@@ -0,0 +1,105 @@
+model_type: sambert
+Model:
+#########################################################
+#         SAMBERT NETWORK ARCHITECTURE SETTING          #
+#########################################################
+  KanTtsSAMBERT:
+    params:
+        max_len: 800
+
+        embedding_dim: 512 
+        encoder_num_layers: 8
+        encoder_num_heads: 8
+        encoder_num_units: 128
+        encoder_ffn_inner_dim: 1024
+        encoder_dropout: 0.1
+        encoder_attention_dropout: 0.1
+        encoder_relu_dropout: 0.1
+        encoder_projection_units: 32
+
+        speaker_units: 32
+        emotion_units: 32
+
+        predictor_filter_size: 41
+        predictor_fsmn_num_layers: 3
+        predictor_num_memory_units: 128
+        predictor_ffn_inner_dim: 256
+        predictor_dropout: 0.1
+        predictor_shift: 0
+        predictor_lstm_units: 128
+        dur_pred_prenet_units: [128, 128]
+        dur_pred_lstm_units: 128
+
+        decoder_prenet_units: [256, 256]
+        decoder_num_layers: 12
+        decoder_num_heads: 8
+        decoder_num_units: 128
+        decoder_ffn_inner_dim: 1024
+        decoder_dropout: 0.1
+        decoder_attention_dropout: 0.1
+        decoder_relu_dropout: 0.1
+
+        outputs_per_step: 3
+        num_mels: 80
+
+        postnet_filter_size: 41
+        postnet_fsmn_num_layers: 4
+        postnet_num_memory_units: 256
+        postnet_ffn_inner_dim: 512
+        postnet_dropout: 0.1
+        postnet_shift: 17
+        postnet_lstm_units: 128
+        MAS: False
+
+    optimizer:
+      type: Adam
+      params:
+        lr: 0.001
+        betas: [0.9, 0.98]
+        eps: 1.0e-9
+        weight_decay: 0.0
+    scheduler:
+      type: NoamLR
+      params:
+        warmup_steps: 4000
+
+linguistic_unit: 
+  cleaners: english_cleaners
+  lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
+  speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
+####################################################
+#                   LOSS SETTING                   #
+####################################################
+Loss:
+  MelReconLoss:
+    enable: True
+    params:
+      loss_type: mae
+
+  ProsodyReconLoss:
+    enable: True
+    params:
+      loss_type: mae
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 32              
+pin_memory: False            
+num_workers: 4 # FIXME: set > 0 may stuck on macos              
+remove_short_samples: False 
+allow_cache: True           
+grad_norm: 1.0
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+train_max_steps: 1000000           # Number of training steps.
+save_interval_steps: 20000         # Interval steps to save checkpoint.
+eval_interval_steps: 10000          # Interval steps to evaluate the network.
+log_interval_steps: 1000            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/voices/zhizhe_emo/am/pytorch_model.bin b/voices/zhizhe_emo/am/pytorch_model.bin
new file mode 100644
index 0000000..88077cf
--- /dev/null
+++ b/voices/zhizhe_emo/am/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:83eb76bc45d0251fc1e609fd1c2e7ae337403165e56b1091b3dc2591b7e8520e
+size 49234411
diff --git a/voices/zhizhe_emo/audio_config.yaml b/voices/zhizhe_emo/audio_config.yaml
new file mode 100644
index 0000000..233817c
--- /dev/null
+++ b/voices/zhizhe_emo/audio_config.yaml
@@ -0,0 +1,27 @@
+# Audio processing configs
+
+audio_config:
+  # Preprocess
+  wav_normalize: True
+  trim_silence: True
+  trim_silence_threshold_db: 60
+  preemphasize: False
+
+  # Feature extraction
+  sampling_rate: 16000
+  hop_length: 200
+  win_length: 1000
+  n_fft: 2048
+  n_mels: 80
+  fmin: 0.0
+  fmax: 8000.0
+  phone_level_feature: True
+
+  # Normalization
+  norm_type: "mean_std"  # "mean_std" or "global"
+  max_norm: 1.0
+  symmetric: False
+  min_level_db: -100.0
+  ref_level_db: 20
+  
+  num_workers: 16
diff --git a/voices/zhizhe_emo/dict/emo_category_dict.txt b/voices/zhizhe_emo/dict/emo_category_dict.txt
new file mode 100755
index 0000000..dfd88e8
--- /dev/null
+++ b/voices/zhizhe_emo/dict/emo_category_dict.txt
@@ -0,0 +1,33 @@
+emotion_none
+emotion_neutral
+emotion_angry
+emotion_disgust
+emotion_fear
+emotion_happy
+emotion_sad
+emotion_surprise
+emotion_calm
+emotion_gentle
+emotion_relax
+emotion_lyrical
+emotion_serious
+emotion_disgruntled
+emotion_satisfied
+emotion_disappointed
+emotion_excited
+emotion_anxiety
+emotion_jealousy
+emotion_hate
+emotion_pity
+emotion_pleasure
+emotion_arousal
+emotion_dominance
+emotion_placeholder1
+emotion_placeholder2
+emotion_placeholder3
+emotion_placeholder4
+emotion_placeholder5
+emotion_placeholder6
+emotion_placeholder7
+emotion_placeholder8
+emotion_placeholder9
\ No newline at end of file
diff --git a/voices/zhizhe_emo/dict/speaker_dict.txt b/voices/zhizhe_emo/dict/speaker_dict.txt
new file mode 100755
index 0000000..af0ca1d
--- /dev/null
+++ b/voices/zhizhe_emo/dict/speaker_dict.txt
@@ -0,0 +1,6 @@
+F7
+F74
+FBYN
+FRXL
+M7
+xiaoyu
diff --git a/voices/zhizhe_emo/dict/sy_dict.txt b/voices/zhizhe_emo/dict/sy_dict.txt
new file mode 100755
index 0000000..ec54511
--- /dev/null
+++ b/voices/zhizhe_emo/dict/sy_dict.txt
@@ -0,0 +1,144 @@
+a_c
+ai_c
+an_c
+ang_c
+ao_c
+b_c
+c_c
+ch_c
+d_c
+e_c
+ei_c
+en_c
+eng_c
+er_c
+f_c
+g_c
+h_c
+i_c
+ia_c
+ian_c
+iang_c
+iao_c
+ie_c
+ih_c
+ii_c
+in_c
+ing_c
+io_c
+iong_c
+iou_c
+j_c
+k_c
+l_c
+m_c
+n_c
+o_c
+ong_c
+ou_c
+p_c
+q_c
+r_c
+s_c
+sh_c
+t_c
+u_c
+ua_c
+uai_c
+uan_c
+uang_c
+uei_c
+uen_c
+ueng_c
+uo_c
+v_c
+van_c
+ve_c
+vn_c
+xx_c
+z_c
+zh_c
+w_c
+y_c
+ga
+ge
+go
+aa
+ae
+ah
+ao
+aw
+ay
+b
+ch
+d
+dh
+eh
+er
+ey
+f
+g
+hh
+ih
+iy
+jh
+k
+l
+m
+n
+ng
+ow
+oy
+p
+r
+s
+sh
+t
+th
+uh
+uw
+v
+w
+y
+z
+zh
+air_c
+angr_c
+anr_c
+aor_c
+ar_c
+eir_c
+engr_c
+enr_c
+iangr_c
+ianr_c
+iaor_c
+iar_c
+ier_c
+ihr_c
+iir_c
+ingr_c
+inr_c
+iongr_c
+iour_c
+ir_c
+ongr_c
+or_c
+our_c
+uair_c
+uangr_c
+uanr_c
+uar_c
+ueir_c
+uenr_c
+uor_c
+ur_c
+vanr_c
+ver_c
+vnr_c
+vr_c
+pau
+#1
+#2
+#3
+#4
\ No newline at end of file
diff --git a/voices/zhizhe_emo/dict/syllable_flag_dict.txt b/voices/zhizhe_emo/dict/syllable_flag_dict.txt
new file mode 100755
index 0000000..84a4d14
--- /dev/null
+++ b/voices/zhizhe_emo/dict/syllable_flag_dict.txt
@@ -0,0 +1,5 @@
+s_begin
+s_end
+s_none
+s_both
+s_middle
diff --git a/voices/zhizhe_emo/dict/tone_dict.txt b/voices/zhizhe_emo/dict/tone_dict.txt
new file mode 100755
index 0000000..7af26ed
--- /dev/null
+++ b/voices/zhizhe_emo/dict/tone_dict.txt
@@ -0,0 +1,7 @@
+tone1
+tone_none
+tone4
+tone2
+tone3
+tone5
+tone0
diff --git a/voices/zhizhe_emo/dict/word_segment_dict.txt b/voices/zhizhe_emo/dict/word_segment_dict.txt
new file mode 100755
index 0000000..667bcf9
--- /dev/null
+++ b/voices/zhizhe_emo/dict/word_segment_dict.txt
@@ -0,0 +1,5 @@
+word_begin
+word_end
+word_middle
+word_both
+word_none
diff --git a/voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth b/voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth
new file mode 100644
index 0000000..b06f78b
--- /dev/null
+++ b/voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:31d837f8dc54d1c80762cba2fc05cddd8a6a466beab3744e102658446f303831
+size 19594437
diff --git a/voices/zhizhe_emo/voc/config.yaml b/voices/zhizhe_emo/voc/config.yaml
new file mode 100644
index 0000000..e9853c3
--- /dev/null
+++ b/voices/zhizhe_emo/voc/config.yaml
@@ -0,0 +1,188 @@
+model_type: hifigan
+Model:
+###########################################################
+#         GENERATOR NETWORK ARCHITECTURE SETTING          #
+###########################################################
+  Generator:
+    params:
+      in_channels: 80                       
+      out_channels: 1                      
+      channels: 256                       
+      kernel_size: 7                     
+      upsample_scales: [10, 5, 2, 2]        
+      upsample_kernal_sizes: [20, 11, 4, 4] 
+      resblock_kernel_sizes: [3, 7, 11]     
+      resblock_dilations:                  
+            - [1, 3, 5, 7]
+            - [1, 3, 5, 7]
+            - [1, 3, 5, 7]
+      bias: true                           
+      causal: true                           
+      nonlinear_activation: "LeakyReLU"    
+      nonlinear_activation_params:         
+        negative_slope: 0.1
+      use_weight_norm: true               
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+###########################################################
+#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
+###########################################################
+  MultiScaleDiscriminator:
+    params:
+      scales: 3                              
+      downsample_pooling: "DWT"  
+      downsample_pooling_params:
+          kernel_size: 4                    
+          stride: 2                         
+          padding: 2                        
+      discriminator_params:
+          in_channels: 1                     
+          out_channels: 1                    
+          kernel_sizes: [15, 41, 5, 3]       
+          channels: 128                      
+          max_downsample_channels: 1024     
+          max_groups: 16                   
+          bias: true
+          downsample_scales: [4, 4, 4, 4, 1]
+          nonlinear_activation: "LeakyReLU"  
+          nonlinear_activation_params:
+            negative_slope: 0.1
+      follow_official_norm: true    
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+  MultiPeriodDiscriminator:
+    params:
+      periods: [2, 3, 5, 7, 11]      
+      discriminator_params:
+        in_channels: 1                  
+        out_channels: 1                  
+        kernel_sizes: [5, 3]              
+        channels: 32                       
+        downsample_scales: [3, 3, 3, 3, 1] 
+        max_downsample_channels: 1024      
+        bias: true                       
+        nonlinear_activation: "LeakyReLU"  
+        nonlinear_activation_params:       
+          negative_slope: 0.1
+        use_spectral_norm: false           
+    optimizer:
+      type: Adam
+      params:
+        lr: 2.0e-4
+        betas: [0.5, 0.9]
+        weight_decay: 0.0
+    scheduler:
+      type: MultiStepLR
+      params:
+        gamma: 0.5
+        milestones:
+            - 200000
+            - 400000
+            - 600000
+            - 800000
+
+####################################################
+#                   LOSS SETTING                   #
+####################################################
+Loss:
+  generator_adv_loss:
+    enable: True
+    params:
+      average_by_discriminators: False
+    weights: 1.0
+
+  discriminator_adv_loss:
+    enable: True
+    params:
+      average_by_discriminators: False
+    weights: 1.0
+
+  stft_loss:
+    enable: False             # Whether to use multi-resolution STFT loss.
+
+  mel_loss:
+    enable: True
+    params:
+      fs: 16000
+      fft_size: 2048
+      hop_size: 200
+      win_length: 1000
+      window: "hann"
+      num_mels: 80
+      fmin: 0
+      fmax: 8000
+      log_base: null
+    weights: 45.0
+
+  subband_stft_loss:
+    enable: False
+    params:
+      fft_sizes: [384, 683, 171]  # List of FFT size for STFT-based loss.
+      hop_sizes: [35, 75, 15]     # List of hop size for STFT-based loss
+      win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
+      window: "hann_window"       # Window function for STFT-based loss
+
+  feat_match_loss:
+    enable: True
+    params:
+      average_by_discriminators: false 
+      average_by_layers: false         
+    weights: 2.0
+
+
+###########################################################
+#                  DATA LOADER SETTING                    #
+###########################################################
+batch_size: 16              
+batch_max_steps: 9600       # Length of each audio in batch. Make sure dividable by hop_size.
+pin_memory: True            
+num_workers: 2 # FIXME: set > 0 may stuck on macos              
+remove_short_samples: False 
+allow_cache: True           
+
+generator_grad_norm: -1
+
+discriminator_grad_norm: -1
+
+###########################################################
+#                    INTERVAL SETTING                     #
+###########################################################
+generator_train_start_steps: 1     # Number of steps to start to train discriminator.
+discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
+train_max_steps: 2500000           # Number of training steps.
+save_interval_steps: 20000         # Interval steps to save checkpoint.
+eval_interval_steps: 10000          # Interval steps to evaluate the network.
+log_interval_steps: 1000            # Interval steps to record the training log.
+
+###########################################################
+#                     OTHER SETTING                       #
+###########################################################
+num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.
diff --git a/voices/zhizhe_emo/vocoder/pytorch_model.bin b/voices/zhizhe_emo/vocoder/pytorch_model.bin
new file mode 100644
index 0000000..8e0f5f4
--- /dev/null
+++ b/voices/zhizhe_emo/vocoder/pytorch_model.bin
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7df13ca0946a193b277a08e2a518cc97a58c636d1eaa34743acedf6731c199e3
+size 19613277