From d64726031f02479989a815b6ef205d14a4528b4a Mon Sep 17 00:00:00 2001
From: "jiaqi.sjq" <jiaqi.sjq@alibaba-inc.com>
Date: Mon, 19 Sep 2022 15:46:21 +0800
Subject: [PATCH] [Update] add pytorch am

---
 configuration.json | 132 +++++++++++++++++++++++----------------------
 voices.zip         |   4 +-
 2 files changed, 69 insertions(+), 67 deletions(-)

diff --git a/configuration.json b/configuration.json
index b637e3c..4660430 100644
--- a/configuration.json
+++ b/configuration.json
@@ -1,87 +1,89 @@
 {
-  "framework": "tensorflow",
+  "framework": "pytorch",
   "task" : "text-to-speech",
   "model" : {
     "type" : "sambert-hifigan",
     "lang_type" : "zhcn",
     "sample_rate" : 16000,
     "am": {
-      "cleaners":"english_cleaners",
+      "am": {
+        "max_len": 800,
 
-      "num_mels":80,
-      "sample_rate":16000,
-      "frame_shift_ms":12.5,
+        "embedding_dim": 512, 
+        "encoder_num_layers": 8,
+        "encoder_num_heads": 8,
+        "encoder_num_units": 128,
+        "encoder_ffn_inner_dim": 1024,
+        "encoder_dropout": 0.1,
+        "encoder_attention_dropout": 0.1,
+        "encoder_relu_dropout": 0.1,
+        "encoder_projection_units": 32,
 
-      
-      "embedding_dim":512,
-      "encoder_n_conv_layers":3,
-      "encoder_filters":256,
-      "encoder_kernel_size":5,
+        "speaker_units": 32,
+        "emotion_units": 32,
 
-      "encoder_num_layers":8,
-      "encoder_num_units":128,
-      "encoder_num_heads":8,
-      "encoder_ffn_inner_dim":1024,
-      "encoder_dropout":0.1,
-      "encoder_attention_dropout":0.1,
-      "encoder_relu_dropout":0.1,
-      "encoder_projection_units":32,
+        "predictor_filter_size": 41,
+        "predictor_fsmn_num_layers": 3,
+        "predictor_num_memory_units": 128,
+        "predictor_ffn_inner_dim": 256,
+        "predictor_dropout": 0.1,
+        "predictor_shift": 0,
+        "predictor_lstm_units": 128,
+        "dur_pred_prenet_units": [128, 128],
+        "dur_pred_lstm_units": 128,
 
-      "predictor_filter_size":41,
-      "predictor_fsmn_num_layers":3,
-      "predictor_dnn_num_layers":0,
-      "predictor_num_memory_units":128,
-      "predictor_ffn_inner_dim":256,
-      "predictor_dropout":0.1,
-      "predictor_shift":0,
+        "decoder_prenet_units": [256, 256],
+        "decoder_num_layers": 12,
+        "decoder_num_heads": 8,
+        "decoder_num_units": 128,
+        "decoder_ffn_inner_dim": 1024,
+        "decoder_dropout": 0.1,
+        "decoder_attention_dropout": 0.1,
+        "decoder_relu_dropout": 0.1,
 
-      "predictor_prenet_units":[128, 128],
-      "predictor_lstm_units":128,
+        "outputs_per_step": 3,
+        "num_mels": 80,
 
-      "prenet_units":[256, 256],
-      "prenet_proj_units":128,
+        "postnet_filter_size": 41,
+        "postnet_fsmn_num_layers": 4,
+        "postnet_num_memory_units": 256,
+        "postnet_ffn_inner_dim": 512,
+        "postnet_dropout": 0.1,
+        "postnet_shift": 17,
+        "postnet_lstm_units": 128
 
-      "decoder_num_layers":12,
-      "decoder_num_units":128,
-      "decoder_num_heads":8,
-      "decoder_ffn_inner_dim":1024,
-      "decoder_dropout":0.1,
-      "decoder_attention_dropout":0.1,
-      "decoder_relu_dropout":0.1,
+      },
 
-      "outputs_per_step":3,
+      "audio": {
+          "frame_shift_ms": 12.5
+      },
 
-      "postnet_filter_size":41,
-      "postnet_fsmn_num_layers":4,
-      "postnet_dnn_num_layers":0,
-      "postnet_num_memory_units":256,
-      "postnet_ffn_inner_dim":512,
-      "postnet_dropout":0.1,
-      "postnet_shift":17,
-      "postnet_lstm_units":128, 
+      "linguistic_unit": {
+        "cleaners": "english_cleaners",
+        "lfeat_type_list": "sy,tone,syllable_flag,word_segment,emo_category,speaker_category",
+        "sy": "dict/sy_dict.txt",
+        "tone": "dict/tone_dict.txt",
+        "syllable_flag": "dict/syllable_flag_dict.txt",
+        "word_segment": "dict/word_segment_dict.txt",
+        "emo_category": "dict/emo_category_dict.txt",
+        "speaker_category": "dict/speaker_dict.txt"
+      },
 
-      
-      "dur_scale":1.0,
+      "num_gpus": 1,
+      "batch_size": 32,
+      "group_size": 1024,
+      "learning_rate": 0.001,
+      "adam_b1": 0.9,
+      "adam_b2": 0.98,
+      "seed": 1234,
 
-      "batch_size":32,
-      "adam_beta1":0.9,
-      "adam_beta2":0.999,
-      "initial_learning_rate":0.002,
-      "decay_learning_rate":true,
-      "use_cmudict":false,                 
+      "num_workers": 4,
 
-      "lfeat_type_list":"sy,tone,syllable_flag,word_segment,emo_category,speaker",
-
-      "guided_attention":false,
-      "guided_attention_2g_squared":0.08,
-      "guided_attention_loss_weight":1.0,
-
-      "free_run":false,
-
-      "X_band_width":40,
-      "H_band_width":40,
-
-      "max_len":900
+      "dist_config": {
+          "dist_backend": "nccl",
+          "dist_url": "tcp://localhost:11111",
+          "world_size": 1
+      }
     },
     "vocoder" : {
       "resblock": "1",
diff --git a/voices.zip b/voices.zip
index 7c231c9..324bb84 100644
--- a/voices.zip
+++ b/voices.zip
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3121f43f9f39860aca5ba30e007debc88864b08514a69bfb5d8f46d834eed8c7
-size 605733980
+oid sha256:971d4e23fc2cb4c93d5cdb321220cd94810d25f6a424b3dca0ee35aa7cb88ba6
+size 255948880