From d64726031f02479989a815b6ef205d14a4528b4a Mon Sep 17 00:00:00 2001 From: "jiaqi.sjq" Date: Mon, 19 Sep 2022 15:46:21 +0800 Subject: [PATCH] [Update] add pytorch am --- configuration.json | 132 +++++++++++++++++++++++---------------------- voices.zip | 4 +- 2 files changed, 69 insertions(+), 67 deletions(-) diff --git a/configuration.json b/configuration.json index b637e3c..4660430 100644 --- a/configuration.json +++ b/configuration.json @@ -1,87 +1,89 @@ { - "framework": "tensorflow", + "framework": "pytorch", "task" : "text-to-speech", "model" : { "type" : "sambert-hifigan", "lang_type" : "zhcn", "sample_rate" : 16000, "am": { - "cleaners":"english_cleaners", + "am": { + "max_len": 800, - "num_mels":80, - "sample_rate":16000, - "frame_shift_ms":12.5, + "embedding_dim": 512, + "encoder_num_layers": 8, + "encoder_num_heads": 8, + "encoder_num_units": 128, + "encoder_ffn_inner_dim": 1024, + "encoder_dropout": 0.1, + "encoder_attention_dropout": 0.1, + "encoder_relu_dropout": 0.1, + "encoder_projection_units": 32, - - "embedding_dim":512, - "encoder_n_conv_layers":3, - "encoder_filters":256, - "encoder_kernel_size":5, + "speaker_units": 32, + "emotion_units": 32, - "encoder_num_layers":8, - "encoder_num_units":128, - "encoder_num_heads":8, - "encoder_ffn_inner_dim":1024, - "encoder_dropout":0.1, - "encoder_attention_dropout":0.1, - "encoder_relu_dropout":0.1, - "encoder_projection_units":32, + "predictor_filter_size": 41, + "predictor_fsmn_num_layers": 3, + "predictor_num_memory_units": 128, + "predictor_ffn_inner_dim": 256, + "predictor_dropout": 0.1, + "predictor_shift": 0, + "predictor_lstm_units": 128, + "dur_pred_prenet_units": [128, 128], + "dur_pred_lstm_units": 128, - "predictor_filter_size":41, - "predictor_fsmn_num_layers":3, - "predictor_dnn_num_layers":0, - "predictor_num_memory_units":128, - "predictor_ffn_inner_dim":256, - "predictor_dropout":0.1, - "predictor_shift":0, + "decoder_prenet_units": [256, 256], + "decoder_num_layers": 12, + "decoder_num_heads": 8, + "decoder_num_units": 128, + "decoder_ffn_inner_dim": 1024, + "decoder_dropout": 0.1, + "decoder_attention_dropout": 0.1, + "decoder_relu_dropout": 0.1, - "predictor_prenet_units":[128, 128], - "predictor_lstm_units":128, + "outputs_per_step": 3, + "num_mels": 80, - "prenet_units":[256, 256], - "prenet_proj_units":128, + "postnet_filter_size": 41, + "postnet_fsmn_num_layers": 4, + "postnet_num_memory_units": 256, + "postnet_ffn_inner_dim": 512, + "postnet_dropout": 0.1, + "postnet_shift": 17, + "postnet_lstm_units": 128 - "decoder_num_layers":12, - "decoder_num_units":128, - "decoder_num_heads":8, - "decoder_ffn_inner_dim":1024, - "decoder_dropout":0.1, - "decoder_attention_dropout":0.1, - "decoder_relu_dropout":0.1, + }, - "outputs_per_step":3, + "audio": { + "frame_shift_ms": 12.5 + }, - "postnet_filter_size":41, - "postnet_fsmn_num_layers":4, - "postnet_dnn_num_layers":0, - "postnet_num_memory_units":256, - "postnet_ffn_inner_dim":512, - "postnet_dropout":0.1, - "postnet_shift":17, - "postnet_lstm_units":128, + "linguistic_unit": { + "cleaners": "english_cleaners", + "lfeat_type_list": "sy,tone,syllable_flag,word_segment,emo_category,speaker_category", + "sy": "dict/sy_dict.txt", + "tone": "dict/tone_dict.txt", + "syllable_flag": "dict/syllable_flag_dict.txt", + "word_segment": "dict/word_segment_dict.txt", + "emo_category": "dict/emo_category_dict.txt", + "speaker_category": "dict/speaker_dict.txt" + }, - - "dur_scale":1.0, + "num_gpus": 1, + "batch_size": 32, + "group_size": 1024, + "learning_rate": 0.001, + "adam_b1": 0.9, + "adam_b2": 0.98, + "seed": 1234, - "batch_size":32, - "adam_beta1":0.9, - "adam_beta2":0.999, - "initial_learning_rate":0.002, - "decay_learning_rate":true, - "use_cmudict":false, + "num_workers": 4, - "lfeat_type_list":"sy,tone,syllable_flag,word_segment,emo_category,speaker", - - "guided_attention":false, - "guided_attention_2g_squared":0.08, - "guided_attention_loss_weight":1.0, - - "free_run":false, - - "X_band_width":40, - "H_band_width":40, - - "max_len":900 + "dist_config": { + "dist_backend": "nccl", + "dist_url": "tcp://localhost:11111", + "world_size": 1 + } }, "vocoder" : { "resblock": "1", diff --git a/voices.zip b/voices.zip index 7c231c9..324bb84 100644 --- a/voices.zip +++ b/voices.zip @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3121f43f9f39860aca5ba30e007debc88864b08514a69bfb5d8f46d834eed8c7 -size 605733980 +oid sha256:971d4e23fc2cb4c93d5cdb321220cd94810d25f6a424b3dca0ee35aa7cb88ba6 +size 255948880