mirror of
https://www.modelscope.cn/iic/speech_sambert-hifigan_tts_zh-cn_16k.git
synced 2026-04-02 10:22:54 +08:00
106 lines
3.2 KiB
YAML
106 lines
3.2 KiB
YAML
model_type: sambert
|
|
Model:
|
|
#########################################################
|
|
# SAMBERT NETWORK ARCHITECTURE SETTING #
|
|
#########################################################
|
|
KanTtsSAMBERT:
|
|
params:
|
|
max_len: 800
|
|
|
|
embedding_dim: 512
|
|
encoder_num_layers: 8
|
|
encoder_num_heads: 8
|
|
encoder_num_units: 128
|
|
encoder_ffn_inner_dim: 1024
|
|
encoder_dropout: 0.1
|
|
encoder_attention_dropout: 0.1
|
|
encoder_relu_dropout: 0.1
|
|
encoder_projection_units: 32
|
|
|
|
speaker_units: 32
|
|
emotion_units: 32
|
|
|
|
predictor_filter_size: 41
|
|
predictor_fsmn_num_layers: 3
|
|
predictor_num_memory_units: 128
|
|
predictor_ffn_inner_dim: 256
|
|
predictor_dropout: 0.1
|
|
predictor_shift: 0
|
|
predictor_lstm_units: 128
|
|
dur_pred_prenet_units: [128, 128]
|
|
dur_pred_lstm_units: 128
|
|
|
|
decoder_prenet_units: [256, 256]
|
|
decoder_num_layers: 12
|
|
decoder_num_heads: 8
|
|
decoder_num_units: 128
|
|
decoder_ffn_inner_dim: 1024
|
|
decoder_dropout: 0.1
|
|
decoder_attention_dropout: 0.1
|
|
decoder_relu_dropout: 0.1
|
|
|
|
outputs_per_step: 3
|
|
num_mels: 80
|
|
|
|
postnet_filter_size: 41
|
|
postnet_fsmn_num_layers: 4
|
|
postnet_num_memory_units: 256
|
|
postnet_ffn_inner_dim: 512
|
|
postnet_dropout: 0.1
|
|
postnet_shift: 17
|
|
postnet_lstm_units: 128
|
|
MAS: False
|
|
|
|
optimizer:
|
|
type: Adam
|
|
params:
|
|
lr: 0.001
|
|
betas: [0.9, 0.98]
|
|
eps: 1.0e-9
|
|
weight_decay: 0.0
|
|
scheduler:
|
|
type: NoamLR
|
|
params:
|
|
warmup_steps: 4000
|
|
|
|
linguistic_unit:
|
|
cleaners: english_cleaners
|
|
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
|
|
speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
|
|
####################################################
|
|
# LOSS SETTING #
|
|
####################################################
|
|
Loss:
|
|
MelReconLoss:
|
|
enable: True
|
|
params:
|
|
loss_type: mae
|
|
|
|
ProsodyReconLoss:
|
|
enable: True
|
|
params:
|
|
loss_type: mae
|
|
|
|
###########################################################
|
|
# DATA LOADER SETTING #
|
|
###########################################################
|
|
batch_size: 32
|
|
pin_memory: False
|
|
num_workers: 4 # FIXME: set > 0 may stuck on macos
|
|
remove_short_samples: False
|
|
allow_cache: True
|
|
grad_norm: 1.0
|
|
|
|
###########################################################
|
|
# INTERVAL SETTING #
|
|
###########################################################
|
|
train_max_steps: 1000000 # Number of training steps.
|
|
save_interval_steps: 20000 # Interval steps to save checkpoint.
|
|
eval_interval_steps: 10000 # Interval steps to evaluate the network.
|
|
log_interval_steps: 1000 # Interval steps to record the training log.
|
|
|
|
###########################################################
|
|
# OTHER SETTING #
|
|
###########################################################
|
|
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|