mirror of
https://www.modelscope.cn/iic/speech_sambert-hifigan_tts_zh-cn_16k.git
synced 2026-04-02 18:32:53 +08:00
update
This commit is contained in:
BIN
voices/zhiyan_emo/am/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
BIN
voices/zhiyan_emo/am/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
Binary file not shown.
105
voices/zhiyan_emo/am/config.yaml
Normal file
105
voices/zhiyan_emo/am/config.yaml
Normal file
@ -0,0 +1,105 @@
|
||||
model_type: sambert
|
||||
Model:
|
||||
#########################################################
|
||||
# SAMBERT NETWORK ARCHITECTURE SETTING #
|
||||
#########################################################
|
||||
KanTtsSAMBERT:
|
||||
params:
|
||||
max_len: 800
|
||||
|
||||
embedding_dim: 512
|
||||
encoder_num_layers: 8
|
||||
encoder_num_heads: 8
|
||||
encoder_num_units: 128
|
||||
encoder_ffn_inner_dim: 1024
|
||||
encoder_dropout: 0.1
|
||||
encoder_attention_dropout: 0.1
|
||||
encoder_relu_dropout: 0.1
|
||||
encoder_projection_units: 32
|
||||
|
||||
speaker_units: 32
|
||||
emotion_units: 32
|
||||
|
||||
predictor_filter_size: 41
|
||||
predictor_fsmn_num_layers: 3
|
||||
predictor_num_memory_units: 128
|
||||
predictor_ffn_inner_dim: 256
|
||||
predictor_dropout: 0.1
|
||||
predictor_shift: 0
|
||||
predictor_lstm_units: 128
|
||||
dur_pred_prenet_units: [128, 128]
|
||||
dur_pred_lstm_units: 128
|
||||
|
||||
decoder_prenet_units: [256, 256]
|
||||
decoder_num_layers: 12
|
||||
decoder_num_heads: 8
|
||||
decoder_num_units: 128
|
||||
decoder_ffn_inner_dim: 1024
|
||||
decoder_dropout: 0.1
|
||||
decoder_attention_dropout: 0.1
|
||||
decoder_relu_dropout: 0.1
|
||||
|
||||
outputs_per_step: 3
|
||||
num_mels: 80
|
||||
|
||||
postnet_filter_size: 41
|
||||
postnet_fsmn_num_layers: 4
|
||||
postnet_num_memory_units: 256
|
||||
postnet_ffn_inner_dim: 512
|
||||
postnet_dropout: 0.1
|
||||
postnet_shift: 17
|
||||
postnet_lstm_units: 128
|
||||
MAS: False
|
||||
|
||||
optimizer:
|
||||
type: Adam
|
||||
params:
|
||||
lr: 0.001
|
||||
betas: [0.9, 0.98]
|
||||
eps: 1.0e-9
|
||||
weight_decay: 0.0
|
||||
scheduler:
|
||||
type: NoamLR
|
||||
params:
|
||||
warmup_steps: 4000
|
||||
|
||||
linguistic_unit:
|
||||
cleaners: english_cleaners
|
||||
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
|
||||
speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
|
||||
####################################################
|
||||
# LOSS SETTING #
|
||||
####################################################
|
||||
Loss:
|
||||
MelReconLoss:
|
||||
enable: True
|
||||
params:
|
||||
loss_type: mae
|
||||
|
||||
ProsodyReconLoss:
|
||||
enable: True
|
||||
params:
|
||||
loss_type: mae
|
||||
|
||||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 32
|
||||
pin_memory: False
|
||||
num_workers: 4 # FIXME: set > 0 may stuck on macos
|
||||
remove_short_samples: False
|
||||
allow_cache: True
|
||||
grad_norm: 1.0
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
train_max_steps: 1000000 # Number of training steps.
|
||||
save_interval_steps: 20000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 10000 # Interval steps to evaluate the network.
|
||||
log_interval_steps: 1000 # Interval steps to record the training log.
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||
BIN
voices/zhiyan_emo/am/pytorch_model.bin
(Stored with Git LFS)
Normal file
BIN
voices/zhiyan_emo/am/pytorch_model.bin
(Stored with Git LFS)
Normal file
Binary file not shown.
27
voices/zhiyan_emo/audio_config.yaml
Normal file
27
voices/zhiyan_emo/audio_config.yaml
Normal file
@ -0,0 +1,27 @@
|
||||
# Audio processing configs
|
||||
|
||||
audio_config:
|
||||
# Preprocess
|
||||
wav_normalize: True
|
||||
trim_silence: True
|
||||
trim_silence_threshold_db: 60
|
||||
preemphasize: False
|
||||
|
||||
# Feature extraction
|
||||
sampling_rate: 16000
|
||||
hop_length: 200
|
||||
win_length: 1000
|
||||
n_fft: 2048
|
||||
n_mels: 80
|
||||
fmin: 0.0
|
||||
fmax: 8000.0
|
||||
phone_level_feature: True
|
||||
|
||||
# Normalization
|
||||
norm_type: "mean_std" # "mean_std" or "global"
|
||||
max_norm: 1.0
|
||||
symmetric: False
|
||||
min_level_db: -100.0
|
||||
ref_level_db: 20
|
||||
|
||||
num_workers: 16
|
||||
33
voices/zhiyan_emo/dict/emo_category_dict.txt
Executable file
33
voices/zhiyan_emo/dict/emo_category_dict.txt
Executable file
@ -0,0 +1,33 @@
|
||||
emotion_none
|
||||
emotion_neutral
|
||||
emotion_angry
|
||||
emotion_disgust
|
||||
emotion_fear
|
||||
emotion_happy
|
||||
emotion_sad
|
||||
emotion_surprise
|
||||
emotion_calm
|
||||
emotion_gentle
|
||||
emotion_relax
|
||||
emotion_lyrical
|
||||
emotion_serious
|
||||
emotion_disgruntled
|
||||
emotion_satisfied
|
||||
emotion_disappointed
|
||||
emotion_excited
|
||||
emotion_anxiety
|
||||
emotion_jealousy
|
||||
emotion_hate
|
||||
emotion_pity
|
||||
emotion_pleasure
|
||||
emotion_arousal
|
||||
emotion_dominance
|
||||
emotion_placeholder1
|
||||
emotion_placeholder2
|
||||
emotion_placeholder3
|
||||
emotion_placeholder4
|
||||
emotion_placeholder5
|
||||
emotion_placeholder6
|
||||
emotion_placeholder7
|
||||
emotion_placeholder8
|
||||
emotion_placeholder9
|
||||
6
voices/zhiyan_emo/dict/speaker_dict.txt
Executable file
6
voices/zhiyan_emo/dict/speaker_dict.txt
Executable file
@ -0,0 +1,6 @@
|
||||
F7
|
||||
F74
|
||||
FBYN
|
||||
FRXL
|
||||
M7
|
||||
xiaoyu
|
||||
144
voices/zhiyan_emo/dict/sy_dict.txt
Executable file
144
voices/zhiyan_emo/dict/sy_dict.txt
Executable file
@ -0,0 +1,144 @@
|
||||
a_c
|
||||
ai_c
|
||||
an_c
|
||||
ang_c
|
||||
ao_c
|
||||
b_c
|
||||
c_c
|
||||
ch_c
|
||||
d_c
|
||||
e_c
|
||||
ei_c
|
||||
en_c
|
||||
eng_c
|
||||
er_c
|
||||
f_c
|
||||
g_c
|
||||
h_c
|
||||
i_c
|
||||
ia_c
|
||||
ian_c
|
||||
iang_c
|
||||
iao_c
|
||||
ie_c
|
||||
ih_c
|
||||
ii_c
|
||||
in_c
|
||||
ing_c
|
||||
io_c
|
||||
iong_c
|
||||
iou_c
|
||||
j_c
|
||||
k_c
|
||||
l_c
|
||||
m_c
|
||||
n_c
|
||||
o_c
|
||||
ong_c
|
||||
ou_c
|
||||
p_c
|
||||
q_c
|
||||
r_c
|
||||
s_c
|
||||
sh_c
|
||||
t_c
|
||||
u_c
|
||||
ua_c
|
||||
uai_c
|
||||
uan_c
|
||||
uang_c
|
||||
uei_c
|
||||
uen_c
|
||||
ueng_c
|
||||
uo_c
|
||||
v_c
|
||||
van_c
|
||||
ve_c
|
||||
vn_c
|
||||
xx_c
|
||||
z_c
|
||||
zh_c
|
||||
w_c
|
||||
y_c
|
||||
ga
|
||||
ge
|
||||
go
|
||||
aa
|
||||
ae
|
||||
ah
|
||||
ao
|
||||
aw
|
||||
ay
|
||||
b
|
||||
ch
|
||||
d
|
||||
dh
|
||||
eh
|
||||
er
|
||||
ey
|
||||
f
|
||||
g
|
||||
hh
|
||||
ih
|
||||
iy
|
||||
jh
|
||||
k
|
||||
l
|
||||
m
|
||||
n
|
||||
ng
|
||||
ow
|
||||
oy
|
||||
p
|
||||
r
|
||||
s
|
||||
sh
|
||||
t
|
||||
th
|
||||
uh
|
||||
uw
|
||||
v
|
||||
w
|
||||
y
|
||||
z
|
||||
zh
|
||||
air_c
|
||||
angr_c
|
||||
anr_c
|
||||
aor_c
|
||||
ar_c
|
||||
eir_c
|
||||
engr_c
|
||||
enr_c
|
||||
iangr_c
|
||||
ianr_c
|
||||
iaor_c
|
||||
iar_c
|
||||
ier_c
|
||||
ihr_c
|
||||
iir_c
|
||||
ingr_c
|
||||
inr_c
|
||||
iongr_c
|
||||
iour_c
|
||||
ir_c
|
||||
ongr_c
|
||||
or_c
|
||||
our_c
|
||||
uair_c
|
||||
uangr_c
|
||||
uanr_c
|
||||
uar_c
|
||||
ueir_c
|
||||
uenr_c
|
||||
uor_c
|
||||
ur_c
|
||||
vanr_c
|
||||
ver_c
|
||||
vnr_c
|
||||
vr_c
|
||||
pau
|
||||
#1
|
||||
#2
|
||||
#3
|
||||
#4
|
||||
5
voices/zhiyan_emo/dict/syllable_flag_dict.txt
Executable file
5
voices/zhiyan_emo/dict/syllable_flag_dict.txt
Executable file
@ -0,0 +1,5 @@
|
||||
s_begin
|
||||
s_end
|
||||
s_none
|
||||
s_both
|
||||
s_middle
|
||||
7
voices/zhiyan_emo/dict/tone_dict.txt
Executable file
7
voices/zhiyan_emo/dict/tone_dict.txt
Executable file
@ -0,0 +1,7 @@
|
||||
tone1
|
||||
tone_none
|
||||
tone4
|
||||
tone2
|
||||
tone3
|
||||
tone5
|
||||
tone0
|
||||
5
voices/zhiyan_emo/dict/word_segment_dict.txt
Executable file
5
voices/zhiyan_emo/dict/word_segment_dict.txt
Executable file
@ -0,0 +1,5 @@
|
||||
word_begin
|
||||
word_end
|
||||
word_middle
|
||||
word_both
|
||||
word_none
|
||||
BIN
voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
BIN
voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
Binary file not shown.
188
voices/zhiyan_emo/voc/config.yaml
Normal file
188
voices/zhiyan_emo/voc/config.yaml
Normal file
@ -0,0 +1,188 @@
|
||||
model_type: hifigan
|
||||
Model:
|
||||
###########################################################
|
||||
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
Generator:
|
||||
params:
|
||||
in_channels: 80
|
||||
out_channels: 1
|
||||
channels: 256
|
||||
kernel_size: 7
|
||||
upsample_scales: [10, 5, 2, 2]
|
||||
upsample_kernal_sizes: [20, 11, 4, 4]
|
||||
resblock_kernel_sizes: [3, 7, 11]
|
||||
resblock_dilations:
|
||||
- [1, 3, 5, 7]
|
||||
- [1, 3, 5, 7]
|
||||
- [1, 3, 5, 7]
|
||||
bias: true
|
||||
causal: true
|
||||
nonlinear_activation: "LeakyReLU"
|
||||
nonlinear_activation_params:
|
||||
negative_slope: 0.1
|
||||
use_weight_norm: true
|
||||
optimizer:
|
||||
type: Adam
|
||||
params:
|
||||
lr: 2.0e-4
|
||||
betas: [0.5, 0.9]
|
||||
weight_decay: 0.0
|
||||
scheduler:
|
||||
type: MultiStepLR
|
||||
params:
|
||||
gamma: 0.5
|
||||
milestones:
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
|
||||
###########################################################
|
||||
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||
###########################################################
|
||||
MultiScaleDiscriminator:
|
||||
params:
|
||||
scales: 3
|
||||
downsample_pooling: "DWT"
|
||||
downsample_pooling_params:
|
||||
kernel_size: 4
|
||||
stride: 2
|
||||
padding: 2
|
||||
discriminator_params:
|
||||
in_channels: 1
|
||||
out_channels: 1
|
||||
kernel_sizes: [15, 41, 5, 3]
|
||||
channels: 128
|
||||
max_downsample_channels: 1024
|
||||
max_groups: 16
|
||||
bias: true
|
||||
downsample_scales: [4, 4, 4, 4, 1]
|
||||
nonlinear_activation: "LeakyReLU"
|
||||
nonlinear_activation_params:
|
||||
negative_slope: 0.1
|
||||
follow_official_norm: true
|
||||
optimizer:
|
||||
type: Adam
|
||||
params:
|
||||
lr: 2.0e-4
|
||||
betas: [0.5, 0.9]
|
||||
weight_decay: 0.0
|
||||
scheduler:
|
||||
type: MultiStepLR
|
||||
params:
|
||||
gamma: 0.5
|
||||
milestones:
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
|
||||
MultiPeriodDiscriminator:
|
||||
params:
|
||||
periods: [2, 3, 5, 7, 11]
|
||||
discriminator_params:
|
||||
in_channels: 1
|
||||
out_channels: 1
|
||||
kernel_sizes: [5, 3]
|
||||
channels: 32
|
||||
downsample_scales: [3, 3, 3, 3, 1]
|
||||
max_downsample_channels: 1024
|
||||
bias: true
|
||||
nonlinear_activation: "LeakyReLU"
|
||||
nonlinear_activation_params:
|
||||
negative_slope: 0.1
|
||||
use_spectral_norm: false
|
||||
optimizer:
|
||||
type: Adam
|
||||
params:
|
||||
lr: 2.0e-4
|
||||
betas: [0.5, 0.9]
|
||||
weight_decay: 0.0
|
||||
scheduler:
|
||||
type: MultiStepLR
|
||||
params:
|
||||
gamma: 0.5
|
||||
milestones:
|
||||
- 200000
|
||||
- 400000
|
||||
- 600000
|
||||
- 800000
|
||||
|
||||
####################################################
|
||||
# LOSS SETTING #
|
||||
####################################################
|
||||
Loss:
|
||||
generator_adv_loss:
|
||||
enable: True
|
||||
params:
|
||||
average_by_discriminators: False
|
||||
weights: 1.0
|
||||
|
||||
discriminator_adv_loss:
|
||||
enable: True
|
||||
params:
|
||||
average_by_discriminators: False
|
||||
weights: 1.0
|
||||
|
||||
stft_loss:
|
||||
enable: False # Whether to use multi-resolution STFT loss.
|
||||
|
||||
mel_loss:
|
||||
enable: True
|
||||
params:
|
||||
fs: 16000
|
||||
fft_size: 2048
|
||||
hop_size: 200
|
||||
win_length: 1000
|
||||
window: "hann"
|
||||
num_mels: 80
|
||||
fmin: 0
|
||||
fmax: 8000
|
||||
log_base: null
|
||||
weights: 45.0
|
||||
|
||||
subband_stft_loss:
|
||||
enable: False
|
||||
params:
|
||||
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
|
||||
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
|
||||
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
|
||||
window: "hann_window" # Window function for STFT-based loss
|
||||
|
||||
feat_match_loss:
|
||||
enable: True
|
||||
params:
|
||||
average_by_discriminators: false
|
||||
average_by_layers: false
|
||||
weights: 2.0
|
||||
|
||||
|
||||
###########################################################
|
||||
# DATA LOADER SETTING #
|
||||
###########################################################
|
||||
batch_size: 16
|
||||
batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||
pin_memory: True
|
||||
num_workers: 2 # FIXME: set > 0 may stuck on macos
|
||||
remove_short_samples: False
|
||||
allow_cache: True
|
||||
|
||||
generator_grad_norm: -1
|
||||
|
||||
discriminator_grad_norm: -1
|
||||
|
||||
###########################################################
|
||||
# INTERVAL SETTING #
|
||||
###########################################################
|
||||
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
||||
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
||||
train_max_steps: 2500000 # Number of training steps.
|
||||
save_interval_steps: 20000 # Interval steps to save checkpoint.
|
||||
eval_interval_steps: 10000 # Interval steps to evaluate the network.
|
||||
log_interval_steps: 1000 # Interval steps to record the training log.
|
||||
|
||||
###########################################################
|
||||
# OTHER SETTING #
|
||||
###########################################################
|
||||
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||
BIN
voices/zhiyan_emo/vocoder/pytorch_model.bin
(Stored with Git LFS)
Normal file
BIN
voices/zhiyan_emo/vocoder/pytorch_model.bin
(Stored with Git LFS)
Normal file
Binary file not shown.
Reference in New Issue
Block a user