mirror of
https://www.modelscope.cn/iic/speech_sambert-hifigan_tts_zh-cn_16k.git
synced 2026-04-02 18:32:53 +08:00
update
This commit is contained in:
8
voices/voices.json
Normal file
8
voices/voices.json
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
"voices": [
|
||||||
|
"zhitian_emo",
|
||||||
|
"zhibei_emo",
|
||||||
|
"zhizhe_emo",
|
||||||
|
"zhiyan_emo"
|
||||||
|
]
|
||||||
|
}
|
||||||
BIN
voices/zhibei_emo/am/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
BIN
voices/zhibei_emo/am/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
Binary file not shown.
105
voices/zhibei_emo/am/config.yaml
Normal file
105
voices/zhibei_emo/am/config.yaml
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
model_type: sambert
|
||||||
|
Model:
|
||||||
|
#########################################################
|
||||||
|
# SAMBERT NETWORK ARCHITECTURE SETTING #
|
||||||
|
#########################################################
|
||||||
|
KanTtsSAMBERT:
|
||||||
|
params:
|
||||||
|
max_len: 800
|
||||||
|
|
||||||
|
embedding_dim: 512
|
||||||
|
encoder_num_layers: 8
|
||||||
|
encoder_num_heads: 8
|
||||||
|
encoder_num_units: 128
|
||||||
|
encoder_ffn_inner_dim: 1024
|
||||||
|
encoder_dropout: 0.1
|
||||||
|
encoder_attention_dropout: 0.1
|
||||||
|
encoder_relu_dropout: 0.1
|
||||||
|
encoder_projection_units: 32
|
||||||
|
|
||||||
|
speaker_units: 32
|
||||||
|
emotion_units: 32
|
||||||
|
|
||||||
|
predictor_filter_size: 41
|
||||||
|
predictor_fsmn_num_layers: 3
|
||||||
|
predictor_num_memory_units: 128
|
||||||
|
predictor_ffn_inner_dim: 256
|
||||||
|
predictor_dropout: 0.1
|
||||||
|
predictor_shift: 0
|
||||||
|
predictor_lstm_units: 128
|
||||||
|
dur_pred_prenet_units: [128, 128]
|
||||||
|
dur_pred_lstm_units: 128
|
||||||
|
|
||||||
|
decoder_prenet_units: [256, 256]
|
||||||
|
decoder_num_layers: 12
|
||||||
|
decoder_num_heads: 8
|
||||||
|
decoder_num_units: 128
|
||||||
|
decoder_ffn_inner_dim: 1024
|
||||||
|
decoder_dropout: 0.1
|
||||||
|
decoder_attention_dropout: 0.1
|
||||||
|
decoder_relu_dropout: 0.1
|
||||||
|
|
||||||
|
outputs_per_step: 3
|
||||||
|
num_mels: 80
|
||||||
|
|
||||||
|
postnet_filter_size: 41
|
||||||
|
postnet_fsmn_num_layers: 4
|
||||||
|
postnet_num_memory_units: 256
|
||||||
|
postnet_ffn_inner_dim: 512
|
||||||
|
postnet_dropout: 0.1
|
||||||
|
postnet_shift: 17
|
||||||
|
postnet_lstm_units: 128
|
||||||
|
MAS: False
|
||||||
|
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 0.001
|
||||||
|
betas: [0.9, 0.98]
|
||||||
|
eps: 1.0e-9
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: NoamLR
|
||||||
|
params:
|
||||||
|
warmup_steps: 4000
|
||||||
|
|
||||||
|
linguistic_unit:
|
||||||
|
cleaners: english_cleaners
|
||||||
|
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
|
||||||
|
speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
|
||||||
|
####################################################
|
||||||
|
# LOSS SETTING #
|
||||||
|
####################################################
|
||||||
|
Loss:
|
||||||
|
MelReconLoss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
loss_type: mae
|
||||||
|
|
||||||
|
ProsodyReconLoss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
loss_type: mae
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA LOADER SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 32
|
||||||
|
pin_memory: False
|
||||||
|
num_workers: 4 # FIXME: set > 0 may stuck on macos
|
||||||
|
remove_short_samples: False
|
||||||
|
allow_cache: True
|
||||||
|
grad_norm: 1.0
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# INTERVAL SETTING #
|
||||||
|
###########################################################
|
||||||
|
train_max_steps: 1000000 # Number of training steps.
|
||||||
|
save_interval_steps: 20000 # Interval steps to save checkpoint.
|
||||||
|
eval_interval_steps: 10000 # Interval steps to evaluate the network.
|
||||||
|
log_interval_steps: 1000 # Interval steps to record the training log.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||||
BIN
voices/zhibei_emo/am/pytorch_model.bin
(Stored with Git LFS)
Normal file
BIN
voices/zhibei_emo/am/pytorch_model.bin
(Stored with Git LFS)
Normal file
Binary file not shown.
27
voices/zhibei_emo/audio_config.yaml
Normal file
27
voices/zhibei_emo/audio_config.yaml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# Audio processing configs
|
||||||
|
|
||||||
|
audio_config:
|
||||||
|
# Preprocess
|
||||||
|
wav_normalize: True
|
||||||
|
trim_silence: True
|
||||||
|
trim_silence_threshold_db: 60
|
||||||
|
preemphasize: False
|
||||||
|
|
||||||
|
# Feature extraction
|
||||||
|
sampling_rate: 16000
|
||||||
|
hop_length: 200
|
||||||
|
win_length: 1000
|
||||||
|
n_fft: 2048
|
||||||
|
n_mels: 80
|
||||||
|
fmin: 0.0
|
||||||
|
fmax: 8000.0
|
||||||
|
phone_level_feature: True
|
||||||
|
|
||||||
|
# Normalization
|
||||||
|
norm_type: "mean_std" # "mean_std" or "global"
|
||||||
|
max_norm: 1.0
|
||||||
|
symmetric: False
|
||||||
|
min_level_db: -100.0
|
||||||
|
ref_level_db: 20
|
||||||
|
|
||||||
|
num_workers: 16
|
||||||
33
voices/zhibei_emo/dict/emo_category_dict.txt
Executable file
33
voices/zhibei_emo/dict/emo_category_dict.txt
Executable file
@ -0,0 +1,33 @@
|
|||||||
|
emotion_none
|
||||||
|
emotion_neutral
|
||||||
|
emotion_angry
|
||||||
|
emotion_disgust
|
||||||
|
emotion_fear
|
||||||
|
emotion_happy
|
||||||
|
emotion_sad
|
||||||
|
emotion_surprise
|
||||||
|
emotion_calm
|
||||||
|
emotion_gentle
|
||||||
|
emotion_relax
|
||||||
|
emotion_lyrical
|
||||||
|
emotion_serious
|
||||||
|
emotion_disgruntled
|
||||||
|
emotion_satisfied
|
||||||
|
emotion_disappointed
|
||||||
|
emotion_excited
|
||||||
|
emotion_anxiety
|
||||||
|
emotion_jealousy
|
||||||
|
emotion_hate
|
||||||
|
emotion_pity
|
||||||
|
emotion_pleasure
|
||||||
|
emotion_arousal
|
||||||
|
emotion_dominance
|
||||||
|
emotion_placeholder1
|
||||||
|
emotion_placeholder2
|
||||||
|
emotion_placeholder3
|
||||||
|
emotion_placeholder4
|
||||||
|
emotion_placeholder5
|
||||||
|
emotion_placeholder6
|
||||||
|
emotion_placeholder7
|
||||||
|
emotion_placeholder8
|
||||||
|
emotion_placeholder9
|
||||||
6
voices/zhibei_emo/dict/speaker_dict.txt
Executable file
6
voices/zhibei_emo/dict/speaker_dict.txt
Executable file
@ -0,0 +1,6 @@
|
|||||||
|
F7
|
||||||
|
F74
|
||||||
|
FBYN
|
||||||
|
FRXL
|
||||||
|
M7
|
||||||
|
xiaoyu
|
||||||
144
voices/zhibei_emo/dict/sy_dict.txt
Executable file
144
voices/zhibei_emo/dict/sy_dict.txt
Executable file
@ -0,0 +1,144 @@
|
|||||||
|
a_c
|
||||||
|
ai_c
|
||||||
|
an_c
|
||||||
|
ang_c
|
||||||
|
ao_c
|
||||||
|
b_c
|
||||||
|
c_c
|
||||||
|
ch_c
|
||||||
|
d_c
|
||||||
|
e_c
|
||||||
|
ei_c
|
||||||
|
en_c
|
||||||
|
eng_c
|
||||||
|
er_c
|
||||||
|
f_c
|
||||||
|
g_c
|
||||||
|
h_c
|
||||||
|
i_c
|
||||||
|
ia_c
|
||||||
|
ian_c
|
||||||
|
iang_c
|
||||||
|
iao_c
|
||||||
|
ie_c
|
||||||
|
ih_c
|
||||||
|
ii_c
|
||||||
|
in_c
|
||||||
|
ing_c
|
||||||
|
io_c
|
||||||
|
iong_c
|
||||||
|
iou_c
|
||||||
|
j_c
|
||||||
|
k_c
|
||||||
|
l_c
|
||||||
|
m_c
|
||||||
|
n_c
|
||||||
|
o_c
|
||||||
|
ong_c
|
||||||
|
ou_c
|
||||||
|
p_c
|
||||||
|
q_c
|
||||||
|
r_c
|
||||||
|
s_c
|
||||||
|
sh_c
|
||||||
|
t_c
|
||||||
|
u_c
|
||||||
|
ua_c
|
||||||
|
uai_c
|
||||||
|
uan_c
|
||||||
|
uang_c
|
||||||
|
uei_c
|
||||||
|
uen_c
|
||||||
|
ueng_c
|
||||||
|
uo_c
|
||||||
|
v_c
|
||||||
|
van_c
|
||||||
|
ve_c
|
||||||
|
vn_c
|
||||||
|
xx_c
|
||||||
|
z_c
|
||||||
|
zh_c
|
||||||
|
w_c
|
||||||
|
y_c
|
||||||
|
ga
|
||||||
|
ge
|
||||||
|
go
|
||||||
|
aa
|
||||||
|
ae
|
||||||
|
ah
|
||||||
|
ao
|
||||||
|
aw
|
||||||
|
ay
|
||||||
|
b
|
||||||
|
ch
|
||||||
|
d
|
||||||
|
dh
|
||||||
|
eh
|
||||||
|
er
|
||||||
|
ey
|
||||||
|
f
|
||||||
|
g
|
||||||
|
hh
|
||||||
|
ih
|
||||||
|
iy
|
||||||
|
jh
|
||||||
|
k
|
||||||
|
l
|
||||||
|
m
|
||||||
|
n
|
||||||
|
ng
|
||||||
|
ow
|
||||||
|
oy
|
||||||
|
p
|
||||||
|
r
|
||||||
|
s
|
||||||
|
sh
|
||||||
|
t
|
||||||
|
th
|
||||||
|
uh
|
||||||
|
uw
|
||||||
|
v
|
||||||
|
w
|
||||||
|
y
|
||||||
|
z
|
||||||
|
zh
|
||||||
|
air_c
|
||||||
|
angr_c
|
||||||
|
anr_c
|
||||||
|
aor_c
|
||||||
|
ar_c
|
||||||
|
eir_c
|
||||||
|
engr_c
|
||||||
|
enr_c
|
||||||
|
iangr_c
|
||||||
|
ianr_c
|
||||||
|
iaor_c
|
||||||
|
iar_c
|
||||||
|
ier_c
|
||||||
|
ihr_c
|
||||||
|
iir_c
|
||||||
|
ingr_c
|
||||||
|
inr_c
|
||||||
|
iongr_c
|
||||||
|
iour_c
|
||||||
|
ir_c
|
||||||
|
ongr_c
|
||||||
|
or_c
|
||||||
|
our_c
|
||||||
|
uair_c
|
||||||
|
uangr_c
|
||||||
|
uanr_c
|
||||||
|
uar_c
|
||||||
|
ueir_c
|
||||||
|
uenr_c
|
||||||
|
uor_c
|
||||||
|
ur_c
|
||||||
|
vanr_c
|
||||||
|
ver_c
|
||||||
|
vnr_c
|
||||||
|
vr_c
|
||||||
|
pau
|
||||||
|
#1
|
||||||
|
#2
|
||||||
|
#3
|
||||||
|
#4
|
||||||
5
voices/zhibei_emo/dict/syllable_flag_dict.txt
Executable file
5
voices/zhibei_emo/dict/syllable_flag_dict.txt
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
s_begin
|
||||||
|
s_end
|
||||||
|
s_none
|
||||||
|
s_both
|
||||||
|
s_middle
|
||||||
7
voices/zhibei_emo/dict/tone_dict.txt
Executable file
7
voices/zhibei_emo/dict/tone_dict.txt
Executable file
@ -0,0 +1,7 @@
|
|||||||
|
tone1
|
||||||
|
tone_none
|
||||||
|
tone4
|
||||||
|
tone2
|
||||||
|
tone3
|
||||||
|
tone5
|
||||||
|
tone0
|
||||||
5
voices/zhibei_emo/dict/word_segment_dict.txt
Executable file
5
voices/zhibei_emo/dict/word_segment_dict.txt
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
word_begin
|
||||||
|
word_end
|
||||||
|
word_middle
|
||||||
|
word_both
|
||||||
|
word_none
|
||||||
BIN
voices/zhibei_emo/voc/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
BIN
voices/zhibei_emo/voc/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
Binary file not shown.
188
voices/zhibei_emo/voc/config.yaml
Normal file
188
voices/zhibei_emo/voc/config.yaml
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
model_type: hifigan
|
||||||
|
Model:
|
||||||
|
###########################################################
|
||||||
|
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||||
|
###########################################################
|
||||||
|
Generator:
|
||||||
|
params:
|
||||||
|
in_channels: 80
|
||||||
|
out_channels: 1
|
||||||
|
channels: 256
|
||||||
|
kernel_size: 7
|
||||||
|
upsample_scales: [10, 5, 2, 2]
|
||||||
|
upsample_kernal_sizes: [20, 11, 4, 4]
|
||||||
|
resblock_kernel_sizes: [3, 7, 11]
|
||||||
|
resblock_dilations:
|
||||||
|
- [1, 3, 5, 7]
|
||||||
|
- [1, 3, 5, 7]
|
||||||
|
- [1, 3, 5, 7]
|
||||||
|
bias: true
|
||||||
|
causal: true
|
||||||
|
nonlinear_activation: "LeakyReLU"
|
||||||
|
nonlinear_activation_params:
|
||||||
|
negative_slope: 0.1
|
||||||
|
use_weight_norm: true
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 2.0e-4
|
||||||
|
betas: [0.5, 0.9]
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: MultiStepLR
|
||||||
|
params:
|
||||||
|
gamma: 0.5
|
||||||
|
milestones:
|
||||||
|
- 200000
|
||||||
|
- 400000
|
||||||
|
- 600000
|
||||||
|
- 800000
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||||
|
###########################################################
|
||||||
|
MultiScaleDiscriminator:
|
||||||
|
params:
|
||||||
|
scales: 3
|
||||||
|
downsample_pooling: "DWT"
|
||||||
|
downsample_pooling_params:
|
||||||
|
kernel_size: 4
|
||||||
|
stride: 2
|
||||||
|
padding: 2
|
||||||
|
discriminator_params:
|
||||||
|
in_channels: 1
|
||||||
|
out_channels: 1
|
||||||
|
kernel_sizes: [15, 41, 5, 3]
|
||||||
|
channels: 128
|
||||||
|
max_downsample_channels: 1024
|
||||||
|
max_groups: 16
|
||||||
|
bias: true
|
||||||
|
downsample_scales: [4, 4, 4, 4, 1]
|
||||||
|
nonlinear_activation: "LeakyReLU"
|
||||||
|
nonlinear_activation_params:
|
||||||
|
negative_slope: 0.1
|
||||||
|
follow_official_norm: true
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 2.0e-4
|
||||||
|
betas: [0.5, 0.9]
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: MultiStepLR
|
||||||
|
params:
|
||||||
|
gamma: 0.5
|
||||||
|
milestones:
|
||||||
|
- 200000
|
||||||
|
- 400000
|
||||||
|
- 600000
|
||||||
|
- 800000
|
||||||
|
|
||||||
|
MultiPeriodDiscriminator:
|
||||||
|
params:
|
||||||
|
periods: [2, 3, 5, 7, 11]
|
||||||
|
discriminator_params:
|
||||||
|
in_channels: 1
|
||||||
|
out_channels: 1
|
||||||
|
kernel_sizes: [5, 3]
|
||||||
|
channels: 32
|
||||||
|
downsample_scales: [3, 3, 3, 3, 1]
|
||||||
|
max_downsample_channels: 1024
|
||||||
|
bias: true
|
||||||
|
nonlinear_activation: "LeakyReLU"
|
||||||
|
nonlinear_activation_params:
|
||||||
|
negative_slope: 0.1
|
||||||
|
use_spectral_norm: false
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 2.0e-4
|
||||||
|
betas: [0.5, 0.9]
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: MultiStepLR
|
||||||
|
params:
|
||||||
|
gamma: 0.5
|
||||||
|
milestones:
|
||||||
|
- 200000
|
||||||
|
- 400000
|
||||||
|
- 600000
|
||||||
|
- 800000
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# LOSS SETTING #
|
||||||
|
####################################################
|
||||||
|
Loss:
|
||||||
|
generator_adv_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
average_by_discriminators: False
|
||||||
|
weights: 1.0
|
||||||
|
|
||||||
|
discriminator_adv_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
average_by_discriminators: False
|
||||||
|
weights: 1.0
|
||||||
|
|
||||||
|
stft_loss:
|
||||||
|
enable: False # Whether to use multi-resolution STFT loss.
|
||||||
|
|
||||||
|
mel_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
fs: 16000
|
||||||
|
fft_size: 2048
|
||||||
|
hop_size: 200
|
||||||
|
win_length: 1000
|
||||||
|
window: "hann"
|
||||||
|
num_mels: 80
|
||||||
|
fmin: 0
|
||||||
|
fmax: 8000
|
||||||
|
log_base: null
|
||||||
|
weights: 45.0
|
||||||
|
|
||||||
|
subband_stft_loss:
|
||||||
|
enable: False
|
||||||
|
params:
|
||||||
|
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
|
||||||
|
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
|
||||||
|
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
|
||||||
|
window: "hann_window" # Window function for STFT-based loss
|
||||||
|
|
||||||
|
feat_match_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
average_by_discriminators: false
|
||||||
|
average_by_layers: false
|
||||||
|
weights: 2.0
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA LOADER SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 16
|
||||||
|
batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||||
|
pin_memory: True
|
||||||
|
num_workers: 2 # FIXME: set > 0 may stuck on macos
|
||||||
|
remove_short_samples: False
|
||||||
|
allow_cache: True
|
||||||
|
|
||||||
|
generator_grad_norm: -1
|
||||||
|
|
||||||
|
discriminator_grad_norm: -1
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# INTERVAL SETTING #
|
||||||
|
###########################################################
|
||||||
|
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
||||||
|
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
||||||
|
train_max_steps: 2500000 # Number of training steps.
|
||||||
|
save_interval_steps: 20000 # Interval steps to save checkpoint.
|
||||||
|
eval_interval_steps: 10000 # Interval steps to evaluate the network.
|
||||||
|
log_interval_steps: 1000 # Interval steps to record the training log.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||||
BIN
voices/zhibei_emo/vocoder/pytorch_model.bin
(Stored with Git LFS)
Normal file
BIN
voices/zhibei_emo/vocoder/pytorch_model.bin
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
voices/zhitian_emo/am/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
BIN
voices/zhitian_emo/am/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
Binary file not shown.
105
voices/zhitian_emo/am/config.yaml
Normal file
105
voices/zhitian_emo/am/config.yaml
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
model_type: sambert
|
||||||
|
Model:
|
||||||
|
#########################################################
|
||||||
|
# SAMBERT NETWORK ARCHITECTURE SETTING #
|
||||||
|
#########################################################
|
||||||
|
KanTtsSAMBERT:
|
||||||
|
params:
|
||||||
|
max_len: 800
|
||||||
|
|
||||||
|
embedding_dim: 512
|
||||||
|
encoder_num_layers: 8
|
||||||
|
encoder_num_heads: 8
|
||||||
|
encoder_num_units: 128
|
||||||
|
encoder_ffn_inner_dim: 1024
|
||||||
|
encoder_dropout: 0.1
|
||||||
|
encoder_attention_dropout: 0.1
|
||||||
|
encoder_relu_dropout: 0.1
|
||||||
|
encoder_projection_units: 32
|
||||||
|
|
||||||
|
speaker_units: 32
|
||||||
|
emotion_units: 32
|
||||||
|
|
||||||
|
predictor_filter_size: 41
|
||||||
|
predictor_fsmn_num_layers: 3
|
||||||
|
predictor_num_memory_units: 128
|
||||||
|
predictor_ffn_inner_dim: 256
|
||||||
|
predictor_dropout: 0.1
|
||||||
|
predictor_shift: 0
|
||||||
|
predictor_lstm_units: 128
|
||||||
|
dur_pred_prenet_units: [128, 128]
|
||||||
|
dur_pred_lstm_units: 128
|
||||||
|
|
||||||
|
decoder_prenet_units: [256, 256]
|
||||||
|
decoder_num_layers: 12
|
||||||
|
decoder_num_heads: 8
|
||||||
|
decoder_num_units: 128
|
||||||
|
decoder_ffn_inner_dim: 1024
|
||||||
|
decoder_dropout: 0.1
|
||||||
|
decoder_attention_dropout: 0.1
|
||||||
|
decoder_relu_dropout: 0.1
|
||||||
|
|
||||||
|
outputs_per_step: 3
|
||||||
|
num_mels: 80
|
||||||
|
|
||||||
|
postnet_filter_size: 41
|
||||||
|
postnet_fsmn_num_layers: 4
|
||||||
|
postnet_num_memory_units: 256
|
||||||
|
postnet_ffn_inner_dim: 512
|
||||||
|
postnet_dropout: 0.1
|
||||||
|
postnet_shift: 17
|
||||||
|
postnet_lstm_units: 128
|
||||||
|
MAS: False
|
||||||
|
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 0.001
|
||||||
|
betas: [0.9, 0.98]
|
||||||
|
eps: 1.0e-9
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: NoamLR
|
||||||
|
params:
|
||||||
|
warmup_steps: 4000
|
||||||
|
|
||||||
|
linguistic_unit:
|
||||||
|
cleaners: english_cleaners
|
||||||
|
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
|
||||||
|
speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
|
||||||
|
####################################################
|
||||||
|
# LOSS SETTING #
|
||||||
|
####################################################
|
||||||
|
Loss:
|
||||||
|
MelReconLoss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
loss_type: mae
|
||||||
|
|
||||||
|
ProsodyReconLoss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
loss_type: mae
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA LOADER SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 32
|
||||||
|
pin_memory: False
|
||||||
|
num_workers: 4 # FIXME: set > 0 may stuck on macos
|
||||||
|
remove_short_samples: False
|
||||||
|
allow_cache: True
|
||||||
|
grad_norm: 1.0
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# INTERVAL SETTING #
|
||||||
|
###########################################################
|
||||||
|
train_max_steps: 1000000 # Number of training steps.
|
||||||
|
save_interval_steps: 20000 # Interval steps to save checkpoint.
|
||||||
|
eval_interval_steps: 10000 # Interval steps to evaluate the network.
|
||||||
|
log_interval_steps: 1000 # Interval steps to record the training log.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||||
BIN
voices/zhitian_emo/am/pytorch_model.bin
(Stored with Git LFS)
Normal file
BIN
voices/zhitian_emo/am/pytorch_model.bin
(Stored with Git LFS)
Normal file
Binary file not shown.
27
voices/zhitian_emo/audio_config.yaml
Normal file
27
voices/zhitian_emo/audio_config.yaml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# Audio processing configs
|
||||||
|
|
||||||
|
audio_config:
|
||||||
|
# Preprocess
|
||||||
|
wav_normalize: True
|
||||||
|
trim_silence: True
|
||||||
|
trim_silence_threshold_db: 60
|
||||||
|
preemphasize: False
|
||||||
|
|
||||||
|
# Feature extraction
|
||||||
|
sampling_rate: 16000
|
||||||
|
hop_length: 200
|
||||||
|
win_length: 1000
|
||||||
|
n_fft: 2048
|
||||||
|
n_mels: 80
|
||||||
|
fmin: 0.0
|
||||||
|
fmax: 8000.0
|
||||||
|
phone_level_feature: True
|
||||||
|
|
||||||
|
# Normalization
|
||||||
|
norm_type: "mean_std" # "mean_std" or "global"
|
||||||
|
max_norm: 1.0
|
||||||
|
symmetric: False
|
||||||
|
min_level_db: -100.0
|
||||||
|
ref_level_db: 20
|
||||||
|
|
||||||
|
num_workers: 16
|
||||||
33
voices/zhitian_emo/dict/emo_category_dict.txt
Executable file
33
voices/zhitian_emo/dict/emo_category_dict.txt
Executable file
@ -0,0 +1,33 @@
|
|||||||
|
emotion_none
|
||||||
|
emotion_neutral
|
||||||
|
emotion_angry
|
||||||
|
emotion_disgust
|
||||||
|
emotion_fear
|
||||||
|
emotion_happy
|
||||||
|
emotion_sad
|
||||||
|
emotion_surprise
|
||||||
|
emotion_calm
|
||||||
|
emotion_gentle
|
||||||
|
emotion_relax
|
||||||
|
emotion_lyrical
|
||||||
|
emotion_serious
|
||||||
|
emotion_disgruntled
|
||||||
|
emotion_satisfied
|
||||||
|
emotion_disappointed
|
||||||
|
emotion_excited
|
||||||
|
emotion_anxiety
|
||||||
|
emotion_jealousy
|
||||||
|
emotion_hate
|
||||||
|
emotion_pity
|
||||||
|
emotion_pleasure
|
||||||
|
emotion_arousal
|
||||||
|
emotion_dominance
|
||||||
|
emotion_placeholder1
|
||||||
|
emotion_placeholder2
|
||||||
|
emotion_placeholder3
|
||||||
|
emotion_placeholder4
|
||||||
|
emotion_placeholder5
|
||||||
|
emotion_placeholder6
|
||||||
|
emotion_placeholder7
|
||||||
|
emotion_placeholder8
|
||||||
|
emotion_placeholder9
|
||||||
6
voices/zhitian_emo/dict/speaker_dict.txt
Executable file
6
voices/zhitian_emo/dict/speaker_dict.txt
Executable file
@ -0,0 +1,6 @@
|
|||||||
|
F7
|
||||||
|
F74
|
||||||
|
FBYN
|
||||||
|
FRXL
|
||||||
|
M7
|
||||||
|
xiaoyu
|
||||||
144
voices/zhitian_emo/dict/sy_dict.txt
Executable file
144
voices/zhitian_emo/dict/sy_dict.txt
Executable file
@ -0,0 +1,144 @@
|
|||||||
|
a_c
|
||||||
|
ai_c
|
||||||
|
an_c
|
||||||
|
ang_c
|
||||||
|
ao_c
|
||||||
|
b_c
|
||||||
|
c_c
|
||||||
|
ch_c
|
||||||
|
d_c
|
||||||
|
e_c
|
||||||
|
ei_c
|
||||||
|
en_c
|
||||||
|
eng_c
|
||||||
|
er_c
|
||||||
|
f_c
|
||||||
|
g_c
|
||||||
|
h_c
|
||||||
|
i_c
|
||||||
|
ia_c
|
||||||
|
ian_c
|
||||||
|
iang_c
|
||||||
|
iao_c
|
||||||
|
ie_c
|
||||||
|
ih_c
|
||||||
|
ii_c
|
||||||
|
in_c
|
||||||
|
ing_c
|
||||||
|
io_c
|
||||||
|
iong_c
|
||||||
|
iou_c
|
||||||
|
j_c
|
||||||
|
k_c
|
||||||
|
l_c
|
||||||
|
m_c
|
||||||
|
n_c
|
||||||
|
o_c
|
||||||
|
ong_c
|
||||||
|
ou_c
|
||||||
|
p_c
|
||||||
|
q_c
|
||||||
|
r_c
|
||||||
|
s_c
|
||||||
|
sh_c
|
||||||
|
t_c
|
||||||
|
u_c
|
||||||
|
ua_c
|
||||||
|
uai_c
|
||||||
|
uan_c
|
||||||
|
uang_c
|
||||||
|
uei_c
|
||||||
|
uen_c
|
||||||
|
ueng_c
|
||||||
|
uo_c
|
||||||
|
v_c
|
||||||
|
van_c
|
||||||
|
ve_c
|
||||||
|
vn_c
|
||||||
|
xx_c
|
||||||
|
z_c
|
||||||
|
zh_c
|
||||||
|
w_c
|
||||||
|
y_c
|
||||||
|
ga
|
||||||
|
ge
|
||||||
|
go
|
||||||
|
aa
|
||||||
|
ae
|
||||||
|
ah
|
||||||
|
ao
|
||||||
|
aw
|
||||||
|
ay
|
||||||
|
b
|
||||||
|
ch
|
||||||
|
d
|
||||||
|
dh
|
||||||
|
eh
|
||||||
|
er
|
||||||
|
ey
|
||||||
|
f
|
||||||
|
g
|
||||||
|
hh
|
||||||
|
ih
|
||||||
|
iy
|
||||||
|
jh
|
||||||
|
k
|
||||||
|
l
|
||||||
|
m
|
||||||
|
n
|
||||||
|
ng
|
||||||
|
ow
|
||||||
|
oy
|
||||||
|
p
|
||||||
|
r
|
||||||
|
s
|
||||||
|
sh
|
||||||
|
t
|
||||||
|
th
|
||||||
|
uh
|
||||||
|
uw
|
||||||
|
v
|
||||||
|
w
|
||||||
|
y
|
||||||
|
z
|
||||||
|
zh
|
||||||
|
air_c
|
||||||
|
angr_c
|
||||||
|
anr_c
|
||||||
|
aor_c
|
||||||
|
ar_c
|
||||||
|
eir_c
|
||||||
|
engr_c
|
||||||
|
enr_c
|
||||||
|
iangr_c
|
||||||
|
ianr_c
|
||||||
|
iaor_c
|
||||||
|
iar_c
|
||||||
|
ier_c
|
||||||
|
ihr_c
|
||||||
|
iir_c
|
||||||
|
ingr_c
|
||||||
|
inr_c
|
||||||
|
iongr_c
|
||||||
|
iour_c
|
||||||
|
ir_c
|
||||||
|
ongr_c
|
||||||
|
or_c
|
||||||
|
our_c
|
||||||
|
uair_c
|
||||||
|
uangr_c
|
||||||
|
uanr_c
|
||||||
|
uar_c
|
||||||
|
ueir_c
|
||||||
|
uenr_c
|
||||||
|
uor_c
|
||||||
|
ur_c
|
||||||
|
vanr_c
|
||||||
|
ver_c
|
||||||
|
vnr_c
|
||||||
|
vr_c
|
||||||
|
pau
|
||||||
|
#1
|
||||||
|
#2
|
||||||
|
#3
|
||||||
|
#4
|
||||||
5
voices/zhitian_emo/dict/syllable_flag_dict.txt
Executable file
5
voices/zhitian_emo/dict/syllable_flag_dict.txt
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
s_begin
|
||||||
|
s_end
|
||||||
|
s_none
|
||||||
|
s_both
|
||||||
|
s_middle
|
||||||
7
voices/zhitian_emo/dict/tone_dict.txt
Executable file
7
voices/zhitian_emo/dict/tone_dict.txt
Executable file
@ -0,0 +1,7 @@
|
|||||||
|
tone1
|
||||||
|
tone_none
|
||||||
|
tone4
|
||||||
|
tone2
|
||||||
|
tone3
|
||||||
|
tone5
|
||||||
|
tone0
|
||||||
5
voices/zhitian_emo/dict/word_segment_dict.txt
Executable file
5
voices/zhitian_emo/dict/word_segment_dict.txt
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
word_begin
|
||||||
|
word_end
|
||||||
|
word_middle
|
||||||
|
word_both
|
||||||
|
word_none
|
||||||
BIN
voices/zhitian_emo/voc/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
BIN
voices/zhitian_emo/voc/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
Binary file not shown.
188
voices/zhitian_emo/voc/config.yaml
Normal file
188
voices/zhitian_emo/voc/config.yaml
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
model_type: hifigan
|
||||||
|
Model:
|
||||||
|
###########################################################
|
||||||
|
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||||
|
###########################################################
|
||||||
|
Generator:
|
||||||
|
params:
|
||||||
|
in_channels: 80
|
||||||
|
out_channels: 1
|
||||||
|
channels: 256
|
||||||
|
kernel_size: 7
|
||||||
|
upsample_scales: [10, 5, 2, 2]
|
||||||
|
upsample_kernal_sizes: [20, 11, 4, 4]
|
||||||
|
resblock_kernel_sizes: [3, 7, 11]
|
||||||
|
resblock_dilations:
|
||||||
|
- [1, 3, 5, 7]
|
||||||
|
- [1, 3, 5, 7]
|
||||||
|
- [1, 3, 5, 7]
|
||||||
|
bias: true
|
||||||
|
causal: true
|
||||||
|
nonlinear_activation: "LeakyReLU"
|
||||||
|
nonlinear_activation_params:
|
||||||
|
negative_slope: 0.1
|
||||||
|
use_weight_norm: true
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 2.0e-4
|
||||||
|
betas: [0.5, 0.9]
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: MultiStepLR
|
||||||
|
params:
|
||||||
|
gamma: 0.5
|
||||||
|
milestones:
|
||||||
|
- 200000
|
||||||
|
- 400000
|
||||||
|
- 600000
|
||||||
|
- 800000
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||||
|
###########################################################
|
||||||
|
MultiScaleDiscriminator:
|
||||||
|
params:
|
||||||
|
scales: 3
|
||||||
|
downsample_pooling: "DWT"
|
||||||
|
downsample_pooling_params:
|
||||||
|
kernel_size: 4
|
||||||
|
stride: 2
|
||||||
|
padding: 2
|
||||||
|
discriminator_params:
|
||||||
|
in_channels: 1
|
||||||
|
out_channels: 1
|
||||||
|
kernel_sizes: [15, 41, 5, 3]
|
||||||
|
channels: 128
|
||||||
|
max_downsample_channels: 1024
|
||||||
|
max_groups: 16
|
||||||
|
bias: true
|
||||||
|
downsample_scales: [4, 4, 4, 4, 1]
|
||||||
|
nonlinear_activation: "LeakyReLU"
|
||||||
|
nonlinear_activation_params:
|
||||||
|
negative_slope: 0.1
|
||||||
|
follow_official_norm: true
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 2.0e-4
|
||||||
|
betas: [0.5, 0.9]
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: MultiStepLR
|
||||||
|
params:
|
||||||
|
gamma: 0.5
|
||||||
|
milestones:
|
||||||
|
- 200000
|
||||||
|
- 400000
|
||||||
|
- 600000
|
||||||
|
- 800000
|
||||||
|
|
||||||
|
MultiPeriodDiscriminator:
|
||||||
|
params:
|
||||||
|
periods: [2, 3, 5, 7, 11]
|
||||||
|
discriminator_params:
|
||||||
|
in_channels: 1
|
||||||
|
out_channels: 1
|
||||||
|
kernel_sizes: [5, 3]
|
||||||
|
channels: 32
|
||||||
|
downsample_scales: [3, 3, 3, 3, 1]
|
||||||
|
max_downsample_channels: 1024
|
||||||
|
bias: true
|
||||||
|
nonlinear_activation: "LeakyReLU"
|
||||||
|
nonlinear_activation_params:
|
||||||
|
negative_slope: 0.1
|
||||||
|
use_spectral_norm: false
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 2.0e-4
|
||||||
|
betas: [0.5, 0.9]
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: MultiStepLR
|
||||||
|
params:
|
||||||
|
gamma: 0.5
|
||||||
|
milestones:
|
||||||
|
- 200000
|
||||||
|
- 400000
|
||||||
|
- 600000
|
||||||
|
- 800000
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# LOSS SETTING #
|
||||||
|
####################################################
|
||||||
|
Loss:
|
||||||
|
generator_adv_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
average_by_discriminators: False
|
||||||
|
weights: 1.0
|
||||||
|
|
||||||
|
discriminator_adv_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
average_by_discriminators: False
|
||||||
|
weights: 1.0
|
||||||
|
|
||||||
|
stft_loss:
|
||||||
|
enable: False # Whether to use multi-resolution STFT loss.
|
||||||
|
|
||||||
|
mel_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
fs: 16000
|
||||||
|
fft_size: 2048
|
||||||
|
hop_size: 200
|
||||||
|
win_length: 1000
|
||||||
|
window: "hann"
|
||||||
|
num_mels: 80
|
||||||
|
fmin: 0
|
||||||
|
fmax: 8000
|
||||||
|
log_base: null
|
||||||
|
weights: 45.0
|
||||||
|
|
||||||
|
subband_stft_loss:
|
||||||
|
enable: False
|
||||||
|
params:
|
||||||
|
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
|
||||||
|
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
|
||||||
|
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
|
||||||
|
window: "hann_window" # Window function for STFT-based loss
|
||||||
|
|
||||||
|
feat_match_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
average_by_discriminators: false
|
||||||
|
average_by_layers: false
|
||||||
|
weights: 2.0
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA LOADER SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 16
|
||||||
|
batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||||
|
pin_memory: True
|
||||||
|
num_workers: 2 # FIXME: set > 0 may stuck on macos
|
||||||
|
remove_short_samples: False
|
||||||
|
allow_cache: True
|
||||||
|
|
||||||
|
generator_grad_norm: -1
|
||||||
|
|
||||||
|
discriminator_grad_norm: -1
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# INTERVAL SETTING #
|
||||||
|
###########################################################
|
||||||
|
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
||||||
|
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
||||||
|
train_max_steps: 2500000 # Number of training steps.
|
||||||
|
save_interval_steps: 20000 # Interval steps to save checkpoint.
|
||||||
|
eval_interval_steps: 10000 # Interval steps to evaluate the network.
|
||||||
|
log_interval_steps: 1000 # Interval steps to record the training log.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||||
BIN
voices/zhitian_emo/vocoder/pytorch_model.bin
(Stored with Git LFS)
Normal file
BIN
voices/zhitian_emo/vocoder/pytorch_model.bin
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
voices/zhiyan_emo/am/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
BIN
voices/zhiyan_emo/am/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
Binary file not shown.
105
voices/zhiyan_emo/am/config.yaml
Normal file
105
voices/zhiyan_emo/am/config.yaml
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
model_type: sambert
|
||||||
|
Model:
|
||||||
|
#########################################################
|
||||||
|
# SAMBERT NETWORK ARCHITECTURE SETTING #
|
||||||
|
#########################################################
|
||||||
|
KanTtsSAMBERT:
|
||||||
|
params:
|
||||||
|
max_len: 800
|
||||||
|
|
||||||
|
embedding_dim: 512
|
||||||
|
encoder_num_layers: 8
|
||||||
|
encoder_num_heads: 8
|
||||||
|
encoder_num_units: 128
|
||||||
|
encoder_ffn_inner_dim: 1024
|
||||||
|
encoder_dropout: 0.1
|
||||||
|
encoder_attention_dropout: 0.1
|
||||||
|
encoder_relu_dropout: 0.1
|
||||||
|
encoder_projection_units: 32
|
||||||
|
|
||||||
|
speaker_units: 32
|
||||||
|
emotion_units: 32
|
||||||
|
|
||||||
|
predictor_filter_size: 41
|
||||||
|
predictor_fsmn_num_layers: 3
|
||||||
|
predictor_num_memory_units: 128
|
||||||
|
predictor_ffn_inner_dim: 256
|
||||||
|
predictor_dropout: 0.1
|
||||||
|
predictor_shift: 0
|
||||||
|
predictor_lstm_units: 128
|
||||||
|
dur_pred_prenet_units: [128, 128]
|
||||||
|
dur_pred_lstm_units: 128
|
||||||
|
|
||||||
|
decoder_prenet_units: [256, 256]
|
||||||
|
decoder_num_layers: 12
|
||||||
|
decoder_num_heads: 8
|
||||||
|
decoder_num_units: 128
|
||||||
|
decoder_ffn_inner_dim: 1024
|
||||||
|
decoder_dropout: 0.1
|
||||||
|
decoder_attention_dropout: 0.1
|
||||||
|
decoder_relu_dropout: 0.1
|
||||||
|
|
||||||
|
outputs_per_step: 3
|
||||||
|
num_mels: 80
|
||||||
|
|
||||||
|
postnet_filter_size: 41
|
||||||
|
postnet_fsmn_num_layers: 4
|
||||||
|
postnet_num_memory_units: 256
|
||||||
|
postnet_ffn_inner_dim: 512
|
||||||
|
postnet_dropout: 0.1
|
||||||
|
postnet_shift: 17
|
||||||
|
postnet_lstm_units: 128
|
||||||
|
MAS: False
|
||||||
|
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 0.001
|
||||||
|
betas: [0.9, 0.98]
|
||||||
|
eps: 1.0e-9
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: NoamLR
|
||||||
|
params:
|
||||||
|
warmup_steps: 4000
|
||||||
|
|
||||||
|
linguistic_unit:
|
||||||
|
cleaners: english_cleaners
|
||||||
|
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
|
||||||
|
speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
|
||||||
|
####################################################
|
||||||
|
# LOSS SETTING #
|
||||||
|
####################################################
|
||||||
|
Loss:
|
||||||
|
MelReconLoss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
loss_type: mae
|
||||||
|
|
||||||
|
ProsodyReconLoss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
loss_type: mae
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA LOADER SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 32
|
||||||
|
pin_memory: False
|
||||||
|
num_workers: 4 # FIXME: set > 0 may stuck on macos
|
||||||
|
remove_short_samples: False
|
||||||
|
allow_cache: True
|
||||||
|
grad_norm: 1.0
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# INTERVAL SETTING #
|
||||||
|
###########################################################
|
||||||
|
train_max_steps: 1000000 # Number of training steps.
|
||||||
|
save_interval_steps: 20000 # Interval steps to save checkpoint.
|
||||||
|
eval_interval_steps: 10000 # Interval steps to evaluate the network.
|
||||||
|
log_interval_steps: 1000 # Interval steps to record the training log.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||||
BIN
voices/zhiyan_emo/am/pytorch_model.bin
(Stored with Git LFS)
Normal file
BIN
voices/zhiyan_emo/am/pytorch_model.bin
(Stored with Git LFS)
Normal file
Binary file not shown.
27
voices/zhiyan_emo/audio_config.yaml
Normal file
27
voices/zhiyan_emo/audio_config.yaml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# Audio processing configs
|
||||||
|
|
||||||
|
audio_config:
|
||||||
|
# Preprocess
|
||||||
|
wav_normalize: True
|
||||||
|
trim_silence: True
|
||||||
|
trim_silence_threshold_db: 60
|
||||||
|
preemphasize: False
|
||||||
|
|
||||||
|
# Feature extraction
|
||||||
|
sampling_rate: 16000
|
||||||
|
hop_length: 200
|
||||||
|
win_length: 1000
|
||||||
|
n_fft: 2048
|
||||||
|
n_mels: 80
|
||||||
|
fmin: 0.0
|
||||||
|
fmax: 8000.0
|
||||||
|
phone_level_feature: True
|
||||||
|
|
||||||
|
# Normalization
|
||||||
|
norm_type: "mean_std" # "mean_std" or "global"
|
||||||
|
max_norm: 1.0
|
||||||
|
symmetric: False
|
||||||
|
min_level_db: -100.0
|
||||||
|
ref_level_db: 20
|
||||||
|
|
||||||
|
num_workers: 16
|
||||||
33
voices/zhiyan_emo/dict/emo_category_dict.txt
Executable file
33
voices/zhiyan_emo/dict/emo_category_dict.txt
Executable file
@ -0,0 +1,33 @@
|
|||||||
|
emotion_none
|
||||||
|
emotion_neutral
|
||||||
|
emotion_angry
|
||||||
|
emotion_disgust
|
||||||
|
emotion_fear
|
||||||
|
emotion_happy
|
||||||
|
emotion_sad
|
||||||
|
emotion_surprise
|
||||||
|
emotion_calm
|
||||||
|
emotion_gentle
|
||||||
|
emotion_relax
|
||||||
|
emotion_lyrical
|
||||||
|
emotion_serious
|
||||||
|
emotion_disgruntled
|
||||||
|
emotion_satisfied
|
||||||
|
emotion_disappointed
|
||||||
|
emotion_excited
|
||||||
|
emotion_anxiety
|
||||||
|
emotion_jealousy
|
||||||
|
emotion_hate
|
||||||
|
emotion_pity
|
||||||
|
emotion_pleasure
|
||||||
|
emotion_arousal
|
||||||
|
emotion_dominance
|
||||||
|
emotion_placeholder1
|
||||||
|
emotion_placeholder2
|
||||||
|
emotion_placeholder3
|
||||||
|
emotion_placeholder4
|
||||||
|
emotion_placeholder5
|
||||||
|
emotion_placeholder6
|
||||||
|
emotion_placeholder7
|
||||||
|
emotion_placeholder8
|
||||||
|
emotion_placeholder9
|
||||||
6
voices/zhiyan_emo/dict/speaker_dict.txt
Executable file
6
voices/zhiyan_emo/dict/speaker_dict.txt
Executable file
@ -0,0 +1,6 @@
|
|||||||
|
F7
|
||||||
|
F74
|
||||||
|
FBYN
|
||||||
|
FRXL
|
||||||
|
M7
|
||||||
|
xiaoyu
|
||||||
144
voices/zhiyan_emo/dict/sy_dict.txt
Executable file
144
voices/zhiyan_emo/dict/sy_dict.txt
Executable file
@ -0,0 +1,144 @@
|
|||||||
|
a_c
|
||||||
|
ai_c
|
||||||
|
an_c
|
||||||
|
ang_c
|
||||||
|
ao_c
|
||||||
|
b_c
|
||||||
|
c_c
|
||||||
|
ch_c
|
||||||
|
d_c
|
||||||
|
e_c
|
||||||
|
ei_c
|
||||||
|
en_c
|
||||||
|
eng_c
|
||||||
|
er_c
|
||||||
|
f_c
|
||||||
|
g_c
|
||||||
|
h_c
|
||||||
|
i_c
|
||||||
|
ia_c
|
||||||
|
ian_c
|
||||||
|
iang_c
|
||||||
|
iao_c
|
||||||
|
ie_c
|
||||||
|
ih_c
|
||||||
|
ii_c
|
||||||
|
in_c
|
||||||
|
ing_c
|
||||||
|
io_c
|
||||||
|
iong_c
|
||||||
|
iou_c
|
||||||
|
j_c
|
||||||
|
k_c
|
||||||
|
l_c
|
||||||
|
m_c
|
||||||
|
n_c
|
||||||
|
o_c
|
||||||
|
ong_c
|
||||||
|
ou_c
|
||||||
|
p_c
|
||||||
|
q_c
|
||||||
|
r_c
|
||||||
|
s_c
|
||||||
|
sh_c
|
||||||
|
t_c
|
||||||
|
u_c
|
||||||
|
ua_c
|
||||||
|
uai_c
|
||||||
|
uan_c
|
||||||
|
uang_c
|
||||||
|
uei_c
|
||||||
|
uen_c
|
||||||
|
ueng_c
|
||||||
|
uo_c
|
||||||
|
v_c
|
||||||
|
van_c
|
||||||
|
ve_c
|
||||||
|
vn_c
|
||||||
|
xx_c
|
||||||
|
z_c
|
||||||
|
zh_c
|
||||||
|
w_c
|
||||||
|
y_c
|
||||||
|
ga
|
||||||
|
ge
|
||||||
|
go
|
||||||
|
aa
|
||||||
|
ae
|
||||||
|
ah
|
||||||
|
ao
|
||||||
|
aw
|
||||||
|
ay
|
||||||
|
b
|
||||||
|
ch
|
||||||
|
d
|
||||||
|
dh
|
||||||
|
eh
|
||||||
|
er
|
||||||
|
ey
|
||||||
|
f
|
||||||
|
g
|
||||||
|
hh
|
||||||
|
ih
|
||||||
|
iy
|
||||||
|
jh
|
||||||
|
k
|
||||||
|
l
|
||||||
|
m
|
||||||
|
n
|
||||||
|
ng
|
||||||
|
ow
|
||||||
|
oy
|
||||||
|
p
|
||||||
|
r
|
||||||
|
s
|
||||||
|
sh
|
||||||
|
t
|
||||||
|
th
|
||||||
|
uh
|
||||||
|
uw
|
||||||
|
v
|
||||||
|
w
|
||||||
|
y
|
||||||
|
z
|
||||||
|
zh
|
||||||
|
air_c
|
||||||
|
angr_c
|
||||||
|
anr_c
|
||||||
|
aor_c
|
||||||
|
ar_c
|
||||||
|
eir_c
|
||||||
|
engr_c
|
||||||
|
enr_c
|
||||||
|
iangr_c
|
||||||
|
ianr_c
|
||||||
|
iaor_c
|
||||||
|
iar_c
|
||||||
|
ier_c
|
||||||
|
ihr_c
|
||||||
|
iir_c
|
||||||
|
ingr_c
|
||||||
|
inr_c
|
||||||
|
iongr_c
|
||||||
|
iour_c
|
||||||
|
ir_c
|
||||||
|
ongr_c
|
||||||
|
or_c
|
||||||
|
our_c
|
||||||
|
uair_c
|
||||||
|
uangr_c
|
||||||
|
uanr_c
|
||||||
|
uar_c
|
||||||
|
ueir_c
|
||||||
|
uenr_c
|
||||||
|
uor_c
|
||||||
|
ur_c
|
||||||
|
vanr_c
|
||||||
|
ver_c
|
||||||
|
vnr_c
|
||||||
|
vr_c
|
||||||
|
pau
|
||||||
|
#1
|
||||||
|
#2
|
||||||
|
#3
|
||||||
|
#4
|
||||||
5
voices/zhiyan_emo/dict/syllable_flag_dict.txt
Executable file
5
voices/zhiyan_emo/dict/syllable_flag_dict.txt
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
s_begin
|
||||||
|
s_end
|
||||||
|
s_none
|
||||||
|
s_both
|
||||||
|
s_middle
|
||||||
7
voices/zhiyan_emo/dict/tone_dict.txt
Executable file
7
voices/zhiyan_emo/dict/tone_dict.txt
Executable file
@ -0,0 +1,7 @@
|
|||||||
|
tone1
|
||||||
|
tone_none
|
||||||
|
tone4
|
||||||
|
tone2
|
||||||
|
tone3
|
||||||
|
tone5
|
||||||
|
tone0
|
||||||
5
voices/zhiyan_emo/dict/word_segment_dict.txt
Executable file
5
voices/zhiyan_emo/dict/word_segment_dict.txt
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
word_begin
|
||||||
|
word_end
|
||||||
|
word_middle
|
||||||
|
word_both
|
||||||
|
word_none
|
||||||
BIN
voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
BIN
voices/zhiyan_emo/voc/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
Binary file not shown.
188
voices/zhiyan_emo/voc/config.yaml
Normal file
188
voices/zhiyan_emo/voc/config.yaml
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
model_type: hifigan
|
||||||
|
Model:
|
||||||
|
###########################################################
|
||||||
|
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||||
|
###########################################################
|
||||||
|
Generator:
|
||||||
|
params:
|
||||||
|
in_channels: 80
|
||||||
|
out_channels: 1
|
||||||
|
channels: 256
|
||||||
|
kernel_size: 7
|
||||||
|
upsample_scales: [10, 5, 2, 2]
|
||||||
|
upsample_kernal_sizes: [20, 11, 4, 4]
|
||||||
|
resblock_kernel_sizes: [3, 7, 11]
|
||||||
|
resblock_dilations:
|
||||||
|
- [1, 3, 5, 7]
|
||||||
|
- [1, 3, 5, 7]
|
||||||
|
- [1, 3, 5, 7]
|
||||||
|
bias: true
|
||||||
|
causal: true
|
||||||
|
nonlinear_activation: "LeakyReLU"
|
||||||
|
nonlinear_activation_params:
|
||||||
|
negative_slope: 0.1
|
||||||
|
use_weight_norm: true
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 2.0e-4
|
||||||
|
betas: [0.5, 0.9]
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: MultiStepLR
|
||||||
|
params:
|
||||||
|
gamma: 0.5
|
||||||
|
milestones:
|
||||||
|
- 200000
|
||||||
|
- 400000
|
||||||
|
- 600000
|
||||||
|
- 800000
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||||
|
###########################################################
|
||||||
|
MultiScaleDiscriminator:
|
||||||
|
params:
|
||||||
|
scales: 3
|
||||||
|
downsample_pooling: "DWT"
|
||||||
|
downsample_pooling_params:
|
||||||
|
kernel_size: 4
|
||||||
|
stride: 2
|
||||||
|
padding: 2
|
||||||
|
discriminator_params:
|
||||||
|
in_channels: 1
|
||||||
|
out_channels: 1
|
||||||
|
kernel_sizes: [15, 41, 5, 3]
|
||||||
|
channels: 128
|
||||||
|
max_downsample_channels: 1024
|
||||||
|
max_groups: 16
|
||||||
|
bias: true
|
||||||
|
downsample_scales: [4, 4, 4, 4, 1]
|
||||||
|
nonlinear_activation: "LeakyReLU"
|
||||||
|
nonlinear_activation_params:
|
||||||
|
negative_slope: 0.1
|
||||||
|
follow_official_norm: true
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 2.0e-4
|
||||||
|
betas: [0.5, 0.9]
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: MultiStepLR
|
||||||
|
params:
|
||||||
|
gamma: 0.5
|
||||||
|
milestones:
|
||||||
|
- 200000
|
||||||
|
- 400000
|
||||||
|
- 600000
|
||||||
|
- 800000
|
||||||
|
|
||||||
|
MultiPeriodDiscriminator:
|
||||||
|
params:
|
||||||
|
periods: [2, 3, 5, 7, 11]
|
||||||
|
discriminator_params:
|
||||||
|
in_channels: 1
|
||||||
|
out_channels: 1
|
||||||
|
kernel_sizes: [5, 3]
|
||||||
|
channels: 32
|
||||||
|
downsample_scales: [3, 3, 3, 3, 1]
|
||||||
|
max_downsample_channels: 1024
|
||||||
|
bias: true
|
||||||
|
nonlinear_activation: "LeakyReLU"
|
||||||
|
nonlinear_activation_params:
|
||||||
|
negative_slope: 0.1
|
||||||
|
use_spectral_norm: false
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 2.0e-4
|
||||||
|
betas: [0.5, 0.9]
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: MultiStepLR
|
||||||
|
params:
|
||||||
|
gamma: 0.5
|
||||||
|
milestones:
|
||||||
|
- 200000
|
||||||
|
- 400000
|
||||||
|
- 600000
|
||||||
|
- 800000
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# LOSS SETTING #
|
||||||
|
####################################################
|
||||||
|
Loss:
|
||||||
|
generator_adv_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
average_by_discriminators: False
|
||||||
|
weights: 1.0
|
||||||
|
|
||||||
|
discriminator_adv_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
average_by_discriminators: False
|
||||||
|
weights: 1.0
|
||||||
|
|
||||||
|
stft_loss:
|
||||||
|
enable: False # Whether to use multi-resolution STFT loss.
|
||||||
|
|
||||||
|
mel_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
fs: 16000
|
||||||
|
fft_size: 2048
|
||||||
|
hop_size: 200
|
||||||
|
win_length: 1000
|
||||||
|
window: "hann"
|
||||||
|
num_mels: 80
|
||||||
|
fmin: 0
|
||||||
|
fmax: 8000
|
||||||
|
log_base: null
|
||||||
|
weights: 45.0
|
||||||
|
|
||||||
|
subband_stft_loss:
|
||||||
|
enable: False
|
||||||
|
params:
|
||||||
|
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
|
||||||
|
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
|
||||||
|
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
|
||||||
|
window: "hann_window" # Window function for STFT-based loss
|
||||||
|
|
||||||
|
feat_match_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
average_by_discriminators: false
|
||||||
|
average_by_layers: false
|
||||||
|
weights: 2.0
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA LOADER SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 16
|
||||||
|
batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||||
|
pin_memory: True
|
||||||
|
num_workers: 2 # FIXME: set > 0 may stuck on macos
|
||||||
|
remove_short_samples: False
|
||||||
|
allow_cache: True
|
||||||
|
|
||||||
|
generator_grad_norm: -1
|
||||||
|
|
||||||
|
discriminator_grad_norm: -1
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# INTERVAL SETTING #
|
||||||
|
###########################################################
|
||||||
|
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
||||||
|
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
||||||
|
train_max_steps: 2500000 # Number of training steps.
|
||||||
|
save_interval_steps: 20000 # Interval steps to save checkpoint.
|
||||||
|
eval_interval_steps: 10000 # Interval steps to evaluate the network.
|
||||||
|
log_interval_steps: 1000 # Interval steps to record the training log.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||||
BIN
voices/zhiyan_emo/vocoder/pytorch_model.bin
(Stored with Git LFS)
Normal file
BIN
voices/zhiyan_emo/vocoder/pytorch_model.bin
(Stored with Git LFS)
Normal file
Binary file not shown.
BIN
voices/zhizhe_emo/am/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
BIN
voices/zhizhe_emo/am/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
Binary file not shown.
105
voices/zhizhe_emo/am/config.yaml
Normal file
105
voices/zhizhe_emo/am/config.yaml
Normal file
@ -0,0 +1,105 @@
|
|||||||
|
model_type: sambert
|
||||||
|
Model:
|
||||||
|
#########################################################
|
||||||
|
# SAMBERT NETWORK ARCHITECTURE SETTING #
|
||||||
|
#########################################################
|
||||||
|
KanTtsSAMBERT:
|
||||||
|
params:
|
||||||
|
max_len: 800
|
||||||
|
|
||||||
|
embedding_dim: 512
|
||||||
|
encoder_num_layers: 8
|
||||||
|
encoder_num_heads: 8
|
||||||
|
encoder_num_units: 128
|
||||||
|
encoder_ffn_inner_dim: 1024
|
||||||
|
encoder_dropout: 0.1
|
||||||
|
encoder_attention_dropout: 0.1
|
||||||
|
encoder_relu_dropout: 0.1
|
||||||
|
encoder_projection_units: 32
|
||||||
|
|
||||||
|
speaker_units: 32
|
||||||
|
emotion_units: 32
|
||||||
|
|
||||||
|
predictor_filter_size: 41
|
||||||
|
predictor_fsmn_num_layers: 3
|
||||||
|
predictor_num_memory_units: 128
|
||||||
|
predictor_ffn_inner_dim: 256
|
||||||
|
predictor_dropout: 0.1
|
||||||
|
predictor_shift: 0
|
||||||
|
predictor_lstm_units: 128
|
||||||
|
dur_pred_prenet_units: [128, 128]
|
||||||
|
dur_pred_lstm_units: 128
|
||||||
|
|
||||||
|
decoder_prenet_units: [256, 256]
|
||||||
|
decoder_num_layers: 12
|
||||||
|
decoder_num_heads: 8
|
||||||
|
decoder_num_units: 128
|
||||||
|
decoder_ffn_inner_dim: 1024
|
||||||
|
decoder_dropout: 0.1
|
||||||
|
decoder_attention_dropout: 0.1
|
||||||
|
decoder_relu_dropout: 0.1
|
||||||
|
|
||||||
|
outputs_per_step: 3
|
||||||
|
num_mels: 80
|
||||||
|
|
||||||
|
postnet_filter_size: 41
|
||||||
|
postnet_fsmn_num_layers: 4
|
||||||
|
postnet_num_memory_units: 256
|
||||||
|
postnet_ffn_inner_dim: 512
|
||||||
|
postnet_dropout: 0.1
|
||||||
|
postnet_shift: 17
|
||||||
|
postnet_lstm_units: 128
|
||||||
|
MAS: False
|
||||||
|
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 0.001
|
||||||
|
betas: [0.9, 0.98]
|
||||||
|
eps: 1.0e-9
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: NoamLR
|
||||||
|
params:
|
||||||
|
warmup_steps: 4000
|
||||||
|
|
||||||
|
linguistic_unit:
|
||||||
|
cleaners: english_cleaners
|
||||||
|
lfeat_type_list: sy,tone,syllable_flag,word_segment,emo_category,speaker_category
|
||||||
|
speaker_list: F7,F74,FBYN,FRXL,M7,xiaoyu
|
||||||
|
####################################################
|
||||||
|
# LOSS SETTING #
|
||||||
|
####################################################
|
||||||
|
Loss:
|
||||||
|
MelReconLoss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
loss_type: mae
|
||||||
|
|
||||||
|
ProsodyReconLoss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
loss_type: mae
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA LOADER SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 32
|
||||||
|
pin_memory: False
|
||||||
|
num_workers: 4 # FIXME: set > 0 may stuck on macos
|
||||||
|
remove_short_samples: False
|
||||||
|
allow_cache: True
|
||||||
|
grad_norm: 1.0
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# INTERVAL SETTING #
|
||||||
|
###########################################################
|
||||||
|
train_max_steps: 1000000 # Number of training steps.
|
||||||
|
save_interval_steps: 20000 # Interval steps to save checkpoint.
|
||||||
|
eval_interval_steps: 10000 # Interval steps to evaluate the network.
|
||||||
|
log_interval_steps: 1000 # Interval steps to record the training log.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||||
BIN
voices/zhizhe_emo/am/pytorch_model.bin
(Stored with Git LFS)
Normal file
BIN
voices/zhizhe_emo/am/pytorch_model.bin
(Stored with Git LFS)
Normal file
Binary file not shown.
27
voices/zhizhe_emo/audio_config.yaml
Normal file
27
voices/zhizhe_emo/audio_config.yaml
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# Audio processing configs
|
||||||
|
|
||||||
|
audio_config:
|
||||||
|
# Preprocess
|
||||||
|
wav_normalize: True
|
||||||
|
trim_silence: True
|
||||||
|
trim_silence_threshold_db: 60
|
||||||
|
preemphasize: False
|
||||||
|
|
||||||
|
# Feature extraction
|
||||||
|
sampling_rate: 16000
|
||||||
|
hop_length: 200
|
||||||
|
win_length: 1000
|
||||||
|
n_fft: 2048
|
||||||
|
n_mels: 80
|
||||||
|
fmin: 0.0
|
||||||
|
fmax: 8000.0
|
||||||
|
phone_level_feature: True
|
||||||
|
|
||||||
|
# Normalization
|
||||||
|
norm_type: "mean_std" # "mean_std" or "global"
|
||||||
|
max_norm: 1.0
|
||||||
|
symmetric: False
|
||||||
|
min_level_db: -100.0
|
||||||
|
ref_level_db: 20
|
||||||
|
|
||||||
|
num_workers: 16
|
||||||
33
voices/zhizhe_emo/dict/emo_category_dict.txt
Executable file
33
voices/zhizhe_emo/dict/emo_category_dict.txt
Executable file
@ -0,0 +1,33 @@
|
|||||||
|
emotion_none
|
||||||
|
emotion_neutral
|
||||||
|
emotion_angry
|
||||||
|
emotion_disgust
|
||||||
|
emotion_fear
|
||||||
|
emotion_happy
|
||||||
|
emotion_sad
|
||||||
|
emotion_surprise
|
||||||
|
emotion_calm
|
||||||
|
emotion_gentle
|
||||||
|
emotion_relax
|
||||||
|
emotion_lyrical
|
||||||
|
emotion_serious
|
||||||
|
emotion_disgruntled
|
||||||
|
emotion_satisfied
|
||||||
|
emotion_disappointed
|
||||||
|
emotion_excited
|
||||||
|
emotion_anxiety
|
||||||
|
emotion_jealousy
|
||||||
|
emotion_hate
|
||||||
|
emotion_pity
|
||||||
|
emotion_pleasure
|
||||||
|
emotion_arousal
|
||||||
|
emotion_dominance
|
||||||
|
emotion_placeholder1
|
||||||
|
emotion_placeholder2
|
||||||
|
emotion_placeholder3
|
||||||
|
emotion_placeholder4
|
||||||
|
emotion_placeholder5
|
||||||
|
emotion_placeholder6
|
||||||
|
emotion_placeholder7
|
||||||
|
emotion_placeholder8
|
||||||
|
emotion_placeholder9
|
||||||
6
voices/zhizhe_emo/dict/speaker_dict.txt
Executable file
6
voices/zhizhe_emo/dict/speaker_dict.txt
Executable file
@ -0,0 +1,6 @@
|
|||||||
|
F7
|
||||||
|
F74
|
||||||
|
FBYN
|
||||||
|
FRXL
|
||||||
|
M7
|
||||||
|
xiaoyu
|
||||||
144
voices/zhizhe_emo/dict/sy_dict.txt
Executable file
144
voices/zhizhe_emo/dict/sy_dict.txt
Executable file
@ -0,0 +1,144 @@
|
|||||||
|
a_c
|
||||||
|
ai_c
|
||||||
|
an_c
|
||||||
|
ang_c
|
||||||
|
ao_c
|
||||||
|
b_c
|
||||||
|
c_c
|
||||||
|
ch_c
|
||||||
|
d_c
|
||||||
|
e_c
|
||||||
|
ei_c
|
||||||
|
en_c
|
||||||
|
eng_c
|
||||||
|
er_c
|
||||||
|
f_c
|
||||||
|
g_c
|
||||||
|
h_c
|
||||||
|
i_c
|
||||||
|
ia_c
|
||||||
|
ian_c
|
||||||
|
iang_c
|
||||||
|
iao_c
|
||||||
|
ie_c
|
||||||
|
ih_c
|
||||||
|
ii_c
|
||||||
|
in_c
|
||||||
|
ing_c
|
||||||
|
io_c
|
||||||
|
iong_c
|
||||||
|
iou_c
|
||||||
|
j_c
|
||||||
|
k_c
|
||||||
|
l_c
|
||||||
|
m_c
|
||||||
|
n_c
|
||||||
|
o_c
|
||||||
|
ong_c
|
||||||
|
ou_c
|
||||||
|
p_c
|
||||||
|
q_c
|
||||||
|
r_c
|
||||||
|
s_c
|
||||||
|
sh_c
|
||||||
|
t_c
|
||||||
|
u_c
|
||||||
|
ua_c
|
||||||
|
uai_c
|
||||||
|
uan_c
|
||||||
|
uang_c
|
||||||
|
uei_c
|
||||||
|
uen_c
|
||||||
|
ueng_c
|
||||||
|
uo_c
|
||||||
|
v_c
|
||||||
|
van_c
|
||||||
|
ve_c
|
||||||
|
vn_c
|
||||||
|
xx_c
|
||||||
|
z_c
|
||||||
|
zh_c
|
||||||
|
w_c
|
||||||
|
y_c
|
||||||
|
ga
|
||||||
|
ge
|
||||||
|
go
|
||||||
|
aa
|
||||||
|
ae
|
||||||
|
ah
|
||||||
|
ao
|
||||||
|
aw
|
||||||
|
ay
|
||||||
|
b
|
||||||
|
ch
|
||||||
|
d
|
||||||
|
dh
|
||||||
|
eh
|
||||||
|
er
|
||||||
|
ey
|
||||||
|
f
|
||||||
|
g
|
||||||
|
hh
|
||||||
|
ih
|
||||||
|
iy
|
||||||
|
jh
|
||||||
|
k
|
||||||
|
l
|
||||||
|
m
|
||||||
|
n
|
||||||
|
ng
|
||||||
|
ow
|
||||||
|
oy
|
||||||
|
p
|
||||||
|
r
|
||||||
|
s
|
||||||
|
sh
|
||||||
|
t
|
||||||
|
th
|
||||||
|
uh
|
||||||
|
uw
|
||||||
|
v
|
||||||
|
w
|
||||||
|
y
|
||||||
|
z
|
||||||
|
zh
|
||||||
|
air_c
|
||||||
|
angr_c
|
||||||
|
anr_c
|
||||||
|
aor_c
|
||||||
|
ar_c
|
||||||
|
eir_c
|
||||||
|
engr_c
|
||||||
|
enr_c
|
||||||
|
iangr_c
|
||||||
|
ianr_c
|
||||||
|
iaor_c
|
||||||
|
iar_c
|
||||||
|
ier_c
|
||||||
|
ihr_c
|
||||||
|
iir_c
|
||||||
|
ingr_c
|
||||||
|
inr_c
|
||||||
|
iongr_c
|
||||||
|
iour_c
|
||||||
|
ir_c
|
||||||
|
ongr_c
|
||||||
|
or_c
|
||||||
|
our_c
|
||||||
|
uair_c
|
||||||
|
uangr_c
|
||||||
|
uanr_c
|
||||||
|
uar_c
|
||||||
|
ueir_c
|
||||||
|
uenr_c
|
||||||
|
uor_c
|
||||||
|
ur_c
|
||||||
|
vanr_c
|
||||||
|
ver_c
|
||||||
|
vnr_c
|
||||||
|
vr_c
|
||||||
|
pau
|
||||||
|
#1
|
||||||
|
#2
|
||||||
|
#3
|
||||||
|
#4
|
||||||
5
voices/zhizhe_emo/dict/syllable_flag_dict.txt
Executable file
5
voices/zhizhe_emo/dict/syllable_flag_dict.txt
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
s_begin
|
||||||
|
s_end
|
||||||
|
s_none
|
||||||
|
s_both
|
||||||
|
s_middle
|
||||||
7
voices/zhizhe_emo/dict/tone_dict.txt
Executable file
7
voices/zhizhe_emo/dict/tone_dict.txt
Executable file
@ -0,0 +1,7 @@
|
|||||||
|
tone1
|
||||||
|
tone_none
|
||||||
|
tone4
|
||||||
|
tone2
|
||||||
|
tone3
|
||||||
|
tone5
|
||||||
|
tone0
|
||||||
5
voices/zhizhe_emo/dict/word_segment_dict.txt
Executable file
5
voices/zhizhe_emo/dict/word_segment_dict.txt
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
word_begin
|
||||||
|
word_end
|
||||||
|
word_middle
|
||||||
|
word_both
|
||||||
|
word_none
|
||||||
BIN
voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
BIN
voices/zhizhe_emo/voc/ckpt/checkpoint_0.pth
(Stored with Git LFS)
Normal file
Binary file not shown.
188
voices/zhizhe_emo/voc/config.yaml
Normal file
188
voices/zhizhe_emo/voc/config.yaml
Normal file
@ -0,0 +1,188 @@
|
|||||||
|
model_type: hifigan
|
||||||
|
Model:
|
||||||
|
###########################################################
|
||||||
|
# GENERATOR NETWORK ARCHITECTURE SETTING #
|
||||||
|
###########################################################
|
||||||
|
Generator:
|
||||||
|
params:
|
||||||
|
in_channels: 80
|
||||||
|
out_channels: 1
|
||||||
|
channels: 256
|
||||||
|
kernel_size: 7
|
||||||
|
upsample_scales: [10, 5, 2, 2]
|
||||||
|
upsample_kernal_sizes: [20, 11, 4, 4]
|
||||||
|
resblock_kernel_sizes: [3, 7, 11]
|
||||||
|
resblock_dilations:
|
||||||
|
- [1, 3, 5, 7]
|
||||||
|
- [1, 3, 5, 7]
|
||||||
|
- [1, 3, 5, 7]
|
||||||
|
bias: true
|
||||||
|
causal: true
|
||||||
|
nonlinear_activation: "LeakyReLU"
|
||||||
|
nonlinear_activation_params:
|
||||||
|
negative_slope: 0.1
|
||||||
|
use_weight_norm: true
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 2.0e-4
|
||||||
|
betas: [0.5, 0.9]
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: MultiStepLR
|
||||||
|
params:
|
||||||
|
gamma: 0.5
|
||||||
|
milestones:
|
||||||
|
- 200000
|
||||||
|
- 400000
|
||||||
|
- 600000
|
||||||
|
- 800000
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DISCRIMINATOR NETWORK ARCHITECTURE SETTING #
|
||||||
|
###########################################################
|
||||||
|
MultiScaleDiscriminator:
|
||||||
|
params:
|
||||||
|
scales: 3
|
||||||
|
downsample_pooling: "DWT"
|
||||||
|
downsample_pooling_params:
|
||||||
|
kernel_size: 4
|
||||||
|
stride: 2
|
||||||
|
padding: 2
|
||||||
|
discriminator_params:
|
||||||
|
in_channels: 1
|
||||||
|
out_channels: 1
|
||||||
|
kernel_sizes: [15, 41, 5, 3]
|
||||||
|
channels: 128
|
||||||
|
max_downsample_channels: 1024
|
||||||
|
max_groups: 16
|
||||||
|
bias: true
|
||||||
|
downsample_scales: [4, 4, 4, 4, 1]
|
||||||
|
nonlinear_activation: "LeakyReLU"
|
||||||
|
nonlinear_activation_params:
|
||||||
|
negative_slope: 0.1
|
||||||
|
follow_official_norm: true
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 2.0e-4
|
||||||
|
betas: [0.5, 0.9]
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: MultiStepLR
|
||||||
|
params:
|
||||||
|
gamma: 0.5
|
||||||
|
milestones:
|
||||||
|
- 200000
|
||||||
|
- 400000
|
||||||
|
- 600000
|
||||||
|
- 800000
|
||||||
|
|
||||||
|
MultiPeriodDiscriminator:
|
||||||
|
params:
|
||||||
|
periods: [2, 3, 5, 7, 11]
|
||||||
|
discriminator_params:
|
||||||
|
in_channels: 1
|
||||||
|
out_channels: 1
|
||||||
|
kernel_sizes: [5, 3]
|
||||||
|
channels: 32
|
||||||
|
downsample_scales: [3, 3, 3, 3, 1]
|
||||||
|
max_downsample_channels: 1024
|
||||||
|
bias: true
|
||||||
|
nonlinear_activation: "LeakyReLU"
|
||||||
|
nonlinear_activation_params:
|
||||||
|
negative_slope: 0.1
|
||||||
|
use_spectral_norm: false
|
||||||
|
optimizer:
|
||||||
|
type: Adam
|
||||||
|
params:
|
||||||
|
lr: 2.0e-4
|
||||||
|
betas: [0.5, 0.9]
|
||||||
|
weight_decay: 0.0
|
||||||
|
scheduler:
|
||||||
|
type: MultiStepLR
|
||||||
|
params:
|
||||||
|
gamma: 0.5
|
||||||
|
milestones:
|
||||||
|
- 200000
|
||||||
|
- 400000
|
||||||
|
- 600000
|
||||||
|
- 800000
|
||||||
|
|
||||||
|
####################################################
|
||||||
|
# LOSS SETTING #
|
||||||
|
####################################################
|
||||||
|
Loss:
|
||||||
|
generator_adv_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
average_by_discriminators: False
|
||||||
|
weights: 1.0
|
||||||
|
|
||||||
|
discriminator_adv_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
average_by_discriminators: False
|
||||||
|
weights: 1.0
|
||||||
|
|
||||||
|
stft_loss:
|
||||||
|
enable: False # Whether to use multi-resolution STFT loss.
|
||||||
|
|
||||||
|
mel_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
fs: 16000
|
||||||
|
fft_size: 2048
|
||||||
|
hop_size: 200
|
||||||
|
win_length: 1000
|
||||||
|
window: "hann"
|
||||||
|
num_mels: 80
|
||||||
|
fmin: 0
|
||||||
|
fmax: 8000
|
||||||
|
log_base: null
|
||||||
|
weights: 45.0
|
||||||
|
|
||||||
|
subband_stft_loss:
|
||||||
|
enable: False
|
||||||
|
params:
|
||||||
|
fft_sizes: [384, 683, 171] # List of FFT size for STFT-based loss.
|
||||||
|
hop_sizes: [35, 75, 15] # List of hop size for STFT-based loss
|
||||||
|
win_lengths: [150, 300, 60] # List of window length for STFT-based loss.
|
||||||
|
window: "hann_window" # Window function for STFT-based loss
|
||||||
|
|
||||||
|
feat_match_loss:
|
||||||
|
enable: True
|
||||||
|
params:
|
||||||
|
average_by_discriminators: false
|
||||||
|
average_by_layers: false
|
||||||
|
weights: 2.0
|
||||||
|
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# DATA LOADER SETTING #
|
||||||
|
###########################################################
|
||||||
|
batch_size: 16
|
||||||
|
batch_max_steps: 9600 # Length of each audio in batch. Make sure dividable by hop_size.
|
||||||
|
pin_memory: True
|
||||||
|
num_workers: 2 # FIXME: set > 0 may stuck on macos
|
||||||
|
remove_short_samples: False
|
||||||
|
allow_cache: True
|
||||||
|
|
||||||
|
generator_grad_norm: -1
|
||||||
|
|
||||||
|
discriminator_grad_norm: -1
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# INTERVAL SETTING #
|
||||||
|
###########################################################
|
||||||
|
generator_train_start_steps: 1 # Number of steps to start to train discriminator.
|
||||||
|
discriminator_train_start_steps: 0 # Number of steps to start to train discriminator.
|
||||||
|
train_max_steps: 2500000 # Number of training steps.
|
||||||
|
save_interval_steps: 20000 # Interval steps to save checkpoint.
|
||||||
|
eval_interval_steps: 10000 # Interval steps to evaluate the network.
|
||||||
|
log_interval_steps: 1000 # Interval steps to record the training log.
|
||||||
|
|
||||||
|
###########################################################
|
||||||
|
# OTHER SETTING #
|
||||||
|
###########################################################
|
||||||
|
num_save_intermediate_results: 4 # Number of results to be saved as intermediate results.
|
||||||
BIN
voices/zhizhe_emo/vocoder/pytorch_model.bin
(Stored with Git LFS)
Normal file
BIN
voices/zhizhe_emo/vocoder/pytorch_model.bin
(Stored with Git LFS)
Normal file
Binary file not shown.
Reference in New Issue
Block a user