This commit is contained in:
jiaqi.sjq
2023-01-04 15:49:38 +08:00
parent 1b91ff809f
commit 9ccdc580f3
13 changed files with 1942 additions and 123 deletions

View File

@ -1,127 +1,10 @@
{
"framework": "Tensorflow",
"framework": "pytorch",
"task" : "text-to-speech",
"model" : {
"type" : "sambert-hifigan",
"lang_type" : "zhcn",
"sample_rate" : 16000,
"am": {
"am": {
"max_len": 800,
"embedding_dim": 512,
"encoder_num_layers": 8,
"encoder_num_heads": 8,
"encoder_num_units": 128,
"encoder_ffn_inner_dim": 1024,
"encoder_dropout": 0.1,
"encoder_attention_dropout": 0.1,
"encoder_relu_dropout": 0.1,
"encoder_projection_units": 32,
"speaker_units": 32,
"emotion_units": 32,
"predictor_filter_size": 41,
"predictor_fsmn_num_layers": 3,
"predictor_num_memory_units": 128,
"predictor_ffn_inner_dim": 256,
"predictor_dropout": 0.1,
"predictor_shift": 0,
"predictor_lstm_units": 128,
"dur_pred_prenet_units": [128, 128],
"dur_pred_lstm_units": 128,
"decoder_prenet_units": [256, 256],
"decoder_num_layers": 12,
"decoder_num_heads": 8,
"decoder_num_units": 128,
"decoder_ffn_inner_dim": 1024,
"decoder_dropout": 0.1,
"decoder_attention_dropout": 0.1,
"decoder_relu_dropout": 0.1,
"outputs_per_step": 3,
"num_mels": 80,
"postnet_filter_size": 41,
"postnet_fsmn_num_layers": 4,
"postnet_num_memory_units": 256,
"postnet_ffn_inner_dim": 512,
"postnet_dropout": 0.1,
"postnet_shift": 17,
"postnet_lstm_units": 128
},
"audio": {
"frame_shift_ms": 12.5
},
"linguistic_unit": {
"cleaners": "english_cleaners",
"lfeat_type_list": "sy,tone,syllable_flag,word_segment,emo_category,speaker_category",
"sy": "dict/sy_dict.txt",
"tone": "dict/tone_dict.txt",
"syllable_flag": "dict/syllable_flag_dict.txt",
"word_segment": "dict/word_segment_dict.txt",
"emo_category": "dict/emo_category_dict.txt",
"speaker_category": "dict/speaker_dict.txt"
},
"num_gpus": 1,
"batch_size": 32,
"group_size": 1024,
"learning_rate": 0.001,
"adam_b1": 0.9,
"adam_b2": 0.98,
"seed": 1234,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:11111",
"world_size": 1
}
},
"vocoder" : {
"resblock": "1",
"num_gpus": 1,
"batch_size": 16,
"learning_rate": 0.0002,
"adam_b1": 0.8,
"adam_b2": 0.99,
"lr_decay": 0.999,
"seed": 1234,
"upsample_rates": [10,5,2,2],
"upsample_kernel_sizes": [20,10,4,4],
"upsample_initial_channel": 256,
"resblock_kernel_sizes": [3,7,11],
"resblock_dilation_sizes": [[1,3,5,7], [1,3,5,7], [1,3,5,7]],
"segment_size": 6400,
"num_mels": 80,
"num_freq": 1025,
"n_fft": 2048,
"hop_size": 200,
"win_size": 1000,
"sampling_rate": 16000,
"fmin": 0,
"fmax": 8000,
"fmax_for_loss": null,
"num_workers": 4,
"dist_config": {
"dist_backend": "nccl",
"dist_url": "tcp://localhost:54312",
"world_size": 1
}
}
"lang_type" : "sichuan",
"sample_rate" : 16000
},
"pipeline": {
"type": "sambert-hifigan-tts"

BIN
voices.zip (Stored with Git LFS)

Binary file not shown.

BIN
voices/F7/am/ckpt/checkpoint_980000.pth (Stored with Git LFS) Normal file

Binary file not shown.

79
voices/F7/am/config.yaml Normal file
View File

@ -0,0 +1,79 @@
Loss:
MelReconLoss:
enable: true
params: {loss_type: mae}
ProsodyReconLoss:
enable: true
params: {loss_type: mae}
Model:
KanTtsSAMBERT:
optimizer:
params:
betas: [0.9, 0.98]
eps: 1.0e-09
lr: 0.001
weight_decay: 0.0
type: Adam
params:
MAS: false
decoder_attention_dropout: 0.1
decoder_dropout: 0.1
decoder_ffn_inner_dim: 1024
decoder_num_heads: 8
decoder_num_layers: 12
decoder_num_units: 128
decoder_prenet_units: [256, 256]
decoder_relu_dropout: 0.1
dur_pred_lstm_units: 128
dur_pred_prenet_units: [128, 128]
embedding_dim: 512
emotion_units: 32
encoder_attention_dropout: 0.1
encoder_dropout: 0.1
encoder_ffn_inner_dim: 1024
encoder_num_heads: 8
encoder_num_layers: 8
encoder_num_units: 128
encoder_projection_units: 32
encoder_relu_dropout: 0.1
max_len: 800
num_mels: 80
outputs_per_step: 3
postnet_dropout: 0.1
postnet_ffn_inner_dim: 512
postnet_filter_size: 41
postnet_fsmn_num_layers: 4
postnet_lstm_units: 128
postnet_num_memory_units: 256
postnet_shift: 17
predictor_dropout: 0.1
predictor_ffn_inner_dim: 256
predictor_filter_size: 41
predictor_fsmn_num_layers: 3
predictor_lstm_units: 128
predictor_num_memory_units: 128
predictor_shift: 0
speaker_units: 32
scheduler:
params: {warmup_steps: 4000}
type: NoamLR
allow_cache: true
audio_config: {fmax: 8000.0, fmin: 0.0, hop_length: 200, max_norm: 1.0, min_level_db: -100.0,
n_fft: 2048, n_mels: 80, norm_type: mean_std, num_workers: 16, phone_level_feature: true,
preemphasize: false, ref_level_db: 20, sampling_rate: 16000, symmetric: false, trim_silence: true,
trim_silence_threshold_db: 60, wav_normalize: true, win_length: 1000}
batch_size: 32
create_time: '2022-12-26 11:05:43'
eval_interval_steps: 10000
git_revision_hash: 388243c0c173756d1eb34783c02cec4c302cdc25
grad_norm: 1.0
linguistic_unit: {cleaners: english_cleaners, language: Sichuan, lfeat_type_list: 'sy,tone,syllable_flag,word_segment,emo_category,speaker_category',
speaker_list: F7}
log_interval_steps: 1000
model_type: sambert
num_save_intermediate_results: 4
num_workers: 4
pin_memory: false
remove_short_samples: false
save_interval_steps: 20000
train_max_steps: 1000000

View File

@ -0,0 +1,27 @@
# Audio processing configs
audio_config:
# Preprocess
wav_normalize: True
trim_silence: True
trim_silence_threshold_db: 60
preemphasize: False
# Feature extraction
sampling_rate: 16000
hop_length: 200
win_length: 1000
n_fft: 2048
n_mels: 80
fmin: 0.0
fmax: 8000.0
phone_level_feature: True
# Normalization
norm_type: "mean_std" # "mean_std" or "global"
max_norm: 1.0
symmetric: False
min_level_db: -100.0
ref_level_db: 20
num_workers: 16

View File

@ -0,0 +1,2 @@
wu w
yi y

View File

@ -0,0 +1,984 @@
<?xml version="1.0" encoding="utf-8"?>
<phoneSet xmlns="http://schemas.alibaba-inc.com/tts">
<phone>
<id>0</id>
<name>a_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>1</id>
<name>ai_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>2</id>
<name>an_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>3</id>
<name>ang_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>4</id>
<name>ao_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>5</id>
<name>b_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>6</id>
<name>c_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>7</id>
<name>ch_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>8</id>
<name>d_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>9</id>
<name>e_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>10</id>
<name>ei_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>11</id>
<name>en_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>12</id>
<name>eng_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>13</id>
<name>er_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>14</id>
<name>f_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>15</id>
<name>g_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>16</id>
<name>h_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>17</id>
<name>i_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>18</id>
<name>ia_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>19</id>
<name>ian_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>20</id>
<name>iang_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>21</id>
<name>iao_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>22</id>
<name>ie_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>23</id>
<name>ih_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>24</id>
<name>ii_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>25</id>
<name>in_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>26</id>
<name>ing_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>27</id>
<name>ioo_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>28</id>
<name>iong_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>29</id>
<name>iou_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>30</id>
<name>j_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>31</id>
<name>k_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>32</id>
<name>l_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>33</id>
<name>m_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>34</id>
<name>n_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>35</id>
<name>o_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>36</id>
<name>ong_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>37</id>
<name>ou_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>38</id>
<name>p_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>liptooth</ap>
<am>fricative</am>
</phone>
<phone>
<id>39</id>
<name>q_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>liptooth</ap>
<am>fricative</am>
</phone>
<phone>
<id>40</id>
<name>r_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>41</id>
<name>s_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>low</ap>
<am>open</am>
</phone>
<phone>
<id>42</id>
<name>sh_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>43</id>
<name>t_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>middle</ap>
<am>open</am>
</phone>
<phone>
<id>44</id>
<name>u_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>45</id>
<name>ua_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>fricative</am>
</phone>
<phone>
<id>46</id>
<name>uai_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>velar</ap>
<am>fricative</am>
</phone>
<phone>
<id>47</id>
<name>uan_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>48</id>
<name>uang_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>49</id>
<name>uei_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>50</id>
<name>uen_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>51</id>
<name>ueng_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>52</id>
<name>uo_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>53</id>
<name>v_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>54</id>
<name>van_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>55</id>
<name>ve_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>56</id>
<name>vn_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>high</ap>
<am>open</am>
</phone>
<phone>
<id>57</id>
<name>xx_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>58</id>
<name>z_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>59</id>
<name>zh_c</name>
<cv>vowel</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>60</id>
<name>w_c</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>61</id>
<name>y_c</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
<phone>
<id>62</id>
<name>ng_c</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>63</id>
<name>iai_c</name>
<cv>consonant</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>64</id>
<name>io_c</name>
<cv>consonant</cv>
<if>final</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>65</id>
<name>ue_c</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>66</id>
<name>ga</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>67</id>
<name>ge</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>68</id>
<name>go</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>69</id>
<name>aa</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>70</id>
<name>ae</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>71</id>
<name>ah</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>72</id>
<name>ao</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>73</id>
<name>aw</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>74</id>
<name>ay</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>75</id>
<name>b</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>76</id>
<name>ch</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>77</id>
<name>d</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>78</id>
<name>dh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>79</id>
<name>eh</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>80</id>
<name>er</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>81</id>
<name>ey</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>82</id>
<name>f</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>liptooth</ap>
<am>fricative</am>
</phone>
<phone>
<id>83</id>
<name>g</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>84</id>
<name>hh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>85</id>
<name>ih</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>86</id>
<name>iy</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>87</id>
<name>jh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>88</id>
<name>k</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>velar</ap>
<am>stop</am>
</phone>
<phone>
<id>89</id>
<name>l</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>lateral</am>
</phone>
<phone>
<id>90</id>
<name>m</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>doublelips</ap>
<am>nasal</am>
</phone>
<phone>
<id>91</id>
<name>n</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>centraltongue</ap>
<am>nasal</am>
</phone>
<phone>
<id>92</id>
<name>ng</name>
<cv>consonant</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>93</id>
<name>ow</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>94</id>
<name>oy</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>95</id>
<name>p</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>doublelips</ap>
<am>stop</am>
</phone>
<phone>
<id>96</id>
<name>r</name>
<cv>consonant</cv>
<if>initial</if>
<uv>voiced</uv>
<ap>backtongue</ap>
<am>fricative</am>
</phone>
<phone>
<id>97</id>
<name>s</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>fricative</am>
</phone>
<phone>
<id>98</id>
<name>sh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>fricative</am>
</phone>
<phone>
<id>99</id>
<name>t</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>centraltongue</ap>
<am>stop</am>
</phone>
<phone>
<id>100</id>
<name>th</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>101</id>
<name>uh</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>102</id>
<name>uw</name>
<cv>vowel</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>103</id>
<name>v</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>104</id>
<name>w</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>105</id>
<name>y</name>
<cv>consonant</cv>
<if>final</if>
<uv>voiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>106</id>
<name>z</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>fronttongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>107</id>
<name>zh</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>backtongue</ap>
<am>affricative</am>
</phone>
<phone>
<id>146</id>
<name>pau</name>
<cv>consonant</cv>
<if>initial</if>
<uv>unvoiced</uv>
<ap>high</ap>
<am>close</am>
</phone>
</phoneSet>

View File

@ -0,0 +1,147 @@
<?xml version="1.0" encoding="utf-8"?>
<posSet xmlns="http://schemas.alibaba-inc.com/tts">
<pos>
<id>1</id>
<name>a</name>
<desc>todo</desc>
</pos>
<pos>
<id>2</id>
<name>b</name>
<desc>todo</desc>
</pos>
<pos>
<id>3</id>
<name>c</name>
<desc>todo</desc>
</pos>
<pos>
<id>4</id>
<name>d</name>
<desc>todo</desc>
</pos>
<pos>
<id>5</id>
<name>e</name>
<desc>todo</desc>
</pos>
<pos>
<id>6</id>
<name>f</name>
<desc>todo</desc>
</pos>
<pos>
<id>7</id>
<name>g</name>
<desc>todo</desc>
<sub>
<pos>
<id>8</id>
<name>gb</name>
<desc>todo</desc>
</pos>
</sub>
</pos>
<pos>
<id>9</id>
<name>h</name>
<desc>todo</desc>
</pos>
<pos>
<id>10</id>
<name>i</name>
<desc>todo</desc>
</pos>
<pos>
<id>11</id>
<name>j</name>
<desc>todo</desc>
</pos>
<pos>
<id>12</id>
<name>k</name>
<desc>todo</desc>
</pos>
<pos>
<id>13</id>
<name>l</name>
<desc>todo</desc>
</pos>
<pos>
<id>14</id>
<name>m</name>
<desc>todo</desc>
</pos>
<pos>
<id>15</id>
<name>n</name>
<desc>todo</desc>
<sub>
<pos>
<id>16</id>
<name>nz</name>
<desc>todo</desc>
</pos>
</sub>
</pos>
<pos>
<id>17</id>
<name>o</name>
<desc>todo</desc>
</pos>
<pos>
<id>18</id>
<name>p</name>
<desc>todo</desc>
</pos>
<pos>
<id>19</id>
<name>q</name>
<desc>todo</desc>
</pos>
<pos>
<id>20</id>
<name>r</name>
<desc>todo</desc>
</pos>
<pos>
<id>21</id>
<name>s</name>
<desc>todo</desc>
</pos>
<pos>
<id>22</id>
<name>t</name>
<desc>todo</desc>
</pos>
<pos>
<id>23</id>
<name>u</name>
<desc>todo</desc>
</pos>
<pos>
<id>24</id>
<name>v</name>
<desc>todo</desc>
</pos>
<pos>
<id>25</id>
<name>w</name>
<desc>todo</desc>
</pos>
<pos>
<id>26</id>
<name>x</name>
<desc>todo</desc>
</pos>
<pos>
<id>27</id>
<name>y</name>
<desc>todo</desc>
</pos>
<pos>
<id>28</id>
<name>z</name>
<desc>todo</desc>
</pos>
</posSet>

View File

@ -0,0 +1,551 @@
a ga a_c
ai ga ai_c
an ga an_c
ao ga ao_c
e ge e_c
er ge er_c
o go o_c
ong go ong_c
ba b_c a_c
bai b_c ai_c
ban b_c an_c
bang b_c ang_c
bao b_c ao_c
be b_c e_c
bei b_c ei_c
ben b_c en_c
bi b_c i_c
bia b_c ia_c
bian b_c ian_c
biao b_c iao_c
bie b_c ie_c
bin b_c in_c
bo b_c o_c
bong b_c ong_c
bu b_c u_c
ca c_c a_c
cai c_c ai_c
can c_c an_c
cang c_c ang_c
cao c_c ao_c
ce c_c e_c
cen c_c en_c
ceng c_c eng_c
ci c_c ii_c
co c_c o_c
cong c_c ong_c
cou c_c ou_c
cu c_c u_c
cuai c_c uai_c
cuan c_c uan_c
cuang c_c uang_c
cui c_c uei_c
cun c_c uen_c
da d_c a_c
dai d_c ai_c
dan d_c an_c
dang d_c ang_c
dao d_c ao_c
de d_c e_c
dei d_c ei_c
den d_c en_c
deng d_c eng_c
di d_c i_c
dian d_c ian_c
diao d_c iao_c
die d_c ie_c
din d_c in_c
ding d_c ing_c
diu d_c iou_c
do d_c o_c
dong d_c ong_c
dou d_c ou_c
du d_c u_c
duan d_c uan_c
dui d_c uei_c
dun d_c uen_c
fa f_c a_c
fai f_c ai_c
fan f_c an_c
fang f_c ang_c
fei f_c ei_c
fen f_c en_c
feng f_c eng_c
fong f_c ong_c
fu f_c u_c
ga g_c a_c
gai g_c ai_c
gan g_c an_c
gang g_c ang_c
gao g_c ao_c
gua g_c ua_c
ge g_c e_c
gen g_c en_c
go g_c o_c
gon g_c iai_c
gong g_c ong_c
gou g_c ou_c
gu g_c u_c
guai g_c uai_c
guan g_c uan_c
guang g_c uang_c
gui g_c uei_c
gun g_c uen_c
ha h_c a_c
hai h_c ai_c
han h_c an_c
hang h_c ang_c
hao h_c ao_c
he h_c e_c
hen h_c en_c
ho h_c o_c
hong h_c ong_c
hou h_c ou_c
hu h_c u_c
hua h_c ua_c
huai h_c uai_c
huan h_c uan_c
huang h_c uang_c
hui h_c uei_c
hun h_c uen_c
huo h_c uo_c
ji j_c i_c
jia j_c ia_c
jiai j_c ia_c
jian j_c ian_c
jiang j_c iang_c
jiao j_c iao_c
jie j_c ie_c
jin j_c in_c
jiu j_c iou_c
ju j_c u_c
juan j_c van_c
jue j_c ve_c
juo j_c uo_c
ka k_c a_c
kai k_c ai_c
kan k_c an_c
kang k_c ang_c
kao k_c ao_c
ke k_c e_c
ken k_c en_c
ko k_c o_c
kong k_c ong_c
kou k_c ou_c
ku k_c u_c
kua k_c ua_c
kuai k_c uai_c
kuan k_c uan_c
kuang k_c uang_c
kue k_c ve_c
kui k_c uei_c
kun k_c uen_c
la l_c a_c
na n_c a_c
lai l_c ai_c
nai n_c ai_c
lan l_c an_c
nan n_c an_c
lang l_c ang_c
nang n_c ang_c
lao l_c ao_c
nao n_c ao_c
len l_c en_c
nen n_c en_c
li l_c i_c
ni n_c i_c
lian l_c ian_c
nian n_c ian_c
liang l_c iang_c
niang n_c iang_c
liao l_c iao_c
niao n_c iao_c
lie l_c ie_c
nie n_c ie_c
lin l_c in_c
nin n_c in_c
liu l_c iou_c
niu n_c iou_c
lo l_c o_c
no n_c o_c
long l_c ong_c
nong n_c ong_c
lou l_c ou_c
nou n_c ou_c
lu l_c u_c
nu n_c u_c
luan l_c uan_c
nuan n_c uan_c
lue l_c ve_c
nue n_c ve_c
lui l_c uei_c
nui n_c uei_c
lun l_c uen_c
nun n_c uen_c
luo l_c uo_c
nuo n_c uo_c
lv l_c v_c
nv n_c v_c
ma m_c a_c
mai m_c ai_c
man m_c an_c
mang m_c ang_c
mao m_c ao_c
me m_c e_c
mei m_c ei_c
men m_c en_c
meng m_c eng_c
mi m_c i_c
mian m_c ian_c
miao m_c iao_c
mie m_c ie_c
min m_c in_c
mo m_c o_c
mong m_c ong_c
mu m_c u_c
ne n_c e_c
nei n_c ei_c
pa p_c a_c
pai p_c ai_c
pan p_c an_c
pang p_c ang_c
pao p_c ao_c
pe p_c e_c
pei p_c ei_c
pen p_c en_c
peng p_c eng_c
pi p_c i_c
pian p_c ian_c
piao p_c iao_c
pie p_c ie_c
pin p_c in_c
po p_c o_c
pong p_c ong_c
pu p_c u_c
qi q_c i_c
qia q_c ia_c
qian q_c ian_c
qiang q_c iang_c
qiao q_c iao_c
qie q_c ie_c
qin q_c in_c
qing q_c ing_c
qiong q_c iong_c
qiu q_c iou_c
qu q_c u_c
quan q_c van_c
que q_c ve_c
qun q_c vn_c
quo q_c uo_c
ran r_c an_c
rang r_c ang_c
rao r_c ao_c
re r_c e_c
ren r_c en_c
ri r_c ih_c
rong r_c ong_c
rou r_c ou_c
ru r_c u_c
rua r_c ua_c
ruan r_c uan_c
sa s_c a_c
sai s_c ai_c
san s_c an_c
sang s_c ang_c
sao s_c ao_c
se s_c e_c
sen s_c en_c
si s_c ii_c
so s_c o_c
song s_c ong_c
sou s_c ou_c
su s_c u_c
sua s_c ua_c
suai s_c uai_c
suan s_c uan_c
suang s_c uang_c
sui s_c uei_c
sun s_c uen_c
ta t_c a_c
tai t_c ai_c
tan t_c an_c
tang t_c ang_c
tao t_c ao_c
ten t_c en_c
ti t_c i_c
tian t_c ian_c
tiao t_c iao_c
tie t_c ie_c
tin t_c in_c
to t_c o_c
tong t_c ong_c
tou t_c ou_c
tu t_c u_c
tuan t_c uan_c
tui t_c uei_c
tuo t_c uo_c
wa w_c a_c
wai w_c ai_c
wan w_c an_c
wang w_c ang_c
wei w_c ei_c
wen w_c en_c
wo w_c o_c
wu w_c u_c
xi xx_c i_c
xia xx_c ia_c
xian xx_c ian_c
xiang xx_c iang_c
xiao xx_c iao_c
xie xx_c ie_c
xin xx_c in_c
xing xx_c ing_c
xiong xx_c iong_c
xiu xx_c iou_c
xu xx_c u_c
xuan xx_c van_c
xue xx_c ve_c
xun xx_c vn_c
ya y_c a_c
yan y_c an_c
yang y_c ang_c
yao y_c ao_c
ye y_c e_c
yi y_c i_c
yin y_c in_c
yo y_c o_c
yong y_c ong_c
you y_c ou_c
yu y_c u_c
yuan y_c van_c
yue y_c ve_c
yun y_c vn_c
yuo y_c uo_c
za z_c a_c
zai z_c ai_c
zan z_c an_c
zang z_c ang_c
zao z_c ao_c
ze z_c e_c
zei z_c ei_c
zen z_c en_c
zi z_c ii_c
zo z_c o_c
zong z_c ong_c
zou z_c ou_c
zu z_c u_c
zua z_c ua_c
zuai z_c uai_c
zuan z_c uan_c
zuang z_c uang_c
zui z_c uei_c
zuo z_c uo_c
bing b_c ing_c
cer c_c er_c
ei ge ei_c
en ge en_c
fou f_c ou_c
gei g_c ei_c
geng g_c eng_c
heng h_c eng_c
huar h_c ua_c
huei h_c uei_c
jing j_c ing_c
jo j_c o_c
keng k_c eng_c
kuei k_c uei_c
le l_c e_c
leng l_c eng_c
neng n_c eng_c
ling l_c ing_c
ning n_c ing_c
ming m_c ing_c
nar n_c a_c
ngai ng_c ai_c
ngan ng_c an_c
ngao ng_c ao_c
ngen ng_c en_c
ngo ng_c o_c
xou xx_c ou_c
ping p_c ing_c
reng r_c eng_c
ro r_c o_c
run r_c uen_c
sei s_c ei_c
seng s_c eng_c
te t_c e_c
teng t_c eng_c
ting t_c ing_c
tun t_c uen_c
wong w_c ong_c
ying y_c ing_c
zeng z_c eng_c
zun z_c uen_c
ang ga ang_c
ou go ou_c
banr b_c an_c
benr b_c en_c
bianr b_c ian_c
dianr d_c ian_c
dunr d_c uen_c
fenr f_c en_c
fo f_c o_c
fur f_c u_c
gunr g_c uen_c
guo g_c uo_c
hair h_c ai_c
har h_c a_c
hei h_c ei_c
huir h_c uei_c
jianr j_c ian_c
jingr j_c ing_c
jiong j_c iong_c
kanr k_c an_c
kei k_c ei_c
kuo k_c uo_c
lar l_c a_c
lei l_c ei_c
lianr l_c ian_c
nianr n_c ian_c
luei l_c uei_c
nuei n_c uei_c
maor m_c ao_c
menr m_c en_c
mou m_c ou_c
nga ng_c a_c
ngang ng_c ang_c
ngei ng_c ei_c
nger ng_c er_c
ngong ng_c ong_c
ngou ng_c ou_c
ningr n_c ing_c
niur n_c iou_c
nvr n_c v_c
qio q_c io_c
qo q_c o_c
rui r_c uei_c
sengr s_c eng_c
ter t_c er_c
tour t_c ou_c
wanr w_c an_c
war w_c a_c
weng w_c eng_c
wenr w_c en_c
xingr xx_c ing_c
xo xx_c o_c
yangr y_c ang_c
yanr y_c an_c
yar y_c a_c
yuanr y_c van_c
yuer y_c ve_c
zeir z_c ei_c
zer z_c er_c
jun j_c vn_c
beir b_c ei_c
cei c_c ei_c
dengr d_c eng_c
far f_c a_c
genr g_c en_c
hor h_c o_c
kor k_c o_c
miu m_c iou_c
nia n_c ia_c
penr p_c en_c
xianr xx_c ian_c
gue g_c ve_c
hue h_c ve_c
bangr b_c ang_c
baor b_c ao_c
bar b_c a_c
bingr b_c ing_c
cangr c_c ang_c
car c_c a_c
cengr c_c eng_c
cuanr c_c uan_c
cuir c_c uei_c
cunr c_c uen_c
danr d_c an_c
dar d_c a_c
dour d_c ou_c
duir d_c uei_c
feir f_c ei_c
fengr f_c eng_c
ganr g_c an_c
gaor g_c ao_c
gar g_c a_c
gengr g_c eng_c
gor g_c o_c
gour g_c ou_c
guanr g_c uan_c
guar g_c ua_c
hanr h_c an_c
hunr h_c uen_c
hur h_c u_c
jiaor j_c iao_c
jiar j_c ia_c
juanr j_c van_c
junr j_c vn_c
kar k_c a_c
kour k_c ou_c
kuair k_c uai_c
laor l_c ao_c
naor n_c ao_c
leir l_c ei_c
neir n_c ei_c
liur l_c iou_c
lur l_c u_c
nur n_c u_c
mianr m_c ian_c
miaor m_c iao_c
mingr m_c ing_c
minr m_c in_c
mur m_c u_c
nge ng_c e_c
niaor n_c iao_c
or go o_c
pair p_c ai_c
paor p_c ao_c
pianr p_c ian_c
piaor p_c iao_c
pon p_c iai_c
pur p_c u_c
qianr q_c ian_c
qir q_c i_c
qiur q_c iou_c
quanr q_c van_c
rei r_c ei_c
ruo r_c uo_c
sir s_c ii_c
sour s_c ou_c
sunr s_c uen_c
suo s_c uo_c
tair t_c ai_c
tanr t_c an_c
tei t_c ei_c
tianr t_c ian_c
tir t_c i_c
wangr w_c ang_c
weir w_c ei_c
xiar xx_c ia_c
yei y_c ei_c
yingr y_c ing_c
zengr z_c eng_c
zir z_c ii_c
zuanr z_c uan_c
zuir z_c uei_c
zur z_c u_c
beng b_c eng_c
cua c_c ua_c
dia d_c ia_c
duo d_c uo_c
eng ge eng_c
pou p_c ou_c
xuo xx_c uo_c
shao sh_c ao_c
zhen zh_c en_c
shi sh_c i_c
zhe zh_c e_c
lia l_c ia_c
hiang h_c iang_c
cuo c_c uo_c
ngeng ng_c eng_c

View File

@ -0,0 +1,7 @@
1
4
2
3
5
0

BIN
voices/F7/voc/ckpt/checkpoint_340000.pth (Stored with Git LFS) Normal file

Binary file not shown.

131
voices/F7/voc/config.yaml Normal file
View File

@ -0,0 +1,131 @@
Loss:
discriminator_adv_loss:
enable: true
params: {average_by_discriminators: false}
weights: 1.0
feat_match_loss:
enable: true
params: {average_by_discriminators: false, average_by_layers: false}
weights: 2.0
generator_adv_loss:
enable: true
params: {average_by_discriminators: false}
weights: 1.0
mel_loss:
enable: true
params: {fft_size: 2048, fmax: 8000, fmin: 0, fs: 16000, hop_size: 200, log_base: null,
num_mels: 80, win_length: 1000, window: hann}
weights: 45.0
stft_loss: {enable: false}
subband_stft_loss:
enable: false
params:
fft_sizes: [384, 683, 171]
hop_sizes: [35, 75, 15]
win_lengths: [150, 300, 60]
window: hann_window
Model:
Generator:
optimizer:
params:
betas: [0.5, 0.9]
lr: 0.0002
weight_decay: 0.0
type: Adam
params:
bias: true
causal: false
channels: 256
in_channels: 80
kernel_size: 7
nonlinear_activation: LeakyReLU
nonlinear_activation_params: {negative_slope: 0.1}
out_channels: 1
resblock_dilations:
- [1, 3, 5, 7]
- [1, 3, 5, 7]
- [1, 3, 5, 7]
resblock_kernel_sizes: [3, 7, 11]
upsample_kernal_sizes: [20, 11, 4, 4]
upsample_scales: [10, 5, 2, 2]
use_weight_norm: true
scheduler:
params:
gamma: 0.5
milestones: [200000, 400000, 600000, 800000]
type: MultiStepLR
MultiPeriodDiscriminator:
optimizer:
params:
betas: [0.5, 0.9]
lr: 0.0002
weight_decay: 0.0
type: Adam
params:
discriminator_params:
bias: true
channels: 32
downsample_scales: [3, 3, 3, 3, 1]
in_channels: 1
kernel_sizes: [5, 3]
max_downsample_channels: 1024
nonlinear_activation: LeakyReLU
nonlinear_activation_params: {negative_slope: 0.1}
out_channels: 1
use_spectral_norm: false
periods: [2, 3, 5, 7, 11]
scheduler:
params:
gamma: 0.5
milestones: [200000, 400000, 600000, 800000]
type: MultiStepLR
MultiScaleDiscriminator:
optimizer:
params:
betas: [0.5, 0.9]
lr: 0.0002
weight_decay: 0.0
type: Adam
params:
discriminator_params:
bias: true
channels: 128
downsample_scales: [4, 4, 4, 4, 1]
in_channels: 1
kernel_sizes: [15, 41, 5, 3]
max_downsample_channels: 1024
max_groups: 16
nonlinear_activation: LeakyReLU
nonlinear_activation_params: {negative_slope: 0.1}
out_channels: 1
downsample_pooling: DWT
downsample_pooling_params: {kernel_size: 4, padding: 2, stride: 2}
follow_official_norm: true
scales: 3
scheduler:
params:
gamma: 0.5
milestones: [200000, 400000, 600000, 800000]
type: MultiStepLR
allow_cache: true
audio_config: {fmax: 8000.0, fmin: 0.0, hop_length: 200, max_norm: 1.0, min_level_db: -100.0,
n_fft: 2048, n_mels: 80, norm_type: mean_std, num_workers: 16, phone_level_feature: true,
preemphasize: false, ref_level_db: 20, sampling_rate: 16000, symmetric: false, trim_silence: true,
trim_silence_threshold_db: 60, wav_normalize: true, win_length: 1000}
batch_max_steps: 9600
batch_size: 16
create_time: '2022-12-26 11:11:35'
discriminator_grad_norm: -1
discriminator_train_start_steps: 0
eval_interval_steps: 10000
generator_grad_norm: -1
generator_train_start_steps: 1
git_revision_hash: 388243c0c173756d1eb34783c02cec4c302cdc25
log_interval_steps: 1000
model_type: hifigan
num_save_intermediate_results: 4
num_workers: 2
pin_memory: true
remove_short_samples: false
save_interval_steps: 20000
train_max_steps: 2500000

5
voices/voices.json Normal file
View File

@ -0,0 +1,5 @@
{
"voices": [
"F7"
]
}