diff --git a/.gitattributes b/.gitattributes index 53d7257..199aaf6 100644 --- a/.gitattributes +++ b/.gitattributes @@ -44,4 +44,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.tar filter=lfs diff=lfs merge=lfs -text *.wasm filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text -*tfevents* filter=lfs diff=lfs merge=lfs -text \ No newline at end of file +*tfevents* filter=lfs diff=lfs merge=lfs -text +flow.decoder.estimator.fp16.Volta.plan filter=lfs diff=lfs merge=lfs -text +campplus.onnx filter=lfs diff=lfs merge=lfs -text +flow.decoder.estimator.fp32.onnx filter=lfs diff=lfs merge=lfs -text +speech_tokenizer_v2.onnx filter=lfs diff=lfs merge=lfs -text +flow.pt filter=lfs diff=lfs merge=lfs -text +hift.pt filter=lfs diff=lfs merge=lfs -text +llm.pt filter=lfs diff=lfs merge=lfs -text diff --git a/asset/dingding.png b/asset/dingding.png new file mode 100644 index 0000000..9a64400 Binary files /dev/null and b/asset/dingding.png differ diff --git a/campplus.onnx b/campplus.onnx new file mode 100644 index 0000000..7b08523 --- /dev/null +++ b/campplus.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6ac6a63997761ae2997373e2ee1c47040854b4b759ea41ec48e4e42df0f4d73 +size 28303423 diff --git a/cosyvoice.yaml b/cosyvoice.yaml new file mode 100644 index 0000000..947409d --- /dev/null +++ b/cosyvoice.yaml @@ -0,0 +1,140 @@ +# set random seed, so that you may reproduce your result. +__set_seed1: !apply:random.seed [1986] +__set_seed2: !apply:numpy.random.seed [1986] +__set_seed3: !apply:torch.manual_seed [1986] +__set_seed4: !apply:torch.cuda.manual_seed_all [1986] + +# fixed params +sample_rate: 24000 +llm_input_size: 896 +llm_output_size: 896 +spk_embed_dim: 192 +qwen_pretrain_path: /mnt/lyuxiang.lx/CosyVoice_github/pretrained_models/CosyVoice2-0.5B/Qwen2-0.5B-CosyVoice-BlankEN + +# model params +# for all class/function included in this repo, we use ! or ! for intialization, so that user may find all corresponding class/function according to one single yaml. +# for system/third_party class/function, we do not require this. +llm: !new:cosyvoice.llm.llm.Qwen2LM + llm_input_size: !ref + llm_output_size: !ref + speech_token_size: 6561 + length_normalized_loss: True + lsm_weight: 0 + llm: !new:cosyvoice.llm.llm.Qwen2Encoder + pretrain_path: !ref + sampling: !name:cosyvoice.utils.common.ras_sampling + top_p: 0.8 + top_k: 25 + win_size: 10 + tau_r: 0.1 + +flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec + input_size: 512 + output_size: 80 + spk_embed_dim: !ref + output_type: 'mel' + vocab_size: 6561 + input_frame_rate: 25 + only_mask_loss: True + token_mel_ratio: 2 + pre_lookahead_len: 3 + encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder + output_size: 512 + attention_heads: 8 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.1 + normalize_before: True + input_layer: 'linear' + pos_enc_layer_type: 'rel_pos_espnet' + selfattention_layer_type: 'rel_selfattn' + input_size: 512 + use_cnn_module: False + macaron_style: False + decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM + in_channels: 240 + n_spks: 1 + spk_emb_dim: 80 + cfm_params: !new:omegaconf.DictConfig + content: + sigma_min: 1e-06 + solver: 'euler' + t_scheduler: 'cosine' + training_cfg_rate: 0.2 + inference_cfg_rate: 0.7 + reg_loss_type: 'l1' + estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder + in_channels: 320 + out_channels: 80 + causal: True + channels: [256] + dropout: 0.0 + attention_head_dim: 64 + n_blocks: 4 + num_mid_blocks: 12 + num_heads: 8 + act_fn: 'gelu' + +hift: !new:cosyvoice.hifigan.generator.HiFTGenerator + in_channels: 80 + base_channels: 512 + nb_harmonics: 8 + sampling_rate: !ref + nsf_alpha: 0.1 + nsf_sigma: 0.003 + nsf_voiced_threshold: 10 + upsample_rates: [8, 5, 3] + upsample_kernel_sizes: [16, 11, 7] + istft_params: + n_fft: 16 + hop_len: 4 + resblock_kernel_sizes: [3, 7, 11] + resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] + source_resblock_kernel_sizes: [7, 7, 11] + source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]] + lrelu_slope: 0.1 + audio_limit: 0.99 + f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor + num_class: 1 + in_channels: 80 + cond_channels: 512 + +# processor functions +parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener +get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer + token_path: !ref + skip_special_tokens: True +allowed_special: 'all' +tokenize: !name:cosyvoice.dataset.processor.tokenize + get_tokenizer: !ref + allowed_special: !ref +filter: !name:cosyvoice.dataset.processor.filter + max_length: 40960 + min_length: 0 + token_max_length: 200 + token_min_length: 1 +resample: !name:cosyvoice.dataset.processor.resample + resample_rate: !ref +feat_extractor: !name:matcha.utils.audio.mel_spectrogram + n_fft: 1920 + num_mels: 80 + sampling_rate: !ref + hop_size: 480 + win_size: 1920 + fmin: 0 + fmax: 8000 + center: False +compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank + feat_extractor: !ref +parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding + normalize: True +shuffle: !name:cosyvoice.dataset.processor.shuffle + shuffle_size: 1000 +sort: !name:cosyvoice.dataset.processor.sort + sort_size: 500 # sort_size should be less than shuffle_size +batch: !name:cosyvoice.dataset.processor.batch + batch_type: 'dynamic' + max_frames_in_batch: 2000 +padding: !name:cosyvoice.dataset.processor.padding \ No newline at end of file diff --git a/flow.decoder.estimator.fp16.Volta.plan b/flow.decoder.estimator.fp16.Volta.plan new file mode 100644 index 0000000..1f6b504 --- /dev/null +++ b/flow.decoder.estimator.fp16.Volta.plan @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f231edf01451fafbc3dc0498a51feb3a264afad43275536c8151fff954ef3c56 +size 161799540 diff --git a/flow.decoder.estimator.fp32.onnx b/flow.decoder.estimator.fp32.onnx new file mode 100644 index 0000000..e2c9281 --- /dev/null +++ b/flow.decoder.estimator.fp32.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51aed3efa2c153898ea53a780893c920e968dab1d7aec25402bd6c9815d94702 +size 286521895 diff --git a/flow.encoder.fp32.zip b/flow.encoder.fp32.zip new file mode 100644 index 0000000..59a01d6 --- /dev/null +++ b/flow.encoder.fp32.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cf2fdb6cc83952d20a4faade946ab867068b1f5531c95f7b61db932c44c5a43 +size 192366262 diff --git a/flow.pt b/flow.pt new file mode 100644 index 0000000..3d62976 --- /dev/null +++ b/flow.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff4c2f867674411e0a08cee702996df13fa67c1cd864c06108da88d16d088541 +size 450575567 diff --git a/hift.pt b/hift.pt new file mode 100644 index 0000000..783dc8c --- /dev/null +++ b/hift.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d4af0d661a416c69544eec83ff9c070dc80c37ee53ef44af3a37d910c95bc21 +size 83364158 diff --git a/llm.pt b/llm.pt new file mode 100644 index 0000000..b8f9334 --- /dev/null +++ b/llm.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b144ef55b51ce8cfb79a73c90dbba0bdaba4e451c0ebcfab20f769264f84a608 +size 2023316821 diff --git a/speech_tokenizer_v2.onnx b/speech_tokenizer_v2.onnx new file mode 100644 index 0000000..a414010 --- /dev/null +++ b/speech_tokenizer_v2.onnx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ce1918400101d13593d64dd35368f0254cf25ea36dee2254e55303142e52d5e +size 496127134