Upload folder using ModelScope SDK

2026-05-19 18:02:57 +08:00 · 2025-10-20 09:26:22 +00:00
parent 272415ea30
commit 85e6bd6bd8
22 changed files with 14408 additions and 41 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -45,3 +45,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+
+model-00001-of-000001.safetensors filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
--- a/.ipynb_checkpoints/README-checkpoint.md
+++ b/.ipynb_checkpoints/README-checkpoint.md
@ -0,0 +1,122 @@
+---
+pipeline_tag: image-text-to-text
+language:
+- multilingual
+tags:
+- deepseek
+- vision-language
+- ocr
+- custom_code
+license: mit
+---
+<div align="center">
+  <img src="https://github.com/deepseek-ai/DeepSeek-V2/blob/main/figures/logo.svg?raw=true" width="60%" alt="DeepSeek AI" />
+</div>
+<hr>
+<div align="center">
+  <a href="https://www.deepseek.com/" target="_blank">
+    <img alt="Homepage" src="https://github.com/deepseek-ai/DeepSeek-V2/blob/main/figures/badge.svg?raw=true" />
+  </a>
+  <a href="https://huggingface.co/deepseek-ai/DeepSeek-OCR" target="_blank">
+    <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-DeepSeek%20AI-ffc107?color=ffc107&logoColor=white" />
+  </a>
+
+</div>
+
+<div align="center">
+
+  <a href="https://discord.gg/Tc7c45Zzu5" target="_blank">
+    <img alt="Discord" src="https://img.shields.io/badge/Discord-DeepSeek%20AI-7289da?logo=discord&logoColor=white&color=7289da" />
+  </a>
+  <a href="https://twitter.com/deepseek_ai" target="_blank">
+    <img alt="Twitter Follow" src="https://img.shields.io/badge/Twitter-deepseek_ai-white?logo=x&logoColor=white" />
+  </a>
+
+</div>
+
+
+
+<p align="center">
+  <a href="https://github.com/deepseek-ai/DeepSeek-OCR"><b>🌟 Github</b></a> |
+  <a href="https://huggingface.co/deepseek-ai/DeepSeek-OCR"><b>📥 Model Download</b></a> |
+  <a href="https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek_OCR_paper.pdf"><b>📄 Paper Link</b></a> |
+  <a href=""><b>📄 Arxiv Paper Link</b></a> |
+</p>
+<h2>
+<p align="center">
+  <a href="">DeepSeek-OCR: Contexts Optical Compression</a>
+</p>
+</h2>
+<p align="center">
+<img src="assets/fig1.png" style="width: 1000px" align=center>
+</p>
+<p align="center">
+<a href="">Explore the boundaries of visual-text compression.</a>       
+</p>
+
+## Usage
+Inference using Huggingface transformers on NVIDIA GPUs. Requirements tested on python 3.12.9 + CUDA11.8：
+
+```
+torch==2.6.0
+transformers==4.46.3
+tokenizers==0.20.3
+einops
+addict 
+easydict
+pip install flash-attn==2.7.3 --no-build-isolation
+```
+
+```python
+from transformers import AutoModel, AutoTokenizer
+import torch
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '0'
+model_name = 'deepseek-ai/DeepSeek-OCR'
+
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+model = AutoModel.from_pretrained(model_name, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True)
+model = model.eval().cuda().to(torch.bfloat16)
+
+# prompt = "<image>\nFree OCR. "
+prompt = "<image>\n<|grounding|>Convert the document to markdown. "
+image_file = 'your_image.jpg'
+output_path = 'your/output/dir'
+
+# infer(self, tokenizer, prompt='', image_file='', output_path = ' ', base_size = 1024, image_size = 640, crop_mode = True, test_compress = False, save_results = False):
+
+# Tiny: base_size = 512, image_size = 512, crop_mode = False
+# Small: base_size = 640, image_size = 640, crop_mode = False
+# Base: base_size = 1024, image_size = 1024, crop_mode = False
+# Large: base_size = 1280, image_size = 1280, crop_mode = False
+
+# Gundam: base_size = 1024, image_size = 640, crop_mode = True
+
+res = model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 640, crop_mode=True, save_results = True, test_compress = True)
+```
+
+## vLLM
+Refer to [🌟GitHub](https://github.com/deepseek-ai/DeepSeek-OCR/) for guidance on model inference acceleration and PDF processing, etc.<!--  -->
+
+## Visualizations
+<table>
+<tr>
+<td><img src="assets/show1.jpg" style="width: 500px"></td>
+<td><img src="assets/show2.jpg" style="width: 500px"></td>
+</tr>
+<tr>
+<td><img src="assets/show3.jpg" style="width: 500px"></td>
+<td><img src="assets/show4.jpg" style="width: 500px"></td>
+</tr>
+</table>
+
+
+## Acknowledgement
+
+We would like to thank [Vary](https://github.com/Ucas-HaoranWei/Vary/), [GOT-OCR2.0](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/), [MinerU](https://github.com/opendatalab/MinerU), [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [OneChart](https://github.com/LingyvKong/OneChart), [Slow Perception](https://github.com/Ucas-HaoranWei/Slow-Perception) for their valuable models and ideas.
+
+We also appreciate the benchmarks: [Fox](https://github.com/ucaslcl/Fox), [OminiDocBench](https://github.com/opendatalab/OmniDocBench).
+
+
+## Citation
+Coming soon!
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 DeepSeek
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/README.md
+++ b/README.md
@ -1,48 +1,122 @@
 ---
-license: Apache License 2.0
-tags: []
-
-#model-type:
-##如 gpt、phi、llama、chatglm、baichuan 等
-#- gpt
-
-#domain:
-##如 nlp、cv、audio、multi-modal
-#- nlp
-
-#language:
-##语言代码列表 https://help.aliyun.com/document_detail/215387.html?spm=a2c4g.11186623.0.0.9f8d7467kni6Aa
-#- cn 
-
-#metrics:
-##如 CIDEr、Blue、ROUGE 等
-#- CIDEr
-
-#tags:
-##各种自定义，包括 pretrained、fine-tuned、instruction-tuned、RL-tuned 等训练方法和其他
-#- pretrained
-
-#tools:
-##如 vllm、fastchat、llamacpp、AdaSeq 等
-#- vllm
+pipeline_tag: image-text-to-text
+language:
+- multilingual
+tags:
+- deepseek
+- vision-language
+- ocr
+- custom_code
+license: mit
 ---
-### 当前模型的贡献者未提供更加详细的模型介绍。模型文件和权重，可浏览“模型文件”页面获取。
-#### 您可以通过如下git clone命令，或者ModelScope SDK来下载模型
+<div align="center">
+  <img src="https://github.com/deepseek-ai/DeepSeek-V2/blob/main/figures/logo.svg?raw=true" width="60%" alt="DeepSeek AI" />
+</div>
+<hr>
+<div align="center">
+  <a href="https://www.deepseek.com/" target="_blank">
+    <img alt="Homepage" src="https://github.com/deepseek-ai/DeepSeek-V2/blob/main/figures/badge.svg?raw=true" />
+  </a>
+  <a href="https://huggingface.co/deepseek-ai/DeepSeek-OCR" target="_blank">
+    <img alt="Hugging Face" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-DeepSeek%20AI-ffc107?color=ffc107&logoColor=white" />
+  </a>
+
+</div>
+
+<div align="center">
+
+  <a href="https://discord.gg/Tc7c45Zzu5" target="_blank">
+    <img alt="Discord" src="https://img.shields.io/badge/Discord-DeepSeek%20AI-7289da?logo=discord&logoColor=white&color=7289da" />
+  </a>
+  <a href="https://twitter.com/deepseek_ai" target="_blank">
+    <img alt="Twitter Follow" src="https://img.shields.io/badge/Twitter-deepseek_ai-white?logo=x&logoColor=white" />
+  </a>
+
+</div>
+
+
+
+<p align="center">
+  <a href="https://github.com/deepseek-ai/DeepSeek-OCR"><b>🌟 Github</b></a> |
+  <a href="https://huggingface.co/deepseek-ai/DeepSeek-OCR"><b>📥 Model Download</b></a> |
+  <a href="https://github.com/deepseek-ai/DeepSeek-OCR/blob/main/DeepSeek_OCR_paper.pdf"><b>📄 Paper Link</b></a> |
+  <a href=""><b>📄 Arxiv Paper Link</b></a> |
+</p>
+<h2>
+<p align="center">
+  <a href="">DeepSeek-OCR: Contexts Optical Compression</a>
+</p>
+</h2>
+<p align="center">
+<img src="assets/fig1.png" style="width: 1000px" align=center>
+</p>
+<p align="center">
+<a href="">Explore the boundaries of visual-text compression.</a>       
+</p>
+
+## Usage
+Inference using Huggingface transformers on NVIDIA GPUs. Requirements tested on python 3.12.9 + CUDA11.8：

-SDK下载
-```bash
-#安装ModelScope
-pip install modelscope
 ```
+torch==2.6.0
+transformers==4.46.3
+tokenizers==0.20.3
+einops
+addict 
+easydict
+pip install flash-attn==2.7.3 --no-build-isolation
+```
+
 ```python
-#SDK模型下载
-from modelscope import snapshot_download
-model_dir = snapshot_download('deepseek-ai/DeepSeek-OCR')
-```
-Git下载
-```
-#Git模型下载
-git clone https://www.modelscope.cn/deepseek-ai/DeepSeek-OCR.git
+from transformers import AutoModel, AutoTokenizer
+import torch
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = '0'
+model_name = 'deepseek-ai/DeepSeek-OCR'
+
+tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+model = AutoModel.from_pretrained(model_name, _attn_implementation='flash_attention_2', trust_remote_code=True, use_safetensors=True)
+model = model.eval().cuda().to(torch.bfloat16)
+
+# prompt = "<image>\nFree OCR. "
+prompt = "<image>\n<|grounding|>Convert the document to markdown. "
+image_file = 'your_image.jpg'
+output_path = 'your/output/dir'
+
+# infer(self, tokenizer, prompt='', image_file='', output_path = ' ', base_size = 1024, image_size = 640, crop_mode = True, test_compress = False, save_results = False):
+
+# Tiny: base_size = 512, image_size = 512, crop_mode = False
+# Small: base_size = 640, image_size = 640, crop_mode = False
+# Base: base_size = 1024, image_size = 1024, crop_mode = False
+# Large: base_size = 1280, image_size = 1280, crop_mode = False
+
+# Gundam: base_size = 1024, image_size = 640, crop_mode = True
+
+res = model.infer(tokenizer, prompt=prompt, image_file=image_file, output_path = output_path, base_size = 1024, image_size = 640, crop_mode=True, save_results = True, test_compress = True)
 ```

-<p style="color: lightgrey;">如果您是本模型的贡献者，我们邀请您根据<a href="https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88" style="color: lightgrey; text-decoration: underline;">模型贡献文档</a>，及时完善模型卡片内容。</p>
+## vLLM
+Refer to [🌟GitHub](https://github.com/deepseek-ai/DeepSeek-OCR/) for guidance on model inference acceleration and PDF processing, etc.<!--  -->
+
+## Visualizations
+<table>
+<tr>
+<td><img src="assets/show1.jpg" style="width: 500px"></td>
+<td><img src="assets/show2.jpg" style="width: 500px"></td>
+</tr>
+<tr>
+<td><img src="assets/show3.jpg" style="width: 500px"></td>
+<td><img src="assets/show4.jpg" style="width: 500px"></td>
+</tr>
+</table>
+
+
+## Acknowledgement
+
+We would like to thank [Vary](https://github.com/Ucas-HaoranWei/Vary/), [GOT-OCR2.0](https://github.com/Ucas-HaoranWei/GOT-OCR2.0/), [MinerU](https://github.com/opendatalab/MinerU), [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [OneChart](https://github.com/LingyvKong/OneChart), [Slow Perception](https://github.com/Ucas-HaoranWei/Slow-Perception) for their valuable models and ideas.
+
+We also appreciate the benchmarks: [Fox](https://github.com/ucaslcl/Fox), [OminiDocBench](https://github.com/opendatalab/OmniDocBench).
+
+
+## Citation
+Coming soon!
--- a/assets/fig1.png
+++ b/assets/fig1.png
--- a/assets/show1.jpg
+++ b/assets/show1.jpg
--- a/assets/show2.jpg
+++ b/assets/show2.jpg
--- a/assets/show3.jpg
+++ b/assets/show3.jpg
--- a/assets/show4.jpg
+++ b/assets/show4.jpg
--- a/config.json
+++ b/config.json
@ -0,0 +1,118 @@
+{
+ "_name_or_path": "deepseek-ai/DeepSeek-OCR",
+  "candidate_resolutions": [
+  [
+   1024,
+   1024
+  ]
+ ],
+ "global_view_pos": "head",
+  "architectures": [
+      "DeepseekOCRForCausalLM"
+    ],
+  "auto_map": {
+    "AutoConfig": "modeling_deepseekocr.DeepseekOCRConfig",
+    "AutoModel": "modeling_deepseekocr.DeepseekOCRForCausalLM"
+  },
+ "language_config": {
+    "architectures": [
+      "DeepseekV2ForCausalLM"
+    ],
+    "auto_map": {
+      "AutoConfig": "configuration_deepseekv2.DeepseekV2Config",
+      "AutoModel": "modeling_deepseek.DeepseekV2Model",
+      "AutoModelForCausalLM": "modeling_deepseek.DeepseekV2ForCausalLM"
+    },
+    "bos_token_id": 0,
+    "eos_token_id": 1,
+    "first_k_dense_replace": 1,
+    "hidden_size": 1280,
+    "intermediate_size": 6848,
+    "kv_lora_rank": null,
+    "lm_head": true,
+    "max_position_embeddings": 8192,
+    "moe_intermediate_size": 896,
+    "n_group": 1,
+    "n_routed_experts": 64,
+    "n_shared_experts": 2,
+    "num_attention_heads": 10,
+    "num_experts_per_tok": 6,
+    "num_hidden_layers": 12,
+    "num_key_value_heads": 10,
+    "q_lora_rank": null,
+    "qk_nope_head_dim": 0,
+    "qk_rope_head_dim": 0,
+    "rm_head": false,
+    "topk_group": 1,
+    "topk_method": "greedy",
+    "torch_dtype": "bfloat16",
+    "use_mla": false,
+    "v_head_dim": 0,
+    "vocab_size": 129280
+  },
+ "model_type": "deepseek_vl_v2",
+ "projector_config": {
+  "input_dim": 2048,
+  "model_type": "mlp_projector",
+  "n_embed": 1280,
+  "projector_type": "linear"
+ },
+ "tile_tag": "2D",
+ "torch_dtype": "bfloat16",
+ "transformers_version": "4.46.3",
+ "vision_config": {
+  "image_size": 1024,
+  "mlp_ratio": 3.7362,
+  "model_name": "deeplip_b_l",
+  "model_type": "vision",
+  "width": {
+   "clip-l-14-224": {
+    "heads": 16,
+    "image_size": 224,
+    "layers": 24,
+    "patch_size": 14,
+    "width": 1024
+   },
+   "sam_vit_b": {
+    "downsample_channels": [
+     512,
+     1024
+    ],
+    "global_attn_indexes": [
+     2,
+     5,
+     8,
+     11
+    ],
+    "heads": 12,
+    "layers": 12,
+    "width": 768
+   }
+  }
+ },
+  "bos_token_id": 0,
+  "eos_token_id": 1,
+  "first_k_dense_replace": 1,
+  "hidden_size": 1280,
+  "intermediate_size": 6848,
+  "kv_lora_rank": null,
+  "lm_head": true,
+  "max_position_embeddings": 8192,
+  "moe_intermediate_size": 896,
+  "n_group": 1,
+  "n_routed_experts": 64,
+  "n_shared_experts": 2,
+  "num_attention_heads": 10,
+  "num_experts_per_tok": 6,
+  "num_hidden_layers": 12,
+  "num_key_value_heads": 10,
+  "q_lora_rank": null,
+  "qk_nope_head_dim": 0,
+  "qk_rope_head_dim": 0,
+  "rm_head": false,
+  "topk_group": 1,
+  "topk_method": "greedy",
+  "use_mla": false,
+  "v_head_dim": 0,
+  "vocab_size": 129280
+}
--- a/configuration.json
+++ b/configuration.json
@ -0,0 +1 @@
+{"framework": "pytorch", "task": "image-text-to-text", "allow_remote": true}
--- a/configuration_deepseek_v2.py
+++ b/configuration_deepseek_v2.py
@ -0,0 +1,210 @@
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+logger = logging.get_logger(__name__)
+
+DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
+class DeepseekV2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate an DeepSeek
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DeepSeek-V2 with multi-latent attention.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 102400):
+            Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DeepseekV2Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        moe_intermediate_size (`int`, *optional*, defaults to 1407):
+            Dimension of the MoE representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        n_shared_experts (`int`, *optional*, defaults to None):
+            Number of shared experts, None means dense model.
+        n_routed_experts (`int`, *optional*, defaults to None):
+            Number of routed experts, None means dense model.
+        routed_scaling_factor (`float`, *optional*, defaults to 1.0):
+            Scaling factor or routed experts.
+        topk_method (`str`, *optional*, defaults to `gready`):
+            Topk method used in routed gate.
+        n_group (`int`, *optional*, defaults to None):
+            Number of groups for routed experts.
+        topk_group (`int`, *optional*, defaults to None):
+            Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
+        num_experts_per_tok (`int`, *optional*, defaults to None):
+            Number of selected experts, None means dense model.
+        moe_layer_freq (`int`, *optional*, defaults to 1):
+            The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
+        first_k_dense_replace (`int`, *optional*, defaults to 0):
+            Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
+                                                            \--k dense layers--/
+        norm_topk_prob (`bool`, *optional*, defaults to False):
+            Whether to normalize the weights of the routed experts.
+        scoring_func (`str`, *optional*, defaults to 'softmax'):
+            Method of computing expert weights.
+        aux_loss_alpha (`float`, *optional*, defaults to 0.001):
+            Auxiliary loss weight coefficient.
+        seq_aux = (`bool`, *optional*, defaults to True):
+            Whether to compute the auxiliary loss for each individual sample.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        use_mla (`bool`, *optional*, defaults to `True`): Use multi-latent attention or multi-head attention. If True,
+            the model will use multi-latent attention, otherwise, it will use multi-head attention.
+
+    ```python
+    >>> from transformers import DeepseekV2Model, DeepseekV2Config
+
+    >>> # Initializing a Deepseek-V2 style configuration
+    >>> configuration = DeepseekV2Config()
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "deepseek_v2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=102400,
+        hidden_size=4096,
+        intermediate_size=11008,
+        moe_intermediate_size = 1407,
+        num_hidden_layers=30,
+        num_attention_heads=32,
+        num_key_value_heads=32,
+        n_shared_experts = None,
+        n_routed_experts = None,
+        ep_size = 1,
+        routed_scaling_factor = 1.0,
+        kv_lora_rank = 512,
+        q_lora_rank = 1536,
+        qk_rope_head_dim = 64,
+        v_head_dim = 128,
+        qk_nope_head_dim = 128,
+        topk_method = 'gready',
+        n_group = None,
+        topk_group = None,
+        num_experts_per_tok = None,
+        moe_layer_freq = 1,
+        first_k_dense_replace = 0,
+        norm_topk_prob = False,
+        scoring_func = 'softmax',
+        aux_loss_alpha = 0.001,
+        seq_aux = True,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=100000,
+        eos_token_id=100001,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        use_mla=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.moe_intermediate_size = moe_intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.n_shared_experts = n_shared_experts
+        self.n_routed_experts = n_routed_experts
+        self.ep_size = ep_size
+        self.routed_scaling_factor = routed_scaling_factor
+        self.kv_lora_rank = kv_lora_rank
+        self.q_lora_rank = q_lora_rank
+        self.qk_rope_head_dim = qk_rope_head_dim
+        self.v_head_dim = v_head_dim
+        self.qk_nope_head_dim = qk_nope_head_dim
+        self.topk_method = topk_method
+        self.n_group = n_group
+        self.topk_group = topk_group
+        self.num_experts_per_tok = num_experts_per_tok
+        self.moe_layer_freq = moe_layer_freq
+        self.first_k_dense_replace = first_k_dense_replace
+        self.norm_topk_prob = norm_topk_prob
+        self.scoring_func = scoring_func
+        self.aux_loss_alpha = aux_loss_alpha
+        self.seq_aux = seq_aux
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = float(rms_norm_eps)
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.use_mla = use_mla
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
--- a/conversation.py
+++ b/conversation.py
@ -0,0 +1,280 @@
+"""
+From https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+"""
+
+import dataclasses
+from enum import IntEnum, auto
+from typing import Any, Dict, List
+
+
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+
+    DeepSeek = auto()
+    DeepSeekV2 = auto()
+    PLAIN = auto()
+    ALIGNMENT = auto()
+
+
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The system message
+    system_message: str = ""
+    # The names of two roles
+    roles: List[str] = (("USER", "ASSISTANT"),)
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.DeepSeek
+    sep: str = "\n"
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: str = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.DeepSeek:
+            seps = [self.sep, self.sep2]
+            if system_prompt == "" or system_prompt is None:
+                ret = ""
+            else:
+                ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    ret += role + ": " + message + seps[i % 2]
+                else:
+                    ret += role + ":"
+            return ret
+        elif self.sep_style == SeparatorStyle.DeepSeekV2:
+            seps = [self.sep, self.sep2]
+            if system_prompt == "" or system_prompt is None:
+                ret = ""
+            else:
+                ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if role == "User":
+                        ret += "<｜sft▁begin｜>\n" + message + self.sep #<｜sft▁begin｜>User Input<｜sft▁end｜>\nResponse<｜end▁of▁sentence｜>
+                    else:
+                        ret += message + self.sep2
+                else:
+                    ret = ret
+            return ret
+
+        elif self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i % 2 == 0:
+                        ret += message + seps[i % 2]
+                    else:
+                        ret += message + seps[i % 2]
+                else:
+                    ret += ""
+            return ret
+        elif self.sep_style == SeparatorStyle.ALIGNMENT:
+            seps = [self.sep, self.sep2]
+            ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i % 2 == 0:
+                        ret += '<image>\n' + seps[i % 2]
+                    else:
+                        ret += message + seps[i % 2]
+                else:
+                    ret += ""
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+
+    def update_last_message(self, message: str):
+        """Update the last output.
+
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+
+    def reset_message(self):
+        """Reset a new message."""
+        self.messages = []
+
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        ret = [{"role": "system", "content": system_prompt}]
+
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+
+    def dict(self):
+        return {
+            "template_name": self.name,
+            "system_message": self.system_message,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+        }
+
+
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+
+
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert template.name not in conv_templates, f"{template.name} has been registered."
+
+    conv_templates[template.name] = template
+
+
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+
+
+register_conv_template(
+    Conversation(
+        name="deepseek",
+        system_template="{system_message}",
+        # system_message="You are a helpful assistant. Please answer truthfully and write out your "
+        # "thinking step by step to be sure you get the right answer.",
+        system_message="",
+        roles=("<|User|>", "<|Assistant|>"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.DeepSeek,
+        sep="\n\n",
+        sep2="<｜end▁of▁sentence｜>",
+        stop_token_ids=[100001],
+        stop_str=["User:", "<｜end▁of▁sentence｜>"]
+    )
+)
+register_conv_template(
+    Conversation(
+        name="deepseekv2",
+        system_template="{system_message}",
+        # system_message="You are a helpful assistant. Please answer truthfully and write out your "
+        # "thinking step by step to be sure you get the right answer.",
+        system_message="",
+        roles=("<｜User｜>", "<｜Assistant｜>"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.DeepSeek,
+        sep="",
+        sep2="<｜end▁of▁sentence｜>",
+        stop_token_ids=[100001],
+        stop_str=["User:", "<｜end▁of▁sentence｜>"]
+    )
+)
+
+
+register_conv_template(
+    Conversation(
+        name="plain",
+        system_template="",
+        system_message="",
+        roles=("", ""),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.PLAIN,
+        sep="",
+        sep2="",
+        stop_token_ids=[100001],
+        stop_str=['</s>'],
+    )
+)
+
+
+register_conv_template(
+    Conversation(
+        name="alignment",
+        system_template="",
+        system_message="",
+        roles=("", ""),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ALIGNMENT,
+        sep="",
+        sep2="",
+        stop_token_ids=[100001],
+        stop_str=['</s>'],
+    )
+)
+
+
+if __name__ == "__main__":
+    print("deepseek template:")
+    conv = get_conv_template("deepseek")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi! This is Tony.")
+    conv.append_message(conv.roles[0], "Who are you?")
+    conv.append_message(conv.roles[1], "I am a helpful assistant.")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
+
+    print("deepseekv2 template:")
+    conv = get_conv_template("deepseekv2")
+    conv.append_message(conv.roles[0], "Hello!")
+    conv.append_message(conv.roles[1], "Hi! This is Tony.")
+    conv.append_message(conv.roles[0], "Who are you?")
+    conv.append_message(conv.roles[1], "I am a helpful assistant.")
+    conv.append_message(conv.roles[0], "How are you?")
+    conv.append_message(conv.roles[1], None)
+    print(conv.get_prompt())
--- a/deepencoder.py
+++ b/deepencoder.py
--- a/model-00001-of-000001.safetensors
+++ b/model-00001-of-000001.safetensors
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1169e7cdc28ff2fb6186556acb2175db148ad26a62097df4c45a17e523180d3f
+size 6672547120
--- a/model.safetensors.index.json
+++ b/model.safetensors.index.json
--- a/modeling_deepseekocr.py
+++ b/modeling_deepseekocr.py
--- a/modeling_deepseekv2.py
+++ b/modeling_deepseekv2.py
--- a/processor_config.json
+++ b/processor_config.json
@ -0,0 +1,28 @@
+{
+  "add_special_token": false,
+  "candidate_resolutions": [
+    [
+      1024,
+      1024
+    ]
+  ],
+  "downsample_ratio": 4,
+  "ignore_id": -100,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_token": "<image>",
+  "mask_prompt": false,
+  "normalize": true,
+  "pad_token": "<\uff5c\u2581pad\u2581\uff5c>",
+  "patch_size": 16,
+  "processor_class": "DeepseekVLV2Processor",
+  "sft_format": "deepseek"
+}
--- a/special_tokens_map.json
+++ b/special_tokens_map.json
@ -0,0 +1,39 @@
+{
+  "additional_special_tokens": [
+    {
+      "content": "<|User|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|Assistant|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": {
+    "content": "<｜begin▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<｜end▁of▁sentence｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<｜▁pad▁｜>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/tokenizer.json
+++ b/tokenizer.json
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a02f8fd5228c90256bb4f6554c34a579d48f909e5beb232dc4afad870b55a8b4
+size 9979544
--- a/tokenizer_config.json
+++ b/tokenizer_config.json
				`@ -0,0 +1 @@`
				`{"framework": "pytorch", "task": "image-text-to-text", "allow_remote": true}`