diff --git a/README.md b/README.md index 6626e50..87feae6 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ Compared to previous versions of DeepSeek-R1, the usage recommendations for Deep 1. System prompt is supported now. 2. It is not required to add "\\n" at the beginning of the output to force the model into thinking pattern. -The model architecture of DeepSeek-R1-0528-Qwen3-8B is identical to that of Qwen3-8B, but it shares the same tokenizer configuration as DeepSeek-R1-0528. This model can be run in the same manner as Qwen3-8B. +The model architecture of DeepSeek-R1-0528-Qwen3-8B is identical to that of Qwen3-8B, but it shares the same tokenizer configuration as DeepSeek-R1-0528. This model can be run in the same manner as Qwen3-8B, but it is essential to ensure that all configuration files are sourced from our repository rather than the original Qwen3 project. ### System Prompt In the official DeepSeek web/app, we use the same system prompt with a specific date. diff --git a/config.json b/config.json index 57c7962..2bbb88e 100644 --- a/config.json +++ b/config.json @@ -21,7 +21,8 @@ "rope_scaling": { "rope_type": "yarn", "factor": 4.0, - "original_max_position_embeddings": 32768 + "original_max_position_embeddings": 32768, + "attn_factor": 0.8782488562869419 }, "rope_theta": 1000000, "sliding_window": null,