From 5ee5fa4832e658ab84298806de06ed2a167834bd Mon Sep 17 00:00:00 2001 From: msr2000 Date: Thu, 29 May 2025 21:16:31 +0800 Subject: [PATCH] Small fix --- README.md | 2 +- config.json | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6626e50..87feae6 100644 --- a/README.md +++ b/README.md @@ -118,7 +118,7 @@ Compared to previous versions of DeepSeek-R1, the usage recommendations for Deep 1. System prompt is supported now. 2. It is not required to add "\\n" at the beginning of the output to force the model into thinking pattern. -The model architecture of DeepSeek-R1-0528-Qwen3-8B is identical to that of Qwen3-8B, but it shares the same tokenizer configuration as DeepSeek-R1-0528. This model can be run in the same manner as Qwen3-8B. +The model architecture of DeepSeek-R1-0528-Qwen3-8B is identical to that of Qwen3-8B, but it shares the same tokenizer configuration as DeepSeek-R1-0528. This model can be run in the same manner as Qwen3-8B, but it is essential to ensure that all configuration files are sourced from our repository rather than the original Qwen3 project. ### System Prompt In the official DeepSeek web/app, we use the same system prompt with a specific date. diff --git a/config.json b/config.json index 57c7962..2bbb88e 100644 --- a/config.json +++ b/config.json @@ -21,7 +21,8 @@ "rope_scaling": { "rope_type": "yarn", "factor": 4.0, - "original_max_position_embeddings": 32768 + "original_max_position_embeddings": 32768, + "attn_factor": 0.8782488562869419 }, "rope_theta": 1000000, "sliding_window": null,