From 4a65fd9221103ff03864337453c238e65d1f4a1b Mon Sep 17 00:00:00 2001 From: oyzh Date: Sat, 15 Feb 2025 11:02:28 +0800 Subject: [PATCH 1/6] fix an args description. --- inference/kernel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/kernel.py b/inference/kernel.py index ae907ad..ba18dca 100644 --- a/inference/kernel.py +++ b/inference/kernel.py @@ -87,7 +87,7 @@ def weight_dequant(x: torch.Tensor, s: torch.Tensor, block_size: int = 128) -> t Args: x (torch.Tensor): The quantized weight tensor of shape (M, N). - s (torch.Tensor): The scale tensor of shape (M, N). + s (torch.Tensor): The scale tensor of shape (M//block_size, N//block_size). block_size (int, optional): The block size to use for dequantization. Defaults to 128. Returns: From d29a967601cc772ede6c475870e3b591f2f89c45 Mon Sep 17 00:00:00 2001 From: huxuedan Date: Wed, 26 Feb 2025 17:06:54 +0800 Subject: [PATCH 2/6] modify the explanation of MLA --- inference/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inference/model.py b/inference/model.py index 8f1ab81..c143e97 100644 --- a/inference/model.py +++ b/inference/model.py @@ -392,7 +392,7 @@ def apply_rotary_emb(x: torch.Tensor, freqs_cis: torch.Tensor) -> torch.Tensor: class MLA(nn.Module): """ - Multi-Headed Attention Layer (MLA). + Multi-Head Latent Attention (MLA) Layer. Attributes: dim (int): Dimensionality of the input features. @@ -442,7 +442,7 @@ class MLA(nn.Module): def forward(self, x: torch.Tensor, start_pos: int, freqs_cis: torch.Tensor, mask: Optional[torch.Tensor]): """ - Forward pass for the Multi-Headed Attention Layer (MLA). + Forward pass for the Multi-Head Latent Attention (MLA) Layer. Args: x (torch.Tensor): Input tensor of shape (batch_size, seq_len, dim). From 1ab09c8780cfb8459aefdc46e3965889ecec9881 Mon Sep 17 00:00:00 2001 From: shihaobai Date: Mon, 3 Mar 2025 19:23:08 +0800 Subject: [PATCH 3/6] Docs: add LightLLM as supported engine --- README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b1fdbef..b388ae9 100644 --- a/README.md +++ b/README.md @@ -233,8 +233,9 @@ DeepSeek-V3 can be deployed locally using the following hardware and open-source 3. **LMDeploy**: Enables efficient FP8 and BF16 inference for local and cloud deployment. 4. **TensorRT-LLM**: Currently supports BF16 inference and INT4/8 quantization, with FP8 support coming soon. 5. **vLLM**: Support DeepSeek-V3 model with FP8 and BF16 modes for tensor parallelism and pipeline parallelism. -6. **AMD GPU**: Enables running the DeepSeek-V3 model on AMD GPUs via SGLang in both BF16 and FP8 modes. -7. **Huawei Ascend NPU**: Supports running DeepSeek-V3 on Huawei Ascend devices. +6. **LightLLM**: Supports single-node or multi-node deployment with DeepSeek-V3 FP8 and BF16. +7. **AMD GPU**: Enables running the DeepSeek-V3 model on AMD GPUs via SGLang in both BF16 and FP8 modes. +8. **Huawei Ascend NPU**: Supports running DeepSeek-V3 on Huawei Ascend devices. Since FP8 training is natively adopted in our framework, we only provide FP8 weights. If you require BF16 weights for experimentation, you can use the provided conversion script to perform the transformation. @@ -328,11 +329,15 @@ For comprehensive step-by-step instructions on running DeepSeek-V3 with LMDeploy [vLLM](https://github.com/vllm-project/vllm) v0.6.6 supports DeepSeek-V3 inference for FP8 and BF16 modes on both NVIDIA and AMD GPUs. Aside from standard techniques, vLLM offers _pipeline parallelism_ allowing you to run this model on multiple machines connected by networks. For detailed guidance, please refer to the [vLLM instructions](https://docs.vllm.ai/en/latest/serving/distributed_serving.html). Please feel free to follow [the enhancement plan](https://github.com/vllm-project/vllm/issues/11539) as well. -### 6.6 Recommended Inference Functionality with AMD GPUs +### 6.6 Inference with LightLLM (recommended) + +[LightLLM](https://github.com/ModelTC/lightllm/tree/main) LightLLM v1.0.1 supports single-machine and multi-machine tensor parallelism deployment for DeepSeek-R1 (FP8/BF16), achieving state-of-the-art performance. For more details, please refer to [LightLLM instructions](https://lightllm-en.readthedocs.io/en/latest/getting_started/quickstart.html). Additionally, LightLLM offers PD-disaggregation deployment for DeepSeek-V2, and the implementation of PD-disaggregation for DeepSeek-V3 is in development. + +### 6.7 Recommended Inference Functionality with AMD GPUs In collaboration with the AMD team, we have achieved Day-One support for AMD GPUs using SGLang, with full compatibility for both FP8 and BF16 precision. For detailed guidance, please refer to the [SGLang instructions](#63-inference-with-lmdeploy-recommended). -### 6.7 Recommended Inference Functionality with Huawei Ascend NPUs +### 6.8 Recommended Inference Functionality with Huawei Ascend NPUs The [MindIE](https://www.hiascend.com/en/software/mindie) framework from the Huawei Ascend community has successfully adapted the BF16 version of DeepSeek-V3. For step-by-step guidance on Ascend NPUs, please follow the [instructions here](https://modelers.cn/models/MindIE/deepseekv3). From 73f2954fa800fb133cd58072f6b5a2ee49e69251 Mon Sep 17 00:00:00 2001 From: shihaobai Date: Mon, 3 Mar 2025 20:10:18 +0800 Subject: [PATCH 4/6] polish --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b388ae9..920ca51 100644 --- a/README.md +++ b/README.md @@ -233,7 +233,7 @@ DeepSeek-V3 can be deployed locally using the following hardware and open-source 3. **LMDeploy**: Enables efficient FP8 and BF16 inference for local and cloud deployment. 4. **TensorRT-LLM**: Currently supports BF16 inference and INT4/8 quantization, with FP8 support coming soon. 5. **vLLM**: Support DeepSeek-V3 model with FP8 and BF16 modes for tensor parallelism and pipeline parallelism. -6. **LightLLM**: Supports single-node or multi-node deployment with DeepSeek-V3 FP8 and BF16. +6. **LightLLM**: Supports efficient single-node or multi-node deployment for FP8 and BF16. 7. **AMD GPU**: Enables running the DeepSeek-V3 model on AMD GPUs via SGLang in both BF16 and FP8 modes. 8. **Huawei Ascend NPU**: Supports running DeepSeek-V3 on Huawei Ascend devices. @@ -331,7 +331,7 @@ For comprehensive step-by-step instructions on running DeepSeek-V3 with LMDeploy ### 6.6 Inference with LightLLM (recommended) -[LightLLM](https://github.com/ModelTC/lightllm/tree/main) LightLLM v1.0.1 supports single-machine and multi-machine tensor parallelism deployment for DeepSeek-R1 (FP8/BF16), achieving state-of-the-art performance. For more details, please refer to [LightLLM instructions](https://lightllm-en.readthedocs.io/en/latest/getting_started/quickstart.html). Additionally, LightLLM offers PD-disaggregation deployment for DeepSeek-V2, and the implementation of PD-disaggregation for DeepSeek-V3 is in development. +[LightLLM](https://github.com/ModelTC/lightllm/tree/main) LightLLM v1.0.1 supports single-machine and multi-machine tensor parallel deployment for DeepSeek-R1 (FP8/BF16) and provides mixed-precision deployment, with more quantization modes continuously integrated. For more details, please refer to [LightLLM instructions](https://lightllm-en.readthedocs.io/en/latest/getting_started/quickstart.html). Additionally, LightLLM offers PD-disaggregation deployment for DeepSeek-V2, and the implementation of PD-disaggregation for DeepSeek-V3 is in development. ### 6.7 Recommended Inference Functionality with AMD GPUs From 408e6e188a6a583370c66d3c0f85fc8a68b8a4c6 Mon Sep 17 00:00:00 2001 From: shihaobai <42648726+shihaobai@users.noreply.github.com> Date: Mon, 3 Mar 2025 20:16:37 +0800 Subject: [PATCH 5/6] Update README.md polish --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 920ca51..9e5f08b 100644 --- a/README.md +++ b/README.md @@ -331,7 +331,7 @@ For comprehensive step-by-step instructions on running DeepSeek-V3 with LMDeploy ### 6.6 Inference with LightLLM (recommended) -[LightLLM](https://github.com/ModelTC/lightllm/tree/main) LightLLM v1.0.1 supports single-machine and multi-machine tensor parallel deployment for DeepSeek-R1 (FP8/BF16) and provides mixed-precision deployment, with more quantization modes continuously integrated. For more details, please refer to [LightLLM instructions](https://lightllm-en.readthedocs.io/en/latest/getting_started/quickstart.html). Additionally, LightLLM offers PD-disaggregation deployment for DeepSeek-V2, and the implementation of PD-disaggregation for DeepSeek-V3 is in development. +[LightLLM](https://github.com/ModelTC/lightllm/tree/main) v1.0.1 supports single-machine and multi-machine tensor parallel deployment for DeepSeek-R1 (FP8/BF16) and provides mixed-precision deployment, with more quantization modes continuously integrated. For more details, please refer to [LightLLM instructions](https://lightllm-en.readthedocs.io/en/latest/getting_started/quickstart.html). Additionally, LightLLM offers PD-disaggregation deployment for DeepSeek-V2, and the implementation of PD-disaggregation for DeepSeek-V3 is in development. ### 6.7 Recommended Inference Functionality with AMD GPUs From a5d2ad229e6c7162f290f4376e2ecf6292f966c4 Mon Sep 17 00:00:00 2001 From: Shixian Sheng Date: Wed, 26 Mar 2025 08:58:35 -0400 Subject: [PATCH 6/6] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a67a28d..fd120d2 100644 --- a/README.md +++ b/README.md @@ -321,7 +321,7 @@ For comprehensive step-by-step instructions on running DeepSeek-V3 with LMDeploy ### 6.4 Inference with TRT-LLM (recommended) -[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) now supports the DeepSeek-V3 model, offering precision options such as BF16 and INT4/INT8 weight-only. Support for FP8 is currently in progress and will be released soon. You can access the custom branch of TRTLLM specifically for DeepSeek-V3 support through the following link to experience the new features directly: https://github.com/NVIDIA/TensorRT-LLM/tree/deepseek/examples/deepseek_v3. +[TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) now supports the DeepSeek-V3 model, offering precision options such as BF16 and INT4/INT8 weight-only. Support for FP8 is currently in progress and will be released soon. You can access the custom branch of TRTLLM specifically for DeepSeek-V3 support through the following link to experience the new features directly: https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/deepseek_v3. ### 6.5 Inference with vLLM (recommended)