From 5ee97a83f0457d0d805b862aeb387358e1801e6d Mon Sep 17 00:00:00 2001 From: Xingkai Yu <38156925+GeeeekExplorer@users.noreply.github.com> Date: Fri, 7 Feb 2025 16:42:55 +0800 Subject: [PATCH] fix comment --- inference/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/inference/model.py b/inference/model.py index 2ec1b20..40bbf4d 100644 --- a/inference/model.py +++ b/inference/model.py @@ -143,7 +143,7 @@ def linear(x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = quantization-aware computations depending on the input parameters. Notes: - - If `weight` is quantized (e.g., `element_size() > 1`), a dequantized version + - If `weight` is quantized (e.g., `element_size() == 1`), a dequantized version is used for computation. - If `gemm_impl == "bf16"`, dequantization and a `bf16` GEMM operation are applied. - For other cases, the function applies quantization to `x` and uses `fp8_gemm` for computation.