From 4e570a99a705502948c29519bebff9fb43ea079b Mon Sep 17 00:00:00 2001 From: iamvalenciia Date: Fri, 14 Feb 2025 03:09:07 -0500 Subject: [PATCH] Fix incorrect comment in linear function regarding weight.element_size() --- inference/model.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/inference/model.py b/inference/model.py index 40bbf4d..e56f599 100644 --- a/inference/model.py +++ b/inference/model.py @@ -143,8 +143,8 @@ def linear(x: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = quantization-aware computations depending on the input parameters. Notes: - - If `weight` is quantized (e.g., `element_size() == 1`), a dequantized version - is used for computation. + - If `weight` is in a higher precision format (e.g., float32 or bfloat16), then `element_size() > 1`, and the original + weight tensor is used for computation. - If `gemm_impl == "bf16"`, dequantization and a `bf16` GEMM operation are applied. - For other cases, the function applies quantization to `x` and uses `fp8_gemm` for computation. """