DeepSeek-V3/inference/auto_fp8/config.py

# Adapted from AutoFP8 (deprecated project)
import re
from typing import List, Optional, Tuple


class BaseQuantizeConfig:
    """Configuration for model quantization.

    Args:
        quant_method: Type/precision of quantization method to use.
            At the moment, this is just "fp8" which specifically means
            the fp8_e4m3 format in pytorch.
        activation_scheme: Choice of either "dynamic" or "static" quantization
            of activations. If "static", then calibration samples are required
            during quantization to produce accurate per-tensor scales for
            activations of Linear modules.
        ignore_patterns: List of patterns used to ignore layers. If a string
            starts with "re:", then everything afterwards is used as python
            regex style matching i.e. re.search(), for each Linear layer.
            By default, "re:.*lm_head" is included to ignore the embedding
            Linear layer usually at the end of decoder LLMs
        kv_cache_quant_targets: Tuple of Linear module names to target for
            calibration of the output scales for KV cache quantization.
            Usually, these should be `("k_proj", "v_proj")`.
    """

    def __init__(
        self,
        quant_method: str = "fp8",
        activation_scheme: str = "static",
        ignore_patterns: List[str] = ["re:.*lm_head"],
        kv_cache_quant_targets: Optional[Tuple[str]] = None,
    ):
        if quant_method != "fp8":
            raise ValueError("Only FP8 quantization is supported.")
        if activation_scheme not in ["static", "dynamic"]:
            raise ValueError(
                "Invalid activation_scheme. Choose either 'static' or 'dynamic'."
            )
        self.quant_method = quant_method
        self.activation_scheme = activation_scheme
        self.re_ignore_patterns = ignore_patterns
        self.ignore_patterns = [
            re.compile(regex_pat, re.VERBOSE) for regex_pat in ignore_patterns
        ]
        self.kv_cache_quant_targets = kv_cache_quant_targets
        self.ignored_layers = []