Calibration utils

Setup and run quantization calibration

`QATCalibrate` #

Source code in src/transformer_deploy/QDQModels/calibration_utils.py

class QATCalibrate:
    def __init__(self, method: str = "histogram", percentile: float = 99.999, per_channel: bool = True):
        """
        Calibration will learn how a float tensor should be mapped to an integer tensor.
        Will learn range, bias and scale.
        Quantization targets signe 8 bits integers as it's the best supported type for Nvidia GPUs
        (there are dedicated 8 bits integer tensor cores on most modern Nvidia GPU architectures).
        Don't forget to call setup_model_qat at some point.
        :param method: the method calibration to use. One of [histogram, percentile].
        Recommended method for transformers is "histogram".
        :param percentile: for histogram method, what do you define as an outlier value
        :param per_channel: calibration granularity. per channel == per dimension.
        """
        assert torch.cuda.is_available(), "CUDA not available"
        self.model: Optional[PreTrainedModel] = None
        assert method in [
            "histogram",
            "max",
        ], f"unknown calibration method (for NLP): {method}"
        self.calib_method: str = method
        self.calibration_percentile: float = percentile
        self.calibration_per_channel: bool = per_channel

    def setup_nvidia_qat(self) -> None:
        """
        Setup Nvidia QAT library global variables.
        Should be called before initializing a model.
        """
        input_desc = QuantDescriptor(num_bits=8, calib_method=self.calib_method)
        axis = (0,) if self.calibration_per_channel else None
        weight_desc = QuantDescriptor(num_bits=8, axis=axis)
        quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
        quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)

    def setup_model_qat(self, model: PreTrainedModel) -> None:
        """
        Enable calibration on each tensor to quantize.
        :param model: model to optimize
        """
        self.model = model
        model = self.model.cuda()
        # Find the TensorQuantizer and enable calibration
        for name, module in model.named_modules():
            if isinstance(module, quant_nn.TensorQuantizer):
                if module._calibrator is not None:
                    module.disable_quant()
                    module.enable_calib()
                else:
                    module.disable()

    def finalize_calibration(self) -> None:
        """
        Disable calibration process and enable quantized nodes.
        """
        calib_method = "max" if self.calib_method == "max" else "percentile"
        for _, module in self.model.named_modules():
            if isinstance(module, quant_nn.TensorQuantizer):
                if module._calibrator is not None:
                    if isinstance(module._calibrator, calib.MaxCalibrator):
                        module.load_calib_amax()
                    else:
                        # strict=False -> avoid Exception when some quantizer are never used
                        # (because of a condition for instance)
                        module.load_calib_amax(calib_method, percentile=self.calibration_percentile, strict=False)
                    module.enable_quant()
                    module.disable_calib()
                else:
                    module.enable()
        # move back model to GPU memory
        self.model.cuda()

    @staticmethod
    def restore():
        """
        Restore behavior without quantization support.
        """
        remove_qdq()

    def __enter__(self):
        add_qdq()
        self.setup_nvidia_qat()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if exc_type is None:
            self.finalize_calibration()

`init(self, method='histogram', percentile=99.999, per_channel=True)` `special` #

Calibration will learn how a float tensor should be mapped to an integer tensor. Will learn range, bias and scale. Quantization targets signe 8 bits integers as it's the best supported type for Nvidia GPUs (there are dedicated 8 bits integer tensor cores on most modern Nvidia GPU architectures). Don't forget to call setup_model_qat at some point.

Parameters:

Name	Type	Description	Default
`method`	`str`	the method calibration to use. One of [histogram, percentile]. Recommended method for transformers is "histogram".	`'histogram'`
`percentile`	`float`	for histogram method, what do you define as an outlier value	`99.999`
`per_channel`	`bool`	calibration granularity. per channel == per dimension.	`True`

Source code in src/transformer_deploy/QDQModels/calibration_utils.py

def __init__(self, method: str = "histogram", percentile: float = 99.999, per_channel: bool = True):
    """
    Calibration will learn how a float tensor should be mapped to an integer tensor.
    Will learn range, bias and scale.
    Quantization targets signe 8 bits integers as it's the best supported type for Nvidia GPUs
    (there are dedicated 8 bits integer tensor cores on most modern Nvidia GPU architectures).
    Don't forget to call setup_model_qat at some point.
    :param method: the method calibration to use. One of [histogram, percentile].
    Recommended method for transformers is "histogram".
    :param percentile: for histogram method, what do you define as an outlier value
    :param per_channel: calibration granularity. per channel == per dimension.
    """
    assert torch.cuda.is_available(), "CUDA not available"
    self.model: Optional[PreTrainedModel] = None
    assert method in [
        "histogram",
        "max",
    ], f"unknown calibration method (for NLP): {method}"
    self.calib_method: str = method
    self.calibration_percentile: float = percentile
    self.calibration_per_channel: bool = per_channel

`finalize_calibration(self)` #

Disable calibration process and enable quantized nodes.

Source code in src/transformer_deploy/QDQModels/calibration_utils.py

def finalize_calibration(self) -> None:
    """
    Disable calibration process and enable quantized nodes.
    """
    calib_method = "max" if self.calib_method == "max" else "percentile"
    for _, module in self.model.named_modules():
        if isinstance(module, quant_nn.TensorQuantizer):
            if module._calibrator is not None:
                if isinstance(module._calibrator, calib.MaxCalibrator):
                    module.load_calib_amax()
                else:
                    # strict=False -> avoid Exception when some quantizer are never used
                    # (because of a condition for instance)
                    module.load_calib_amax(calib_method, percentile=self.calibration_percentile, strict=False)
                module.enable_quant()
                module.disable_calib()
            else:
                module.enable()
    # move back model to GPU memory
    self.model.cuda()

`restore()` `staticmethod` #

Restore behavior without quantization support.

Source code in src/transformer_deploy/QDQModels/calibration_utils.py

@staticmethod
def restore():
    """
    Restore behavior without quantization support.
    """
    remove_qdq()

`setup_model_qat(self, model)` #

Enable calibration on each tensor to quantize.

Parameters:

Name	Type	Description	Default
`model`	`PreTrainedModel`	model to optimize	required

Source code in src/transformer_deploy/QDQModels/calibration_utils.py

def setup_model_qat(self, model: PreTrainedModel) -> None:
    """
    Enable calibration on each tensor to quantize.
    :param model: model to optimize
    """
    self.model = model
    model = self.model.cuda()
    # Find the TensorQuantizer and enable calibration
    for name, module in model.named_modules():
        if isinstance(module, quant_nn.TensorQuantizer):
            if module._calibrator is not None:
                module.disable_quant()
                module.enable_calib()
            else:
                module.disable()

`setup_nvidia_qat(self)` #

Setup Nvidia QAT library global variables. Should be called before initializing a model.

Source code in src/transformer_deploy/QDQModels/calibration_utils.py

def setup_nvidia_qat(self) -> None:
    """
    Setup Nvidia QAT library global variables.
    Should be called before initializing a model.
    """
    input_desc = QuantDescriptor(num_bits=8, calib_method=self.calib_method)
    axis = (0,) if self.calibration_per_channel else None
    weight_desc = QuantDescriptor(num_bits=8, axis=axis)
    quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
    quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)

Calibration utils

QATCalibrate #

__init__(self, method='histogram', percentile=99.999, per_channel=True) special #

finalize_calibration(self) #

restore() staticmethod #

setup_model_qat(self, model) #

setup_nvidia_qat(self) #

`QATCalibrate` #

`init(self, method='histogram', percentile=99.999, per_channel=True)` `special` #

`finalize_calibration(self)` #

`restore()` `staticmethod` #

`setup_model_qat(self, model)` #

`setup_nvidia_qat(self)` #