Calibration utils
Setup and run quantization calibration
QATCalibrate
#
Source code in src/transformer_deploy/QDQModels/calibration_utils.py
class QATCalibrate:
def __init__(self, method: str = "histogram", percentile: float = 99.999, per_channel: bool = True):
"""
Calibration will learn how a float tensor should be mapped to an integer tensor.
Will learn range, bias and scale.
Quantization targets signe 8 bits integers as it's the best supported type for Nvidia GPUs
(there are dedicated 8 bits integer tensor cores on most modern Nvidia GPU architectures).
Don't forget to call setup_model_qat at some point.
:param method: the method calibration to use. One of [histogram, percentile].
Recommended method for transformers is "histogram".
:param percentile: for histogram method, what do you define as an outlier value
:param per_channel: calibration granularity. per channel == per dimension.
"""
assert torch.cuda.is_available(), "CUDA not available"
self.model: Optional[PreTrainedModel] = None
assert method in [
"histogram",
"max",
], f"unknown calibration method (for NLP): {method}"
self.calib_method: str = method
self.calibration_percentile: float = percentile
self.calibration_per_channel: bool = per_channel
def setup_nvidia_qat(self) -> None:
"""
Setup Nvidia QAT library global variables.
Should be called before initializing a model.
"""
input_desc = QuantDescriptor(num_bits=8, calib_method=self.calib_method)
axis = (0,) if self.calibration_per_channel else None
weight_desc = QuantDescriptor(num_bits=8, axis=axis)
quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)
def setup_model_qat(self, model: PreTrainedModel) -> None:
"""
Enable calibration on each tensor to quantize.
:param model: model to optimize
"""
self.model = model
model = self.model.cuda()
# Find the TensorQuantizer and enable calibration
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.disable_quant()
module.enable_calib()
else:
module.disable()
def finalize_calibration(self) -> None:
"""
Disable calibration process and enable quantized nodes.
"""
calib_method = "max" if self.calib_method == "max" else "percentile"
for _, module in self.model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
if isinstance(module._calibrator, calib.MaxCalibrator):
module.load_calib_amax()
else:
# strict=False -> avoid Exception when some quantizer are never used
# (because of a condition for instance)
module.load_calib_amax(calib_method, percentile=self.calibration_percentile, strict=False)
module.enable_quant()
module.disable_calib()
else:
module.enable()
# move back model to GPU memory
self.model.cuda()
@staticmethod
def restore():
"""
Restore behavior without quantization support.
"""
remove_qdq()
def __enter__(self):
add_qdq()
self.setup_nvidia_qat()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is None:
self.finalize_calibration()
__init__(self, method='histogram', percentile=99.999, per_channel=True)
special
#
Calibration will learn how a float tensor should be mapped to an integer tensor. Will learn range, bias and scale. Quantization targets signe 8 bits integers as it's the best supported type for Nvidia GPUs (there are dedicated 8 bits integer tensor cores on most modern Nvidia GPU architectures). Don't forget to call setup_model_qat at some point.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
method |
str |
the method calibration to use. One of [histogram, percentile]. Recommended method for transformers is "histogram". |
'histogram' |
percentile |
float |
for histogram method, what do you define as an outlier value |
99.999 |
per_channel |
bool |
calibration granularity. per channel == per dimension. |
True |
Source code in src/transformer_deploy/QDQModels/calibration_utils.py
def __init__(self, method: str = "histogram", percentile: float = 99.999, per_channel: bool = True):
"""
Calibration will learn how a float tensor should be mapped to an integer tensor.
Will learn range, bias and scale.
Quantization targets signe 8 bits integers as it's the best supported type for Nvidia GPUs
(there are dedicated 8 bits integer tensor cores on most modern Nvidia GPU architectures).
Don't forget to call setup_model_qat at some point.
:param method: the method calibration to use. One of [histogram, percentile].
Recommended method for transformers is "histogram".
:param percentile: for histogram method, what do you define as an outlier value
:param per_channel: calibration granularity. per channel == per dimension.
"""
assert torch.cuda.is_available(), "CUDA not available"
self.model: Optional[PreTrainedModel] = None
assert method in [
"histogram",
"max",
], f"unknown calibration method (for NLP): {method}"
self.calib_method: str = method
self.calibration_percentile: float = percentile
self.calibration_per_channel: bool = per_channel
finalize_calibration(self)
#
Disable calibration process and enable quantized nodes.
Source code in src/transformer_deploy/QDQModels/calibration_utils.py
def finalize_calibration(self) -> None:
"""
Disable calibration process and enable quantized nodes.
"""
calib_method = "max" if self.calib_method == "max" else "percentile"
for _, module in self.model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
if isinstance(module._calibrator, calib.MaxCalibrator):
module.load_calib_amax()
else:
# strict=False -> avoid Exception when some quantizer are never used
# (because of a condition for instance)
module.load_calib_amax(calib_method, percentile=self.calibration_percentile, strict=False)
module.enable_quant()
module.disable_calib()
else:
module.enable()
# move back model to GPU memory
self.model.cuda()
restore()
staticmethod
#
setup_model_qat(self, model)
#
Enable calibration on each tensor to quantize.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
PreTrainedModel |
model to optimize |
required |
Source code in src/transformer_deploy/QDQModels/calibration_utils.py
def setup_model_qat(self, model: PreTrainedModel) -> None:
"""
Enable calibration on each tensor to quantize.
:param model: model to optimize
"""
self.model = model
model = self.model.cuda()
# Find the TensorQuantizer and enable calibration
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.disable_quant()
module.enable_calib()
else:
module.disable()
setup_nvidia_qat(self)
#
Setup Nvidia QAT library global variables. Should be called before initializing a model.
Source code in src/transformer_deploy/QDQModels/calibration_utils.py
def setup_nvidia_qat(self) -> None:
"""
Setup Nvidia QAT library global variables.
Should be called before initializing a model.
"""
input_desc = QuantDescriptor(num_bits=8, calib_method=self.calib_method)
axis = (0,) if self.calibration_per_channel else None
weight_desc = QuantDescriptor(num_bits=8, axis=axis)
quant_nn.QuantLinear.set_default_quant_desc_input(input_desc)
quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc)