Utils

Shared functions related to benchmarks.

`compare_outputs(pytorch_output, engine_output)` #

Compare 2 model outputs by computing the mean of absolute value difference between them.

Parameters:

Name	Type	Description	Default
`pytorch_output`	`ndarray`	reference output	required
`engine_output`	`ndarray`	other engine output	required

Returns:

Type	Description
`float`	difference between outputs as a single float

Source code in src/transformer_deploy/benchmarks/utils.py

def compare_outputs(pytorch_output: np.ndarray, engine_output: np.ndarray) -> float:
    """
    Compare 2 model outputs by computing the mean of absolute value difference between them.

    :param pytorch_output: reference output
    :param engine_output: other engine output
    :return: difference between outputs as a single float
    """
    return np.mean(np.abs(pytorch_output - engine_output))

`generate_input(seq_len, batch_size, input_names, device='cuda')` #

Generate dummy inputs.

Parameters:

Name	Type	Description	Default
`seq_len`	`int`	number of token per input.	required
`batch_size`	`int`	first dimension of the tensor	required
`input_names`	`List[str]`	tensor input names to generate	required
`device`	`str`	where to store tensors (Pytorch only). One of [cpu, cuda]	`'cuda'`

Returns:

Type	Description
`Tuple[Dict[str, torch.Tensor], Dict[str, numpy.ndarray]]`	a tuple of tensors, Pytorch and numpy

Source code in src/transformer_deploy/benchmarks/utils.py

def generate_input(
    seq_len: int, batch_size: int, input_names: List[str], device: str = "cuda"
) -> Tuple[Dict[str, torch.Tensor], Dict[str, np.ndarray]]:
    """
    Generate dummy inputs.
    :param seq_len: number of token per input.
    :param batch_size: first dimension of the tensor
    :param input_names: tensor input names to generate
    :param device: where to store tensors (Pytorch only). One of [cpu, cuda]
    :return: a tuple of tensors, Pytorch and numpy
    """
    assert device in ["cpu", "cuda"]
    shape = (batch_size, seq_len)
    inputs_pytorch: Dict[str, torch.Tensor] = dict()
    for name in input_names:
        inputs_pytorch[name] = torch.ones(size=shape, dtype=torch.int32, device=device)
    inputs_onnx: Dict[str, np.ndarray] = {
        k: np.ascontiguousarray(v.detach().cpu().numpy()) for k, v in inputs_pytorch.items()
    }
    return inputs_pytorch, inputs_onnx

`generate_multiple_inputs(seq_len, batch_size, input_names, nb_inputs_to_gen, device)` #

Generate multiple random inputs.

Parameters:

Name	Type	Description	Default
`seq_len`	`int`	sequence length to generate	required
`batch_size`	`int`	number of sequences per batch to generate	required
`input_names`	`List[str]`	tensor input names to generate	required
`nb_inputs_to_gen`	`int`	number of batches of sequences to generate	required
`device`	`str`	one of [cpu, cuda]	required

Returns:

Type	Description
`Tuple[List[Dict[str, torch.Tensor]], List[Dict[str, numpy.ndarray]]]`	generated sequences

Source code in src/transformer_deploy/benchmarks/utils.py

def generate_multiple_inputs(
    seq_len: int, batch_size: int, input_names: List[str], nb_inputs_to_gen: int, device: str
) -> Tuple[List[Dict[str, torch.Tensor]], List[Dict[str, np.ndarray]]]:
    """
    Generate multiple random inputs.

    :param seq_len: sequence length to generate
    :param batch_size: number of sequences per batch to generate
    :param input_names: tensor input names to generate
    :param nb_inputs_to_gen: number of batches of sequences to generate
    :param device: one of [cpu, cuda]
    :return: generated sequences
    """
    all_inputs_pytorch: List[Dict[str, torch.Tensor]] = list()
    all_inputs_onnx: List[Dict[str, np.ndarray]] = list()
    for _ in range(nb_inputs_to_gen):
        inputs_pytorch, inputs_onnx = generate_input(
            seq_len=seq_len, batch_size=batch_size, input_names=input_names, device=device
        )
        all_inputs_pytorch.append(inputs_pytorch)
        all_inputs_onnx.append(inputs_onnx)
    return all_inputs_pytorch, all_inputs_onnx

`print_timings(name, timings)` #

Format and print inference latencies.

Parameters:

Name	Type	Description	Default
`name`	`str`	inference engine name	required
`timings`	`List[float]`	latencies measured during the inference	required

Source code in src/transformer_deploy/benchmarks/utils.py

def print_timings(name: str, timings: List[float]) -> None:
    """
    Format and print inference latencies.

    :param name: inference engine name
    :param timings: latencies measured during the inference
    """
    mean_time = 1e3 * np.mean(timings)
    std_time = 1e3 * np.std(timings)
    min_time = 1e3 * np.min(timings)
    max_time = 1e3 * np.max(timings)
    median, percent_95_time, percent_99_time = 1e3 * np.percentile(timings, [50, 95, 99])
    print(
        f"[{name}] "
        f"mean={mean_time:.2f}ms, "
        f"sd={std_time:.2f}ms, "
        f"min={min_time:.2f}ms, "
        f"max={max_time:.2f}ms, "
        f"median={median:.2f}ms, "
        f"95p={percent_95_time:.2f}ms, "
        f"99p={percent_99_time:.2f}ms"
    )

`setup_logging(level=20)` #

Set the generic Python logger

Parameters:

Name	Type	Description	Default
`level`	`int`	logger level	`20`

Source code in src/transformer_deploy/benchmarks/utils.py

def setup_logging(level: int = logging.INFO) -> None:
    """
    Set the generic Python logger
    :param level: logger level
    """
    logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=level)

`to_numpy(tensors)` #

Convert list of torch / numpy tensors to a numpy tensor

Parameters:

Name	Type	Description	Default
`tensors`	`List[Union[numpy.ndarray, torch.Tensor]]`	list of torch / numpy tensors	required

Returns:

Type	Description
`ndarray`	numpy tensor

Source code in src/transformer_deploy/benchmarks/utils.py

def to_numpy(tensors: List[Union[np.ndarray, torch.Tensor]]) -> np.ndarray:
    """
    Convert list of torch / numpy tensors to a numpy tensor
    :param tensors: list of torch / numpy tensors
    :return: numpy tensor
    """
    if isinstance(tensors[0], torch.Tensor):
        pytorch_output = [t.detach().cpu().numpy() for t in tensors]
    elif isinstance(tensors[0], np.ndarray):
        pytorch_output = tensors
    else:
        raise Exception(f"unknown tensor type: {type(tensors[0])}")
    return np.asarray(pytorch_output)

`track_infer_time(buffer)` #

A context manager to perform latency measures

Parameters:

Name	Type	Description	Default
`buffer`	`List[int]`	a List where to save latencies for each input	required

Source code in src/transformer_deploy/benchmarks/utils.py

@contextmanager
def track_infer_time(buffer: List[int]) -> None:
    """
    A context manager to perform latency measures
    :param buffer: a List where to save latencies for each input
    """
    start = time.perf_counter()
    yield
    end = time.perf_counter()
    buffer.append(end - start)

Utils

compare_outputs(pytorch_output, engine_output) #

generate_input(seq_len, batch_size, input_names, device='cuda') #

generate_multiple_inputs(seq_len, batch_size, input_names, nb_inputs_to_gen, device) #

print_timings(name, timings) #

setup_logging(level=20) #

to_numpy(tensors) #

track_infer_time(buffer) #

`compare_outputs(pytorch_output, engine_output)` #

`generate_input(seq_len, batch_size, input_names, device='cuda')` #

`generate_multiple_inputs(seq_len, batch_size, input_names, nb_inputs_to_gen, device)` #

`print_timings(name, timings)` #

`setup_logging(level=20)` #

`to_numpy(tensors)` #

`track_infer_time(buffer)` #