Generative model
This module is copy-pasted in generated Triton configuration folder to perform the tokenization step.
GPTModelWrapper (Module, GenerationMixin)
#
Source code in src/transformer_deploy/utils/generative_model.py
class GPTModelWrapper(Module, GenerationMixin):
def __init__(
self, config: PretrainedConfig, device: torch.device, inference: Callable[[torch.Tensor], torch.Tensor]
):
super().__init__()
self.config: PretrainedConfig = config
self.device: torch.device = device
self.inference: Callable[[torch.Tensor], torch.Tensor] = inference
self.main_input_name = "input_ids" # https://github.com/huggingface/transformers/pull/14803
def prepare_inputs_for_generation(self, input_ids, **kwargs):
return {
self.main_input_name: input_ids,
}
def forward(self, input_ids, **_):
logits = self.inference(input_ids)
return CausalLMOutputWithCrossAttentions(logits=logits)
forward(self, input_ids, **_)
#
Defines the computation performed at every call.
Should be overridden by all subclasses.
.. note::
Although the recipe for forward pass needs to be defined within
this function, one should call the :class:Module
instance afterwards
instead of this since the former takes care of running the
registered hooks while the latter silently ignores them.
prepare_inputs_for_generation(self, input_ids, **kwargs)
#
TritonPythonModel
#
Source code in src/transformer_deploy/utils/generative_model.py
class TritonPythonModel:
tokenizer: PreTrainedTokenizer
device: str
def initialize(self, args: Dict[str, str]) -> None:
"""
Initialize the tokenization process
:param args: arguments from Triton config file
"""
current_path: str = os.path.join(args["model_repository"], args["model_version"])
self.device = "cpu" if args["model_instance_kind"] == "CPU" else "cuda"
# more variables in https://github.com/triton-inference-server/python_backend/blob/main/src/python.cc
model_config = AutoConfig.from_pretrained(current_path)
target_model = args["model_name"].replace("_generate", "_model")
def inference_triton(input_ids: torch.Tensor) -> torch.Tensor:
input_ids = input_ids.type(dtype=torch.int32)
inputs = [pb_utils.Tensor.from_dlpack("input_ids", torch.to_dlpack(input_ids))]
inference_request = pb_utils.InferenceRequest(
model_name=target_model, requested_output_names=["output"], inputs=inputs
)
inference_response = inference_request.exec()
if inference_response.has_error():
raise pb_utils.TritonModelException(inference_response.error().message())
else:
output = pb_utils.get_output_tensor_by_name(inference_response, "output")
tensor: torch.Tensor = torch.from_dlpack(output.to_dlpack())
tensor = tensor.cuda()
return tensor
self.model = GPTModelWrapper(config=model_config, device=self.device, inference=inference_triton)
if self.device == "cuda":
self.model = self.model.cuda()
self.tokenizer = AutoTokenizer.from_pretrained(current_path)
# to silent a warning during seq generation
self.model.config.pad_token_id = self.tokenizer.eos_token_id
def execute(self, requests) -> "List[List[pb_utils.Tensor]]":
"""
Parse and tokenize each request
:param requests: 1 or more requests received by Triton server.
:return: text as input tensors
"""
responses = []
# for loop for batch requests (disabled in our case)
for request in requests:
# binary data typed back to string
query = [t.decode("UTF-8") for t in pb_utils.get_input_tensor_by_name(request, "TEXT").as_numpy().tolist()]
tokens: BatchEncoding = self.tokenizer(
text=query[0], return_tensors=TensorType.PYTORCH, return_attention_mask=False
)
# tensorrt uses int32 as input type, ort also because we force the format
input_ids = tokens.input_ids.type(dtype=torch.int32)
if self.device == "cuda":
input_ids = input_ids.to("cuda")
output_seq: torch.Tensor = self.model.generate(input_ids, max_length=32)
decoded_texts: List[str] = [self.tokenizer.decode(seq, skip_special_tokens=True) for seq in output_seq]
tensor_output = [pb_utils.Tensor("output", np.array(t, dtype=object)) for t in decoded_texts]
responses.append(pb_utils.InferenceResponse(tensor_output))
return responses
execute(self, requests)
#
Parse and tokenize each request
Parameters:
Name | Type | Description | Default |
---|---|---|---|
requests |
1 or more requests received by Triton server. |
required |
Returns:
Type | Description |
---|---|
List[List[pb_utils.Tensor]] |
text as input tensors |
Source code in src/transformer_deploy/utils/generative_model.py
def execute(self, requests) -> "List[List[pb_utils.Tensor]]":
"""
Parse and tokenize each request
:param requests: 1 or more requests received by Triton server.
:return: text as input tensors
"""
responses = []
# for loop for batch requests (disabled in our case)
for request in requests:
# binary data typed back to string
query = [t.decode("UTF-8") for t in pb_utils.get_input_tensor_by_name(request, "TEXT").as_numpy().tolist()]
tokens: BatchEncoding = self.tokenizer(
text=query[0], return_tensors=TensorType.PYTORCH, return_attention_mask=False
)
# tensorrt uses int32 as input type, ort also because we force the format
input_ids = tokens.input_ids.type(dtype=torch.int32)
if self.device == "cuda":
input_ids = input_ids.to("cuda")
output_seq: torch.Tensor = self.model.generate(input_ids, max_length=32)
decoded_texts: List[str] = [self.tokenizer.decode(seq, skip_special_tokens=True) for seq in output_seq]
tensor_output = [pb_utils.Tensor("output", np.array(t, dtype=object)) for t in decoded_texts]
responses.append(pb_utils.InferenceResponse(tensor_output))
return responses
initialize(self, args)
#
Initialize the tokenization process
Parameters:
Name | Type | Description | Default |
---|---|---|---|
args |
Dict[str, str] |
arguments from Triton config file |
required |
Source code in src/transformer_deploy/utils/generative_model.py
def initialize(self, args: Dict[str, str]) -> None:
"""
Initialize the tokenization process
:param args: arguments from Triton config file
"""
current_path: str = os.path.join(args["model_repository"], args["model_version"])
self.device = "cpu" if args["model_instance_kind"] == "CPU" else "cuda"
# more variables in https://github.com/triton-inference-server/python_backend/blob/main/src/python.cc
model_config = AutoConfig.from_pretrained(current_path)
target_model = args["model_name"].replace("_generate", "_model")
def inference_triton(input_ids: torch.Tensor) -> torch.Tensor:
input_ids = input_ids.type(dtype=torch.int32)
inputs = [pb_utils.Tensor.from_dlpack("input_ids", torch.to_dlpack(input_ids))]
inference_request = pb_utils.InferenceRequest(
model_name=target_model, requested_output_names=["output"], inputs=inputs
)
inference_response = inference_request.exec()
if inference_response.has_error():
raise pb_utils.TritonModelException(inference_response.error().message())
else:
output = pb_utils.get_output_tensor_by_name(inference_response, "output")
tensor: torch.Tensor = torch.from_dlpack(output.to_dlpack())
tensor = tensor.cuda()
return tensor
self.model = GPTModelWrapper(config=model_config, device=self.device, inference=inference_triton)
if self.device == "cuda":
self.model = self.model.cuda()
self.tokenizer = AutoTokenizer.from_pretrained(current_path)
# to silent a warning during seq generation
self.model.config.pad_token_id = self.tokenizer.eos_token_id