diff --git a/requirements.txt b/requirements.txt index faef06cf8..a16a15778 100644 --- a/requirements.txt +++ b/requirements.txt @@ -12,4 +12,8 @@ multiprocess packaging x2paddle rarfile -onnx >= 1.6.0 \ No newline at end of file +gradio +tritonclient[all] +attrdict +psutil +onnx >= 1.6.0 diff --git a/visualdl/component/inference/fastdeploy_client/__init__.py b/visualdl/component/inference/fastdeploy_client/__init__.py new file mode 100644 index 000000000..9c19f7b87 --- /dev/null +++ b/visualdl/component/inference/fastdeploy_client/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2022 VisualDL Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ======================================================================= diff --git a/visualdl/component/inference/fastdeploy_client/client_app.py b/visualdl/component/inference/fastdeploy_client/client_app.py new file mode 100644 index 000000000..397b8255a --- /dev/null +++ b/visualdl/component/inference/fastdeploy_client/client_app.py @@ -0,0 +1,409 @@ +# Copyright (c) 2022 VisualDL Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ======================================================================= +import gradio as gr +import numpy as np + +from .http_client_manager import get_metric_data +from .http_client_manager import HttpClientManager +from .http_client_manager import metrics_table_head +from .visualizer import visualize_detection +from .visualizer import visualize_face_alignment +from .visualizer import visualize_face_detection +from .visualizer import visualize_headpose +from .visualizer import visualize_keypoint_detection +from .visualizer import visualize_matting +from .visualizer import visualize_ocr +from .visualizer import visualize_segmentation + +_http_manager = HttpClientManager() + +supported_tasks = { + 'detection': visualize_detection, + 'facedet': visualize_face_detection, + 'keypointdetection': visualize_keypoint_detection, + 'segmentation': visualize_segmentation, + 'matting': visualize_matting, + 'ocr': visualize_ocr, + 'facealignment': visualize_face_alignment, + 'headpose': visualize_headpose, + 'unspecified': lambda x: str(x) +} + + +def create_gradio_client_app(): # noqa:C901 + css = """ + .gradio-container { + font-family: 'IBM Plex Sans', sans-serif; + } + .gr-button { + color: white; + border-color: black; + background: black; + } + input[type='range'] { + accent-color: black; + } + .dark input[type='range'] { + accent-color: #dfdfdf; + } + #gallery { + min-height: 22rem; + margin-bottom: 15px; + margin-left: auto; + margin-right: auto; + border-bottom-right-radius: .5rem !important; + border-bottom-left-radius: .5rem !important; + } + #gallery>div>.h-full { + min-height: 20rem; + } + .details:hover { + text-decoration: underline; + } + .gr-button { + white-space: nowrap; + } + .gr-button:focus { + border-color: rgb(147 197 253 / var(--tw-border-opacity)); + outline: none; + box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); + --tw-border-opacity: 1; + --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) \ + var(--tw-ring-offset-color); + --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color); + --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity)); + --tw-ring-opacity: .5; + } + .footer { + margin-bottom: 45px; + margin-top: 35px; + text-align: center; + border-bottom: 1px solid #e5e5e5; + } + .footer>p { + font-size: .8rem; + display: inline-block; + padding: 0 10px; + transform: translateY(10px); + background: white; + } + .dark .footer { + border-color: #303030; + } + .dark .footer>p { + background: #0b0f19; + } + .prompt h4{ + margin: 1.25em 0 .25em 0; + font-weight: bold; + font-size: 115%; + } + """ + + block = gr.Blocks(css=css) + + with block: + gr.HTML(""" +
+
+

+ FastDeploy Client +

+
+

+ The client is used for creating requests to fastdeploy server. +

+
+ """) + with gr.Group(): + with gr.Box(): + with gr.Column(): + with gr.Row(): + server_addr_text = gr.Textbox( + label="服务ip", + show_label=True, + max_lines=1, + placeholder="localhost", + ) + + server_http_port_text = gr.Textbox( + label="推理服务端口", + show_label=True, + max_lines=1, + placeholder="8000", + ) + + server_metric_port_text = gr.Textbox( + label="性能服务端口", + show_label=True, + max_lines=1, + placeholder="8002", + ) + with gr.Row(): + model_name_text = gr.Textbox( + label="模型名称", + show_label=True, + max_lines=1, + placeholder="yolov5", + ) + model_version_text = gr.Textbox( + label="模型版本", + show_label=True, + max_lines=1, + placeholder="1", + ) + + with gr.Box(): + with gr.Tab("组件形式"): + check_button = gr.Button("获取模型输入输出") + component_format_column = gr.Column(visible=False) + with component_format_column: + task_radio = gr.Radio( + choices=list(supported_tasks.keys()), + value='unspecified', + label='任务类型', + visible=True) + gr.Markdown("根据模型需要,挑选文本框或者图像框进行输入") + with gr.Row(): + with gr.Column(): + gr.Markdown("模型输入") + input_accordions = [] + input_name_texts = [] + input_images = [] + input_texts = [] + for i in range(6): + accordion = gr.Accordion( + "输入变量 {}".format(i), + open=True, + visible=False) + with accordion: + input_name_text = gr.Textbox( + label="变量名", interactive=False) + input_image = gr.Image(type='numpy') + input_text = gr.Textbox( + label="文本框", max_lines=1000) + input_accordions.append(accordion) + input_name_texts.append(input_name_text) + input_images.append(input_image) + input_texts.append(input_text) + + with gr.Column(): + gr.Markdown("模型输出") + output_accordions = [] + output_name_texts = [] + output_images = [] + output_texts = [] + for i in range(6): + accordion = gr.Accordion( + "输出变量 {}".format(i), + open=True, + visible=False) + with accordion: + output_name_text = gr.Textbox( + label="变量名", interactive=False) + output_text = gr.Textbox( + label="服务返回的原数据", + interactive=False, + show_label=True) + output_image = gr.Image( + interactive=False) + output_accordions.append(accordion) + output_name_texts.append(output_name_text) + output_images.append(output_image) + output_texts.append(output_text) + component_submit_button = gr.Button("提交请求") + with gr.Tab("原始形式"): + gr.Markdown("模型输入") + raw_payload_text = gr.Textbox( + label="负载数据", max_lines=10000) + with gr.Column(): + gr.Markdown("输出") + output_raw_text = gr.Textbox( + label="服务返回的原始数据", interactive=False) + raw_submit_button = gr.Button("提交请求") + + with gr.Box(): + with gr.Column(): + gr.Markdown("服务性能统计(每次提交请求会自动更新数据,您也可以手动点击更新)") + output_html_table = gr.HTML( + label="metrics", + interactive=False, + show_label=False, + value=metrics_table_head.format('', '')) + update_metric_button = gr.Button("更新统计数据") + + status_text = gr.Textbox( + label="status", + show_label=True, + max_lines=1, + interactive=False) + + all_input_output_components = input_accordions + input_name_texts + input_images + \ + input_texts + output_accordions + output_name_texts + output_images + output_texts + + def get_input_output_name(server_ip, server_port, model_name, + model_version): + try: + server_addr = server_ip + ':' + server_port + input_metas, output_metas = _http_manager.get_model_meta( + server_addr, model_name, model_version) + except Exception as e: + return {status_text: str(e)} + results = { + component: None + for component in all_input_output_components + } + results[component_format_column] = gr.update(visible=True) + # results[check_button] = gr.update(visible=False) + for input_accordio in input_accordions: + results[input_accordio] = gr.update(visible=False) + for output_accordio in output_accordions: + results[output_accordio] = gr.update(visible=False) + results[status_text] = 'GetInputOutputName Successful' + for i, input_meta in enumerate(input_metas): + results[input_accordions[i]] = gr.update(visible=True) + results[input_name_texts[i]] = input_meta['name'] + for i, output_meta in enumerate(output_metas): + results[output_accordions[i]] = gr.update(visible=True) + results[output_name_texts[i]] = output_meta['name'] + return results + + def component_inference(*args): + server_ip = args[0] + http_port = args[1] + metric_port = args[2] + model_name = args[3] + model_version = args[4] + names = args[5:5 + len(input_name_texts)] + images = args[5 + len(input_name_texts):5 + len(input_name_texts) + + len(input_images)] + texts = args[5 + len(input_name_texts) + len(input_images):5 + + len(input_name_texts) + len(input_images) + + len(input_texts)] + task_type = args[-1] + server_addr = server_ip + ':' + http_port + if server_ip and http_port and model_name and model_version: + inputs = {} + for i, input_name in enumerate(names): + if input_name: + if images[i] is not None: + inputs[input_name] = np.array([images[i]]) + if texts[i]: + inputs[input_name] = np.array( + [[texts[i].encode('utf-8')]], dtype=np.object_) + try: + infer_results = _http_manager.infer( + server_addr, model_name, model_version, inputs) + results = {status_text: 'Inference Successful'} + for i, (output_name, + data) in enumerate(infer_results.items()): + results[output_name_texts[i]] = output_name + results[output_texts[i]] = str(data) + if task_type != 'unspecified': + try: + results[output_images[i]] = supported_tasks[ + task_type](images[0], data) + except Exception: + results[output_images[i]] = None + if metric_port: + html_table = get_metric_data(server_ip, metric_port) + results[output_html_table] = html_table + return results + except Exception as e: + return {status_text: 'Error: {}'.format(e)} + else: + return { + status_text: + 'Please input server addr, model name and model version.' + } + + def raw_inference(*args): + server_ip = args[0] + http_port = args[1] + metric_port = args[2] + model_name = args[3] + model_version = args[4] + payload_text = args[5] + server_addr = server_ip + ':' + http_port + try: + result = _http_manager.raw_infer(server_addr, model_name, + model_version, payload_text) + results = { + status_text: 'Get response from server', + output_raw_text: result + } + if server_ip and metric_port: + html_table = get_metric_data(server_ip, metric_port) + results[output_html_table] = html_table + return results + except Exception as e: + return {status_text: 'Error: {}'.format(e)} + + def update_metric(server_ip, metrics_port): + if server_ip and metrics_port: + try: + html_table = get_metric_data(server_ip, metrics_port) + return { + output_html_table: html_table, + status_text: "Successfully update metrics." + } + except Exception as e: + return {status_text: 'Error: {}'.format(e)} + else: + return { + status_text: 'Please input server ip and metrics_port.' + } + + check_button.click( + fn=get_input_output_name, + inputs=[ + server_addr_text, server_http_port_text, model_name_text, + model_version_text + ], + outputs=[ + *all_input_output_components, check_button, + component_format_column, status_text + ]) + component_submit_button.click( + fn=component_inference, + inputs=[ + server_addr_text, server_http_port_text, + server_metric_port_text, model_name_text, model_version_text, + *input_name_texts, *input_images, *input_texts, task_radio + ], + outputs=[ + *output_name_texts, *output_images, *output_texts, status_text, + output_html_table + ]) + raw_submit_button.click( + fn=raw_inference, + inputs=[ + server_addr_text, server_http_port_text, + server_metric_port_text, model_name_text, model_version_text, + raw_payload_text + ], + outputs=[output_raw_text, status_text, output_html_table]) + update_metric_button.click( + fn=update_metric, + inputs=[server_addr_text, server_metric_port_text], + outputs=[output_html_table, status_text]) + return block diff --git a/visualdl/component/inference/fastdeploy_client/http_client_manager.py b/visualdl/component/inference/fastdeploy_client/http_client_manager.py new file mode 100644 index 000000000..691594152 --- /dev/null +++ b/visualdl/component/inference/fastdeploy_client/http_client_manager.py @@ -0,0 +1,304 @@ +# Copyright (c) 2022 VisualDL Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ======================================================================= +import json +import re + +import numpy as np +import requests +import tritonclient.http as httpclient +from attrdict import AttrDict +from tritonclient.utils import InferenceServerException + + +def convert_http_metadata_config(metadata): + metadata = AttrDict(metadata) + + return metadata + + +def prepare_request(inputs_meta, inputs_data, outputs_meta): + ''' + inputs_meta: inputs meta information from model. name: info + inputs_data: users input data. name: data + ''' + # Set the input data + inputs = [] + for input_dict in inputs_meta: + input_name = input_dict['name'] + if input_name not in inputs_data: + raise RuntimeError( + 'Error: input name {} required for model not existed.'.format( + input_name)) + if input_dict['datatype'] == 'FP32': + inputs_data[input_name] = inputs_data[input_name].astype( + np.float32 + ) / 255 # image data returned by gradio is uint8, convert to fp32 + if len(input_dict['shape'] + ) == 3 and input_dict['shape'][0] == 3: # NCHW + inputs_data[input_name] = inputs_data[input_name][0].transpose( + 2, 0, 1) + elif len(input_dict['shape'] + ) == 4 and input_dict['shape'][1] == 3: # NCHW + inputs_data[input_name] = inputs_data[input_name].transpose( + 0, 3, 1, 2) + infer_input = httpclient.InferInput( + input_name, inputs_data[input_name].shape, input_dict['datatype']) + infer_input.set_data_from_numpy(inputs_data[input_name]) + inputs.append(infer_input) + outputs = [] + for output_dict in outputs_meta: + infer_output = httpclient.InferRequestedOutput(output_dict.name) + outputs.append(infer_output) + return inputs, outputs + + +metrics_table_head = """ + + +
+ + + + + + + + + + + + + + + + + + + {} +
模型名称执行统计延迟统计
请求处理成功数请求处理失败数推理batch数推理样本数请求处理时间(ms)任务队列等待时间(ms)输入处理时间(ms)模型推理时间(ms)输出处理时间(ms)
+
+
+
+
+
+
+
+ + + + + + + + + + + + + + + {} +
GPU性能指标显存
利用率(%)功率(W)功率限制(W)耗电量(W)总量(GB)已使用(GB)
+
+""" + + +def get_metric_data(server_addr, metric_port): # noqa:C901 + ''' + Get metrics data from fastdeploy server, and transform it into html table. + Args: + server_addr(str): fastdeployserver ip address + metric_port(int): fastdeployserver metrics port + Returns: + htmltable(str): html table to show metrics data + ''' + model_table = {} + gpu_table = {} + metric_column_name = { + "Model": { + "nv_inference_request_success", "nv_inference_request_failure", + "nv_inference_count", "nv_inference_exec_count", + "nv_inference_request_duration_us", + "nv_inference_queue_duration_us", + "nv_inference_compute_input_duration_us", + "nv_inference_compute_infer_duration_us", + "nv_inference_compute_output_duration_us" + }, + "GPU": { + "nv_gpu_power_usage", "nv_gpu_power_limit", + "nv_energy_consumption", "nv_gpu_utilization", + "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" + }, + "CPU": { + "nv_cpu_utilization", "nv_cpu_memory_total_bytes", + "nv_cpu_memory_used_bytes" + } + } + try: + res = requests.get("http://{}:{}/metrics".format( + server_addr, metric_port)) + except Exception: + return metrics_table_head.format('', '') + metric_content = res.text + for content in metric_content.split('\n'): + if content.startswith('#'): + continue + else: + res = re.match(r'(\w+){(.*)} (\w+)', + content) # match output by server metrics interface + if not res: + continue + metric_name = res.group(1) + model = res.group(2) + value = res.group(3) + infos = {} + for info in model.split(','): + k, v = info.split('=') + v = v.strip('"') + infos[k] = v + if metric_name in [ + "nv_inference_request_duration_us", + "nv_inference_queue_duration_us", + "nv_inference_compute_input_duration_us", + "nv_inference_compute_infer_duration_us", + "nv_inference_compute_output_duration_us" + ]: + value = str(float(value) / 1000) + elif metric_name in [ + "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" + ]: + value = str(float(value) / 1024 / 1024 / 1024) + for key, metric_names in metric_column_name.items(): + if metric_name in metric_names: + if key == 'Model': + model_name = infos['model'] + if model_name not in model_table: + model_table[model_name] = {} + model_table[model_name][metric_name] = value + elif key == 'GPU': + gpu_name = infos['gpu_uuid'] + if gpu_name not in gpu_table: + gpu_table[gpu_name] = {} + gpu_table[gpu_name][metric_name] = value + elif key == 'CPU': + pass + model_data_list = [] + gpu_data_list = [] + model_data_metric_names = [ + "nv_inference_request_success", "nv_inference_request_failure", + "nv_inference_exec_count", "nv_inference_count", + "nv_inference_request_duration_us", "nv_inference_queue_duration_us", + "nv_inference_compute_input_duration_us", + "nv_inference_compute_infer_duration_us", + "nv_inference_compute_output_duration_us" + ] + gpu_data_metric_names = [ + "nv_gpu_utilization", "nv_gpu_power_usage", "nv_gpu_power_limit", + "nv_energy_consumption", "nv_gpu_memory_total_bytes", + "nv_gpu_memory_used_bytes" + ] + for k, v in model_table.items(): + data = [] + data.append(k) + for data_metric in model_data_metric_names: + data.append(v[data_metric]) + model_data_list.append(data) + for k, v in gpu_table.items(): + data = [] + data.append(k) + for data_metric in gpu_data_metric_names: + data.append(v[data_metric]) + gpu_data_list.append(data) + model_data = '\n'.join([ + "" + '\n'.join(["" + item + "" + for item in data]) + "" + for data in model_data_list + ]) + gpu_data = '\n'.join([ + "" + '\n'.join(["" + item + "" + for item in data]) + "" + for data in gpu_data_list + ]) + return metrics_table_head.format(model_data, gpu_data) + + +class HttpClientManager: + def __init__(self): + self.clients = {} # server url: httpclient + + def _create_client(self, server_url): + if server_url in self.clients: + return self.clients[server_url] + try: + fastdeploy_client = httpclient.InferenceServerClient(server_url) + self.clients[server_url] = fastdeploy_client + return fastdeploy_client + except Exception: + raise RuntimeError( + 'Can not connect to server {}, please check your \ + server address'.format(server_url)) + + def infer(self, server_url, model_name, model_version, inputs): + fastdeploy_client = self._create_client(server_url) + input_metadata, output_metadata = self.get_model_meta( + server_url, model_name, model_version) + inputs, outputs = prepare_request(input_metadata, inputs, + output_metadata) + response = fastdeploy_client.infer( + model_name, inputs, model_version=model_version, outputs=outputs) + + results = {} + for output in output_metadata: + result = response.as_numpy(output.name) # datatype: numpy + if output.datatype == 'BYTES': # datatype: bytes + try: + value = result + if len(result.shape) == 1: + value = result[0] + elif len(result.shape) == 2: + value = result[0][0] + elif len(result.shape) == 3: + value = result[0][0][0] + result = json.loads(value) # datatype: json + except Exception: + pass + else: + result = result[0] + results[output.name] = result + return results + + def raw_infer(self, server_url, model_name, model_version, raw_input): + url = 'http://{}/v2/models/{}/versions/{}/infer'.format( + server_url, model_name, model_version) + res = requests.post(url, data=json.dumps(json.loads(raw_input))) + return json.dumps(res.json()) + + def get_model_meta(self, server_url, model_name, model_version): + fastdeploy_client = self._create_client(server_url) + try: + model_metadata = fastdeploy_client.get_model_metadata( + model_name=model_name, model_version=model_version) + except InferenceServerException as e: + raise RuntimeError("Failed to retrieve the metadata: " + str(e)) + + model_metadata = convert_http_metadata_config(model_metadata) + + input_metadata = model_metadata.inputs + output_metadata = model_metadata.outputs + return input_metadata, output_metadata diff --git a/visualdl/component/inference/fastdeploy_client/visualizer.py b/visualdl/component/inference/fastdeploy_client/visualizer.py new file mode 100644 index 000000000..2c6abe0b4 --- /dev/null +++ b/visualdl/component/inference/fastdeploy_client/visualizer.py @@ -0,0 +1,184 @@ +# Copyright (c) 2022 VisualDL Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ======================================================================= +import numpy as np + +__all__ = [ + 'visualize_detection', 'visualize_keypoint_detection', + 'visualize_face_detection', 'visualize_face_alignment', + 'visualize_segmentation', 'visualize_matting', 'visualize_ocr', + 'visualize_headpose' +] + + +def visualize_detection(image, data): + try: + import fastdeploy as fd + except Exception: + raise RuntimeError( + "fastdeploy is required for visualizing results,please refer to \ + https://github.com/PaddlePaddle/FastDeploy to install fastdeploy") + boxes = np.array(data['boxes']) + scores = np.array(data['scores']) + label_ids = np.array(data['label_ids']) + masks = np.array(data['masks']) + contain_masks = data['contain_masks'] + detection_result = fd.C.vision.DetectionResult() + detection_result.boxes = boxes + detection_result.scores = scores + detection_result.label_ids = label_ids + detection_result.masks = masks + detection_result.contain_masks = contain_masks + result = fd.vision.vis_detection(image, detection_result) + return result + + +def visualize_keypoint_detection(image, data): + try: + import fastdeploy as fd + except Exception: + raise RuntimeError( + "fastdeploy is required for visualizing results,please refer to \ + https://github.com/PaddlePaddle/FastDeploy to install fastdeploy") + keypoints = np.array(data['keypoints']) + scores = np.array(data['scores']) + num_joints = np.array(data['num_joints']) + + detection_result = fd.C.vision.KeyPointDetectionResult() + detection_result.keypoints = keypoints + detection_result.scores = scores + detection_result.num_joints = num_joints + + result = fd.vision.vis_keypoint_detection(image, detection_result) + return result + + +def visualize_face_detection(image, data): + try: + import fastdeploy as fd + except Exception: + raise RuntimeError( + "fastdeploy is required for visualizing results,please refer to \ + https://github.com/PaddlePaddle/FastDeploy to install fastdeploy") + data = np.array(data['data']) + scores = np.array(data['scores']) + landmarks = np.array(data['landmarks']) + landmarks_per_face = data['landmarks_per_face'] + + detection_result = fd.C.vision.FaceDetectionResult() + detection_result.data = data + detection_result.scores = scores + detection_result.landmarks = landmarks + detection_result.landmarks_per_face = landmarks_per_face + + result = fd.vision.vis_face_detection(image, detection_result) + return result + + +def visualize_face_alignment(image, data): + try: + import fastdeploy as fd + except Exception: + raise RuntimeError( + "fastdeploy is required for visualizing results,please refer to \ + https://github.com/PaddlePaddle/FastDeploy to install fastdeploy") + landmarks = np.array(data['landmarks']) + + facealignment_result = fd.C.vision.FaceAlignmentResult() + facealignment_result.landmarks = landmarks + + result = fd.vision.vis_face_alignment(image, facealignment_result) + return result + + +def visualize_segmentation(image, data): + try: + import fastdeploy as fd + except Exception: + raise RuntimeError( + "fastdeploy is required for visualizing results,please refer to \ + https://github.com/PaddlePaddle/FastDeploy to install fastdeploy") + label_ids = np.array(data['label_ids']) + score_map = np.array(data['score_map']) + shape = np.array(data['shape']) + + segmentation_result = fd.C.vision.SegmentationResult() + segmentation_result.shape = shape + segmentation_result.score_map = score_map + segmentation_result.label_ids = label_ids + + result = fd.vision.vis_segmentation(image, segmentation_result) + return result + + +def visualize_matting(image, data): + try: + import fastdeploy as fd + except Exception: + raise RuntimeError( + "fastdeploy is required for visualizing results,please refer to \ + https://github.com/PaddlePaddle/FastDeploy to install fastdeploy") + alpha = np.array(data['alpha']) + foreground = np.array(data['foreground']) + contain_foreground = data['contain_foreground'] + shape = np.array(data['shape']) + + matting_result = fd.C.vision.MattingResult() + matting_result.alpha = alpha + matting_result.foreground = foreground + matting_result.contain_foreground = contain_foreground + matting_result.shape = shape + + result = fd.vision.vis_matting(image, matting_result) + return result + + +def visualize_ocr(image, data): + try: + import fastdeploy as fd + except Exception: + raise RuntimeError( + "fastdeploy is required for visualizing results,please refer to \ + https://github.com/PaddlePaddle/FastDeploy to install fastdeploy") + boxes = np.array(data['boxes']) + text = np.array(data['text']) + rec_scores = np.array(data['rec_scores']) + cls_scores = np.array(data['cls_scores']) + cls_labels = data['cls_labels'] + + ocr_result = fd.C.vision.OCRResult() + ocr_result.boxes = boxes + ocr_result.text = text + ocr_result.rec_scores = rec_scores + ocr_result.cls_scores = cls_scores + ocr_result.cls_labels = cls_labels + + result = fd.vision.vis_ppocr(image, ocr_result) + return result + + +def visualize_headpose(image, data): + try: + import fastdeploy as fd + except Exception: + raise RuntimeError( + "fastdeploy is required for visualizing results,please refer to \ + https://github.com/PaddlePaddle/FastDeploy to install fastdeploy") + euler_angles = np.array(data['euler_angles']) + + headpose_result = fd.C.vision.HeadPoseResult() + headpose_result.euler_angles = euler_angles + + result = fd.vision.vis_headpose(image, headpose_result) + return result diff --git a/visualdl/component/inference/fastdeploy_lib.py b/visualdl/component/inference/fastdeploy_lib.py new file mode 100644 index 000000000..5264c6e77 --- /dev/null +++ b/visualdl/component/inference/fastdeploy_lib.py @@ -0,0 +1,790 @@ +# Copyright (c) 2022 VisualDL Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ======================================================================= +import copy +import json +import os +import random +import re +import signal +import string +from collections import defaultdict +from subprocess import Popen +from subprocess import STDOUT + +import google.protobuf.json_format as json_format +import google.protobuf.text_format as text_format +import psutil +import requests + +from .proto.model_config_pb2 import ModelConfig +from visualdl.utils.dir import FASTDEPLOYSERVER_PATH + + +def pbtxt2json(content: str): + ''' + Convert protocol messages in text format to json format string. + ''' + message = text_format.Parse(content, ModelConfig()) + json_string = json_format.MessageToJson(message) + return json_string + + +def json2pbtxt(content: str): + ''' + Convert json format string to protocol messages in text format. + ''' + message = json_format.Parse(content, ModelConfig()) + text_proto = text_format.MessageToString(message) + return text_proto + + +def validate_data(model_config): + ''' + Validate data in model config, we should check empty value recieved from front end. + The easiest way to handle it is to drop empty value. + Args: + model_config: model config to be saved in config file + Return: + model config after filtering. + ''' + model_config_filtered = {} + for key, value in model_config.items(): + if value: + model_config_filtered[key] = value + return model_config_filtered + + +def analyse_config(cur_dir: str): + ''' + Analyse the model config in specified directory. + Return a json object to describe configuration. + ''' + all_model_configs = {} + all_model_versions = {} + parent_dir, sub_dirs, filenames = os.walk(cur_dir).send( + None) # models can only put directory in model repository, + # so we should only search depth 1 directories. + for model_dir_name in sub_dirs: + model_dir, model_sub_dirs, filenames = os.walk( + os.path.join(parent_dir, model_dir_name)).send(None) + model_name = os.path.basename(model_dir) + config_filenames = [] + for filename in filenames: + if '.pbtxt' in filename: + config_filenames.append( + filename + ) # filenames with extension .pbtxt are all config files + if config_filenames: + default_config_filename = config_filenames[0] + if 'config.pbtxt' in config_filenames: + default_config_filename = 'config.pbtxt' + config_filenames.remove(default_config_filename) + config_filenames.insert(0, default_config_filename) + else: + # if no config.pbtxt, we choose the first file in config_filenames list to create config.pbtxt + copy_config_file_to_default_config(model_dir, + default_config_filename) + default_config_filename = 'config.pbtxt' + config_filenames.insert(0, default_config_filename) + json_config = json.loads( + pbtxt2json( + open(os.path.join(model_dir, + default_config_filename)).read())) + json_config["config_filenames"] = config_filenames[ + 0] # add config_filenames to config data (frontend developer said he only wanted one filename, + # and to request config_filenames by get_config_filenames_for_one_model later) + all_model_configs[ + model_name] = json_config # store original config file content in json format + json_config[ + 'name'] = model_name # because name in config data may be different from model_name, + # model_name is model directory name actually, we should conform name with model_name. + else: + continue + for model_sub_dir in model_sub_dirs: + if re.match( + r'\d+', + model_sub_dir): # version directory consists of numbers + if model_name not in all_model_versions: + all_model_versions[model_name] = {} + if model_sub_dir not in all_model_versions[model_name]: + all_model_versions[model_name][model_sub_dir] = [] + for version_resource_file in os.listdir( + os.path.join(model_dir, model_sub_dir)): + all_model_versions[model_name][model_sub_dir].append( + version_resource_file) + if model_name not in all_model_versions: # if a model has config but no version directory, + # to convenient users, we create one + all_model_versions[model_name] = {} + os.mkdir(os.path.join(model_dir, '1')) + all_model_versions[model_name]['1'] = [] + + if not all_model_configs: + raise Exception( + 'The path you choose is not a valid model repository, please choose a valid path.' + ) + return all_model_configs, all_model_versions + + +def exchange_format_to_original_format(exchange_format): + ''' + Change config exchange format to original format. + ''' + ensembles = [] + models = [] + all_models = {} + if 'ensembles' in exchange_format: + ensembles = exchange_format['ensembles'] + if 'models' in exchange_format: + models = exchange_format['models'] + alls = ensembles + models + for model_config in alls: + # 1. add 'executionAccelerators' keyword + if 'optimization' in model_config: + optimization_config = model_config['optimization'] + del model_config['optimization'] + model_config['optimization'] = {} + model_config['optimization'][ + 'executionAccelerators'] = optimization_config + # 2. delete versions information + if 'versions' in model_config: + del model_config['versions'] + if 'config_filenames' in model_config: + del model_config['config_filenames'] + if 'platform' in model_config and model_config[ + 'platform'] == 'ensemble': # emsemble model + # 3. add 'ensembleScheduling' keyword + if 'step' in model_config: + step_configs = model_config['step'] + if 'ensembleScheduling' not in model_config: + model_config['ensembleScheduling'] = {} + model_config['ensembleScheduling']['step'] = step_configs + del model_config['step'] + # 4. remove two virtual models(feed, fetch), and + # "modelType", "inputModels", "outputModels", "inputVars", "outputVars" + remove_list = [] + for model_config_in_step in step_configs: + if model_config_in_step[ + 'modelName'] == 'feed' or model_config_in_step[ + 'modelName'] == 'fetch': + remove_list.append(model_config_in_step) + continue + del model_config_in_step['modelType'] + del model_config_in_step['inputModels'] + del model_config_in_step['outputModels'] + del model_config_in_step['inputVars'] + del model_config_in_step['outputVars'] + for remove_item in remove_list: + step_configs.remove(remove_item) + all_models[model_config['name']] = model_config + return all_models + + +def copy_config_file_to_default_config(model_dir, config_name): + json_config = json.loads( + pbtxt2json(open(os.path.join(model_dir, config_name)).read())) + model_name = os.path.basename(model_dir) + json_config['name'] = model_name + text_proto = json2pbtxt(json.dumps(json_config)) + with open(os.path.join(model_dir, 'config.pbtxt'), 'w') as f: + f.write(text_proto) + + +def original_format_to_exchange_format(original_format, version_info): + ''' + Change config original format to exchange format. + ''' + exchange_format = {} + exchange_format['ensembles'] = [] + exchange_format['models'] = [] + # 0. transform version info into component format in frontend + for model_name, version_filenames_dict in version_info.items(): + version_info_for_frontend = [] + for version_name, filenames in version_filenames_dict.items(): + version_filenames_dict_for_frontend = {} + version_filenames_dict_for_frontend['title'] = version_name + version_filenames_dict_for_frontend['key'] = version_name + version_filenames_dict_for_frontend['children'] = [] + for filename in filenames: + version_filenames_dict_for_frontend['children'].append({ + 'title': + filename, + 'key': + filename + }) + version_info_for_frontend.append( + version_filenames_dict_for_frontend) + version_info[model_name] = version_info_for_frontend + + for model_name, model_config in original_format.items(): + # 1. remove 'executionAccelerators' keyword + transformed_config = copy.deepcopy(model_config) + if 'optimization' in model_config: + if 'executionAccelerators' in model_config['optimization']: + transformed_optimization_config = model_config['optimization'][ + 'executionAccelerators'] + del transformed_config['optimization'] + transformed_config[ + 'optimization'] = transformed_optimization_config + # 2. add versions information + if model_name in version_info: + transformed_config['versions'] = version_info[model_name] + if 'platform' in model_config and model_config[ + 'platform'] == 'ensemble': # emsemble model + # 3. remove ensembleScheduling + if 'ensembleScheduling' in model_config: + if 'step' in model_config['ensembleScheduling']: + del transformed_config['ensembleScheduling'] + transformed_config['step'] = model_config[ + 'ensembleScheduling']['step'] + # 4. add two virtual models(feed, fetch), and + # "modelType", "inputModels", "outputModels", "inputVars", "outputVars" + for model_config_in_step in transformed_config['step']: + model_config_in_step['modelType'] = 'normal' + model_config_in_step['inputModels'] = [] + model_config_in_step['outputModels'] = [] + model_config_in_step['inputVars'] = [] + model_config_in_step['outputVars'] = [] + + transformed_config['step'].append({ + "modelName": "feed", + "modelType": "virtual", + "inputModels": [], + "outputModels": [], + "inputVars": [], + "outputVars": [] + }) + transformed_config['step'].append({ + "modelName": "fetch", + "modelType": "virtual", + "inputModels": [], + "outputModels": [], + "inputVars": [], + "outputVars": [] + }) + analyse_step_relationships(transformed_config['step'], + transformed_config['input'], + transformed_config['output']) + exchange_format['ensembles'].append(transformed_config) + elif 'backend' in model_config: # single model + exchange_format['models'].append(transformed_config) + return exchange_format + + +def analyse_step_relationships(step_config, inputs, outputs): # noqa: C901 + ''' + Analyse model relationships in ensemble step. And fill \ + "inputModels", "outputModels", "inputVars", "outputVars" in step_config. + step_config: step data in ensemble model config. + inputs: inputs in ensemble model config. + outputs: outputs in ensemble model config. + ''' + models_dict = {} + vars_dict = {} + for model_config_in_step in step_config: + models_dict[model_config_in_step['modelName']] = model_config_in_step + if model_config_in_step['modelType'] == 'virtual': + for var in inputs: + if var['name'] not in vars_dict: + vars_dict[var['name']] = {} + vars_dict[var['name']]['from_models'] = set() + vars_dict[var['name']]['to_models'] = set() + vars_dict[var['name']]['from_models'].add('feed') + for var in outputs: + if var['name'] not in vars_dict: + vars_dict[var['name']] = {} + vars_dict[var['name']]['from_models'] = set() + vars_dict[var['name']]['to_models'] = set() + vars_dict[var['name']]['to_models'].add('fetch') + else: + for var_placehold_name, var_name in model_config_in_step[ + 'inputMap'].items(): + if var_name not in vars_dict: + vars_dict[var_name] = {} + vars_dict[var_name]['from_models'] = set() + vars_dict[var_name]['to_models'] = set() + vars_dict[var_name]['to_models'].add( + model_config_in_step['modelName']) + + for var_placehold_name, var_name in model_config_in_step[ + 'outputMap'].items(): + if var_name not in vars_dict: + vars_dict[var_name] = {} + vars_dict[var_name]['from_models'] = set() + vars_dict[var_name]['to_models'] = set() + vars_dict[var_name]['from_models'].add( + model_config_in_step['modelName']) + for var_name, relationships in vars_dict.items(): + for from_model in relationships['from_models']: + models_dict[from_model]['outputVars'].append(var_name) + for var_to_model in relationships['to_models']: + if var_to_model not in models_dict[from_model]['outputModels']: + models_dict[from_model]['outputModels'].append( + var_to_model) + for to_model in relationships['to_models']: + models_dict[to_model]['inputVars'].append(var_name) + for var_from_model in relationships['from_models']: + if var_from_model not in models_dict[to_model]['inputModels']: + models_dict[to_model]['inputModels'].append(var_from_model) + calculate_layout_for_frontend(models_dict) + + +def get_config_filenames_for_one_model(cur_dir, name): + _, _, filenames = os.walk(os.path.join(cur_dir, name)).send(None) + config_filenames = [] + backup_config_filenames = [] + for filename in filenames: + if '.pbtxt' in filename and 'vdlbackup' not in filename: + config_filenames.append( + filename + ) # filenames with extension .pbtxt and not contain 'vdlbackup' are normal config files + elif '.pbtxt' in filename and 'vdlbackup' in filename: + backup_config_filenames.append( + filename + ) # filenames with extension .pbtxt and contain 'vdlbackup' are backup config files + config_filenames = sorted(config_filenames) + sorted( + backup_config_filenames) + return config_filenames + + +def get_config_for_one_model(cur_dir, name, config_filename): + all_model_configs = {} + all_model_versions = {} + filename = os.path.join(cur_dir, name, config_filename) + json_config = json.loads(pbtxt2json(open(filename).read())) + json_config[ + 'name'] = name # because name in config data may be different from model_name, + # model_name is model directory name actually, we should conform name with model_name. + json_config["config_filenames"] = config_filename + all_model_configs[ + name] = json_config # store original config file content in json format + all_model_versions[name] = {} + for model_sub_dir in os.listdir(os.path.join(cur_dir, name)): + if re.match(r'\d+', + model_sub_dir): # version directory consists of numbers + if model_sub_dir not in all_model_versions[name]: + all_model_versions[name][model_sub_dir] = [] + for version_resource_file in os.listdir( + os.path.join(cur_dir, name, model_sub_dir)): + all_model_versions[name][model_sub_dir].append( + version_resource_file) + model_config = original_format_to_exchange_format(all_model_configs, + all_model_versions) + if model_config['ensembles']: + return model_config['ensembles'][0] + elif model_config['models']: + return model_config['models'][0] + + +def calculate_layout_for_frontend(model_config_in_step): + ''' + Analyse model topology connections and prepare the positions for each model in layout. + Dynamic program algorithm: + depth(cur_node) = max([depth(prev_node) for prev_node in cur_node['inputModels']]) + Args: + model_config_in_step(dict): model config in ensemble models' step, indexed by model name. + Returns: + None. Results calculated will be saved in place. + ''' + path_depth = defaultdict(int) + + def depth_recursive(model): + if model['modelName'] == 'feed': + path_depth[model['modelName']] = 0 + return 0 + if path_depth[model['modelName']] != 0: + return path_depth[model['modelName']] + path_depth[model['modelName']] = max([ + depth_recursive(model_config_in_step[model_name]) for model_name in + model_config_in_step[model['modelName']]['inputModels'] + ]) + 1 + return path_depth[model['modelName']] + + depth_recursive(model_config_in_step['fetch']) + path_depth_tuple = [ + (k, v) + for k, v in sorted(path_depth.items(), key=lambda item: item[1]) + ] + cur_x = 0 + last_depth = -1 + for model_name, depth in path_depth_tuple: + if depth == last_depth: + model_config_in_step[model_name]['pos_y'] = depth + model_config_in_step[model_name]['pos_x'] = cur_x + cur_x += 1 + else: + cur_x = 0 + model_config_in_step[model_name]['pos_y'] = depth + model_config_in_step[model_name]['pos_x'] = cur_x + cur_x += 1 + last_depth = depth + return + + +def launch_process(kwargs: dict): + ''' + Launch a fastdeploy server according to specified arguments. + ''' + cmd = ['fastdeployserver'] + launch_env = os.environ.copy() + start_args = {} + for key, value in kwargs.items(): + if key == 'default_model_name': # Used to fill client model_name automatically + start_args[key] = value + continue + if key == 'server-name' or key == 'ensemble-img': # extra information + start_args[key] = value + continue + if key == 'gpus': + if value: + launch_env['CUDA_VISIBLE_DEVICES'] = value + start_args[key] = value + continue + cmd.append('--{}'.format(key)) + cmd.append('{}'.format(value)) + start_args[key] = value + if start_args['server-name'] and start_args['server-name'] in os.listdir( + FASTDEPLOYSERVER_PATH): + raise RuntimeError( + "Failed to launch server,server name {} has been used,please write a different server name." + .format(start_args['server-name'])) + all_model_configs, all_model_versions = analyse_config( + start_args['model-repository']) + model_repo_config = original_format_to_exchange_format( + all_model_configs, all_model_versions) + model_repo_config['ensemble-img'] = start_args['ensemble-img'] + logfilename = 'logfile-{}'.format(get_random_string(8)) + while os.path.exists(os.path.join(FASTDEPLOYSERVER_PATH, logfilename)): + logfilename = 'logfile-{}'.format(get_random_string(8)) + p = Popen( + cmd, + stdout=open( + os.path.join(FASTDEPLOYSERVER_PATH, logfilename), 'w', + buffering=1), + stderr=STDOUT, + universal_newlines=True, + env=launch_env) + server_name = start_args['server-name'] if start_args[ + 'server-name'] else p.pid + with open( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_name)), + 'w') as f: + # filename ${server_name} contain 4 lines: + # line1 : the real log filename ${logfilename} + # line2 : pid + # line3 : launch arguments + # line4 : model-repository configuration + f.write(logfilename + '\n' + str(p.pid) + '\n' + + json.dumps(start_args) + '\n' + json.dumps(model_repo_config)) + return p + + +def get_random_string(length): + # choose from all lowercase letter + letters = string.ascii_lowercase + result_str = ''.join([random.choice(letters) for i in range(length)]) + return result_str + + +def get_start_arguments(server_id): + ''' + Get the start arguments for fastdeployserver process. + Args: + server_id(str): fastdeployserver process name + Returns: + args(dict): launch arguments when start fastdeployserver process. + ''' + args = {} + if os.path.exists( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))): + with open( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)), + 'r') as f: + arguments_json = f.read().split('\n')[2] + args = json.loads(arguments_json) + return args + + +def get_process_pid(server_id): + ''' + Get the process id for fastdeployserver process. + Args: + server_id(str): fastdeployserver process name + Returns: + pid(int): process id. + ''' + pid = None + if os.path.exists( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))): + with open( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)), + 'r') as f: + pid = int(f.read().split('\n')[1]) + return pid + + +def get_process_logfile_name(server_id): + ''' + Get the process logfile name for fastdeployserver process. + Args: + server_id(str): fastdeployserver process name + Returns: + logfile(str): logfile name. + ''' + filename = None + if os.path.exists( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))): + with open( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)), + 'r') as f: + filename = f.read().split('\n')[0] + return filename + + +def get_process_model_configuration(server_id): + ''' + Get the model repository configuration for fastdeployserver process. + Args: + server_id(str): fastdeployserver process name + Returns: + configuration(dict): model repository configuration + ''' + conf = {} + if os.path.exists( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))): + with open( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)), + 'r') as f: + conf_json = f.read().split('\n')[3] + conf = json.loads(conf_json) + return conf + + +def get_process_output(server_id, length): + ''' + Get the standard output of a opened subprocess. + ''' + if os.path.exists( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))): + logfilename = get_process_logfile_name(server_id) + # delete file ${logfilename} if exists + if os.path.exists( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(logfilename))): + with open( + os.path.join(FASTDEPLOYSERVER_PATH, + '{}'.format(logfilename)), 'r') as f: + f.seek(length) + data = f.read() + return data + + +def mark_pid_for_dead_process(server_id): + ''' + Resource files for a dead server only deleted when user closes the server in frontend. + When user close the server, pid recorded in logfile will be killed. + In case a dead process id is reassigned for a new process, we should mark the pid recorded in logfile as outdated. + Here, we choose to replace the pid to -1 in logfile to denote the zombie process \ + which has been polled and becomes dead. + Args: + server_id(str): fastdeployserver process name + ''' + if os.path.exists( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))): + with open( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)), + 'r') as f: + contents = f.read().split('\n') + contents[1] = '-1' # we replace pid to -1 + with open( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id)), + 'w') as f: + f.write('\n'.join(contents)) + + +def delete_files_for_process(server_id): + ''' + Delete logfile for fastdeployserver process. + Args: + server_id(str): fastdeployserver process name + ''' + if os.path.exists( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))): + logfilename = get_process_logfile_name(server_id) + # delete file ${logfilename} if exists + if os.path.exists( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(logfilename))): + os.remove( + os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(logfilename))) + os.remove(os.path.join(FASTDEPLOYSERVER_PATH, '{}'.format(server_id))) + + +def kill_process(process): + ''' + Stop a opened subprocess. + ''' + if type(process) == str: # server_id, use os.kill to terminate + pid = get_process_pid(process) + if pid == -1: # we use -1 to mark dead process + return + try: + os.kill(pid, signal.SIGKILL) + except Exception: + pass + else: + pid = process.pid + process.kill() + try: + process.wait(10) + except Exception: + pass + + +def get_alive_fastdeploy_servers(): + ''' + Search server names in `FASTDEPLOYSERVER_PATH`, if process is dead and log still exists due to \ + some unexpectable reasons, delete log file. + ''' + server_names = [ + name for name in os.listdir(FASTDEPLOYSERVER_PATH) + if 'logfile' not in name + ] + should_delete_servers = [] + for server_name in server_names: + if check_process_alive(server_name) is False: + delete_files_for_process(server_name) + should_delete_servers.append(server_name) + for server_name in should_delete_servers: + server_names.remove(server_name) + return server_names + + +def check_process_zombie(server_id): + ''' + Given a server id, check whether the process became zoombie and mark pid as -1. + Args: + server_id(str): fastdeployserver process name + Return: + status(bool): True if process became zoombie. + ''' + pid = get_process_pid(server_id) + if pid == -1: + return True + else: + return False + + +def check_process_alive(server_id): + ''' + Given a server id, check whether the process is alive or not. + Args: + server_id(str): fastdeployserver process name + Return: + status(bool): True if process is still alive. + ''' + pid = get_process_pid(server_id) + if pid is None: + return False + if pid == -1: # We use -1 to mark zombie process which has been dead process. + # Consider user wants to know the reason for dead process due to exception, + # we return True to let user in frontend can get the log for dead process. + return True + try: + os.kill(pid, 0) + except OSError: + return False + else: + if 'fastdeployserve' not in psutil.Process(pid).name( + ): # We should judge the pid is fastdeployserver process, in case pid has been reassigned. + # Note: I do not know why psutil.Process(pid).name() is fastdeployserve but not fastdeployserver. + return False + else: + return True + + +_metric_column_name = { + "Model": { + "nv_inference_request_success", "nv_inference_request_failure", + "nv_inference_count", "nv_inference_exec_count", + "nv_inference_request_duration_us", "nv_inference_queue_duration_us", + "nv_inference_compute_input_duration_us", + "nv_inference_compute_infer_duration_us", + "nv_inference_compute_output_duration_us" + }, + "GPU": { + "nv_gpu_power_usage", "nv_gpu_power_limit", "nv_energy_consumption", + "nv_gpu_utilization", "nv_gpu_memory_total_bytes", + "nv_gpu_memory_used_bytes" + }, + "CPU": { + "nv_cpu_utilization", "nv_cpu_memory_total_bytes", + "nv_cpu_memory_used_bytes" + } +} + + +def generate_metric_table(server_addr, server_port): # noqa:C901 + model_table = {} + gpu_table = {} + try: + res = requests.get("http://{}:{}/metrics".format( + server_addr, server_port)) + except Exception: + return None + metric_content = res.text + for content in metric_content.split('\n'): + if content.startswith('#'): + continue + else: + res = re.match(r'(\w+){(.*)} (\w+)', + content) # match output by server metrics interface + if not res: + continue + metric_name = res.group(1) + model = res.group(2) + value = res.group(3) + infos = {} + for info in model.split(','): + k, v = info.split('=') + v = v.strip('"') + infos[k] = v + if metric_name in [ + "nv_inference_request_duration_us", + "nv_inference_queue_duration_us", + "nv_inference_compute_input_duration_us", + "nv_inference_compute_infer_duration_us", + "nv_inference_compute_output_duration_us" + ]: + value = float(value) / 1000 + elif metric_name in [ + "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes" + ]: + value = float(value) / 1024 / 1024 / 1024 + for key, metric_names in _metric_column_name.items(): + if metric_name in metric_names: + if key == 'Model': + model_name = infos['model'] + if model_name not in model_table: + model_table[model_name] = {} + model_table[model_name][metric_name] = value + elif key == 'GPU': + gpu_name = infos['gpu_uuid'] + if gpu_name not in gpu_table: + gpu_table[gpu_name] = {} + gpu_table[gpu_name][metric_name] = value + elif key == 'CPU': + pass + results = {} + results['Model'] = model_table + results['GPU'] = gpu_table + return results diff --git a/visualdl/component/inference/fastdeploy_server.py b/visualdl/component/inference/fastdeploy_server.py new file mode 100644 index 000000000..89b0b13ff --- /dev/null +++ b/visualdl/component/inference/fastdeploy_server.py @@ -0,0 +1,439 @@ +# Copyright (c) 2022 VisualDL Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ======================================================================= +import datetime +import json +import os +import re +import shutil +import socket +import time +from multiprocessing import Process +from pathlib import Path + +import requests + +from .fastdeploy_client.client_app import create_gradio_client_app +from .fastdeploy_lib import analyse_config +from .fastdeploy_lib import check_process_zombie +from .fastdeploy_lib import copy_config_file_to_default_config +from .fastdeploy_lib import delete_files_for_process +from .fastdeploy_lib import exchange_format_to_original_format +from .fastdeploy_lib import generate_metric_table +from .fastdeploy_lib import get_alive_fastdeploy_servers +from .fastdeploy_lib import get_config_filenames_for_one_model +from .fastdeploy_lib import get_config_for_one_model +from .fastdeploy_lib import get_process_model_configuration +from .fastdeploy_lib import get_process_output +from .fastdeploy_lib import get_start_arguments +from .fastdeploy_lib import json2pbtxt +from .fastdeploy_lib import kill_process +from .fastdeploy_lib import launch_process +from .fastdeploy_lib import mark_pid_for_dead_process +from .fastdeploy_lib import original_format_to_exchange_format +from .fastdeploy_lib import validate_data +from visualdl.server.api import gen_result +from visualdl.server.api import result +from visualdl.utils.dir import FASTDEPLOYSERVER_PATH + + +class FastDeployServerApi(object): + def __init__(self): + self.root_dir = Path(os.getcwd()) + self.opened_servers = { + } # Use to store the opened server process pid and process itself + self.client_port = None + + @result() + def get_directory(self, cur_dir): + if self.root_dir not in Path(os.path.abspath(cur_dir)).parents: + cur_dir = '.' + cur_dir, sub_dirs, filenames = os.walk(cur_dir).send(None) + if Path(self.root_dir) != Path(os.path.abspath(cur_dir)): + sub_dirs.append('..') + sub_dirs = sorted(sub_dirs) + directorys = { + 'parent_dir': + os.path.relpath(Path(os.path.abspath(cur_dir)), self.root_dir), + 'sub_dir': + sub_dirs + } + return directorys + + @result() + def get_config(self, cur_dir): + all_model_configs, all_model_versions = analyse_config(cur_dir) + return original_format_to_exchange_format(all_model_configs, + all_model_versions) + + @result() + def config_update(self, cur_dir, model_name, config, config_filename): + config = json.loads(config) + all_models = exchange_format_to_original_format(config) + model_dir = os.path.join(os.path.abspath(cur_dir), model_name) + filtered_config = validate_data(all_models[model_name]) + text_proto = json2pbtxt(json.dumps(filtered_config)) + # backup user's config data first, when data corrupted by front-end, we still can recovery data + # backup config filename: {original_name}_vdlbackup_{datetime}.pbtxt + # backup config can only used to restore config.pbtxt + if 'vdlbackup' in config_filename: + raise RuntimeError( + "Backup config file is not permitted to update.") + basename = os.path.splitext(config_filename)[0] + shutil.copy( + os.path.join(model_dir, config_filename), + os.path.join( + model_dir, '{}_vdlbackup_{}.pbtxt'.format( + basename, + datetime.datetime.now().isoformat()))) + with open(os.path.join(model_dir, config_filename), 'w') as f: + f.write(text_proto) + return + + @result() + def start_server(self, configs): + configs = json.loads(configs) + process = launch_process(configs) + if process.poll() is not None: + raise RuntimeError( + "Failed to launch fastdeployserver,please check fastdeployserver is installed in environment." + ) + server_name = configs['server-name'] if configs[ + 'server-name'] else str(process.pid) + self.opened_servers[server_name] = process + return server_name + + @result() + def stop_server(self, server_id): + if server_id in self.opened_servers: # check if server_id in self.opened_servers + kill_process(self.opened_servers[server_id]) + del self.opened_servers[server_id] + elif server_id in set( + os.listdir(FASTDEPLOYSERVER_PATH)): # check if server_id in + # FASTDEPLOYSERVER_PATH(may be launched by other vdl app instance by gunicorn) + kill_process(server_id) + delete_files_for_process(server_id) + self._poll_zombie_process() + + @result('text/plain') + def get_server_output(self, server_id, length): + length = int(length) + if server_id in self.opened_servers: # check if server_id in self.opened_servers + return get_process_output(server_id, length) + elif str(server_id) in set( + os.listdir(FASTDEPLOYSERVER_PATH)): # check if server_id in + # FASTDEPLOYSERVER_PATH(may be launched by other vdl app instance by gunicorn) + return get_process_output(server_id, length) + else: + return + + @result() + def get_server_metric(self, server_id): + args = get_start_arguments(server_id) + host = 'localhost' + port = args.get('metrics-port', 8002) + return generate_metric_table(host, port) + + @result() + def get_server_list(self): + return get_alive_fastdeploy_servers() + + @result() + def check_server_alive(self, server_id): + self._poll_zombie_process() + if check_process_zombie(server_id) is True: + raise RuntimeError( + "Server {} is down due to exception or killed,please check the reason according to the log, \ + then close this server.".format(server_id)) + return + + @result() + def get_server_config(self, server_id): + return get_process_model_configuration(server_id) + + @result() + def get_pretrain_model_list(self): + ''' + Get all available fastdeploy models from hub server. + ''' + res = requests.get( + 'http://paddlepaddle.org.cn/paddlehub/fastdeploy_listmodels') + result = res.json() + if result['status'] != 0: + raise RuntimeError( + "Failed to get pre-trained model list from hub server.") + else: + data = result['data'] + model_list = {} + for category, models in data.items(): + if category not in model_list: + model_list[category] = set() + for model in models: + model_list[category].add(model['name']) + # adapt data format for frontend + models_info = [] + for category, model_names in model_list.items(): + models_info.append({ + "value": category, + "label": category, + "children": [] + }) + for model_name in sorted(model_names): + models_info[-1]["children"].append({ + "value": model_name, + "label": model_name + }) + return models_info + + @result() + def download_pretrain_model(self, cur_dir, model_name, version, + pretrain_model_name): + version_resource_dir = os.path.join( + os.path.abspath(cur_dir), model_name, version) + try: + import fastdeploy as fd + except Exception: + raise RuntimeError( + "fastdeploy is required for visualizing results,please refer to \ + https://github.com/PaddlePaddle/FastDeploy to install fastdeploy") + model_path = fd.download_model( + name=pretrain_model_name, path=version_resource_dir) + if model_path: + if '.onnx' in model_path: + shutil.move( + model_path, + os.path.join(os.path.dirname(model_path), 'model.onnx')) + else: + for filename in os.listdir(model_path): + if '.pdmodel' in filename or '.pdiparams' in filename: + shutil.move( + os.path.join(model_path, filename), + os.path.join( + os.path.dirname(model_path), 'model{}'.format( + os.path.splitext(filename)[1]))) + else: + shutil.move( + os.path.join(model_path, filename), + os.path.join( + os.path.dirname(model_path), filename)) + shutil.rmtree(model_path) + version_info_for_frontend = [] + for version_name in os.listdir(os.path.join(cur_dir, model_name)): + if re.match( + r'\d+', + version_name): # version directory consists of numbers + version_filenames_dict_for_frontend = {} + version_filenames_dict_for_frontend['title'] = version_name + version_filenames_dict_for_frontend['key'] = version_name + version_filenames_dict_for_frontend['children'] = [] + for filename in os.listdir( + os.path.join(cur_dir, model_name, version_name)): + version_filenames_dict_for_frontend['children'].append( + { + 'title': filename, + 'key': filename + }) + version_info_for_frontend.append( + version_filenames_dict_for_frontend) + return version_info_for_frontend + else: + raise RuntimeError( + "Failed to download pre-trained model {}.".format( + pretrain_model_name)) + + @result() + def get_config_for_model(self, cur_dir, name, config_filename): + return get_config_for_one_model(cur_dir, name, config_filename) + + @result() + def get_config_filenames_for_model(self, cur_dir, name): + return get_config_filenames_for_one_model(cur_dir, name) + + @result() + def delete_config_for_model(self, cur_dir, name, config_filename): + if self.root_dir not in Path( + os.path.abspath(cur_dir) + ).parents: # should prevent user remove files outside model-repository + raise RuntimeError( + 'Failed to delete config file, please check filepath.') + if os.path.exists(os.path.join(cur_dir, name, config_filename)): + os.remove(os.path.join(cur_dir, name, config_filename)) + return get_config_filenames_for_one_model(cur_dir, name) + + @result() + def set_default_config_for_model(self, cur_dir, name, config_filename): + model_dir = os.path.join(os.path.abspath(cur_dir), name) + # backup config.pbtxt to config_vdlbackup_{datetime}.pbtxt + if os.path.exists(os.path.join(model_dir, 'config.pbtxt')): + shutil.copy( + os.path.join(model_dir, 'config.pbtxt'), + os.path.join( + model_dir, 'config_vdlbackup_{}.pbtxt'.format( + datetime.datetime.now().isoformat()))) + if config_filename != 'config.pbtxt': + copy_config_file_to_default_config(model_dir, config_filename) + return + + @result() + def delete_resource_for_model(self, cur_dir, model_name, version, + resource_filename): + if self.root_dir not in Path( + os.path.abspath(cur_dir) + ).parents: # should prevent user remove files outside model-repository + raise RuntimeError( + 'Failed to delete resource file, please check filepath.') + resource_path = os.path.join( + os.path.abspath(cur_dir), model_name, version, resource_filename) + if os.path.exists(resource_path): + os.remove(resource_path) + version_info_for_frontend = [] + for version_name in os.listdir(os.path.join(cur_dir, model_name)): + if re.match(r'\d+', + version_name): # version directory consists of numbers + version_filenames_dict_for_frontend = {} + version_filenames_dict_for_frontend['title'] = version_name + version_filenames_dict_for_frontend['key'] = version_name + version_filenames_dict_for_frontend['children'] = [] + for filename in os.listdir( + os.path.join(cur_dir, model_name, version_name)): + version_filenames_dict_for_frontend['children'].append({ + 'title': + filename, + 'key': + filename + }) + version_info_for_frontend.append( + version_filenames_dict_for_frontend) + return version_info_for_frontend + + @result() + def rename_resource_for_model(self, cur_dir, model_name, version, + resource_filename, new_filename): + if self.root_dir not in Path( + os.path.abspath(cur_dir) + ).parents: # should prevent user remove files outside model-repository + raise RuntimeError( + 'Failed to rename resource file, please check filepath.') + resource_path = os.path.join( + os.path.abspath(cur_dir), model_name, version, resource_filename) + new_file_path = os.path.join( + os.path.abspath(cur_dir), model_name, version, new_filename) + if os.path.exists(resource_path): + shutil.move(resource_path, new_file_path) + version_info_for_frontend = [] + for version_name in os.listdir(os.path.join(cur_dir, model_name)): + if re.match(r'\d+', + version_name): # version directory consists of numbers + version_filenames_dict_for_frontend = {} + version_filenames_dict_for_frontend['title'] = version_name + version_filenames_dict_for_frontend['key'] = version_name + version_filenames_dict_for_frontend['children'] = [] + for filename in os.listdir( + os.path.join(cur_dir, model_name, version_name)): + version_filenames_dict_for_frontend['children'].append({ + 'title': + filename, + 'key': + filename + }) + version_info_for_frontend.append( + version_filenames_dict_for_frontend) + return version_info_for_frontend + + def create_fastdeploy_client(self): + if self.client_port is None: + + def get_free_tcp_port(): + tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + # tcp.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT, 1) + tcp.bind(('localhost', 0)) + addr, port = tcp.getsockname() + tcp.close() + return port + + self.client_port = get_free_tcp_port() + app = create_gradio_client_app() + thread = Process( + target=app.launch, kwargs={'server_port': self.client_port}) + thread.start() + + def check_alive(): + while True: + try: + requests.get('http://localhost:{}/'.format( + self.client_port)) + break + except Exception: + time.sleep(1) + + check_alive() + return self.client_port + + def _poll_zombie_process(self): + # check if there are servers killed by other vdl app instance and become zoombie + should_delete = [] + for server_id, process in self.opened_servers.items(): + if process.poll() is not None: + mark_pid_for_dead_process(server_id) + should_delete.append(server_id) + + for server_id in should_delete: + del self.opened_servers[server_id] + + +def create_fastdeploy_api_call(): + api = FastDeployServerApi() + routes = { + 'get_directory': (api.get_directory, ['dir']), + 'config_update': (api.config_update, + ['dir', 'name', 'config', 'config_filename']), + 'get_config': (api.get_config, ['dir']), + 'get_config_filenames_for_model': (api.get_config_filenames_for_model, + ['dir', 'name']), + 'get_config_for_model': (api.get_config_for_model, + ['dir', 'name', 'config_filename']), + 'set_default_config_for_model': (api.set_default_config_for_model, + ['dir', 'name', 'config_filename']), + 'delete_config_for_model': (api.delete_config_for_model, + ['dir', 'name', 'config_filename']), + 'start_server': (api.start_server, ['config']), + 'stop_server': (api.stop_server, ['server_id']), + 'get_server_output': (api.get_server_output, ['server_id', 'length']), + 'create_fastdeploy_client': (api.create_fastdeploy_client, []), + 'get_server_list': (api.get_server_list, []), + 'get_server_metric': (api.get_server_metric, ['server_id']), + 'get_server_config': (api.get_server_config, ['server_id']), + 'get_pretrain_model_list': (api.get_pretrain_model_list, []), + 'check_server_alive': (api.check_server_alive, ['server_id']), + 'download_pretrain_model': + (api.download_pretrain_model, + ['dir', 'name', 'version', 'pretrain_model_name']), + 'delete_resource_for_model': + (api.delete_resource_for_model, + ['dir', 'name', 'version', 'resource_filename']), + 'rename_resource_for_model': (api.rename_resource_for_model, [ + 'dir', 'name', 'version', 'resource_filename', 'new_filename' + ]) + } + + def call(path: str, args): + route = routes.get(path) + if not route: + return json.dumps(gen_result( + status=1, msg='api not found')), 'application/json', None + method, call_arg_names = route + call_args = [args.get(name) for name in call_arg_names] + return method(*call_args) + + return call diff --git a/visualdl/component/inference/proto/__init__.py b/visualdl/component/inference/proto/__init__.py new file mode 100644 index 000000000..9c19f7b87 --- /dev/null +++ b/visualdl/component/inference/proto/__init__.py @@ -0,0 +1,14 @@ +# Copyright (c) 2022 VisualDL Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ======================================================================= diff --git a/visualdl/component/inference/proto/model_config.protxt b/visualdl/component/inference/proto/model_config.protxt new file mode 100644 index 000000000..1751f02f7 --- /dev/null +++ b/visualdl/component/inference/proto/model_config.protxt @@ -0,0 +1,1981 @@ +// Copyright 2018-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions +// are met: +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// * Neither the name of NVIDIA CORPORATION nor the names of its +// contributors may be used to endorse or promote products derived +// from this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY +// OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Copyright (c) 2018, TensorFlow Authors. All rights reserved. + +syntax = "proto3"; + +package inference; + +//@@.. cpp:namespace:: inference + +//@@ +//@@.. cpp:enum:: DataType +//@@ +//@@ Data types supported for input and output tensors. +//@@ +enum DataType { + //@@ .. cpp:enumerator:: DataType::INVALID = 0 + TYPE_INVALID = 0; + + //@@ .. cpp:enumerator:: DataType::BOOL = 1 + TYPE_BOOL = 1; + + //@@ .. cpp:enumerator:: DataType::UINT8 = 2 + TYPE_UINT8 = 2; + //@@ .. cpp:enumerator:: DataType::UINT16 = 3 + TYPE_UINT16 = 3; + //@@ .. cpp:enumerator:: DataType::UINT32 = 4 + TYPE_UINT32 = 4; + //@@ .. cpp:enumerator:: DataType::UINT64 = 5 + TYPE_UINT64 = 5; + + //@@ .. cpp:enumerator:: DataType::INT8 = 6 + TYPE_INT8 = 6; + //@@ .. cpp:enumerator:: DataType::INT16 = 7 + TYPE_INT16 = 7; + //@@ .. cpp:enumerator:: DataType::INT32 = 8 + TYPE_INT32 = 8; + //@@ .. cpp:enumerator:: DataType::INT64 = 9 + TYPE_INT64 = 9; + + //@@ .. cpp:enumerator:: DataType::FP16 = 10 + TYPE_FP16 = 10; + //@@ .. cpp:enumerator:: DataType::FP32 = 11 + TYPE_FP32 = 11; + //@@ .. cpp:enumerator:: DataType::FP64 = 12 + TYPE_FP64 = 12; + + //@@ .. cpp:enumerator:: DataType::STRING = 13 + TYPE_STRING = 13; + + //@@ .. cpp:enumerator:: DataType::BF16 = 14 + TYPE_BF16 = 14; +} + +//@@ +//@@ .. cpp:var:: message ModelRateLimiter +//@@ +//@@ The specifications required by the rate limiter to properly +//@@ schedule the inference requests across the different models +//@@ and their instances. +//@@ +message ModelRateLimiter +{ + //@@ .. cpp:var:: message Resource + //@@ + //@@ The resource property. + //@@ + message Resource + { + //@@ .. cpp:var:: string name + //@@ + //@@ The name associated with the resource. + //@@ + string name = 1; + + //@@ .. cpp:var:: bool global + //@@ + //@@ Whether or not the resource is global. If true then the resource + //@@ is assumed to be shared among the devices otherwise specified + //@@ count of the resource is assumed for each device associated + //@@ with the instance. + //@@ + bool global = 2; + + //@@ .. cpp:var:: uint32 count + //@@ + //@@ The number of resources required for the execution of the model + //@@ instance. + //@@ + uint32 count = 3; + } + + //@@ .. cpp:var:: Resource resources (repeated) + //@@ + //@@ The resources required to execute the request on a model instance. + //@@ Resources are just names with a corresponding count. The execution + //@@ of the instance will be blocked until the specificied resources are + //@@ available. By default an instance uses no rate-limiter resources. + //@@ + repeated Resource resources = 1; + + //@@ .. cpp:var:: uint32 priority + //@@ + //@@ The optional weighting value to be used for prioritizing across + //@@ instances. An instance with priority 2 will be given 1/2 the + //@@ number of scheduling chances as an instance_group with priority + //@@ 1. The default priority is 1. The priority of value 0 will be + //@@ treated as priority 1. + //@@ + uint32 priority = 2; +} + +//@@ +//@@.. cpp:var:: message ModelInstanceGroup +//@@ +//@@ A group of one or more instances of a model and resources made +//@@ available for those instances. +//@@ +message ModelInstanceGroup +{ + //@@ + //@@ .. cpp:enum:: Kind + //@@ + //@@ Kind of this instance group. + //@@ + enum Kind { + //@@ .. cpp:enumerator:: Kind::KIND_AUTO = 0 + //@@ + //@@ This instance group represents instances that can run on either + //@@ CPU or GPU. If all GPUs listed in 'gpus' are available then + //@@ instances will be created on GPU(s), otherwise instances will + //@@ be created on CPU. + //@@ + KIND_AUTO = 0; + + //@@ .. cpp:enumerator:: Kind::KIND_GPU = 1 + //@@ + //@@ This instance group represents instances that must run on the + //@@ GPU. + //@@ + KIND_GPU = 1; + + //@@ .. cpp:enumerator:: Kind::KIND_CPU = 2 + //@@ + //@@ This instance group represents instances that must run on the + //@@ CPU. + //@@ + KIND_CPU = 2; + + //@@ .. cpp:enumerator:: Kind::KIND_MODEL = 3 + //@@ + //@@ This instance group represents instances that should run on the + //@@ CPU and/or GPU(s) as specified by the model or backend itself. + //@@ The inference server will not override the model/backend + //@@ settings. + //@@ + KIND_MODEL = 3; + } + + //@@ + //@@ .. cpp:var:: message SecondaryDevice + //@@ + //@@ A secondary device required for a model instance. + //@@ + message SecondaryDevice + { + //@@ + //@@ .. cpp:enum:: SecondaryDeviceKind + //@@ + //@@ The kind of the secondary device. + //@@ + enum SecondaryDeviceKind { + //@@ .. cpp:enumerator:: SecondaryDeviceKind::KIND_NVDLA = 0 + //@@ + //@@ An NVDLA core. http://nvdla.org + //@@ Currently KIND_NVDLA is only supported by the TensorRT backend. + //@@ + KIND_NVDLA = 0; + } + + //@@ .. cpp:var:: SecondaryDeviceKind kind + //@@ + //@@ The secondary device kind. + //@@ + SecondaryDeviceKind kind = 1; + + //@@ .. cpp:var:: int64 device_id + //@@ + //@@ Identifier for the secondary device. + //@@ + int64 device_id = 2; + } + + //@@ .. cpp:var:: string name + //@@ + //@@ Optional name of this group of instances. If not specified the + //@@ name will be formed as _. The name of + //@@ individual instances will be further formed by a unique instance + //@@ number and GPU index: + //@@ + string name = 1; + + //@@ .. cpp:var:: Kind kind + //@@ + //@@ The kind of this instance group. Default is KIND_AUTO. If + //@@ KIND_AUTO or KIND_GPU then both 'count' and 'gpu' are valid and + //@@ may be specified. If KIND_CPU or KIND_MODEL only 'count' is valid + //@@ and 'gpu' cannot be specified. + //@@ + Kind kind = 4; + + //@@ .. cpp:var:: int32 count + //@@ + //@@ For a group assigned to GPU, the number of instances created for + //@@ each GPU listed in 'gpus'. For a group assigned to CPU the number + //@@ of instances created. Default is 1. + int32 count = 2; + + //@@ .. cpp:var:: ModelRateLimiter rate_limiter + //@@ + //@@ The rate limiter specific settings to be associated with this + //@@ instance group. Optional, if not specified no rate limiting + //@@ will be applied to this instance group. + //@@ + ModelRateLimiter rate_limiter = 6; + + //@@ .. cpp:var:: int32 gpus (repeated) + //@@ + //@@ GPU(s) where instances should be available. For each GPU listed, + //@@ 'count' instances of the model will be available. Setting 'gpus' + //@@ to empty (or not specifying at all) is eqivalent to listing all + //@@ available GPUs. + //@@ + repeated int32 gpus = 3; + + //@@ .. cpp:var:: SecondaryDevice secondary_devices (repeated) + //@@ + //@@ Secondary devices that are required by instances specified by this + //@@ instance group. Optional. + //@@ + repeated SecondaryDevice secondary_devices = 8; + + //@@ .. cpp:var:: string profile (repeated) + //@@ + //@@ For TensorRT models containing multiple optimization profile, this + //@@ parameter specifies a set of optimization profiles available to this + //@@ instance group. The inference server will choose the optimal profile + //@@ based on the shapes of the input tensors. This field should lie + //@@ between 0 and - 1 + //@@ and be specified only for TensorRT backend, otherwise an error will + //@@ be generated. If not specified, the server will select the first + //@@ optimization profile by default. + //@@ + repeated string profile = 5; + + //@@ .. cpp:var:: bool passive + //@@ + //@@ Whether the instances within this instance group will be accepting + //@@ inference requests from the scheduler. If true, the instances will + //@@ not be added to the scheduler. Default value is false. + //@@ + bool passive = 7; + + //@@ .. cpp:var:: string host_policy + //@@ + //@@ The host policy name that the instance to be associated with. + //@@ The default value is set to reflect the device kind of the instance, + //@@ for instance, KIND_CPU is "cpu", KIND_MODEL is "model" and + //@@ KIND_GPU is "gpu_". + //@@ + string host_policy = 9; +} + +//@@ +//@@.. cpp:var:: message ModelTensorReshape +//@@ +//@@ Reshape specification for input and output tensors. +//@@ +message ModelTensorReshape +{ + //@@ .. cpp:var:: int64 shape (repeated) + //@@ + //@@ The shape to use for reshaping. + //@@ + repeated int64 shape = 1; +} + +//@@ +//@@.. cpp:var:: message ModelInput +//@@ +//@@ An input required by the model. +//@@ +message ModelInput +{ + //@@ + //@@ .. cpp:enum:: Format + //@@ + //@@ The format for the input. + //@@ + enum Format { + //@@ .. cpp:enumerator:: Format::FORMAT_NONE = 0 + //@@ + //@@ The input has no specific format. This is the default. + //@@ + FORMAT_NONE = 0; + + //@@ .. cpp:enumerator:: Format::FORMAT_NHWC = 1 + //@@ + //@@ HWC image format. Tensors with this format require 3 dimensions + //@@ if the model does not support batching (max_batch_size = 0) or 4 + //@@ dimensions if the model does support batching (max_batch_size + //@@ >= 1). In either case the 'dims' below should only specify the + //@@ 3 non-batch dimensions (i.e. HWC or CHW). + //@@ + FORMAT_NHWC = 1; + + //@@ .. cpp:enumerator:: Format::FORMAT_NCHW = 2 + //@@ + //@@ CHW image format. Tensors with this format require 3 dimensions + //@@ if the model does not support batching (max_batch_size = 0) or 4 + //@@ dimensions if the model does support batching (max_batch_size + //@@ >= 1). In either case the 'dims' below should only specify the + //@@ 3 non-batch dimensions (i.e. HWC or CHW). + //@@ + FORMAT_NCHW = 2; + } + + //@@ .. cpp:var:: string name + //@@ + //@@ The name of the input. + //@@ + string name = 1; + + //@@ .. cpp:var:: DataType data_type + //@@ + //@@ The data-type of the input. + //@@ + DataType data_type = 2; + + //@@ .. cpp:var:: Format format + //@@ + //@@ The format of the input. Optional. + //@@ + Format format = 3; + + //@@ .. cpp:var:: int64 dims (repeated) + //@@ + //@@ The dimensions/shape of the input tensor that must be provided + //@@ when invoking the inference API for this model. + //@@ + repeated int64 dims = 4; + + //@@ .. cpp:var:: ModelTensorReshape reshape + //@@ + //@@ The shape expected for this input by the backend. The input will + //@@ be reshaped to this before being presented to the backend. The + //@@ reshape must have the same number of elements as the input shape + //@@ specified by 'dims'. Optional. + //@@ + ModelTensorReshape reshape = 5; + + //@@ .. cpp:var:: bool is_shape_tensor + //@@ + //@@ Whether or not the input is a shape tensor to the model. This field + //@@ is currently supported only for the TensorRT model. An error will be + //@@ generated if this specification does not comply with underlying + //@@ model. + //@@ + bool is_shape_tensor = 6; + + //@@ .. cpp:var:: bool allow_ragged_batch + //@@ + //@@ Whether or not the input is allowed to be "ragged" in a dynamically + //@@ created batch. Default is false indicating that two requests will + //@@ only be batched if this tensor has the same shape in both requests. + //@@ True indicates that two requests can be batched even if this tensor + //@@ has a different shape in each request. + //@@ + bool allow_ragged_batch = 7; + + //@@ .. cpp:var:: bool optional + //@@ + //@@ Whether or not the input is optional for the model execution. + //@@ If true, the input is not required in the inference request. + //@@ Default value is false. + //@@ + bool optional = 8; +} + +//@@ +//@@.. cpp:var:: message ModelOutput +//@@ +//@@ An output produced by the model. +//@@ +message ModelOutput +{ + //@@ .. cpp:var:: string name + //@@ + //@@ The name of the output. + //@@ + string name = 1; + + //@@ .. cpp:var:: DataType data_type + //@@ + //@@ The data-type of the output. + //@@ + DataType data_type = 2; + + //@@ .. cpp:var:: int64 dims (repeated) + //@@ + //@@ The dimensions/shape of the output tensor. + //@@ + repeated int64 dims = 3; + + //@@ .. cpp:var:: ModelTensorReshape reshape + //@@ + //@@ The shape produced for this output by the backend. The output will + //@@ be reshaped from this to the shape specifed in 'dims' before being + //@@ returned in the inference response. The reshape must have the same + //@@ number of elements as the output shape specified by 'dims'. Optional. + //@@ + ModelTensorReshape reshape = 5; + + //@@ .. cpp:var:: string label_filename + //@@ + //@@ The label file associated with this output. Should be specified only + //@@ for outputs that represent classifications. Optional. + //@@ + string label_filename = 4; + + + //@@ .. cpp:var:: bool is_shape_tensor + //@@ + //@@ Whether or not the output is a shape tensor to the model. This field + //@@ is currently supported only for the TensorRT model. An error will be + //@@ generated if this specification does not comply with underlying + //@@ model. + //@@ + bool is_shape_tensor = 6; +} + +//@@ .. cpp:var:: message BatchInput +//@@ +//@@ A batch input is an additional input that must be added by +//@@ the backend based on all the requests in a batch. +//@@ +message BatchInput +{ + //@@ + //@@ .. cpp:enum:: Kind + //@@ + //@@ The kind of the batch input. + //@@ + enum Kind { + //@@ .. cpp:enumerator:: Kind::BATCH_ELEMENT_COUNT = 0 + //@@ + //@@ The element count of the 'source_input' will be added as + //@@ input with shape [1]. + //@@ + BATCH_ELEMENT_COUNT = 0; + + //@@ .. cpp:enumerator:: Kind::BATCH_ACCUMULATED_ELEMENT_COUNT = 1 + //@@ + //@@ The accumulated element count of the 'source_input' will be + //@@ added as input with shape [1]. For example, if there is a + //@@ batch of two request, each with 2 elements, an input of value + //@@ 2 will be added to the first request, and an input of value + //@@ 4 will be added to the second request. + //@@ + BATCH_ACCUMULATED_ELEMENT_COUNT = 1; + + //@@ .. cpp:enumerator:: + //@@ Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2 + //@@ + //@@ The accumulated element count of the 'source_input' will be + //@@ added as input with shape [1], except for the first request + //@@ in the batch. For the first request in the batch, the input + //@@ will have shape [2] where the first element is value 0. + //@@ + BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2; + + //@@ .. cpp:enumerator:: Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3 + //@@ + //@@ Among the requests in the batch, the max element count of the + //@@ 'source_input' will be added as input with shape + //@@ [max_element_count] for the first request in the batch. + //@@ For other requests, such input will be with shape [0]. + //@@ The data of the tensor will be uninitialized. + //@@ + BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3; + + //@@ .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE = 4 + //@@ + //@@ Among the requests in the batch, the shape of the + //@@ 'source_input' will be added as input with shape + //@@ [batch_size, len(input_dim)]. For example, if one + //@@ batch-2 input with shape [3, 1] and batch-1 input + //@@ with shape [2, 2] are batched, the batch input will + //@@ have shape [3, 2] and value [ [3, 1], [3, 1], [2, 2]]. + //@@ + BATCH_ITEM_SHAPE = 4; + + //@@ .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE_FLATTEN = 5 + //@@ + //@@ Among the requests in the batch, the shape of the + //@@ 'source_input' will be added as input with single dimensional + //@@ shape [batch_size * len(input_dim)]. For example, if one + //@@ batch-2 input with shape [3, 1] and batch-1 input + //@@ with shape [2, 2] are batched, the batch input will + //@@ have shape [6] and value [3, 1, 3, 1, 2, 2]. + //@@ + BATCH_ITEM_SHAPE_FLATTEN = 5; + } + + //@@ .. cpp:var:: Kind kind + //@@ + //@@ The kind of this batch input. + //@@ + Kind kind = 1; + + //@@ .. cpp:var:: string target_name (repeated) + //@@ + //@@ The name of the model inputs that the backend will create + //@@ for this batch input. + //@@ + repeated string target_name = 2; + + //@@ .. cpp:var:: DataType data_type + //@@ + //@@ The input's datatype. The data type can be TYPE_INT32 or + //@@ TYPE_FP32. + //@@ + DataType data_type = 3; + + //@@ .. cpp:var:: string source_input (repeated) + //@@ + //@@ The backend derives the value for each batch input from one or + //@@ more other inputs. 'source_input' gives the names of those + //@@ inputs. + //@@ + repeated string source_input = 4; +} + +//@@.. cpp:var:: message BatchOutput +//@@ +//@@ A batch output is an output produced by the model that must be handled +//@@ differently by the backend based on all the requests in a batch. +//@@ +message BatchOutput +{ + //@@ + //@@ .. cpp:enum:: Kind + //@@ + //@@ The kind of the batch output. + //@@ + enum Kind { + //@@ .. cpp:enumerator:: Kind::BATCH_SCATTER_WITH_INPUT_SHAPE = 0 + //@@ + //@@ The output should be scattered according to the shape of + //@@ 'source_input'. The dynamic dimension of the output will + //@@ be set to the value of the same dimension in the input. + //@@ + BATCH_SCATTER_WITH_INPUT_SHAPE = 0; + } + + //@@ .. cpp:var:: string target_name (repeated) + //@@ + //@@ The name of the outputs to be produced by this batch output + //@@ specification. + //@@ + repeated string target_name = 1; + + //@@ .. cpp:var:: Kind kind + //@@ + //@@ The kind of this batch output. + //@@ + Kind kind = 2; + + //@@ .. cpp:var:: string source_input (repeated) + //@@ + //@@ The backend derives each batch output from one or more inputs. + //@@ 'source_input' gives the names of those inputs. + //@@ + repeated string source_input = 3; +} + +//@@ +//@@.. cpp:var:: message ModelVersionPolicy +//@@ +//@@ Policy indicating which versions of a model should be made +//@@ available by the inference server. +//@@ +message ModelVersionPolicy +{ + //@@ .. cpp:var:: message Latest + //@@ + //@@ Serve only the latest version(s) of a model. This is + //@@ the default policy. + //@@ + message Latest + { + //@@ .. cpp:var:: uint32 num_versions + //@@ + //@@ Serve only the 'num_versions' highest-numbered versions. T + //@@ The default value of 'num_versions' is 1, indicating that by + //@@ default only the single highest-number version of a + //@@ model will be served. + //@@ + uint32 num_versions = 1; + } + + //@@ .. cpp:var:: message All + //@@ + //@@ Serve all versions of the model. + //@@ + message All {} + + //@@ .. cpp:var:: message Specific + //@@ + //@@ Serve only specific versions of the model. + //@@ + message Specific + { + //@@ .. cpp:var:: int64 versions (repeated) + //@@ + //@@ The specific versions of the model that will be served. + //@@ + repeated int64 versions = 1; + } + + //@@ .. cpp:var:: oneof policy_choice + //@@ + //@@ Each model must implement only a single version policy. The + //@@ default policy is 'Latest'. + //@@ + oneof policy_choice + { + //@@ .. cpp:var:: Latest latest + //@@ + //@@ Serve only latest version(s) of the model. + //@@ + Latest latest = 1; + + //@@ .. cpp:var:: All all + //@@ + //@@ Serve all versions of the model. + //@@ + All all = 2; + + //@@ .. cpp:var:: Specific specific + //@@ + //@@ Serve only specific version(s) of the model. + //@@ + Specific specific = 3; + } +} + +//@@ +//@@.. cpp:var:: message ModelOptimizationPolicy +//@@ +//@@ Optimization settings for a model. These settings control if/how a +//@@ model is optimized and prioritized by the backend framework when +//@@ it is loaded. +//@@ +message ModelOptimizationPolicy +{ + //@@ + //@@ .. cpp:var:: message Graph + //@@ + //@@ Enable generic graph optimization of the model. If not specified + //@@ the framework's default level of optimization is used. Supports + //@@ TensorFlow graphdef and savedmodel and Onnx models. For TensorFlow + //@@ causes XLA to be enabled/disabled for the model. For Onnx defaults + //@@ to enabling all optimizations, -1 enables only basic optimizations, + //@@ +1 enables only basic and extended optimizations. + //@@ + message Graph + { + //@@ .. cpp:var:: int32 level + //@@ + //@@ The optimization level. Defaults to 0 (zero) if not specified. + //@@ + //@@ - -1: Disabled + //@@ - 0: Framework default + //@@ - 1+: Enable optimization level (greater values indicate + //@@ higher optimization levels) + //@@ + int32 level = 1; + } + + //@@ + //@@ .. cpp:enum:: ModelPriority + //@@ + //@@ Model priorities. A model will be given scheduling and execution + //@@ preference over models at lower priorities. Current model + //@@ priorities only work for TensorRT models. + //@@ + enum ModelPriority { + //@@ .. cpp:enumerator:: ModelPriority::PRIORITY_DEFAULT = 0 + //@@ + //@@ The default model priority. + //@@ + PRIORITY_DEFAULT = 0; + + //@@ .. cpp:enumerator:: ModelPriority::PRIORITY_MAX = 1 + //@@ + //@@ The maximum model priority. + //@@ + PRIORITY_MAX = 1; + + //@@ .. cpp:enumerator:: ModelPriority::PRIORITY_MIN = 2 + //@@ + //@@ The minimum model priority. + //@@ + PRIORITY_MIN = 2; + } + + //@@ + //@@ .. cpp:var:: message Cuda + //@@ + //@@ CUDA-specific optimization settings. + //@@ + message Cuda + { + //@@ .. cpp:var:: message GraphSpec + //@@ + //@@ Specification of the CUDA graph to be captured. + //@@ + message GraphSpec + { + //@@ .. cpp:var:: message Dims + //@@ + //@@ Specification of tensor dimension. + //@@ + message Shape + { + //@@ .. cpp:var:: int64 dim (repeated) + //@@ + //@@ The dimension. + //@@ + repeated int64 dim = 1; + } + + message LowerBound + { + //@@ .. cpp:var:: int32 batch_size + //@@ + //@@ The batch size of the CUDA graph. If 'max_batch_size' is 0, + //@@ 'batch_size' must be set to 0. Otherwise, 'batch_size' must + //@@ be set to value between 1 and 'max_batch_size'. + //@@ + int32 batch_size = 1; + + //@@ .. cpp:var:: map input + //@@ + //@@ The specification of the inputs. 'Shape' is the shape of + //@@ the input without batching dimension. + //@@ + map input = 2; + } + + //@@ .. cpp:var:: int32 batch_size + //@@ + //@@ The batch size of the CUDA graph. If 'max_batch_size' is 0, + //@@ 'batch_size' must be set to 0. Otherwise, 'batch_size' must + //@@ be set to value between 1 and 'max_batch_size'. + //@@ + int32 batch_size = 1; + + //@@ .. cpp:var:: map input + //@@ + //@@ The specification of the inputs. 'Shape' is the shape of the + //@@ input without batching dimension. + //@@ + map input = 2; + + //@@ .. cpp:var:: LowerBound graph_lower_bound + //@@ + //@@ Specify the lower bound of the CUDA graph. Optional. + //@@ If specified, the graph can be used for input shapes and + //@@ batch sizes that are in closed interval between the lower + //@@ bound specification and graph specification. For dynamic + //@@ shape model, this allows CUDA graphs to be launched + //@@ frequently without capturing all possible shape combinations. + //@@ However, using graph for shape combinations different from + //@@ the one used for capturing introduces uninitialized data for + //@@ execution and it may distort the inference result if + //@@ the model is sensitive to uninitialized data. + //@@ + LowerBound graph_lower_bound = 3; + } + + //@@ .. cpp:var:: bool graphs + //@@ + //@@ Use CUDA graphs API to capture model operations and execute + //@@ them more efficiently. Default value is false. + //@@ Currently only recognized by TensorRT backend. + //@@ + bool graphs = 1; + + //@@ .. cpp:var:: bool busy_wait_events + //@@ + //@@ Use busy-waiting to synchronize CUDA events to achieve minimum + //@@ latency from event complete to host thread to be notified, with + //@@ the cost of high CPU load. Default value is false. + //@@ Currently only recognized by TensorRT backend. + //@@ + bool busy_wait_events = 2; + + //@@ .. cpp:var:: GraphSpec graph_spec (repeated) + //@@ + //@@ Specification of the CUDA graph to be captured. If not specified + //@@ and 'graphs' is true, the default CUDA graphs will be captured + //@@ based on model settings. + //@@ Currently only recognized by TensorRT backend. + //@@ + repeated GraphSpec graph_spec = 3; + + //@@ .. cpp:var:: bool output_copy_stream + //@@ + //@@ Uses a CUDA stream separate from the inference stream to copy the + //@@ output to host. However, be aware that setting this option to + //@@ true will lead to an increase in the memory consumption of the + //@@ model as Triton will allocate twice as much GPU memory for its + //@@ I/O tensor buffers. Default value is false. + //@@ Currently only recognized by TensorRT backend. + //@@ + bool output_copy_stream = 4; + } + + //@@ + //@@ .. cpp:var:: message ExecutionAccelerators + //@@ + //@@ Specify the preferred execution accelerators to be used to execute + //@@ the model. Currently only recognized by ONNX Runtime backend and + //@@ TensorFlow backend. + //@@ + //@@ For ONNX Runtime backend, it will deploy the model with the execution + //@@ accelerators by priority, the priority is determined based on the + //@@ order that they are set, i.e. the provider at the front has highest + //@@ priority. Overall, the priority will be in the following order: + //@@ (if instance is on GPU) + //@@ CUDA Execution Provider (if instance is on GPU) + //@@ + //@@ Default CPU Execution Provider + //@@ + message ExecutionAccelerators + { + //@@ + //@@ .. cpp:var:: message Accelerator + //@@ + //@@ Specify the accelerator to be used to execute the model. + //@@ Accelerator with the same name may accept different parameters + //@@ depending on the backends. + //@@ + message Accelerator + { + //@@ .. cpp:var:: string name + //@@ + //@@ The name of the execution accelerator. + //@@ + string name = 1; + + //@@ .. cpp:var:: map parameters + //@@ + //@@ Additional paremeters used to configure the accelerator. + //@@ + map parameters = 2; + } + + //@@ .. cpp:var:: Accelerator gpu_execution_accelerator (repeated) + //@@ + //@@ The preferred execution provider to be used if the model instance + //@@ is deployed on GPU. + //@@ + //@@ For ONNX Runtime backend, possible value is "tensorrt" as name, + //@@ and no parameters are required. + //@@ + //@@ For TensorFlow backend, possible values are "tensorrt", + //@@ "auto_mixed_precision", "gpu_io". + //@@ + //@@ For "tensorrt", the following parameters can be specified: + //@@ "precision_mode": The precision used for optimization. + //@@ Allowed values are "FP32" and "FP16". Default value is "FP32". + //@@ + //@@ "max_cached_engines": The maximum number of cached TensorRT + //@@ engines in dynamic TensorRT ops. Default value is 100. + //@@ + //@@ "minimum_segment_size": The smallest model subgraph that will + //@@ be considered for optimization by TensorRT. Default value is 3. + //@@ + //@@ "max_workspace_size_bytes": The maximum GPU memory the model + //@@ can use temporarily during execution. Default value is 1GB. + //@@ + //@@ For "auto_mixed_precision", no parameters are required. If set, + //@@ the model will try to use FP16 for better performance. + //@@ This optimization can not be set with "tensorrt". + //@@ + //@@ For "gpu_io", no parameters are required. If set, the model will + //@@ be executed using TensorFlow Callable API to set input and output + //@@ tensors in GPU memory if possible, which can reduce data transfer + //@@ overhead if the model is used in ensemble. However, the Callable + //@@ object will be created on model creation and it will request all + //@@ outputs for every model execution, which may impact the + //@@ performance if a request does not require all outputs. This + //@@ optimization will only take affect if the model instance is + //@@ created with KIND_GPU. + //@@ + repeated Accelerator gpu_execution_accelerator = 1; + + //@@ .. cpp:var:: Accelerator cpu_execution_accelerator (repeated) + //@@ + //@@ The preferred execution provider to be used if the model instance + //@@ is deployed on CPU. + //@@ + //@@ For ONNX Runtime backend, possible value is "openvino" as name, + //@@ and no parameters are required. + //@@ + repeated Accelerator cpu_execution_accelerator = 2; + } + + //@@ + //@@ .. cpp:var:: message PinnedMemoryBuffer + //@@ + //@@ Specify whether to use a pinned memory buffer when transferring data + //@@ between non-pinned system memory and GPU memory. Using a pinned + //@@ memory buffer for system from/to GPU transfers will typically provide + //@@ increased performance. For example, in the common use case where the + //@@ request provides inputs and delivers outputs via non-pinned system + //@@ memory, if the model instance accepts GPU IOs, the inputs will be + //@@ processed by two copies: from non-pinned system memory to pinned + //@@ memory, and from pinned memory to GPU memory. Similarly, pinned + //@@ memory will be used for delivering the outputs. + //@@ + message PinnedMemoryBuffer + { + //@@ .. cpp:var:: bool enable + //@@ + //@@ Use pinned memory buffer. Default is true. + //@@ + bool enable = 1; + } + + //@@ .. cpp:var:: Graph graph + //@@ + //@@ The graph optimization setting for the model. Optional. + //@@ + Graph graph = 1; + + //@@ .. cpp:var:: ModelPriority priority + //@@ + //@@ The priority setting for the model. Optional. + //@@ + ModelPriority priority = 2; + + //@@ .. cpp:var:: Cuda cuda + //@@ + //@@ CUDA-specific optimization settings. Optional. + //@@ + Cuda cuda = 3; + + //@@ .. cpp:var:: ExecutionAccelerators execution_accelerators + //@@ + //@@ The accelerators used for the model. Optional. + //@@ + ExecutionAccelerators execution_accelerators = 4; + + //@@ .. cpp:var:: PinnedMemoryBuffer input_pinned_memory + //@@ + //@@ Use pinned memory buffer when the data transfer for inputs + //@@ is between GPU memory and non-pinned system memory. + //@@ Default is true. + //@@ + PinnedMemoryBuffer input_pinned_memory = 5; + + //@@ .. cpp:var:: PinnedMemoryBuffer output_pinned_memory + //@@ + //@@ Use pinned memory buffer when the data transfer for outputs + //@@ is between GPU memory and non-pinned system memory. + //@@ Default is true. + //@@ + PinnedMemoryBuffer output_pinned_memory = 6; + + //@@ .. cpp:var:: uint32 gather_kernel_buffer_threshold + //@@ + //@@ The backend may use a gather kernel to gather input data if the + //@@ device has direct access to the source buffer and the destination + //@@ buffer. In such case, the gather kernel will be used only if the + //@@ number of buffers to be gathered is greater or equal to + //@@ the specifed value. If 0, the gather kernel will be disabled. + //@@ Default value is 0. + //@@ Currently only recognized by TensorRT backend. + //@@ + uint32 gather_kernel_buffer_threshold = 7; + + //@@ .. cpp:var:: bool eager_batching + //@@ + //@@ Start preparing the next batch before the model instance is ready + //@@ for the next inference. This option can be used to overlap the + //@@ batch preparation with model execution, with the trade-off that + //@@ the next batch might be smaller than what it could have been. + //@@ Default value is false. + //@@ Currently only recognized by TensorRT backend. + //@@ + bool eager_batching = 8; +} + +//@@ +//@@.. cpp:var:: message ModelQueuePolicy +//@@ +//@@ Queue policy for inference requests. +//@@ +message ModelQueuePolicy +{ + //@@ + //@@ .. cpp:enum:: TimeoutAction + //@@ + //@@ The action applied to timed-out requests. + //@@ + enum TimeoutAction { + //@@ .. cpp:enumerator:: Action::REJECT = 0 + //@@ + //@@ Reject the request and return error message accordingly. + //@@ + REJECT = 0; + + //@@ .. cpp:enumerator:: Action::DELAY = 1 + //@@ + //@@ Delay the request until all other requests at the same + //@@ (or higher) priority levels that have not reached their timeouts + //@@ are processed. A delayed request will eventually be processed, + //@@ but may be delayed indefinitely due to newly arriving requests. + //@@ + DELAY = 1; + } + + //@@ + //@@ .. cpp:var:: TimeoutAction timeout_action + //@@ + //@@ The action applied to timed-out request. + //@@ The default action is REJECT. + //@@ + TimeoutAction timeout_action = 1; + + //@@ + //@@ .. cpp:var:: uint64 default_timeout_microseconds + //@@ + //@@ The default timeout for every request, in microseconds. + //@@ The default value is 0 which indicates that no timeout is set. + //@@ + uint64 default_timeout_microseconds = 2; + + //@@ + //@@ .. cpp:var:: bool allow_timeout_override + //@@ + //@@ Whether individual request can override the default timeout value. + //@@ When true, individual requests can set a timeout that is less than + //@@ the default timeout value but may not increase the timeout. + //@@ The default value is false. + //@@ + bool allow_timeout_override = 3; + + //@@ + //@@ .. cpp:var:: uint32 max_queue_size + //@@ + //@@ The maximum queue size for holding requests. A request will be + //@@ rejected immediately if it can't be enqueued because the queue is + //@@ full. The default value is 0 which indicates that no maximum + //@@ queue size is enforced. + //@@ + uint32 max_queue_size = 4; +} + +//@@ +//@@.. cpp:var:: message ModelDynamicBatching +//@@ +//@@ Dynamic batching configuration. These settings control how dynamic +//@@ batching operates for the model. +//@@ +message ModelDynamicBatching +{ + //@@ .. cpp:var:: int32 preferred_batch_size (repeated) + //@@ + //@@ Preferred batch sizes for dynamic batching. If a batch of one of + //@@ these sizes can be formed it will be executed immediately. If + //@@ not specified a preferred batch size will be chosen automatically + //@@ based on model and GPU characteristics. + //@@ + repeated int32 preferred_batch_size = 1; + + //@@ .. cpp:var:: uint64 max_queue_delay_microseconds + //@@ + //@@ The maximum time, in microseconds, a request will be delayed in + //@@ the scheduling queue to wait for additional requests for + //@@ batching. Default is 0. + //@@ + uint64 max_queue_delay_microseconds = 2; + + //@@ .. cpp:var:: bool preserve_ordering + //@@ + //@@ Should the dynamic batcher preserve the ordering of responses to + //@@ match the order of requests received by the scheduler. Default is + //@@ false. If true, the responses will be returned in the same order as + //@@ the order of requests sent to the scheduler. If false, the responses + //@@ may be returned in arbitrary order. This option is specifically + //@@ needed when a sequence of related inference requests (i.e. inference + //@@ requests with the same correlation ID) are sent to the dynamic + //@@ batcher to ensure that the sequence responses are in the correct + //@@ order. + //@@ + bool preserve_ordering = 3; + + //@@ .. cpp:var:: uint32 priority_levels + //@@ + //@@ The number of priority levels to be enabled for the model, + //@@ the priority level starts from 1 and 1 is the highest priority. + //@@ Requests are handled in priority order with all priority 1 requests + //@@ processed before priority 2, all priority 2 requests processed before + //@@ priority 3, etc. Requests with the same priority level will be + //@@ handled in the order that they are received. + //@@ + uint32 priority_levels = 4; + + //@@ .. cpp:var:: uint32 default_priority_level + //@@ + //@@ The priority level used for requests that don't specify their + //@@ priority. The value must be in the range [ 1, 'priority_levels' ]. + //@@ + uint32 default_priority_level = 5; + + //@@ .. cpp:var:: ModelQueuePolicy default_queue_policy + //@@ + //@@ The default queue policy used for requests that don't require + //@@ priority handling and requests that specify priority levels where + //@@ there is no specific policy given. If not specified, a policy with + //@@ default field values will be used. + //@@ + ModelQueuePolicy default_queue_policy = 6; + + //@@ .. cpp:var:: map priority_queue_policy + //@@ + //@@ Specify the queue policy for the priority level. The default queue + //@@ policy will be used if a priority level doesn't specify a queue + //@@ policy. + //@@ + map priority_queue_policy = 7; +} + +//@@ +//@@.. cpp:var:: message ModelSequenceBatching +//@@ +//@@ Sequence batching configuration. These settings control how sequence +//@@ batching operates for the model. +//@@ +message ModelSequenceBatching +{ + //@@ .. cpp:var:: message Control + //@@ + //@@ A control is a signal that the sequence batcher uses to + //@@ communicate with a backend. + //@@ + message Control + { + //@@ + //@@ .. cpp:enum:: Kind + //@@ + //@@ The kind of the control. + //@@ + enum Kind { + //@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_START = 0 + //@@ + //@@ A new sequence is/is-not starting. If true a sequence is + //@@ starting, if false a sequence is continuing. Must + //@@ specify either int32_false_true, fp32_false_true or + //@@ bool_false_true for this control. This control is optional. + //@@ + CONTROL_SEQUENCE_START = 0; + + //@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_READY = 1 + //@@ + //@@ A sequence is/is-not ready for inference. If true the + //@@ input tensor data is valid and should be used. If false + //@@ the input tensor data is invalid and inferencing should + //@@ be "skipped". Must specify either int32_false_true, + //@@ fp32_false_true or bool_false_true for this control. This + //@@ control is optional. + //@@ + CONTROL_SEQUENCE_READY = 1; + + //@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_END = 2 + //@@ + //@@ A sequence is/is-not ending. If true a sequence is + //@@ ending, if false a sequence is continuing. Must specify + //@@ either int32_false_true, fp32_false_true or bool_false_true + //@@ for this control. This control is optional. + //@@ + CONTROL_SEQUENCE_END = 2; + + //@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_CORRID = 3 + //@@ + //@@ The correlation ID of the sequence. The correlation ID + //@@ is an uint64_t value that is communicated in whole or + //@@ in part by the tensor. The tensor's datatype must be + //@@ specified by data_type and must be TYPE_UINT64, TYPE_INT64, + //@@ TYPE_UINT32 or TYPE_INT32. If a 32-bit datatype is specified + //@@ the correlation ID will be truncated to the low-order 32 + //@@ bits. This control is optional. + //@@ + CONTROL_SEQUENCE_CORRID = 3; + } + + //@@ .. cpp:var:: Kind kind + //@@ + //@@ The kind of this control. + //@@ + Kind kind = 1; + + //@@ .. cpp:var:: int32 int32_false_true (repeated) + //@@ + //@@ The control's true and false setting is indicated by setting + //@@ a value in an int32 tensor. The tensor must be a + //@@ 1-dimensional tensor with size equal to the batch size of + //@@ the request. 'int32_false_true' must have two entries: the + //@@ first the false value and the second the true value. + //@@ + repeated int32 int32_false_true = 2; + + //@@ .. cpp:var:: float fp32_false_true (repeated) + //@@ + //@@ The control's true and false setting is indicated by setting + //@@ a value in a fp32 tensor. The tensor must be a + //@@ 1-dimensional tensor with size equal to the batch size of + //@@ the request. 'fp32_false_true' must have two entries: the + //@@ first the false value and the second the true value. + //@@ + repeated float fp32_false_true = 3; + + //@@ .. cpp:var:: bool bool_false_true (repeated) + //@@ + //@@ The control's true and false setting is indicated by setting + //@@ a value in a bool tensor. The tensor must be a + //@@ 1-dimensional tensor with size equal to the batch size of + //@@ the request. 'bool_false_true' must have two entries: the + //@@ first the false value and the second the true value. + //@@ + repeated bool bool_false_true = 5; + + //@@ .. cpp:var:: DataType data_type + //@@ + //@@ The control's datatype. + //@@ + DataType data_type = 4; + } + + //@@ .. cpp:var:: message ControlInput + //@@ + //@@ The sequence control values to communicate by a model input. + //@@ + message ControlInput + { + //@@ .. cpp:var:: string name + //@@ + //@@ The name of the model input. + //@@ + string name = 1; + + //@@ .. cpp:var:: Control control (repeated) + //@@ + //@@ The control value(s) that should be communicated to the + //@@ model using this model input. + //@@ + repeated Control control = 2; + } + + //@@ + //@@ .. cpp:var:: message InitialState + //@@ + //@@ Settings used to initialize data for implicit state. + //@@ + message InitialState + { + //@@ .. cpp:var:: DataType data_type + //@@ + //@@ The data-type of the state. + //@@ + DataType data_type = 1; + + //@@ .. cpp:var:: int64 dims (repeated) + //@@ + //@@ The shape of the state tensor, not including the batch dimension. + //@@ + repeated int64 dims = 2; + + //@@ .. cpp:var:: oneof state_data + //@@ + //@@ Specify how the initial state data is generated. + //@@ + oneof state_data + { + //@@ + //@@ .. cpp:var:: bool zero_data + //@@ + //@@ The identifier for using zeros as initial state data. + //@@ Note that the value of 'zero_data' will not be checked, + //@@ instead, zero data will be used as long as the field is set. + //@@ + bool zero_data = 3; + + //@@ .. cpp:var:: string data_file + //@@ + //@@ The file whose content will be used as the initial data for + //@@ the state in row-major order. The file must be provided in + //@@ sub-directory 'initial_state' under the model directory. + //@@ + string data_file = 4; + } + + //@@ .. cpp:var:: string name + //@@ + //@@ The name of the state initialization. + //@@ + string name = 5; + } + + //@@ .. cpp:var:: message State + //@@ + //@@ An input / output pair of tensors that carry state for the sequence. + //@@ + message State + { + //@@ .. cpp:var:: string input_name + //@@ + //@@ The name of the model state input. + //@@ + string input_name = 1; + + //@@ .. cpp:var:: string output_name + //@@ + //@@ The name of the model state output. + //@@ + string output_name = 2; + + //@@ .. cpp:var:: DataType data_type + //@@ + //@@ The data-type of the state. + //@@ + DataType data_type = 3; + + //@@ .. cpp:var:: int64 dim (repeated) + //@@ + //@@ The dimension. + //@@ + repeated int64 dims = 4; + + //@@ .. cpp:var:: InitialState initial_state (repeated) + //@@ + //@@ The optional field to specify the initial state for the model. + //@@ + repeated InitialState initial_state = 5; + } + + //@@ .. cpp:var:: message StrategyDirect + //@@ + //@@ The sequence batcher uses a specific, unique batch + //@@ slot for each sequence. All inference requests in a + //@@ sequence are directed to the same batch slot in the same + //@@ model instance over the lifetime of the sequence. This + //@@ is the default strategy. + //@@ + message StrategyDirect + { + //@@ .. cpp:var:: uint64 max_queue_delay_microseconds + //@@ + //@@ The maximum time, in microseconds, a candidate request + //@@ will be delayed in the sequence batch scheduling queue to + //@@ wait for additional requests for batching. Default is 0. + //@@ + uint64 max_queue_delay_microseconds = 1; + + //@@ .. cpp:var:: float minimum_slot_utilization + //@@ + //@@ The minimum slot utilization that must be satisfied to + //@@ execute the batch before 'max_queue_delay_microseconds' expires. + //@@ For example, a value of 0.5 indicates that the batch should be + //@@ executed as soon as 50% or more of the slots are ready even if + //@@ the 'max_queue_delay_microseconds' timeout has not expired. + //@@ The default is 0.0, indicating that a batch will be executed + //@@ before 'max_queue_delay_microseconds' timeout expires if at least + //@@ one batch slot is ready. 'max_queue_delay_microseconds' will be + //@@ ignored unless minimum_slot_utilization is set to a non-zero + //@@ value. + //@@ + float minimum_slot_utilization = 2; + } + + //@@ .. cpp:var:: message StrategyOldest + //@@ + //@@ The sequence batcher maintains up to 'max_candidate_sequences' + //@@ candidate sequences. 'max_candidate_sequences' can be greater + //@@ than the model's 'max_batch_size'. For inferencing the batcher + //@@ chooses from the candidate sequences up to 'max_batch_size' + //@@ inference requests. Requests are chosen in an oldest-first + //@@ manner across all candidate sequences. A given sequence is + //@@ not guaranteed to be assigned to the same batch slot for + //@@ all inference requests of that sequence. + //@@ + message StrategyOldest + { + //@@ .. cpp:var:: int32 max_candidate_sequences + //@@ + //@@ Maximum number of candidate sequences that the batcher + //@@ maintains. Excess seqences are kept in an ordered backlog + //@@ and become candidates when existing candidate sequences + //@@ complete. + //@@ + int32 max_candidate_sequences = 1; + + //@@ .. cpp:var:: int32 preferred_batch_size (repeated) + //@@ + //@@ Preferred batch sizes for dynamic batching of candidate + //@@ sequences. If a batch of one of these sizes can be formed + //@@ it will be executed immediately. If not specified a + //@@ preferred batch size will be chosen automatically + //@@ based on model and GPU characteristics. + //@@ + repeated int32 preferred_batch_size = 2; + + //@@ .. cpp:var:: uint64 max_queue_delay_microseconds + //@@ + //@@ The maximum time, in microseconds, a candidate request + //@@ will be delayed in the dynamic batch scheduling queue to + //@@ wait for additional requests for batching. Default is 0. + //@@ + uint64 max_queue_delay_microseconds = 3; + } + + //@@ .. cpp:var:: oneof strategy_choice + //@@ + //@@ The strategy used by the sequence batcher. Default strategy + //@@ is 'direct'. + //@@ + oneof strategy_choice + { + //@@ .. cpp:var:: StrategyDirect direct + //@@ + //@@ StrategyDirect scheduling strategy. + //@@ + StrategyDirect direct = 3; + + //@@ .. cpp:var:: StrategyOldest oldest + //@@ + //@@ StrategyOldest scheduling strategy. + //@@ + StrategyOldest oldest = 4; + } + + //@@ .. cpp:var:: uint64 max_sequence_idle_microseconds + //@@ + //@@ The maximum time, in microseconds, that a sequence is allowed to + //@@ be idle before it is aborted. The inference server considers a + //@@ sequence idle when it does not have any inference request queued + //@@ for the sequence. If this limit is exceeded, the inference server + //@@ will free the sequence slot allocated by the sequence and make it + //@@ available for another sequence. If not specified (or specified as + //@@ zero) a default value of 1000000 (1 second) is used. + //@@ + uint64 max_sequence_idle_microseconds = 1; + + //@@ .. cpp:var:: ControlInput control_input (repeated) + //@@ + //@@ The model input(s) that the server should use to communicate + //@@ sequence start, stop, ready and similar control values to the + //@@ model. + //@@ + repeated ControlInput control_input = 2; + + //@@ .. cpp:var:: State state (repeated) + //@@ + //@@ The optional state that can be stored in Triton for performing + //@@ inference requests on a sequence. Each sequence holds an implicit + //@@ state local to itself. The output state tensor provided by the + //@@ model in 'output_name' field of the current inference request will + //@@ be transferred as an input tensor named 'input_name' in the next + //@@ request of the same sequence. The input state of the first request + //@@ in the sequence contains garbage data. + //@@ + repeated State state = 5; +} + +//@@ +//@@.. cpp:var:: message ModelEnsembling +//@@ +//@@ Model ensembling configuration. These settings specify the models that +//@@ compose the ensemble and how data flows between the models. +//@@ +message ModelEnsembling +{ + //@@ .. cpp:var:: message Step + //@@ + //@@ Each step specifies a model included in the ensemble, + //@@ maps ensemble tensor names to the model input tensors, + //@@ and maps model output tensors to ensemble tensor names + //@@ + message Step + { + //@@ .. cpp:var:: string model_name + //@@ + //@@ The name of the model to execute for this step of the ensemble. + //@@ + string model_name = 1; + + //@@ .. cpp:var:: int64 model_version + //@@ + //@@ The version of the model to use for inference. If -1 + //@@ the latest/most-recent version of the model is used. + //@@ + int64 model_version = 2; + + //@@ .. cpp:var:: map input_map + //@@ + //@@ Map from name of an input tensor on this step's model to ensemble + //@@ tensor name. The ensemble tensor must have the same data type and + //@@ shape as the model input. Each model input must be assigned to + //@@ one ensemble tensor, but the same ensemble tensor can be assigned + //@@ to multiple model inputs. + //@@ + map input_map = 3; + + //@@ .. cpp:var:: map output_map + //@@ + //@@ Map from name of an output tensor on this step's model to ensemble + //@@ tensor name. The data type and shape of the ensemble tensor will + //@@ be inferred from the model output. It is optional to assign all + //@@ model outputs to ensemble tensors. One ensemble tensor name + //@@ can appear in an output map only once. + //@@ + map output_map = 4; + } + + //@@ .. cpp:var:: Step step (repeated) + //@@ + //@@ The models and the input / output mappings used within the ensemble. + //@@ + repeated Step step = 1; +} + +//@@ +//@@.. cpp:var:: message ModelParameter +//@@ +//@@ A model parameter. +//@@ +message ModelParameter +{ + //@@ .. cpp:var:: string string_value + //@@ + //@@ The string value of the parameter. + //@@ + string string_value = 1; +} + +//@@ +//@@.. cpp:var:: message ModelWarmup +//@@ +//@@ Settings used to construct the request sample for model warmup. +//@@ +message ModelWarmup +{ + //@@ + //@@ .. cpp:var:: message Input + //@@ + //@@ Meta data associated with an input. + //@@ + message Input + { + //@@ .. cpp:var:: DataType data_type + //@@ + //@@ The data-type of the input. + //@@ + DataType data_type = 1; + + //@@ .. cpp:var:: int64 dims (repeated) + //@@ + //@@ The shape of the input tensor, not including the batch dimension. + //@@ + repeated int64 dims = 2; + + //@@ .. cpp:var:: oneof input_data_type + //@@ + //@@ Specify how the input data is generated. If the input has STRING + //@@ data type and 'random_data' is set, the data generation will fall + //@@ back to 'zero_data'. + //@@ + oneof input_data_type + { + //@@ + //@@ .. cpp:var:: bool zero_data + //@@ + //@@ The identifier for using zeros as input data. Note that the + //@@ value of 'zero_data' will not be checked, instead, zero data + //@@ will be used as long as the field is set. + //@@ + bool zero_data = 3; + + //@@ + //@@ .. cpp:var:: bool random_data + //@@ + //@@ The identifier for using random data as input data. Note that + //@@ the value of 'random_data' will not be checked, instead, + //@@ random data will be used as long as the field is set. + //@@ + bool random_data = 4; + + //@@ .. cpp:var:: string input_data_file + //@@ + //@@ The file whose content will be used as raw input data in + //@@ row-major order. The file must be provided in a sub-directory + //@@ 'warmup' under the model directory. The file contents should be + //@@ in binary format. For TYPE_STRING data-type, an element is + //@@ represented by a 4-byte unsigned integer giving the length + //@@ followed by the actual bytes. + //@@ + string input_data_file = 5; + } + } + + //@@ .. cpp:var:: string name + //@@ + //@@ The name of the request sample. + //@@ + string name = 1; + + //@@ .. cpp:var:: uint32 batch_size + //@@ + //@@ The batch size of the inference request. This must be >= 1. For + //@@ models that don't support batching, batch_size must be 1. If + //@@ batch_size > 1, the 'inputs' specified below will be duplicated to + //@@ match the batch size requested. + //@@ + uint32 batch_size = 2; + + //@@ .. cpp:var:: map inputs + //@@ + //@@ The warmup meta data associated with every model input, including + //@@ control tensors. + //@@ + map inputs = 3; + + //@@ .. cpp:var:: uint32 count + //@@ + //@@ The number of iterations that this warmup sample will be executed. + //@@ For example, if this field is set to 2, 2 model executions using this + //@@ sample will be scheduled for warmup. Default value is 0 which + //@@ indicates that this sample will be used only once. + //@@ Note that for sequence model, 'count' may not work well + //@@ because the model often expect a valid sequence of requests which + //@@ should be represented by a series of warmup samples. 'count > 1' + //@@ essentially "resends" one of the sample, which may invalidate the + //@@ sequence and result in unexpected warmup failure. + //@@ + uint32 count = 4; +} + +//@@ +//@@ .. cpp:var:: message ModelOperations +//@@ +//@@ The metadata of libraries providing custom operations for this model. +//@@ +message ModelOperations +{ + //@@ .. cpp:var:: string op_library_filename (repeated) + //@@ + //@@ Optional paths of the libraries providing custom operations for + //@@ this model. Valid only for ONNX models. + //@@ + repeated string op_library_filename = 1; +} + +//@@ +//@@ .. cpp:var:: message ModelTransactionPolicy +//@@ +//@@ The specification that describes the nature of transactions +//@@ to be expected from the model. +//@@ +message ModelTransactionPolicy +{ + //@@ .. cpp:var:: bool decoupled + //@@ + //@@ Indicates whether responses generated by the model are decoupled with + //@@ the requests issued to it, which means the number of responses + //@@ generated by model may differ from number of requests issued, and + //@@ that the responses may be out of order relative to the order of + //@@ requests. The default is false, which means the model will generate + //@@ exactly one response for each request. + //@@ + bool decoupled = 1; +} + +//@@ +//@@.. cpp:var:: message ModelRepositoryAgents +//@@ +//@@ The repository agents for the model. +//@@ +message ModelRepositoryAgents +{ + //@@ + //@@ .. cpp:var:: message Agent + //@@ + //@@ A repository agent that should be invoked for the specified + //@@ repository actions for this model. + //@@ + message Agent + { + //@@ .. cpp:var:: string name + //@@ + //@@ The name of the agent. + //@@ + string name = 1; + + //@@ .. cpp:var:: map parameters + //@@ + //@@ The parameters for the agent. + //@@ + map parameters = 2; + } + + //@@ + //@@ .. cpp:var:: Agent agents (repeated) + //@@ + //@@ The ordered list of agents for the model. These agents will be + //@@ invoked in order to respond to repository actions occuring for the + //@@ model. + //@@ + repeated Agent agents = 1; +} + +//@@ +//@@.. cpp:var:: message ModelResponseCache +//@@ +//@@ The response cache setting for the model. +//@@ +message ModelResponseCache +{ + //@@ + //@@ .. cpp::var:: bool enable + //@@ + //@@ Whether or not to use response cache for the model. If True, the + //@@ responses from the model are cached and when identical request + //@@ is encountered, instead of going through the model execution, + //@@ the response from the cache is utilized. By default, response + //@@ cache is disabled for the models. + //@@ + bool enable = 1; +} + +//@@ +//@@.. cpp:var:: message ModelConfig +//@@ +//@@ A model configuration. +//@@ +message ModelConfig +{ + //@@ .. cpp:var:: string name + //@@ + //@@ The name of the model. + //@@ + string name = 1; + + //@@ .. cpp:var:: string platform + //@@ + //@@ The framework for the model. Possible values are + //@@ "tensorrt_plan", "tensorflow_graphdef", + //@@ "tensorflow_savedmodel", "onnxruntime_onnx", + //@@ "pytorch_libtorch". + //@@ + string platform = 2; + + //@@ .. cpp:var:: string backend + //@@ + //@@ The backend used by the model. + //@@ + string backend = 17; + + //@@ .. cpp:var:: ModelVersionPolicy version_policy + //@@ + //@@ Policy indicating which version(s) of the model will be served. + //@@ + ModelVersionPolicy version_policy = 3; + + //@@ .. cpp:var:: int32 max_batch_size + //@@ + //@@ Maximum batch size allowed for inference. This can only decrease + //@@ what is allowed by the model itself. A max_batch_size value of 0 + //@@ indicates that batching is not allowed for the model and the + //@@ dimension/shape of the input and output tensors must exactly + //@@ match what is specified in the input and output configuration. A + //@@ max_batch_size value > 0 indicates that batching is allowed and + //@@ so the model expects the input tensors to have an additional + //@@ initial dimension for the batching that is not specified in the + //@@ input (for example, if the model supports batched inputs of + //@@ 2-dimensional tensors then the model configuration will specify + //@@ the input shape as [ X, Y ] but the model will expect the actual + //@@ input tensors to have shape [ N, X, Y ]). For max_batch_size > 0 + //@@ returned outputs will also have an additional initial dimension + //@@ for the batch. + //@@ + int32 max_batch_size = 4; + + //@@ .. cpp:var:: ModelInput input (repeated) + //@@ + //@@ The inputs request by the model. + //@@ + repeated ModelInput input = 5; + + //@@ .. cpp:var:: ModelOutput output (repeated) + //@@ + //@@ The outputs produced by the model. + //@@ + repeated ModelOutput output = 6; + + //@@ .. cpp:var:: BatchInput batch_input (repeated) + //@@ + //@@ The model input(s) that the server should use to communicate + //@@ batch related values to the model. + //@@ + repeated BatchInput batch_input = 20; + + //@@ .. cpp:var:: BatchOutput batch_output (repeated) + //@@ + //@@ The outputs produced by the model that requires special handling + //@@ by the model backend. + //@@ + repeated BatchOutput batch_output = 21; + + //@@ .. cpp:var:: ModelOptimizationPolicy optimization + //@@ + //@@ Optimization configuration for the model. If not specified + //@@ then default optimization policy is used. + //@@ + ModelOptimizationPolicy optimization = 12; + + //@@ .. cpp:var:: oneof scheduling_choice + //@@ + //@@ The scheduling policy for the model. If not specified the + //@@ default scheduling policy is used for the model. The default + //@@ policy is to execute each inference request independently. + //@@ + oneof scheduling_choice + { + //@@ .. cpp:var:: ModelDynamicBatching dynamic_batching + //@@ + //@@ If specified, enables the dynamic-batching scheduling + //@@ policy. With dynamic-batching the scheduler may group + //@@ together independent requests into a single batch to + //@@ improve inference throughput. + //@@ + ModelDynamicBatching dynamic_batching = 11; + + //@@ .. cpp:var:: ModelSequenceBatching sequence_batching + //@@ + //@@ If specified, enables the sequence-batching scheduling + //@@ policy. With sequence-batching, inference requests + //@@ with the same correlation ID are routed to the same + //@@ model instance. Multiple sequences of inference requests + //@@ may be batched together into a single batch to + //@@ improve inference throughput. + //@@ + ModelSequenceBatching sequence_batching = 13; + + //@@ .. cpp:var:: ModelEnsembling ensemble_scheduling + //@@ + //@@ If specified, enables the model-ensembling scheduling + //@@ policy. With model-ensembling, inference requests + //@@ will be processed according to the specification, such as an + //@@ execution sequence of models. The input specified in this model + //@@ config will be the input for the ensemble, and the output + //@@ specified will be the output of the ensemble. + //@@ + ModelEnsembling ensemble_scheduling = 15; + } + + //@@ .. cpp:var:: ModelInstanceGroup instance_group (repeated) + //@@ + //@@ Instances of this model. If not specified, one instance + //@@ of the model will be instantiated on each available GPU. + //@@ + repeated ModelInstanceGroup instance_group = 7; + + //@@ .. cpp:var:: string default_model_filename + //@@ + //@@ Optional filename of the model file to use if a + //@@ compute-capability specific model is not specified in + //@@ :cpp:var:`cc_model_filenames`. If not specified the default name + //@@ is 'model.graphdef', 'model.savedmodel', 'model.plan' or + //@@ 'model.pt' depending on the model type. + //@@ + string default_model_filename = 8; + + //@@ .. cpp:var:: map cc_model_filenames + //@@ + //@@ Optional map from CUDA compute capability to the filename of + //@@ the model that supports that compute capability. The filename + //@@ refers to a file within the model version directory. + //@@ + map cc_model_filenames = 9; + + //@@ .. cpp:var:: map metric_tags + //@@ + //@@ Optional metric tags. User-specific key-value pairs for metrics + //@@ reported for this model. These tags are applied to the metrics + //@@ reported on the HTTP metrics port. + //@@ + map metric_tags = 10; + + //@@ .. cpp:var:: map parameters + //@@ + //@@ Optional model parameters. User-specified parameter values. + //@@ + map parameters = 14; + + //@@ .. cpp:var:: ModelWarmup model_warmup (repeated) + //@@ + //@@ Warmup setting of this model. If specified, all instances + //@@ will be run with the request samples in sequence before + //@@ serving the model. + //@@ This field can only be specified if the model is not an ensemble + //@@ model. + //@@ + repeated ModelWarmup model_warmup = 16; + + //@@ .. cpp:var:: ModelOperations model_operations + //@@ + //@@ Optional metadata of the libraries providing custom operations for + //@@ this model. + //@@ + ModelOperations model_operations = 18; + + //@@ .. cpp:var:: ModelTransactionPolicy model_transaction_policy + //@@ + //@@ Optional specification that describes the nature of transactions + //@@ to be expected from the model. + //@@ + ModelTransactionPolicy model_transaction_policy = 19; + + //@@ .. cpp:var:: ModelRepositoryAgents model_repository_agents + //@@ + //@@ Optional specification of the agent(s) that should be invoked + //@@ with repository actions are performed for this model. + //@@ + ModelRepositoryAgents model_repository_agents = 23; + + //@@ .. cpp:var:: ModelResponseCache response_cache + //@@ + //@@ Optional setting for utilizing the response cache for this + //@@ model. + //@@ + ModelResponseCache response_cache = 24; +} \ No newline at end of file diff --git a/visualdl/component/inference/proto/model_config_pb2.py b/visualdl/component/inference/proto/model_config_pb2.py new file mode 100644 index 000000000..70bf7b906 --- /dev/null +++ b/visualdl/component/inference/proto/model_config_pb2.py @@ -0,0 +1,856 @@ +# flake8: noqa +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: model_config.protxt +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import descriptor_pool as _descriptor_pool +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database +from google.protobuf.internal import enum_type_wrapper +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x13model_config.protxt\x12\tinference\"\x96\x01\n\x10ModelRateLimiter\x12\x37\n\tresources\x18\x01 \x03(\x0b\x32$.inference.ModelRateLimiter.Resource\x12\x10\n\x08priority\x18\x02 \x01(\r\x1a\x37\n\x08Resource\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0e\n\x06global\x18\x02 \x01(\x08\x12\r\n\x05\x63ount\x18\x03 \x01(\r\"\x87\x04\n\x12ModelInstanceGroup\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x30\n\x04kind\x18\x04 \x01(\x0e\x32\".inference.ModelInstanceGroup.Kind\x12\r\n\x05\x63ount\x18\x02 \x01(\x05\x12\x31\n\x0crate_limiter\x18\x06 \x01(\x0b\x32\x1b.inference.ModelRateLimiter\x12\x0c\n\x04gpus\x18\x03 \x03(\x05\x12H\n\x11secondary_devices\x18\x08 \x03(\x0b\x32-.inference.ModelInstanceGroup.SecondaryDevice\x12\x0f\n\x07profile\x18\x05 \x03(\t\x12\x0f\n\x07passive\x18\x07 \x01(\x08\x12\x13\n\x0bhost_policy\x18\t \x01(\t\x1a\x9c\x01\n\x0fSecondaryDevice\x12O\n\x04kind\x18\x01 \x01(\x0e\x32\x41.inference.ModelInstanceGroup.SecondaryDevice.SecondaryDeviceKind\x12\x11\n\tdevice_id\x18\x02 \x01(\x03\"%\n\x13SecondaryDeviceKind\x12\x0e\n\nKIND_NVDLA\x10\x00\"A\n\x04Kind\x12\r\n\tKIND_AUTO\x10\x00\x12\x0c\n\x08KIND_GPU\x10\x01\x12\x0c\n\x08KIND_CPU\x10\x02\x12\x0e\n\nKIND_MODEL\x10\x03\"#\n\x12ModelTensorReshape\x12\r\n\x05shape\x18\x01 \x03(\x03\"\xb2\x02\n\nModelInput\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\tdata_type\x18\x02 \x01(\x0e\x32\x13.inference.DataType\x12,\n\x06\x66ormat\x18\x03 \x01(\x0e\x32\x1c.inference.ModelInput.Format\x12\x0c\n\x04\x64ims\x18\x04 \x03(\x03\x12.\n\x07reshape\x18\x05 \x01(\x0b\x32\x1d.inference.ModelTensorReshape\x12\x17\n\x0fis_shape_tensor\x18\x06 \x01(\x08\x12\x1a\n\x12\x61llow_ragged_batch\x18\x07 \x01(\x08\x12\x10\n\x08optional\x18\x08 \x01(\x08\";\n\x06\x46ormat\x12\x0f\n\x0b\x46ORMAT_NONE\x10\x00\x12\x0f\n\x0b\x46ORMAT_NHWC\x10\x01\x12\x0f\n\x0b\x46ORMAT_NCHW\x10\x02\"\xb2\x01\n\x0bModelOutput\x12\x0c\n\x04name\x18\x01 \x01(\t\x12&\n\tdata_type\x18\x02 \x01(\x0e\x32\x13.inference.DataType\x12\x0c\n\x04\x64ims\x18\x03 \x03(\x03\x12.\n\x07reshape\x18\x05 \x01(\x0b\x32\x1d.inference.ModelTensorReshape\x12\x16\n\x0elabel_filename\x18\x04 \x01(\t\x12\x17\n\x0fis_shape_tensor\x18\x06 \x01(\x08\"\xd9\x02\n\nBatchInput\x12(\n\x04kind\x18\x01 \x01(\x0e\x32\x1a.inference.BatchInput.Kind\x12\x13\n\x0btarget_name\x18\x02 \x03(\t\x12&\n\tdata_type\x18\x03 \x01(\x0e\x32\x13.inference.DataType\x12\x14\n\x0csource_input\x18\x04 \x03(\t\"\xcd\x01\n\x04Kind\x12\x17\n\x13\x42\x41TCH_ELEMENT_COUNT\x10\x00\x12#\n\x1f\x42\x41TCH_ACCUMULATED_ELEMENT_COUNT\x10\x01\x12-\n)BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO\x10\x02\x12$\n BATCH_MAX_ELEMENT_COUNT_AS_SHAPE\x10\x03\x12\x14\n\x10\x42\x41TCH_ITEM_SHAPE\x10\x04\x12\x1c\n\x18\x42\x41TCH_ITEM_SHAPE_FLATTEN\x10\x05\"\x8f\x01\n\x0b\x42\x61tchOutput\x12\x13\n\x0btarget_name\x18\x01 \x03(\t\x12)\n\x04kind\x18\x02 \x01(\x0e\x32\x1b.inference.BatchOutput.Kind\x12\x14\n\x0csource_input\x18\x03 \x03(\t\"*\n\x04Kind\x12\"\n\x1e\x42\x41TCH_SCATTER_WITH_INPUT_SHAPE\x10\x00\"\x90\x02\n\x12ModelVersionPolicy\x12\x36\n\x06latest\x18\x01 \x01(\x0b\x32$.inference.ModelVersionPolicy.LatestH\x00\x12\x30\n\x03\x61ll\x18\x02 \x01(\x0b\x32!.inference.ModelVersionPolicy.AllH\x00\x12:\n\x08specific\x18\x03 \x01(\x0b\x32&.inference.ModelVersionPolicy.SpecificH\x00\x1a\x1e\n\x06Latest\x12\x14\n\x0cnum_versions\x18\x01 \x01(\r\x1a\x05\n\x03\x41ll\x1a\x1c\n\x08Specific\x12\x10\n\x08versions\x18\x01 \x03(\x03\x42\x0f\n\rpolicy_choice\"\xfd\r\n\x17ModelOptimizationPolicy\x12\x37\n\x05graph\x18\x01 \x01(\x0b\x32(.inference.ModelOptimizationPolicy.Graph\x12\x42\n\x08priority\x18\x02 \x01(\x0e\x32\x30.inference.ModelOptimizationPolicy.ModelPriority\x12\x35\n\x04\x63uda\x18\x03 \x01(\x0b\x32\'.inference.ModelOptimizationPolicy.Cuda\x12X\n\x16\x65xecution_accelerators\x18\x04 \x01(\x0b\x32\x38.inference.ModelOptimizationPolicy.ExecutionAccelerators\x12R\n\x13input_pinned_memory\x18\x05 \x01(\x0b\x32\x35.inference.ModelOptimizationPolicy.PinnedMemoryBuffer\x12S\n\x14output_pinned_memory\x18\x06 \x01(\x0b\x32\x35.inference.ModelOptimizationPolicy.PinnedMemoryBuffer\x12&\n\x1egather_kernel_buffer_threshold\x18\x07 \x01(\r\x12\x16\n\x0e\x65\x61ger_batching\x18\x08 \x01(\x08\x1a\x16\n\x05Graph\x12\r\n\x05level\x18\x01 \x01(\x05\x1a\xba\x05\n\x04\x43uda\x12\x0e\n\x06graphs\x18\x01 \x01(\x08\x12\x18\n\x10\x62usy_wait_events\x18\x02 \x01(\x08\x12\x45\n\ngraph_spec\x18\x03 \x03(\x0b\x32\x31.inference.ModelOptimizationPolicy.Cuda.GraphSpec\x12\x1a\n\x12output_copy_stream\x18\x04 \x01(\x08\x1a\xa4\x04\n\tGraphSpec\x12\x12\n\nbatch_size\x18\x01 \x01(\x05\x12K\n\x05input\x18\x02 \x03(\x0b\x32<.inference.ModelOptimizationPolicy.Cuda.GraphSpec.InputEntry\x12W\n\x11graph_lower_bound\x18\x03 \x01(\x0b\x32<.inference.ModelOptimizationPolicy.Cuda.GraphSpec.LowerBound\x1a\x14\n\x05Shape\x12\x0b\n\x03\x64im\x18\x01 \x03(\x03\x1a\xdf\x01\n\nLowerBound\x12\x12\n\nbatch_size\x18\x01 \x01(\x05\x12V\n\x05input\x18\x02 \x03(\x0b\x32G.inference.ModelOptimizationPolicy.Cuda.GraphSpec.LowerBound.InputEntry\x1a\x65\n\nInputEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x46\n\x05value\x18\x02 \x01(\x0b\x32\x37.inference.ModelOptimizationPolicy.Cuda.GraphSpec.Shape:\x02\x38\x01\x1a\x65\n\nInputEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\x46\n\x05value\x18\x02 \x01(\x0b\x32\x37.inference.ModelOptimizationPolicy.Cuda.GraphSpec.Shape:\x02\x38\x01\x1a\xa4\x03\n\x15\x45xecutionAccelerators\x12g\n\x19gpu_execution_accelerator\x18\x01 \x03(\x0b\x32\x44.inference.ModelOptimizationPolicy.ExecutionAccelerators.Accelerator\x12g\n\x19\x63pu_execution_accelerator\x18\x02 \x03(\x0b\x32\x44.inference.ModelOptimizationPolicy.ExecutionAccelerators.Accelerator\x1a\xb8\x01\n\x0b\x41\x63\x63\x65lerator\x12\x0c\n\x04name\x18\x01 \x01(\t\x12h\n\nparameters\x18\x02 \x03(\x0b\x32T.inference.ModelOptimizationPolicy.ExecutionAccelerators.Accelerator.ParametersEntry\x1a\x31\n\x0fParametersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a$\n\x12PinnedMemoryBuffer\x12\x0e\n\x06\x65nable\x18\x01 \x01(\x08\"I\n\rModelPriority\x12\x14\n\x10PRIORITY_DEFAULT\x10\x00\x12\x10\n\x0cPRIORITY_MAX\x10\x01\x12\x10\n\x0cPRIORITY_MIN\x10\x02\"\xdb\x01\n\x10ModelQueuePolicy\x12\x41\n\x0etimeout_action\x18\x01 \x01(\x0e\x32).inference.ModelQueuePolicy.TimeoutAction\x12$\n\x1c\x64\x65\x66\x61ult_timeout_microseconds\x18\x02 \x01(\x04\x12\x1e\n\x16\x61llow_timeout_override\x18\x03 \x01(\x08\x12\x16\n\x0emax_queue_size\x18\x04 \x01(\r\"&\n\rTimeoutAction\x12\n\n\x06REJECT\x10\x00\x12\t\n\x05\x44\x45LAY\x10\x01\"\x9b\x03\n\x14ModelDynamicBatching\x12\x1c\n\x14preferred_batch_size\x18\x01 \x03(\x05\x12$\n\x1cmax_queue_delay_microseconds\x18\x02 \x01(\x04\x12\x19\n\x11preserve_ordering\x18\x03 \x01(\x08\x12\x17\n\x0fpriority_levels\x18\x04 \x01(\r\x12\x1e\n\x16\x64\x65\x66\x61ult_priority_level\x18\x05 \x01(\r\x12\x39\n\x14\x64\x65\x66\x61ult_queue_policy\x18\x06 \x01(\x0b\x32\x1b.inference.ModelQueuePolicy\x12W\n\x15priority_queue_policy\x18\x07 \x03(\x0b\x32\x38.inference.ModelDynamicBatching.PriorityQueuePolicyEntry\x1aW\n\x18PriorityQueuePolicyEntry\x12\x0b\n\x03key\x18\x01 \x01(\r\x12*\n\x05value\x18\x02 \x01(\x0b\x32\x1b.inference.ModelQueuePolicy:\x02\x38\x01\"\xef\t\n\x15ModelSequenceBatching\x12\x41\n\x06\x64irect\x18\x03 \x01(\x0b\x32/.inference.ModelSequenceBatching.StrategyDirectH\x00\x12\x41\n\x06oldest\x18\x04 \x01(\x0b\x32/.inference.ModelSequenceBatching.StrategyOldestH\x00\x12&\n\x1emax_sequence_idle_microseconds\x18\x01 \x01(\x04\x12\x44\n\rcontrol_input\x18\x02 \x03(\x0b\x32-.inference.ModelSequenceBatching.ControlInput\x12\x35\n\x05state\x18\x05 \x03(\x0b\x32&.inference.ModelSequenceBatching.State\x1a\xb1\x02\n\x07\x43ontrol\x12;\n\x04kind\x18\x01 \x01(\x0e\x32-.inference.ModelSequenceBatching.Control.Kind\x12\x18\n\x10int32_false_true\x18\x02 \x03(\x05\x12\x17\n\x0f\x66p32_false_true\x18\x03 \x03(\x02\x12\x17\n\x0f\x62ool_false_true\x18\x05 \x03(\x08\x12&\n\tdata_type\x18\x04 \x01(\x0e\x32\x13.inference.DataType\"u\n\x04Kind\x12\x1a\n\x16\x43ONTROL_SEQUENCE_START\x10\x00\x12\x1a\n\x16\x43ONTROL_SEQUENCE_READY\x10\x01\x12\x18\n\x14\x43ONTROL_SEQUENCE_END\x10\x02\x12\x1b\n\x17\x43ONTROL_SEQUENCE_CORRID\x10\x03\x1aW\n\x0c\x43ontrolInput\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x39\n\x07\x63ontrol\x18\x02 \x03(\x0b\x32(.inference.ModelSequenceBatching.Control\x1a\x8a\x01\n\x0cInitialState\x12&\n\tdata_type\x18\x01 \x01(\x0e\x32\x13.inference.DataType\x12\x0c\n\x04\x64ims\x18\x02 \x03(\x03\x12\x13\n\tzero_data\x18\x03 \x01(\x08H\x00\x12\x13\n\tdata_file\x18\x04 \x01(\tH\x00\x12\x0c\n\x04name\x18\x05 \x01(\tB\x0c\n\nstate_data\x1a\xac\x01\n\x05State\x12\x12\n\ninput_name\x18\x01 \x01(\t\x12\x13\n\x0boutput_name\x18\x02 \x01(\t\x12&\n\tdata_type\x18\x03 \x01(\x0e\x32\x13.inference.DataType\x12\x0c\n\x04\x64ims\x18\x04 \x03(\x03\x12\x44\n\rinitial_state\x18\x05 \x03(\x0b\x32-.inference.ModelSequenceBatching.InitialState\x1aX\n\x0eStrategyDirect\x12$\n\x1cmax_queue_delay_microseconds\x18\x01 \x01(\x04\x12 \n\x18minimum_slot_utilization\x18\x02 \x01(\x02\x1au\n\x0eStrategyOldest\x12\x1f\n\x17max_candidate_sequences\x18\x01 \x01(\x05\x12\x1c\n\x14preferred_batch_size\x18\x02 \x03(\x05\x12$\n\x1cmax_queue_delay_microseconds\x18\x03 \x01(\x04\x42\x11\n\x0fstrategy_choice\"\xdd\x02\n\x0fModelEnsembling\x12-\n\x04step\x18\x01 \x03(\x0b\x32\x1f.inference.ModelEnsembling.Step\x1a\x9a\x02\n\x04Step\x12\x12\n\nmodel_name\x18\x01 \x01(\t\x12\x15\n\rmodel_version\x18\x02 \x01(\x03\x12@\n\tinput_map\x18\x03 \x03(\x0b\x32-.inference.ModelEnsembling.Step.InputMapEntry\x12\x42\n\noutput_map\x18\x04 \x03(\x0b\x32..inference.ModelEnsembling.Step.OutputMapEntry\x1a/\n\rInputMapEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x30\n\x0eOutputMapEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"&\n\x0eModelParameter\x12\x14\n\x0cstring_value\x18\x01 \x01(\t\"\xd9\x02\n\x0bModelWarmup\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x12\n\nbatch_size\x18\x02 \x01(\r\x12\x32\n\x06inputs\x18\x03 \x03(\x0b\x32\".inference.ModelWarmup.InputsEntry\x12\r\n\x05\x63ount\x18\x04 \x01(\r\x1a\x97\x01\n\x05Input\x12&\n\tdata_type\x18\x01 \x01(\x0e\x32\x13.inference.DataType\x12\x0c\n\x04\x64ims\x18\x02 \x03(\x03\x12\x13\n\tzero_data\x18\x03 \x01(\x08H\x00\x12\x15\n\x0brandom_data\x18\x04 \x01(\x08H\x00\x12\x19\n\x0finput_data_file\x18\x05 \x01(\tH\x00\x42\x11\n\x0finput_data_type\x1aK\n\x0bInputsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12+\n\x05value\x18\x02 \x01(\x0b\x32\x1c.inference.ModelWarmup.Input:\x02\x38\x01\".\n\x0fModelOperations\x12\x1b\n\x13op_library_filename\x18\x01 \x03(\t\"+\n\x16ModelTransactionPolicy\x12\x11\n\tdecoupled\x18\x01 \x01(\x08\"\xe6\x01\n\x15ModelRepositoryAgents\x12\x36\n\x06\x61gents\x18\x01 \x03(\x0b\x32&.inference.ModelRepositoryAgents.Agent\x1a\x94\x01\n\x05\x41gent\x12\x0c\n\x04name\x18\x01 \x01(\t\x12J\n\nparameters\x18\x02 \x03(\x0b\x32\x36.inference.ModelRepositoryAgents.Agent.ParametersEntry\x1a\x31\n\x0fParametersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"$\n\x12ModelResponseCache\x12\x0e\n\x06\x65nable\x18\x01 \x01(\x08\"\xb2\n\n\x0bModelConfig\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08platform\x18\x02 \x01(\t\x12\x0f\n\x07\x62\x61\x63kend\x18\x11 \x01(\t\x12\x35\n\x0eversion_policy\x18\x03 \x01(\x0b\x32\x1d.inference.ModelVersionPolicy\x12\x16\n\x0emax_batch_size\x18\x04 \x01(\x05\x12$\n\x05input\x18\x05 \x03(\x0b\x32\x15.inference.ModelInput\x12&\n\x06output\x18\x06 \x03(\x0b\x32\x16.inference.ModelOutput\x12*\n\x0b\x62\x61tch_input\x18\x14 \x03(\x0b\x32\x15.inference.BatchInput\x12,\n\x0c\x62\x61tch_output\x18\x15 \x03(\x0b\x32\x16.inference.BatchOutput\x12\x38\n\x0coptimization\x18\x0c \x01(\x0b\x32\".inference.ModelOptimizationPolicy\x12;\n\x10\x64ynamic_batching\x18\x0b \x01(\x0b\x32\x1f.inference.ModelDynamicBatchingH\x00\x12=\n\x11sequence_batching\x18\r \x01(\x0b\x32 .inference.ModelSequenceBatchingH\x00\x12\x39\n\x13\x65nsemble_scheduling\x18\x0f \x01(\x0b\x32\x1a.inference.ModelEnsemblingH\x00\x12\x35\n\x0einstance_group\x18\x07 \x03(\x0b\x32\x1d.inference.ModelInstanceGroup\x12\x1e\n\x16\x64\x65\x66\x61ult_model_filename\x18\x08 \x01(\t\x12H\n\x12\x63\x63_model_filenames\x18\t \x03(\x0b\x32,.inference.ModelConfig.CcModelFilenamesEntry\x12;\n\x0bmetric_tags\x18\n \x03(\x0b\x32&.inference.ModelConfig.MetricTagsEntry\x12:\n\nparameters\x18\x0e \x03(\x0b\x32&.inference.ModelConfig.ParametersEntry\x12,\n\x0cmodel_warmup\x18\x10 \x03(\x0b\x32\x16.inference.ModelWarmup\x12\x34\n\x10model_operations\x18\x12 \x01(\x0b\x32\x1a.inference.ModelOperations\x12\x43\n\x18model_transaction_policy\x18\x13 \x01(\x0b\x32!.inference.ModelTransactionPolicy\x12\x41\n\x17model_repository_agents\x18\x17 \x01(\x0b\x32 .inference.ModelRepositoryAgents\x12\x35\n\x0eresponse_cache\x18\x18 \x01(\x0b\x32\x1d.inference.ModelResponseCache\x1a\x37\n\x15\x43\x63ModelFilenamesEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1a\x31\n\x0fMetricTagsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x1aL\n\x0fParametersEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12(\n\x05value\x18\x02 \x01(\x0b\x32\x19.inference.ModelParameter:\x02\x38\x01\x42\x13\n\x11scheduling_choice*\xfa\x01\n\x08\x44\x61taType\x12\x10\n\x0cTYPE_INVALID\x10\x00\x12\r\n\tTYPE_BOOL\x10\x01\x12\x0e\n\nTYPE_UINT8\x10\x02\x12\x0f\n\x0bTYPE_UINT16\x10\x03\x12\x0f\n\x0bTYPE_UINT32\x10\x04\x12\x0f\n\x0bTYPE_UINT64\x10\x05\x12\r\n\tTYPE_INT8\x10\x06\x12\x0e\n\nTYPE_INT16\x10\x07\x12\x0e\n\nTYPE_INT32\x10\x08\x12\x0e\n\nTYPE_INT64\x10\t\x12\r\n\tTYPE_FP16\x10\n\x12\r\n\tTYPE_FP32\x10\x0b\x12\r\n\tTYPE_FP64\x10\x0c\x12\x0f\n\x0bTYPE_STRING\x10\r\x12\r\n\tTYPE_BF16\x10\x0e\x62\x06proto3' +) + +_DATATYPE = DESCRIPTOR.enum_types_by_name['DataType'] +DataType = enum_type_wrapper.EnumTypeWrapper(_DATATYPE) +TYPE_INVALID = 0 +TYPE_BOOL = 1 +TYPE_UINT8 = 2 +TYPE_UINT16 = 3 +TYPE_UINT32 = 4 +TYPE_UINT64 = 5 +TYPE_INT8 = 6 +TYPE_INT16 = 7 +TYPE_INT32 = 8 +TYPE_INT64 = 9 +TYPE_FP16 = 10 +TYPE_FP32 = 11 +TYPE_FP64 = 12 +TYPE_STRING = 13 +TYPE_BF16 = 14 + +_MODELRATELIMITER = DESCRIPTOR.message_types_by_name['ModelRateLimiter'] +_MODELRATELIMITER_RESOURCE = _MODELRATELIMITER.nested_types_by_name['Resource'] +_MODELINSTANCEGROUP = DESCRIPTOR.message_types_by_name['ModelInstanceGroup'] +_MODELINSTANCEGROUP_SECONDARYDEVICE = _MODELINSTANCEGROUP.nested_types_by_name[ + 'SecondaryDevice'] +_MODELTENSORRESHAPE = DESCRIPTOR.message_types_by_name['ModelTensorReshape'] +_MODELINPUT = DESCRIPTOR.message_types_by_name['ModelInput'] +_MODELOUTPUT = DESCRIPTOR.message_types_by_name['ModelOutput'] +_BATCHINPUT = DESCRIPTOR.message_types_by_name['BatchInput'] +_BATCHOUTPUT = DESCRIPTOR.message_types_by_name['BatchOutput'] +_MODELVERSIONPOLICY = DESCRIPTOR.message_types_by_name['ModelVersionPolicy'] +_MODELVERSIONPOLICY_LATEST = _MODELVERSIONPOLICY.nested_types_by_name['Latest'] +_MODELVERSIONPOLICY_ALL = _MODELVERSIONPOLICY.nested_types_by_name['All'] +_MODELVERSIONPOLICY_SPECIFIC = _MODELVERSIONPOLICY.nested_types_by_name[ + 'Specific'] +_MODELOPTIMIZATIONPOLICY = DESCRIPTOR.message_types_by_name[ + 'ModelOptimizationPolicy'] +_MODELOPTIMIZATIONPOLICY_GRAPH = _MODELOPTIMIZATIONPOLICY.nested_types_by_name[ + 'Graph'] +_MODELOPTIMIZATIONPOLICY_CUDA = _MODELOPTIMIZATIONPOLICY.nested_types_by_name[ + 'Cuda'] +_MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC = _MODELOPTIMIZATIONPOLICY_CUDA.nested_types_by_name[ + 'GraphSpec'] +_MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_SHAPE = _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC.nested_types_by_name[ + 'Shape'] +_MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND = _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC.nested_types_by_name[ + 'LowerBound'] +_MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND_INPUTENTRY = _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND.nested_types_by_name[ + 'InputEntry'] +_MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_INPUTENTRY = _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC.nested_types_by_name[ + 'InputEntry'] +_MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS = _MODELOPTIMIZATIONPOLICY.nested_types_by_name[ + 'ExecutionAccelerators'] +_MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR = _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS.nested_types_by_name[ + 'Accelerator'] +_MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR_PARAMETERSENTRY = _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR.nested_types_by_name[ + 'ParametersEntry'] +_MODELOPTIMIZATIONPOLICY_PINNEDMEMORYBUFFER = _MODELOPTIMIZATIONPOLICY.nested_types_by_name[ + 'PinnedMemoryBuffer'] +_MODELQUEUEPOLICY = DESCRIPTOR.message_types_by_name['ModelQueuePolicy'] +_MODELDYNAMICBATCHING = DESCRIPTOR.message_types_by_name[ + 'ModelDynamicBatching'] +_MODELDYNAMICBATCHING_PRIORITYQUEUEPOLICYENTRY = _MODELDYNAMICBATCHING.nested_types_by_name[ + 'PriorityQueuePolicyEntry'] +_MODELSEQUENCEBATCHING = DESCRIPTOR.message_types_by_name[ + 'ModelSequenceBatching'] +_MODELSEQUENCEBATCHING_CONTROL = _MODELSEQUENCEBATCHING.nested_types_by_name[ + 'Control'] +_MODELSEQUENCEBATCHING_CONTROLINPUT = _MODELSEQUENCEBATCHING.nested_types_by_name[ + 'ControlInput'] +_MODELSEQUENCEBATCHING_INITIALSTATE = _MODELSEQUENCEBATCHING.nested_types_by_name[ + 'InitialState'] +_MODELSEQUENCEBATCHING_STATE = _MODELSEQUENCEBATCHING.nested_types_by_name[ + 'State'] +_MODELSEQUENCEBATCHING_STRATEGYDIRECT = _MODELSEQUENCEBATCHING.nested_types_by_name[ + 'StrategyDirect'] +_MODELSEQUENCEBATCHING_STRATEGYOLDEST = _MODELSEQUENCEBATCHING.nested_types_by_name[ + 'StrategyOldest'] +_MODELENSEMBLING = DESCRIPTOR.message_types_by_name['ModelEnsembling'] +_MODELENSEMBLING_STEP = _MODELENSEMBLING.nested_types_by_name['Step'] +_MODELENSEMBLING_STEP_INPUTMAPENTRY = _MODELENSEMBLING_STEP.nested_types_by_name[ + 'InputMapEntry'] +_MODELENSEMBLING_STEP_OUTPUTMAPENTRY = _MODELENSEMBLING_STEP.nested_types_by_name[ + 'OutputMapEntry'] +_MODELPARAMETER = DESCRIPTOR.message_types_by_name['ModelParameter'] +_MODELWARMUP = DESCRIPTOR.message_types_by_name['ModelWarmup'] +_MODELWARMUP_INPUT = _MODELWARMUP.nested_types_by_name['Input'] +_MODELWARMUP_INPUTSENTRY = _MODELWARMUP.nested_types_by_name['InputsEntry'] +_MODELOPERATIONS = DESCRIPTOR.message_types_by_name['ModelOperations'] +_MODELTRANSACTIONPOLICY = DESCRIPTOR.message_types_by_name[ + 'ModelTransactionPolicy'] +_MODELREPOSITORYAGENTS = DESCRIPTOR.message_types_by_name[ + 'ModelRepositoryAgents'] +_MODELREPOSITORYAGENTS_AGENT = _MODELREPOSITORYAGENTS.nested_types_by_name[ + 'Agent'] +_MODELREPOSITORYAGENTS_AGENT_PARAMETERSENTRY = _MODELREPOSITORYAGENTS_AGENT.nested_types_by_name[ + 'ParametersEntry'] +_MODELRESPONSECACHE = DESCRIPTOR.message_types_by_name['ModelResponseCache'] +_MODELCONFIG = DESCRIPTOR.message_types_by_name['ModelConfig'] +_MODELCONFIG_CCMODELFILENAMESENTRY = _MODELCONFIG.nested_types_by_name[ + 'CcModelFilenamesEntry'] +_MODELCONFIG_METRICTAGSENTRY = _MODELCONFIG.nested_types_by_name[ + 'MetricTagsEntry'] +_MODELCONFIG_PARAMETERSENTRY = _MODELCONFIG.nested_types_by_name[ + 'ParametersEntry'] +_MODELINSTANCEGROUP_SECONDARYDEVICE_SECONDARYDEVICEKIND = _MODELINSTANCEGROUP_SECONDARYDEVICE.enum_types_by_name[ + 'SecondaryDeviceKind'] +_MODELINSTANCEGROUP_KIND = _MODELINSTANCEGROUP.enum_types_by_name['Kind'] +_MODELINPUT_FORMAT = _MODELINPUT.enum_types_by_name['Format'] +_BATCHINPUT_KIND = _BATCHINPUT.enum_types_by_name['Kind'] +_BATCHOUTPUT_KIND = _BATCHOUTPUT.enum_types_by_name['Kind'] +_MODELOPTIMIZATIONPOLICY_MODELPRIORITY = _MODELOPTIMIZATIONPOLICY.enum_types_by_name[ + 'ModelPriority'] +_MODELQUEUEPOLICY_TIMEOUTACTION = _MODELQUEUEPOLICY.enum_types_by_name[ + 'TimeoutAction'] +_MODELSEQUENCEBATCHING_CONTROL_KIND = _MODELSEQUENCEBATCHING_CONTROL.enum_types_by_name[ + 'Kind'] +ModelRateLimiter = _reflection.GeneratedProtocolMessageType( + 'ModelRateLimiter', + (_message.Message, ), + { + 'Resource': + _reflection.GeneratedProtocolMessageType( + 'Resource', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELRATELIMITER_RESOURCE, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelRateLimiter.Resource) + }), + 'DESCRIPTOR': + _MODELRATELIMITER, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelRateLimiter) + }) +_sym_db.RegisterMessage(ModelRateLimiter) +_sym_db.RegisterMessage(ModelRateLimiter.Resource) + +ModelInstanceGroup = _reflection.GeneratedProtocolMessageType( + 'ModelInstanceGroup', + (_message.Message, ), + { + 'SecondaryDevice': + _reflection.GeneratedProtocolMessageType( + 'SecondaryDevice', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELINSTANCEGROUP_SECONDARYDEVICE, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelInstanceGroup.SecondaryDevice) + }), + 'DESCRIPTOR': + _MODELINSTANCEGROUP, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelInstanceGroup) + }) +_sym_db.RegisterMessage(ModelInstanceGroup) +_sym_db.RegisterMessage(ModelInstanceGroup.SecondaryDevice) + +ModelTensorReshape = _reflection.GeneratedProtocolMessageType( + 'ModelTensorReshape', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELTENSORRESHAPE, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelTensorReshape) + }) +_sym_db.RegisterMessage(ModelTensorReshape) + +ModelInput = _reflection.GeneratedProtocolMessageType( + 'ModelInput', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELINPUT, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelInput) + }) +_sym_db.RegisterMessage(ModelInput) + +ModelOutput = _reflection.GeneratedProtocolMessageType( + 'ModelOutput', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELOUTPUT, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOutput) + }) +_sym_db.RegisterMessage(ModelOutput) + +BatchInput = _reflection.GeneratedProtocolMessageType( + 'BatchInput', + (_message.Message, ), + { + 'DESCRIPTOR': _BATCHINPUT, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.BatchInput) + }) +_sym_db.RegisterMessage(BatchInput) + +BatchOutput = _reflection.GeneratedProtocolMessageType( + 'BatchOutput', + (_message.Message, ), + { + 'DESCRIPTOR': _BATCHOUTPUT, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.BatchOutput) + }) +_sym_db.RegisterMessage(BatchOutput) + +ModelVersionPolicy = _reflection.GeneratedProtocolMessageType( + 'ModelVersionPolicy', + (_message.Message, ), + { + 'Latest': + _reflection.GeneratedProtocolMessageType( + 'Latest', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELVERSIONPOLICY_LATEST, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelVersionPolicy.Latest) + }), + 'All': + _reflection.GeneratedProtocolMessageType( + 'All', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELVERSIONPOLICY_ALL, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelVersionPolicy.All) + }), + 'Specific': + _reflection.GeneratedProtocolMessageType( + 'Specific', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELVERSIONPOLICY_SPECIFIC, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelVersionPolicy.Specific) + }), + 'DESCRIPTOR': + _MODELVERSIONPOLICY, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelVersionPolicy) + }) +_sym_db.RegisterMessage(ModelVersionPolicy) +_sym_db.RegisterMessage(ModelVersionPolicy.Latest) +_sym_db.RegisterMessage(ModelVersionPolicy.All) +_sym_db.RegisterMessage(ModelVersionPolicy.Specific) + +ModelOptimizationPolicy = _reflection.GeneratedProtocolMessageType( + 'ModelOptimizationPolicy', + (_message.Message, ), + { + 'Graph': + _reflection.GeneratedProtocolMessageType( + 'Graph', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELOPTIMIZATIONPOLICY_GRAPH, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Graph) + }), + 'Cuda': + _reflection.GeneratedProtocolMessageType( + 'Cuda', + (_message.Message, ), + { + 'GraphSpec': + _reflection.GeneratedProtocolMessageType( + 'GraphSpec', + (_message.Message, ), + { + 'Shape': + _reflection.GeneratedProtocolMessageType( + 'Shape', + (_message.Message, ), + { + 'DESCRIPTOR': + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_SHAPE, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Cuda.GraphSpec.Shape) + }), + 'LowerBound': + _reflection.GeneratedProtocolMessageType( + 'LowerBound', + (_message.Message, ), + { + 'InputEntry': + _reflection.GeneratedProtocolMessageType( + 'InputEntry', + (_message.Message, ), + { + 'DESCRIPTOR': + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND_INPUTENTRY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Cuda.GraphSpec.LowerBound.InputEntry) + }), + 'DESCRIPTOR': + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Cuda.GraphSpec.LowerBound) + }), + 'InputEntry': + _reflection.GeneratedProtocolMessageType( + 'InputEntry', + (_message.Message, ), + { + 'DESCRIPTOR': + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_INPUTENTRY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Cuda.GraphSpec.InputEntry) + }), + 'DESCRIPTOR': + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Cuda.GraphSpec) + }), + 'DESCRIPTOR': + _MODELOPTIMIZATIONPOLICY_CUDA, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.Cuda) + }), + 'ExecutionAccelerators': + _reflection.GeneratedProtocolMessageType( + 'ExecutionAccelerators', + (_message.Message, ), + { + 'Accelerator': + _reflection.GeneratedProtocolMessageType( + 'Accelerator', + (_message.Message, ), + { + 'ParametersEntry': + _reflection.GeneratedProtocolMessageType( + 'ParametersEntry', + (_message.Message, ), + { + 'DESCRIPTOR': + _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR_PARAMETERSENTRY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.ExecutionAccelerators.Accelerator.ParametersEntry) + }), + 'DESCRIPTOR': + _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.ExecutionAccelerators.Accelerator) + }), + 'DESCRIPTOR': + _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.ExecutionAccelerators) + }), + 'PinnedMemoryBuffer': + _reflection.GeneratedProtocolMessageType( + 'PinnedMemoryBuffer', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELOPTIMIZATIONPOLICY_PINNEDMEMORYBUFFER, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy.PinnedMemoryBuffer) + }), + 'DESCRIPTOR': + _MODELOPTIMIZATIONPOLICY, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOptimizationPolicy) + }) +_sym_db.RegisterMessage(ModelOptimizationPolicy) +_sym_db.RegisterMessage(ModelOptimizationPolicy.Graph) +_sym_db.RegisterMessage(ModelOptimizationPolicy.Cuda) +_sym_db.RegisterMessage(ModelOptimizationPolicy.Cuda.GraphSpec) +_sym_db.RegisterMessage(ModelOptimizationPolicy.Cuda.GraphSpec.Shape) +_sym_db.RegisterMessage(ModelOptimizationPolicy.Cuda.GraphSpec.LowerBound) +_sym_db.RegisterMessage( + ModelOptimizationPolicy.Cuda.GraphSpec.LowerBound.InputEntry) +_sym_db.RegisterMessage(ModelOptimizationPolicy.Cuda.GraphSpec.InputEntry) +_sym_db.RegisterMessage(ModelOptimizationPolicy.ExecutionAccelerators) +_sym_db.RegisterMessage( + ModelOptimizationPolicy.ExecutionAccelerators.Accelerator) +_sym_db.RegisterMessage( + ModelOptimizationPolicy.ExecutionAccelerators.Accelerator.ParametersEntry) +_sym_db.RegisterMessage(ModelOptimizationPolicy.PinnedMemoryBuffer) + +ModelQueuePolicy = _reflection.GeneratedProtocolMessageType( + 'ModelQueuePolicy', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELQUEUEPOLICY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelQueuePolicy) + }) +_sym_db.RegisterMessage(ModelQueuePolicy) + +ModelDynamicBatching = _reflection.GeneratedProtocolMessageType( + 'ModelDynamicBatching', + (_message.Message, ), + { + 'PriorityQueuePolicyEntry': + _reflection.GeneratedProtocolMessageType( + 'PriorityQueuePolicyEntry', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELDYNAMICBATCHING_PRIORITYQUEUEPOLICYENTRY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelDynamicBatching.PriorityQueuePolicyEntry) + }), + 'DESCRIPTOR': + _MODELDYNAMICBATCHING, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelDynamicBatching) + }) +_sym_db.RegisterMessage(ModelDynamicBatching) +_sym_db.RegisterMessage(ModelDynamicBatching.PriorityQueuePolicyEntry) + +ModelSequenceBatching = _reflection.GeneratedProtocolMessageType( + 'ModelSequenceBatching', + (_message.Message, ), + { + 'Control': + _reflection.GeneratedProtocolMessageType( + 'Control', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELSEQUENCEBATCHING_CONTROL, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching.Control) + }), + 'ControlInput': + _reflection.GeneratedProtocolMessageType( + 'ControlInput', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELSEQUENCEBATCHING_CONTROLINPUT, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching.ControlInput) + }), + 'InitialState': + _reflection.GeneratedProtocolMessageType( + 'InitialState', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELSEQUENCEBATCHING_INITIALSTATE, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching.InitialState) + }), + 'State': + _reflection.GeneratedProtocolMessageType( + 'State', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELSEQUENCEBATCHING_STATE, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching.State) + }), + 'StrategyDirect': + _reflection.GeneratedProtocolMessageType( + 'StrategyDirect', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELSEQUENCEBATCHING_STRATEGYDIRECT, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching.StrategyDirect) + }), + 'StrategyOldest': + _reflection.GeneratedProtocolMessageType( + 'StrategyOldest', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELSEQUENCEBATCHING_STRATEGYOLDEST, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching.StrategyOldest) + }), + 'DESCRIPTOR': + _MODELSEQUENCEBATCHING, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelSequenceBatching) + }) +_sym_db.RegisterMessage(ModelSequenceBatching) +_sym_db.RegisterMessage(ModelSequenceBatching.Control) +_sym_db.RegisterMessage(ModelSequenceBatching.ControlInput) +_sym_db.RegisterMessage(ModelSequenceBatching.InitialState) +_sym_db.RegisterMessage(ModelSequenceBatching.State) +_sym_db.RegisterMessage(ModelSequenceBatching.StrategyDirect) +_sym_db.RegisterMessage(ModelSequenceBatching.StrategyOldest) + +ModelEnsembling = _reflection.GeneratedProtocolMessageType( + 'ModelEnsembling', + (_message.Message, ), + { + 'Step': + _reflection.GeneratedProtocolMessageType( + 'Step', + (_message.Message, ), + { + 'InputMapEntry': + _reflection.GeneratedProtocolMessageType( + 'InputMapEntry', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELENSEMBLING_STEP_INPUTMAPENTRY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelEnsembling.Step.InputMapEntry) + }), + 'OutputMapEntry': + _reflection.GeneratedProtocolMessageType( + 'OutputMapEntry', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELENSEMBLING_STEP_OUTPUTMAPENTRY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelEnsembling.Step.OutputMapEntry) + }), + 'DESCRIPTOR': + _MODELENSEMBLING_STEP, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelEnsembling.Step) + }), + 'DESCRIPTOR': + _MODELENSEMBLING, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelEnsembling) + }) +_sym_db.RegisterMessage(ModelEnsembling) +_sym_db.RegisterMessage(ModelEnsembling.Step) +_sym_db.RegisterMessage(ModelEnsembling.Step.InputMapEntry) +_sym_db.RegisterMessage(ModelEnsembling.Step.OutputMapEntry) + +ModelParameter = _reflection.GeneratedProtocolMessageType( + 'ModelParameter', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELPARAMETER, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelParameter) + }) +_sym_db.RegisterMessage(ModelParameter) + +ModelWarmup = _reflection.GeneratedProtocolMessageType( + 'ModelWarmup', + (_message.Message, ), + { + 'Input': + _reflection.GeneratedProtocolMessageType( + 'Input', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELWARMUP_INPUT, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelWarmup.Input) + }), + 'InputsEntry': + _reflection.GeneratedProtocolMessageType( + 'InputsEntry', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELWARMUP_INPUTSENTRY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelWarmup.InputsEntry) + }), + 'DESCRIPTOR': + _MODELWARMUP, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelWarmup) + }) +_sym_db.RegisterMessage(ModelWarmup) +_sym_db.RegisterMessage(ModelWarmup.Input) +_sym_db.RegisterMessage(ModelWarmup.InputsEntry) + +ModelOperations = _reflection.GeneratedProtocolMessageType( + 'ModelOperations', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELOPERATIONS, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelOperations) + }) +_sym_db.RegisterMessage(ModelOperations) + +ModelTransactionPolicy = _reflection.GeneratedProtocolMessageType( + 'ModelTransactionPolicy', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELTRANSACTIONPOLICY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelTransactionPolicy) + }) +_sym_db.RegisterMessage(ModelTransactionPolicy) + +ModelRepositoryAgents = _reflection.GeneratedProtocolMessageType( + 'ModelRepositoryAgents', + (_message.Message, ), + { + 'Agent': + _reflection.GeneratedProtocolMessageType( + 'Agent', + (_message.Message, ), + { + 'ParametersEntry': + _reflection.GeneratedProtocolMessageType( + 'ParametersEntry', + (_message.Message, ), + { + 'DESCRIPTOR': + _MODELREPOSITORYAGENTS_AGENT_PARAMETERSENTRY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelRepositoryAgents.Agent.ParametersEntry) + }), + 'DESCRIPTOR': + _MODELREPOSITORYAGENTS_AGENT, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelRepositoryAgents.Agent) + }), + 'DESCRIPTOR': + _MODELREPOSITORYAGENTS, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelRepositoryAgents) + }) +_sym_db.RegisterMessage(ModelRepositoryAgents) +_sym_db.RegisterMessage(ModelRepositoryAgents.Agent) +_sym_db.RegisterMessage(ModelRepositoryAgents.Agent.ParametersEntry) + +ModelResponseCache = _reflection.GeneratedProtocolMessageType( + 'ModelResponseCache', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELRESPONSECACHE, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelResponseCache) + }) +_sym_db.RegisterMessage(ModelResponseCache) + +ModelConfig = _reflection.GeneratedProtocolMessageType( + 'ModelConfig', + (_message.Message, ), + { + 'CcModelFilenamesEntry': + _reflection.GeneratedProtocolMessageType( + 'CcModelFilenamesEntry', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELCONFIG_CCMODELFILENAMESENTRY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelConfig.CcModelFilenamesEntry) + }), + 'MetricTagsEntry': + _reflection.GeneratedProtocolMessageType( + 'MetricTagsEntry', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELCONFIG_METRICTAGSENTRY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelConfig.MetricTagsEntry) + }), + 'ParametersEntry': + _reflection.GeneratedProtocolMessageType( + 'ParametersEntry', + (_message.Message, ), + { + 'DESCRIPTOR': _MODELCONFIG_PARAMETERSENTRY, + '__module__': 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelConfig.ParametersEntry) + }), + 'DESCRIPTOR': + _MODELCONFIG, + '__module__': + 'model_config.protxt_pb2' + # @@protoc_insertion_point(class_scope:inference.ModelConfig) + }) +_sym_db.RegisterMessage(ModelConfig) +_sym_db.RegisterMessage(ModelConfig.CcModelFilenamesEntry) +_sym_db.RegisterMessage(ModelConfig.MetricTagsEntry) +_sym_db.RegisterMessage(ModelConfig.ParametersEntry) + +if _descriptor._USE_C_DESCRIPTORS == False: + + DESCRIPTOR._options = None + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND_INPUTENTRY._options = None + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND_INPUTENTRY._serialized_options = b'8\001' + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_INPUTENTRY._options = None + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_INPUTENTRY._serialized_options = b'8\001' + _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR_PARAMETERSENTRY._options = None + _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR_PARAMETERSENTRY._serialized_options = b'8\001' + _MODELDYNAMICBATCHING_PRIORITYQUEUEPOLICYENTRY._options = None + _MODELDYNAMICBATCHING_PRIORITYQUEUEPOLICYENTRY._serialized_options = b'8\001' + _MODELENSEMBLING_STEP_INPUTMAPENTRY._options = None + _MODELENSEMBLING_STEP_INPUTMAPENTRY._serialized_options = b'8\001' + _MODELENSEMBLING_STEP_OUTPUTMAPENTRY._options = None + _MODELENSEMBLING_STEP_OUTPUTMAPENTRY._serialized_options = b'8\001' + _MODELWARMUP_INPUTSENTRY._options = None + _MODELWARMUP_INPUTSENTRY._serialized_options = b'8\001' + _MODELREPOSITORYAGENTS_AGENT_PARAMETERSENTRY._options = None + _MODELREPOSITORYAGENTS_AGENT_PARAMETERSENTRY._serialized_options = b'8\001' + _MODELCONFIG_CCMODELFILENAMESENTRY._options = None + _MODELCONFIG_CCMODELFILENAMESENTRY._serialized_options = b'8\001' + _MODELCONFIG_METRICTAGSENTRY._options = None + _MODELCONFIG_METRICTAGSENTRY._serialized_options = b'8\001' + _MODELCONFIG_PARAMETERSENTRY._options = None + _MODELCONFIG_PARAMETERSENTRY._serialized_options = b'8\001' + _DATATYPE._serialized_start = 8137 + _DATATYPE._serialized_end = 8387 + _MODELRATELIMITER._serialized_start = 35 + _MODELRATELIMITER._serialized_end = 185 + _MODELRATELIMITER_RESOURCE._serialized_start = 130 + _MODELRATELIMITER_RESOURCE._serialized_end = 185 + _MODELINSTANCEGROUP._serialized_start = 188 + _MODELINSTANCEGROUP._serialized_end = 707 + _MODELINSTANCEGROUP_SECONDARYDEVICE._serialized_start = 484 + _MODELINSTANCEGROUP_SECONDARYDEVICE._serialized_end = 640 + _MODELINSTANCEGROUP_SECONDARYDEVICE_SECONDARYDEVICEKIND._serialized_start = 603 + _MODELINSTANCEGROUP_SECONDARYDEVICE_SECONDARYDEVICEKIND._serialized_end = 640 + _MODELINSTANCEGROUP_KIND._serialized_start = 642 + _MODELINSTANCEGROUP_KIND._serialized_end = 707 + _MODELTENSORRESHAPE._serialized_start = 709 + _MODELTENSORRESHAPE._serialized_end = 744 + _MODELINPUT._serialized_start = 747 + _MODELINPUT._serialized_end = 1053 + _MODELINPUT_FORMAT._serialized_start = 994 + _MODELINPUT_FORMAT._serialized_end = 1053 + _MODELOUTPUT._serialized_start = 1056 + _MODELOUTPUT._serialized_end = 1234 + _BATCHINPUT._serialized_start = 1237 + _BATCHINPUT._serialized_end = 1582 + _BATCHINPUT_KIND._serialized_start = 1377 + _BATCHINPUT_KIND._serialized_end = 1582 + _BATCHOUTPUT._serialized_start = 1585 + _BATCHOUTPUT._serialized_end = 1728 + _BATCHOUTPUT_KIND._serialized_start = 1686 + _BATCHOUTPUT_KIND._serialized_end = 1728 + _MODELVERSIONPOLICY._serialized_start = 1731 + _MODELVERSIONPOLICY._serialized_end = 2003 + _MODELVERSIONPOLICY_LATEST._serialized_start = 1919 + _MODELVERSIONPOLICY_LATEST._serialized_end = 1949 + _MODELVERSIONPOLICY_ALL._serialized_start = 1951 + _MODELVERSIONPOLICY_ALL._serialized_end = 1956 + _MODELVERSIONPOLICY_SPECIFIC._serialized_start = 1958 + _MODELVERSIONPOLICY_SPECIFIC._serialized_end = 1986 + _MODELOPTIMIZATIONPOLICY._serialized_start = 2006 + _MODELOPTIMIZATIONPOLICY._serialized_end = 3795 + _MODELOPTIMIZATIONPOLICY_GRAPH._serialized_start = 2536 + _MODELOPTIMIZATIONPOLICY_GRAPH._serialized_end = 2558 + _MODELOPTIMIZATIONPOLICY_CUDA._serialized_start = 2561 + _MODELOPTIMIZATIONPOLICY_CUDA._serialized_end = 3259 + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC._serialized_start = 2711 + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC._serialized_end = 3259 + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_SHAPE._serialized_start = 2910 + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_SHAPE._serialized_end = 2930 + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND._serialized_start = 2933 + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND._serialized_end = 3156 + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND_INPUTENTRY._serialized_start = 3055 + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_LOWERBOUND_INPUTENTRY._serialized_end = 3156 + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_INPUTENTRY._serialized_start = 3055 + _MODELOPTIMIZATIONPOLICY_CUDA_GRAPHSPEC_INPUTENTRY._serialized_end = 3156 + _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS._serialized_start = 3262 + _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS._serialized_end = 3682 + _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR._serialized_start = 3498 + _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR._serialized_end = 3682 + _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR_PARAMETERSENTRY._serialized_start = 3633 + _MODELOPTIMIZATIONPOLICY_EXECUTIONACCELERATORS_ACCELERATOR_PARAMETERSENTRY._serialized_end = 3682 + _MODELOPTIMIZATIONPOLICY_PINNEDMEMORYBUFFER._serialized_start = 3684 + _MODELOPTIMIZATIONPOLICY_PINNEDMEMORYBUFFER._serialized_end = 3720 + _MODELOPTIMIZATIONPOLICY_MODELPRIORITY._serialized_start = 3722 + _MODELOPTIMIZATIONPOLICY_MODELPRIORITY._serialized_end = 3795 + _MODELQUEUEPOLICY._serialized_start = 3798 + _MODELQUEUEPOLICY._serialized_end = 4017 + _MODELQUEUEPOLICY_TIMEOUTACTION._serialized_start = 3979 + _MODELQUEUEPOLICY_TIMEOUTACTION._serialized_end = 4017 + _MODELDYNAMICBATCHING._serialized_start = 4020 + _MODELDYNAMICBATCHING._serialized_end = 4431 + _MODELDYNAMICBATCHING_PRIORITYQUEUEPOLICYENTRY._serialized_start = 4344 + _MODELDYNAMICBATCHING_PRIORITYQUEUEPOLICYENTRY._serialized_end = 4431 + _MODELSEQUENCEBATCHING._serialized_start = 4434 + _MODELSEQUENCEBATCHING._serialized_end = 5697 + _MODELSEQUENCEBATCHING_CONTROL._serialized_start = 4759 + _MODELSEQUENCEBATCHING_CONTROL._serialized_end = 5064 + _MODELSEQUENCEBATCHING_CONTROL_KIND._serialized_start = 4947 + _MODELSEQUENCEBATCHING_CONTROL_KIND._serialized_end = 5064 + _MODELSEQUENCEBATCHING_CONTROLINPUT._serialized_start = 5066 + _MODELSEQUENCEBATCHING_CONTROLINPUT._serialized_end = 5153 + _MODELSEQUENCEBATCHING_INITIALSTATE._serialized_start = 5156 + _MODELSEQUENCEBATCHING_INITIALSTATE._serialized_end = 5294 + _MODELSEQUENCEBATCHING_STATE._serialized_start = 5297 + _MODELSEQUENCEBATCHING_STATE._serialized_end = 5469 + _MODELSEQUENCEBATCHING_STRATEGYDIRECT._serialized_start = 5471 + _MODELSEQUENCEBATCHING_STRATEGYDIRECT._serialized_end = 5559 + _MODELSEQUENCEBATCHING_STRATEGYOLDEST._serialized_start = 5561 + _MODELSEQUENCEBATCHING_STRATEGYOLDEST._serialized_end = 5678 + _MODELENSEMBLING._serialized_start = 5700 + _MODELENSEMBLING._serialized_end = 6049 + _MODELENSEMBLING_STEP._serialized_start = 5767 + _MODELENSEMBLING_STEP._serialized_end = 6049 + _MODELENSEMBLING_STEP_INPUTMAPENTRY._serialized_start = 5952 + _MODELENSEMBLING_STEP_INPUTMAPENTRY._serialized_end = 5999 + _MODELENSEMBLING_STEP_OUTPUTMAPENTRY._serialized_start = 6001 + _MODELENSEMBLING_STEP_OUTPUTMAPENTRY._serialized_end = 6049 + _MODELPARAMETER._serialized_start = 6051 + _MODELPARAMETER._serialized_end = 6089 + _MODELWARMUP._serialized_start = 6092 + _MODELWARMUP._serialized_end = 6437 + _MODELWARMUP_INPUT._serialized_start = 6209 + _MODELWARMUP_INPUT._serialized_end = 6360 + _MODELWARMUP_INPUTSENTRY._serialized_start = 6362 + _MODELWARMUP_INPUTSENTRY._serialized_end = 6437 + _MODELOPERATIONS._serialized_start = 6439 + _MODELOPERATIONS._serialized_end = 6485 + _MODELTRANSACTIONPOLICY._serialized_start = 6487 + _MODELTRANSACTIONPOLICY._serialized_end = 6530 + _MODELREPOSITORYAGENTS._serialized_start = 6533 + _MODELREPOSITORYAGENTS._serialized_end = 6763 + _MODELREPOSITORYAGENTS_AGENT._serialized_start = 6615 + _MODELREPOSITORYAGENTS_AGENT._serialized_end = 6763 + _MODELREPOSITORYAGENTS_AGENT_PARAMETERSENTRY._serialized_start = 3633 + _MODELREPOSITORYAGENTS_AGENT_PARAMETERSENTRY._serialized_end = 3682 + _MODELRESPONSECACHE._serialized_start = 6765 + _MODELRESPONSECACHE._serialized_end = 6801 + _MODELCONFIG._serialized_start = 6804 + _MODELCONFIG._serialized_end = 8134 + _MODELCONFIG_CCMODELFILENAMESENTRY._serialized_start = 7929 + _MODELCONFIG_CCMODELFILENAMESENTRY._serialized_end = 7984 + _MODELCONFIG_METRICTAGSENTRY._serialized_start = 7986 + _MODELCONFIG_METRICTAGSENTRY._serialized_end = 8035 + _MODELCONFIG_PARAMETERSENTRY._serialized_start = 8037 + _MODELCONFIG_PARAMETERSENTRY._serialized_end = 8113 +# @@protoc_insertion_point(module_scope) diff --git a/visualdl/server/api.py b/visualdl/server/api.py index 502bf48f0..0ef7b6dc1 100644 --- a/visualdl/server/api.py +++ b/visualdl/server/api.py @@ -417,7 +417,10 @@ def get_component_tabs(*apis, vdl_args, request_args): all_tabs.update(api('component_tabs', request_args)) all_tabs.add('static_graph') else: - return ['static_graph', 'x2paddle', 'fastdeploy_server'] + return [ + 'static_graph', 'x2paddle', 'fastdeploy_server', + 'fastdeploy_client' + ] return list(all_tabs) diff --git a/visualdl/server/app.py b/visualdl/server/app.py index 5f9454fa9..e451c4e21 100644 --- a/visualdl/server/app.py +++ b/visualdl/server/app.py @@ -13,12 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. # ======================================================================= +import json import multiprocessing import os import re import sys import threading import time +import urllib import webbrowser import requests @@ -32,6 +34,8 @@ import visualdl.server from visualdl import __version__ +from visualdl.component.inference.fastdeploy_lib import get_start_arguments +from visualdl.component.inference.fastdeploy_server import create_fastdeploy_api_call from visualdl.component.inference.model_convert_server import create_model_convert_api_call from visualdl.component.profiler.profiler_server import create_profiler_api_call from visualdl.server.api import create_api_call @@ -71,6 +75,7 @@ def create_app(args): # noqa: C901 api_call = create_api_call(args.logdir, args.model, args.cache_timeout) profiler_api_call = create_profiler_api_call(args.logdir) inference_api_call = create_model_convert_api_call() + fastdeploy_api_call = create_fastdeploy_api_call() if args.telemetry: update_util.PbUpdater(args.product).start() @@ -153,6 +158,141 @@ def serve_inference_api(method): return make_response( Response(data, mimetype=mimetype, headers=headers)) + @app.route(api_path + '/fastdeploy/', methods=["GET", "POST"]) + def serve_fastdeploy_api(method): + if request.method == 'POST': + data, mimetype, headers = fastdeploy_api_call(method, request.form) + else: + data, mimetype, headers = fastdeploy_api_call(method, request.args) + return make_response( + Response(data, mimetype=mimetype, headers=headers)) + + @app.route( + api_path + '/fastdeploy/fastdeploy_client', methods=["GET", "POST"]) + def serve_fastdeploy_create_fastdeploy_client(): + try: + if request.method == 'POST': + fastdeploy_api_call('create_fastdeploy_client', request.form) + request_args = request.form + else: + fastdeploy_api_call('create_fastdeploy_client', request.args) + request_args = request.args + except Exception as e: + error_msg = '{}'.format(e) + return make_response(error_msg) + args = urllib.parse.urlencode(request_args) + if args: + return redirect( + api_path + "/fastdeploy/fastdeploy_client/app?{}".format(args), + code=302) + return redirect( + api_path + "/fastdeploy/fastdeploy_client/app", code=302) + + @app.route( + api_path + "/fastdeploy/fastdeploy_client/", + methods=["GET", "POST"]) + def request_fastdeploy_create_fastdeploy_client_app(path: str): + ''' + Gradio app server url interface. We route urls for gradio app to gradio server. + + Args: + path(str): All resource path from gradio server. + + Returns: + Any thing from gradio server. + ''' + if request.method == 'POST': + port = fastdeploy_api_call('create_fastdeploy_client', + request.form) + request_args = request.form + else: + port = fastdeploy_api_call('create_fastdeploy_client', + request.args) + request_args = request.args + if path == 'app': + proxy_url = request.url.replace( + request.host_url.rstrip('/') + api_path + + '/fastdeploy/fastdeploy_client/app', + 'http://localhost:{}/'.format(port)) + else: + proxy_url = request.url.replace( + request.host_url.rstrip('/') + api_path + + '/fastdeploy/fastdeploy_client/', + 'http://localhost:{}/'.format(port)) + resp = requests.request( + method=request.method, + url=proxy_url, + headers={ + key: value + for (key, value) in request.headers if key != 'Host' + }, + data=request.get_data(), + cookies=request.cookies, + allow_redirects=False) + if path == 'app': + content = resp.content + if request_args and 'server_id' in request_args: + server_id = request_args.get('server_id') + start_args = get_start_arguments(server_id) + http_port = start_args.get('http-port', '') + metrics_port = start_args.get('metrics-port', '') + model_name = start_args.get('default_model_name', '') + content = content.decode() + try: + default_server_addr = re.search( + '"label": {}.*?"value": "".*?}}'.format( + json.dumps("服务ip", ensure_ascii=True).replace( + '\\', '\\\\')), content).group(0) + cur_server_addr = default_server_addr.replace( + '"value": ""', '"value": "localhost"') + default_http_port = re.search( + '"label": {}.*?"value": "".*?}}'.format( + json.dumps("推理服务端口", ensure_ascii=True).replace( + '\\', '\\\\')), content).group(0) + cur_http_port = default_http_port.replace( + '"value": ""', '"value": "{}"'.format(http_port)) + default_metrics_port = re.search( + '"label": {}.*?"value": "".*?}}'.format( + json.dumps("性能服务端口", ensure_ascii=True).replace( + '\\', '\\\\')), content).group(0) + cur_metrics_port = default_metrics_port.replace( + '"value": ""', '"value": "{}"'.format(metrics_port)) + default_model_name = re.search( + '"label": {}.*?"value": "".*?}}'.format( + json.dumps("模型名称", ensure_ascii=True).replace( + '\\', '\\\\')), content).group(0) + cur_model_name = default_model_name.replace( + '"value": ""', '"value": "{}"'.format(model_name)) + default_model_version = re.search( + '"label": {}.*?"value": "".*?}}'.format( + json.dumps("模型版本", ensure_ascii=True).replace( + '\\', '\\\\')), content).group(0) + cur_model_version = default_model_version.replace( + '"value": ""', '"value": "{}"'.format('1')) + content = content.replace(default_server_addr, + cur_server_addr) + if http_port: + content = content.replace(default_http_port, + cur_http_port) + if metrics_port: + content = content.replace(default_metrics_port, + cur_metrics_port) + if model_name: + content = content.replace(default_model_name, + cur_model_name) + + content = content.replace(default_model_version, + cur_model_version) + except Exception: + pass + finally: + content = content.encode() + else: + content = resp.content + headers = [(name, value) for (name, value) in resp.raw.headers.items()] + response = Response(content, resp.status_code, headers) + return response + @app.route(api_path + '/component_tabs') def component_tabs(): data, mimetype, headers = get_component_tabs( diff --git a/visualdl/server/args.py b/visualdl/server/args.py index cb42422c7..71f97afb1 100644 --- a/visualdl/server/args.py +++ b/visualdl/server/args.py @@ -78,7 +78,8 @@ def validate_args(args): supported_tabs = [ 'scalar', 'image', 'text', 'embeddings', 'audio', 'histogram', 'hyper_parameters', 'static_graph', 'dynamic_graph', 'pr_curve', - 'roc_curve', 'profiler', 'x2paddle', 'fastdeploy_server' + 'roc_curve', 'profiler', 'x2paddle', 'fastdeploy_server', + 'fastdeploy_client' ] if args.component_tabs is not None: for component_tab in args.component_tabs: diff --git a/visualdl/utils/dir.py b/visualdl/utils/dir.py index 64199f4cd..b22ed4246 100644 --- a/visualdl/utils/dir.py +++ b/visualdl/utils/dir.py @@ -23,6 +23,7 @@ VDL_HOME = os.path.join(USER_HOME, '.visualdl') CONF_HOME = os.path.join(VDL_HOME, 'conf') CONFIG_PATH = os.path.join(CONF_HOME, 'config.json') +FASTDEPLOYSERVER_PATH = os.path.join(VDL_HOME, 'fastdeployserver') X2PADDLE_CACHE_PATH = os.path.join(VDL_HOME, 'x2paddle') @@ -32,5 +33,7 @@ def init_vdl_config(): if not os.path.exists(CONFIG_PATH) or 0 == os.path.getsize(CONFIG_PATH): with open(CONFIG_PATH, 'w') as fp: fp.write(json.dumps(default_vdl_config)) + if not os.path.exists(FASTDEPLOYSERVER_PATH): + os.makedirs(FASTDEPLOYSERVER_PATH, exist_ok=True) if not os.path.exists(X2PADDLE_CACHE_PATH): os.makedirs(X2PADDLE_CACHE_PATH, exist_ok=True)