I0627 23:07:12.842238 1660 cache_manager.cc:480] Create CacheManager with cache_dir: '/opt/tritonserver/caches' I0627 23:07:13.126047 1660 pinned_memory_manager.cc:275] Pinned memory pool is created at '0x7fe648000000' with size 268435456 I0627 23:07:13.126470 1660 cuda_memory_manager.cc:107] CUDA memory pool is created on device 0 with size 67108864 I0627 23:07:13.128860 1660 model_config_utils.cc:680] Server side auto-completed config: name: "ensemble" platform: "ensemble" max_batch_size: 64 input { name: "text_input" data_type: TYPE_STRING dims: -1 } input { name: "max_tokens" data_type: TYPE_INT32 dims: -1 } input { name: "bad_words" data_type: TYPE_STRING dims: -1 optional: true } input { name: "stop_words" data_type: TYPE_STRING dims: -1 optional: true } input { name: "end_id" data_type: TYPE_INT32 dims: 1 optional: true } input { name: "pad_id" data_type: TYPE_INT32 dims: 1 optional: true } input { name: "top_k" data_type: TYPE_INT32 dims: 1 optional: true } input { name: "top_p" data_type: TYPE_FP32 dims: 1 optional: true } input { name: "temperature" data_type: TYPE_FP32 dims: 1 optional: true } input { name: "length_penalty" data_type: TYPE_FP32 dims: 1 optional: true } input { name: "repetition_penalty" data_type: TYPE_FP32 dims: 1 optional: true } input { name: "min_length" data_type: TYPE_INT32 dims: 1 optional: true } input { name: "presence_penalty" data_type: TYPE_FP32 dims: 1 optional: true } input { name: "frequency_penalty" data_type: TYPE_FP32 dims: 1 optional: true } input { name: "random_seed" data_type: TYPE_UINT64 dims: 1 optional: true } input { name: "return_log_probs" data_type: TYPE_BOOL dims: 1 optional: true } input { name: "return_context_logits" data_type: TYPE_BOOL dims: 1 optional: true } input { name: "return_generation_logits" data_type: TYPE_BOOL dims: 1 optional: true } input { name: "beam_width" data_type: TYPE_INT32 dims: 1 optional: true } input { name: "stream" data_type: TYPE_BOOL dims: 1 optional: true } input { name: "prompt_embedding_table" data_type: TYPE_FP16 dims: -1 dims: -1 optional: true } input { name: "prompt_vocab_size" data_type: TYPE_INT32 dims: 1 optional: true } input { name: "embedding_bias_words" data_type: TYPE_STRING dims: -1 optional: true } input { name: "embedding_bias_weights" data_type: TYPE_FP32 dims: -1 optional: true } output { name: "text_output" data_type: TYPE_STRING dims: -1 } output { name: "cum_log_probs" data_type: TYPE_FP32 dims: -1 } output { name: "output_log_probs" data_type: TYPE_FP32 dims: -1 dims: -1 } output { name: "context_logits" data_type: TYPE_FP32 dims: -1 dims: -1 } output { name: "generation_logits" data_type: TYPE_FP32 dims: -1 dims: -1 dims: -1 } ensemble_scheduling { step { model_name: "preprocessing" model_version: -1 input_map { key: "BAD_WORDS_DICT" value: "bad_words" } input_map { key: "EMBEDDING_BIAS_WEIGHTS" value: "embedding_bias_weights" } input_map { key: "EMBEDDING_BIAS_WORDS" value: "embedding_bias_words" } input_map { key: "END_ID" value: "end_id" } input_map { key: "PAD_ID" value: "pad_id" } input_map { key: "QUERY" value: "text_input" } input_map { key: "REQUEST_OUTPUT_LEN" value: "max_tokens" } input_map { key: "STOP_WORDS_DICT" value: "stop_words" } output_map { key: "BAD_WORDS_IDS" value: "_BAD_WORDS_IDS" } output_map { key: "EMBEDDING_BIAS" value: "_EMBEDDING_BIAS" } output_map { key: "INPUT_ID" value: "_INPUT_ID" } output_map { key: "OUT_END_ID" value: "_PREPROCESSOR_END_ID" } output_map { key: "OUT_PAD_ID" value: "_PREPROCESSOR_PAD_ID" } output_map { key: "REQUEST_INPUT_LEN" value: "_REQUEST_INPUT_LEN" } output_map { key: "REQUEST_OUTPUT_LEN" value: "_REQUEST_OUTPUT_LEN" } output_map { key: "STOP_WORDS_IDS" value: "_STOP_WORDS_IDS" } } step { model_name: "tensorrt_llm" model_version: -1 input_map { key: "bad_words_list" value: "_BAD_WORDS_IDS" } input_map { key: "beam_width" value: "beam_width" } input_map { key: "embedding_bias" value: "_EMBEDDING_BIAS" } input_map { key: "end_id" value: "_PREPROCESSOR_END_ID" } input_map { key: "frequency_penalty" value: "frequency_penalty" } input_map { key: "input_ids" value: "_INPUT_ID" } input_map { key: "input_lengths" value: "_REQUEST_INPUT_LEN" } input_map { key: "len_penalty" value: "length_penalty" } input_map { key: "min_length" value: "min_length" } input_map { key: "pad_id" value: "_PREPROCESSOR_PAD_ID" } input_map { key: "presence_penalty" value: "presence_penalty" } input_map { key: "prompt_embedding_table" value: "prompt_embedding_table" } input_map { key: "prompt_vocab_size" value: "prompt_vocab_size" } input_map { key: "random_seed" value: "random_seed" } input_map { key: "repetition_penalty" value: "repetition_penalty" } input_map { key: "request_output_len" value: "_REQUEST_OUTPUT_LEN" } input_map { key: "return_context_logits" value: "return_context_logits" } input_map { key: "return_generation_logits" value: "return_generation_logits" } input_map { key: "return_log_probs" value: "return_log_probs" } input_map { key: "runtime_top_k" value: "top_k" } input_map { key: "runtime_top_p" value: "top_p" } input_map { key: "stop_words_list" value: "_STOP_WORDS_IDS" } input_map { key: "streaming" value: "stream" } input_map { key: "temperature" value: "temperature" } output_map { key: "context_logits" value: "_CONTEXT_LOGITS" } output_map { key: "cum_log_probs" value: "_CUM_LOG_PROBS" } output_map { key: "generation_logits" value: "_GENERATION_LOGITS" } output_map { key: "output_ids" value: "_TOKENS_BATCH" } output_map { key: "output_log_probs" value: "_OUTPUT_LOG_PROBS" } output_map { key: "sequence_length" value: "_SEQUENCE_LENGTH" } } step { model_name: "postprocessing" model_version: -1 input_map { key: "CONTEXT_LOGITS" value: "_CONTEXT_LOGITS" } input_map { key: "CUM_LOG_PROBS" value: "_CUM_LOG_PROBS" } input_map { key: "GENERATION_LOGITS" value: "_GENERATION_LOGITS" } input_map { key: "OUTPUT_LOG_PROBS" value: "_OUTPUT_LOG_PROBS" } input_map { key: "SEQUENCE_LENGTH" value: "_SEQUENCE_LENGTH" } input_map { key: "TOKENS_BATCH" value: "_TOKENS_BATCH" } output_map { key: "OUTPUT" value: "text_output" } output_map { key: "OUT_CONTEXT_LOGITS" value: "context_logits" } output_map { key: "OUT_CUM_LOG_PROBS" value: "cum_log_probs" } output_map { key: "OUT_GENERATION_LOGITS" value: "generation_logits" } output_map { key: "OUT_OUTPUT_LOG_PROBS" value: "output_log_probs" } } } I0627 23:07:13.129637 1660 model_config_utils.cc:680] Server side auto-completed config: name: "postprocessing" max_batch_size: 64 input { name: "TOKENS_BATCH" data_type: TYPE_INT32 dims: -1 dims: -1 } input { name: "SEQUENCE_LENGTH" data_type: TYPE_INT32 dims: -1 } input { name: "CUM_LOG_PROBS" data_type: TYPE_FP32 dims: -1 } input { name: "OUTPUT_LOG_PROBS" data_type: TYPE_FP32 dims: -1 dims: -1 } input { name: "CONTEXT_LOGITS" data_type: TYPE_FP32 dims: -1 dims: -1 optional: true } input { name: "GENERATION_LOGITS" data_type: TYPE_FP32 dims: -1 dims: -1 dims: -1 optional: true } output { name: "OUTPUT" data_type: TYPE_STRING dims: -1 } output { name: "OUT_CUM_LOG_PROBS" data_type: TYPE_FP32 dims: -1 } output { name: "OUT_OUTPUT_LOG_PROBS" data_type: TYPE_FP32 dims: -1 dims: -1 } output { name: "OUT_CONTEXT_LOGITS" data_type: TYPE_FP32 dims: -1 dims: -1 } output { name: "OUT_GENERATION_LOGITS" data_type: TYPE_FP32 dims: -1 dims: -1 dims: -1 } instance_group { count: 1 kind: KIND_CPU } default_model_filename: "model.py" parameters { key: "skip_special_tokens" value { string_value: "True" } } parameters { key: "tokenizer_dir" value { string_value: "/tensorrt/models/Meta-Llama-Guard-2-8B" } } parameters { key: "tokenizer_type" value { string_value: "llama" } } backend: "python" I0627 23:07:13.129958 1660 model_config_utils.cc:680] Server side auto-completed config: name: "preprocessing" max_batch_size: 64 input { name: "QUERY" data_type: TYPE_STRING dims: -1 } input { name: "REQUEST_OUTPUT_LEN" data_type: TYPE_INT32 dims: -1 } input { name: "BAD_WORDS_DICT" data_type: TYPE_STRING dims: -1 optional: true } input { name: "STOP_WORDS_DICT" data_type: TYPE_STRING dims: -1 optional: true } input { name: "EMBEDDING_BIAS_WORDS" data_type: TYPE_STRING dims: -1 optional: true } input { name: "EMBEDDING_BIAS_WEIGHTS" data_type: TYPE_FP32 dims: -1 optional: true } input { name: "END_ID" data_type: TYPE_INT32 dims: -1 optional: true } input { name: "PAD_ID" data_type: TYPE_INT32 dims: -1 optional: true } output { name: "INPUT_ID" data_type: TYPE_INT32 dims: -1 } output { name: "REQUEST_INPUT_LEN" data_type: TYPE_INT32 dims: 1 } output { name: "BAD_WORDS_IDS" data_type: TYPE_INT32 dims: 2 dims: -1 } output { name: "STOP_WORDS_IDS" data_type: TYPE_INT32 dims: 2 dims: -1 } output { name: "EMBEDDING_BIAS" data_type: TYPE_FP32 dims: -1 } output { name: "REQUEST_OUTPUT_LEN" data_type: TYPE_INT32 dims: -1 } output { name: "OUT_END_ID" data_type: TYPE_INT32 dims: -1 } output { name: "OUT_PAD_ID" data_type: TYPE_INT32 dims: -1 } instance_group { count: 1 kind: KIND_CPU } default_model_filename: "model.py" parameters { key: "add_special_tokens" value { string_value: "False" } } parameters { key: "tokenizer_dir" value { string_value: "/tensorrt/models/Meta-Llama-Guard-2-8B" } } parameters { key: "tokenizer_type" value { string_value: "llama" } } backend: "python" I0627 23:07:13.130386 1660 model_config_utils.cc:680] Server side auto-completed config: name: "tensorrt_llm" max_batch_size: 64 input { name: "input_ids" data_type: TYPE_INT32 dims: -1 allow_ragged_batch: true } input { name: "input_lengths" data_type: TYPE_INT32 dims: 1 reshape { } } input { name: "request_output_len" data_type: TYPE_INT32 dims: 1 } input { name: "draft_input_ids" data_type: TYPE_INT32 dims: -1 allow_ragged_batch: true optional: true } input { name: "end_id" data_type: TYPE_INT32 dims: 1 reshape { } optional: true } input { name: "pad_id" data_type: TYPE_INT32 dims: 1 reshape { } optional: true } input { name: "stop_words_list" data_type: TYPE_INT32 dims: 2 dims: -1 allow_ragged_batch: true optional: true } input { name: "bad_words_list" data_type: TYPE_INT32 dims: 2 dims: -1 allow_ragged_batch: true optional: true } input { name: "embedding_bias" data_type: TYPE_FP32 dims: -1 allow_ragged_batch: true optional: true } input { name: "beam_width" data_type: TYPE_INT32 dims: 1 reshape { } optional: true } input { name: "temperature" data_type: TYPE_FP32 dims: 1 reshape { } optional: true } input { name: "runtime_top_k" data_type: TYPE_INT32 dims: 1 reshape { } optional: true } input { name: "runtime_top_p" data_type: TYPE_FP32 dims: 1 reshape { } optional: true } input { name: "len_penalty" data_type: TYPE_FP32 dims: 1 reshape { } optional: true } input { name: "repetition_penalty" data_type: TYPE_FP32 dims: 1 reshape { } optional: true } input { name: "min_length" data_type: TYPE_INT32 dims: 1 reshape { } optional: true } input { name: "presence_penalty" data_type: TYPE_FP32 dims: 1 reshape { } optional: true } input { name: "frequency_penalty" data_type: TYPE_FP32 dims: 1 reshape { } optional: true } input { name: "random_seed" data_type: TYPE_UINT64 dims: 1 reshape { } optional: true } input { name: "return_log_probs" data_type: TYPE_BOOL dims: 1 reshape { } optional: true } input { name: "return_context_logits" data_type: TYPE_BOOL dims: 1 reshape { } optional: true } input { name: "return_generation_logits" data_type: TYPE_BOOL dims: 1 reshape { } optional: true } input { name: "stop" data_type: TYPE_BOOL dims: 1 optional: true } input { name: "streaming" data_type: TYPE_BOOL dims: 1 optional: true } input { name: "prompt_embedding_table" data_type: TYPE_FP16 dims: -1 dims: -1 allow_ragged_batch: true optional: true } input { name: "prompt_vocab_size" data_type: TYPE_INT32 dims: 1 reshape { } optional: true } input { name: "lora_weights" data_type: TYPE_FP16 dims: -1 dims: -1 allow_ragged_batch: true optional: true } input { name: "lora_config" data_type: TYPE_INT32 dims: -1 dims: 3 allow_ragged_batch: true optional: true } output { name: "output_ids" data_type: TYPE_INT32 dims: -1 dims: -1 } output { name: "sequence_length" data_type: TYPE_INT32 dims: -1 } output { name: "cum_log_probs" data_type: TYPE_FP32 dims: -1 } output { name: "output_log_probs" data_type: TYPE_FP32 dims: -1 dims: -1 } output { name: "context_logits" data_type: TYPE_FP32 dims: -1 dims: -1 } output { name: "generation_logits" data_type: TYPE_FP32 dims: -1 dims: -1 dims: -1 } instance_group { count: 1 kind: KIND_CPU } dynamic_batching { preferred_batch_size: 64 max_queue_delay_microseconds: 600 } parameters { key: "FORCE_CPU_ONLY_INPUT_TENSORS" value { string_value: "no" } } parameters { key: "batch_scheduler_policy" value { string_value: "${batch_scheduler_policy}" } } parameters { key: "enable_chunked_context" value { string_value: "${enable_chunked_context}" } } parameters { key: "enable_kv_cache_reuse" value { string_value: "False" } } parameters { key: "enable_trt_overlap" value { string_value: "${enable_trt_overlap}" } } parameters { key: "exclude_input_in_output" value { string_value: "True" } } parameters { key: "gpt_model_path" value { string_value: "/tensorrt/tensorrt-models/Meta-Llama-Guard-2-8B/v0.8.0/trt-engines/fp16/1-gpu/" } } parameters { key: "gpt_model_type" value { string_value: "inflight_batching" } } parameters { key: "gpu_device_ids" value { string_value: "${gpu_device_ids}" } } parameters { key: "kv_cache_free_gpu_mem_fraction" value { string_value: "0.9" } } parameters { key: "max_attention_window_size" value { string_value: "2560" } } parameters { key: "max_beam_width" value { string_value: "1" } } parameters { key: "max_tokens_in_paged_kv_cache" value { string_value: "2560" } } parameters { key: "normalize_log_probs" value { string_value: "${normalize_log_probs}" } } backend: "tensorrtllm" model_transaction_policy { } I0627 23:07:13.130914 1660 model_config_utils.cc:680] Server side auto-completed config: name: "tensorrt_llm_bls" max_batch_size: 64 input { name: "text_input" data_type: TYPE_STRING dims: -1 } input { name: "max_tokens" data_type: TYPE_INT32 dims: -1 } input { name: "bad_words" data_type: TYPE_STRING dims: -1 optional: true } input { name: "stop_words" data_type: TYPE_STRING dims: -1 optional: true } input { name: "end_id" data_type: TYPE_INT32 dims: 1 optional: true } input { name: "pad_id" data_type: TYPE_INT32 dims: 1 optional: true } input { name: "top_k" data_type: TYPE_INT32 dims: 1 optional: true } input { name: "top_p" data_type: TYPE_FP32 dims: 1 optional: true } input { name: "temperature" data_type: TYPE_FP32 dims: 1 optional: true } input { name: "length_penalty" data_type: TYPE_FP32 dims: 1 optional: true } input { name: "repetition_penalty" data_type: TYPE_FP32 dims: 1 optional: true } input { name: "min_length" data_type: TYPE_INT32 dims: 1 optional: true } input { name: "presence_penalty" data_type: TYPE_FP32 dims: 1 optional: true } input { name: "frequency_penalty" data_type: TYPE_FP32 dims: 1 optional: true } input { name: "random_seed" data_type: TYPE_UINT64 dims: 1 optional: true } input { name: "return_log_probs" data_type: TYPE_BOOL dims: 1 optional: true } input { name: "return_context_logits" data_type: TYPE_BOOL dims: 1 reshape { } optional: true } input { name: "return_generation_logits" data_type: TYPE_BOOL dims: 1 reshape { } optional: true } input { name: "beam_width" data_type: TYPE_INT32 dims: 1 optional: true } input { name: "stream" data_type: TYPE_BOOL dims: 1 optional: true } input { name: "prompt_embedding_table" data_type: TYPE_FP16 dims: -1 dims: -1 optional: true } input { name: "prompt_vocab_size" data_type: TYPE_INT32 dims: 1 optional: true } input { name: "embedding_bias_words" data_type: TYPE_STRING dims: -1 optional: true } input { name: "embedding_bias_weights" data_type: TYPE_FP32 dims: -1 optional: true } output { name: "text_output" data_type: TYPE_STRING dims: -1 } output { name: "cum_log_probs" data_type: TYPE_FP32 dims: -1 } output { name: "output_log_probs" data_type: TYPE_FP32 dims: -1 dims: -1 } output { name: "context_logits" data_type: TYPE_FP32 dims: -1 dims: -1 } output { name: "generation_logits" data_type: TYPE_FP32 dims: -1 dims: -1 dims: -1 } instance_group { count: 1 kind: KIND_CPU } default_model_filename: "model.py" parameters { key: "accumulate_tokens" value { string_value: "False" } } backend: "python" model_transaction_policy { } I0627 23:07:13.131264 1660 model_lifecycle.cc:438] AsyncLoad() 'postprocessing' I0627 23:07:13.131325 1660 model_lifecycle.cc:469] loading: postprocessing:1 I0627 23:07:13.131354 1660 model_lifecycle.cc:438] AsyncLoad() 'preprocessing' I0627 23:07:13.131418 1660 model_lifecycle.cc:469] loading: preprocessing:1 I0627 23:07:13.131441 1660 model_lifecycle.cc:438] AsyncLoad() 'tensorrt_llm' I0627 23:07:13.131440 1660 model_lifecycle.cc:547] CreateModel() 'postprocessing' version 1 I0627 23:07:13.131496 1660 model_lifecycle.cc:469] loading: tensorrt_llm:1 I0627 23:07:13.131501 1660 model_lifecycle.cc:547] CreateModel() 'preprocessing' version 1 I0627 23:07:13.131554 1660 backend_model.cc:502] Adding default backend config setting: default-max-batch-size,4 I0627 23:07:13.131517 1660 model_lifecycle.cc:438] AsyncLoad() 'tensorrt_llm_bls' I0627 23:07:13.131608 1660 model_lifecycle.cc:547] CreateModel() 'tensorrt_llm' version 1 I0627 23:07:13.131644 1660 model_lifecycle.cc:469] loading: tensorrt_llm_bls:1 I0627 23:07:13.131650 1660 backend_model.cc:502] Adding default backend config setting: default-max-batch-size,4 I0627 23:07:13.131595 1660 shared_library.cc:112] OpenLibraryHandle: /opt/tritonserver/backends/python/libtriton_python.so I0627 23:07:13.131719 1660 model_lifecycle.cc:547] CreateModel() 'tensorrt_llm_bls' version 1 I0627 23:07:13.131733 1660 backend_model.cc:502] Adding default backend config setting: default-max-batch-size,4 I0627 23:07:13.131819 1660 backend_model.cc:502] Adding default backend config setting: default-max-batch-size,4 I0627 23:07:13.132878 1660 python_be.cc:2075] 'python' TRITONBACKEND API version: 1.18 I0627 23:07:13.132901 1660 python_be.cc:2097] backend configuration: {"cmdline":{"auto-complete-config":"true","backend-directory":"/opt/tritonserver/backends","min-compute-capability":"6.000000","default-max-batch-size":"4"}} I0627 23:07:13.132943 1660 python_be.cc:2236] Shared memory configuration is shm-default-byte-size=1048576,shm-growth-byte-size=1048576,stub-timeout-seconds=30 I0627 23:07:13.133093 1660 python_be.cc:2559] TRITONBACKEND_GetBackendAttribute: setting attributes I0627 23:07:13.133214 1660 shared_library.cc:112] OpenLibraryHandle: /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so I0627 23:07:13.134783 1660 python_be.cc:2337] TRITONBACKEND_ModelInitialize: postprocessing (version 1) I0627 23:07:13.134810 1660 python_be.cc:2337] TRITONBACKEND_ModelInitialize: preprocessing (version 1) I0627 23:07:13.135505 1660 model_config_utils.cc:1902] ModelConfig 64-bit fields: I0627 23:07:13.135525 1660 model_config_utils.cc:1904] ModelConfig::dynamic_batching::default_priority_level I0627 23:07:13.135532 1660 model_config_utils.cc:1904] ModelConfig::dynamic_batching::default_queue_policy::default_timeout_microseconds I0627 23:07:13.135546 1660 model_config_utils.cc:1904] ModelConfig::dynamic_batching::max_queue_delay_microseconds I0627 23:07:13.135553 1660 model_config_utils.cc:1904] ModelConfig::dynamic_batching::priority_levels I0627 23:07:13.135566 1660 model_config_utils.cc:1904] ModelConfig::dynamic_batching::priority_queue_policy::key I0627 23:07:13.135573 1660 model_config_utils.cc:1904] ModelConfig::dynamic_batching::priority_queue_policy::value::default_timeout_microseconds I0627 23:07:13.135582 1660 model_config_utils.cc:1904] ModelConfig::ensemble_scheduling::step::model_version I0627 23:07:13.135598 1660 model_config_utils.cc:1904] ModelConfig::input::dims I0627 23:07:13.135616 1660 model_config_utils.cc:1904] ModelConfig::input::reshape::shape I0627 23:07:13.135624 1660 model_config_utils.cc:1904] ModelConfig::instance_group::secondary_devices::device_id I0627 23:07:13.135632 1660 model_config_utils.cc:1904] ModelConfig::model_warmup::inputs::value::dims I0627 23:07:13.135649 1660 model_config_utils.cc:1904] ModelConfig::optimization::cuda::graph_spec::graph_lower_bound::input::value::dim I0627 23:07:13.135674 1660 model_config_utils.cc:1904] ModelConfig::optimization::cuda::graph_spec::input::value::dim I0627 23:07:13.135693 1660 model_config_utils.cc:1904] ModelConfig::output::dims I0627 23:07:13.135710 1660 model_config_utils.cc:1904] ModelConfig::output::reshape::shape I0627 23:07:13.135726 1660 model_config_utils.cc:1904] ModelConfig::sequence_batching::direct::max_queue_delay_microseconds I0627 23:07:13.135733 1660 model_config_utils.cc:1904] ModelConfig::sequence_batching::max_sequence_idle_microseconds I0627 23:07:13.135749 1660 model_config_utils.cc:1904] ModelConfig::sequence_batching::oldest::max_queue_delay_microseconds I0627 23:07:13.135765 1660 model_config_utils.cc:1904] ModelConfig::sequence_batching::state::dims I0627 23:07:13.135781 1660 model_config_utils.cc:1904] ModelConfig::sequence_batching::state::initial_state::dims I0627 23:07:13.135799 1660 model_config_utils.cc:1904] ModelConfig::version_policy::specific::versions I0627 23:07:13.162495 1660 stub_launcher.cc:388] Starting Python backend stub: exec /opt/tritonserver/backends/python/triton_python_backend_stub /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/preprocessing/1/model.py triton_python_backend_shm_region_2 1048576 1048576 1660 /opt/tritonserver/backends/python 336 preprocessing DEFAULT I0627 23:07:13.162580 1660 stub_launcher.cc:388] Starting Python backend stub: exec /opt/tritonserver/backends/python/triton_python_backend_stub /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/postprocessing/1/model.py triton_python_backend_shm_region_1 1048576 1048576 1660 /opt/tritonserver/backends/python 336 postprocessing DEFAULT I0627 23:07:13.227083 1660 python_be.cc:2337] TRITONBACKEND_ModelInitialize: tensorrt_llm_bls (version 1) I0627 23:07:13.228908 1660 stub_launcher.cc:388] Starting Python backend stub: exec /opt/tritonserver/backends/python/triton_python_backend_stub /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/tensorrt_llm_bls/1/model.py triton_python_backend_shm_region_3 1048576 1048576 1660 /opt/tritonserver/backends/python 336 tensorrt_llm_bls DEFAULT[TensorRT-LLM][WARNING] gpu_device_ids is not specified, will be automatically set [TensorRT-LLM][WARNING] batch_scheduler_policy parameter was not found or is invalid (must be max_utilization or guaranteed_no_evict) [TensorRT-LLM][WARNING] enable_chunked_context is not specified, will be set to false. [TensorRT-LLM][WARNING] enable_trt_overlap is not specified, will be set to false [TensorRT-LLM][WARNING] normalize_log_probs is not specified, will be set to true [TensorRT-LLM][INFO] Engine version 0.8.0 found in the config file, assuming engine(s) built by new builder API. [TensorRT-LLM][WARNING] [json.exception.type_error.302] type must be array, but is null [TensorRT-LLM][WARNING] Optional value for parameter lora_target_modules will not be set. [TensorRT-LLM][WARNING] Parameter max_draft_len cannot be read from json: [TensorRT-LLM][WARNING] [json.exception.out_of_range.403] key 'max_draft_len' not found [TensorRT-LLM][WARNING] [json.exception.type_error.302] type must be string, but is null [TensorRT-LLM][WARNING] Optional value for parameter quant_algo will not be set. [TensorRT-LLM][WARNING] [json.exception.type_error.302] type must be string, but is null [TensorRT-LLM][WARNING] Optional value for parameter kv_cache_quant_algo will not be set. [TensorRT-LLM][INFO] Initializing MPI with thread mode 1 [TensorRT-LLM][INFO] MPI size: 1, rank: 0 [TensorRT-LLM][INFO] Rank 0 is using GPU 0 I0627 23:07:14.542780 1660 python_be.cc:2031] model configuration: { "name": "tensorrt_llm_bls", "platform": "", "backend": "python", "runtime": "", "version_policy": { "latest": { "num_versions": 1 } }, "max_batch_size": 64, "input": [ { "name": "text_input", "data_type": "TYPE_STRING", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": false }, { "name": "max_tokens", "data_type": "TYPE_INT32", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": false }, { "name": "bad_words", "data_type": "TYPE_STRING", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "stop_words", "data_type": "TYPE_STRING", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "end_id", "data_type": "TYPE_INT32", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "pad_id", "data_type": "TYPE_INT32", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "top_k", "data_type": "TYPE_INT32", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "top_p", "data_type": "TYPE_FP32", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "temperature", "data_type": "TYPE_FP32", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "length_penalty", "data_type": "TYPE_FP32", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "repetition_penalty", "data_type": "TYPE_FP32", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "min_length", "data_type": "TYPE_INT32", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "presence_penalty", "data_type": "TYPE_FP32", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "frequency_penalty", "data_type": "TYPE_FP32", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "random_seed", "data_type": "TYPE_UINT64", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "return_log_probs", "data_type": "TYPE_BOOL", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "return_context_logits", "data_type": "TYPE_BOOL", "format": "FORMAT_NONE", "dims": [ 1 ], "reshape": { "shape": [] }, "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "return_generation_logits", "data_type": "TYPE_BOOL", "format": "FORMAT_NONE", "dims": [ 1 ], "reshape": { "shape": [] }, "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "beam_width", "data_type": "TYPE_INT32", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "stream", "data_type": "TYPE_BOOL", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "prompt_embedding_table", "data_type": "TYPE_FP16", "format": "FORMAT_NONE", "dims": [ -1, -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "prompt_vocab_size", "data_type": "TYPE_INT32", "format": "FORMAT_NONE", "dims": [ 1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "embedding_bias_words", "data_type": "TYPE_STRING", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "embedding_bias_weights", "data_type": "TYPE_FP32", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true } ], "output": [ { "name": "text_output", "data_type": "TYPE_STRING", "dims": [ -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "cum_log_probs", "data_type": "TYPE_FP32", "dims": [ -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "output_log_probs", "data_type": "TYPE_FP32", "dims": [ -1, -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "context_logits", "data_type": "TYPE_FP32", "dims": [ -1, -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "generation_logits", "data_type": "TYPE_FP32", "dims": [ -1, -1, -1 ], "label_filename": "", "is_shape_tensor": false } ], "batch_input": [], "batch_output": [], "optimization": { "priority": "PRIORITY_DEFAULT", "input_pinned_memory": { "enable": true }, "output_pinned_memory": { "enable": true }, "gather_kernel_buffer_threshold": 0, "eager_batching": false }, "instance_group": [ { "name": "tensorrt_llm_bls_0", "kind": "KIND_CPU", "count": 1, "gpus": [], "secondary_devices": [], "profile": [], "passive": false, "host_policy": "" } ], "default_model_filename": "model.py", "cc_model_filenames": {}, "metric_tags": {}, "parameters": { "accumulate_tokens": { "string_value": "False" } }, "model_warmup": [], "model_transaction_policy": { "decoupled": false } } I0627 23:07:14.543360 1660 python_be.cc:2381] TRITONBACKEND_ModelInstanceInitialize: tensorrt_llm_bls_0_0 (CPU device 0) I0627 23:07:14.543393 1660 backend_model_instance.cc:69] Creating instance tensorrt_llm_bls_0_0 on CPU using artifact 'model.py' I0627 23:07:14.544331 1660 stub_launcher.cc:388] Starting Python backend stub: exec /opt/tritonserver/backends/python/triton_python_backend_stub /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/tensorrt_llm_bls/1/model.py triton_python_backend_shm_region_4 1048576 1048576 1660 /opt/tritonserver/backends/python 336 tensorrt_llm_bls_0_0 DEFAULT I0627 23:07:14.837605 1660 python_be.cc:2402] TRITONBACKEND_ModelInstanceInitialize: instance initialization successful tensorrt_llm_bls_0_0 (device 0) I0627 23:07:14.837845 1660 backend_model_instance.cc:772] Starting backend thread for tensorrt_llm_bls_0_0 at nice 0 on device 0... I0627 23:07:14.837954 1660 backend_model.cc:674] Created model instance named 'tensorrt_llm_bls_0_0' with device id '0' I0627 23:07:14.838117 1660 model_lifecycle.cc:692] OnLoadComplete() 'tensorrt_llm_bls' version 1 I0627 23:07:14.838147 1660 model_lifecycle.cc:730] OnLoadFinal() 'tensorrt_llm_bls' for all version(s) I0627 23:07:14.838156 1660 model_lifecycle.cc:835] successfully loaded 'tensorrt_llm_bls' I0627 23:07:16.875996 1660 python_be.cc:2031] model configuration: { "name": "preprocessing", "platform": "", "backend": "python", "runtime": "", "version_policy": { "latest": { "num_versions": 1 } }, "max_batch_size": 64, "input": [ { "name": "QUERY", "data_type": "TYPE_STRING", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": false }, { "name": "REQUEST_OUTPUT_LEN", "data_type": "TYPE_INT32", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": false }, { "name": "BAD_WORDS_DICT", "data_type": "TYPE_STRING", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "STOP_WORDS_DICT", "data_type": "TYPE_STRING", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "EMBEDDING_BIAS_WORDS", "data_type": "TYPE_STRING", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "EMBEDDING_BIAS_WEIGHTS", "data_type": "TYPE_FP32", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "END_ID", "data_type": "TYPE_INT32", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "PAD_ID", "data_type": "TYPE_INT32", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true } ], "output": [ { "name": "INPUT_ID", "data_type": "TYPE_INT32", "dims": [ -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "REQUEST_INPUT_LEN", "data_type": "TYPE_INT32", "dims": [ 1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "BAD_WORDS_IDS", "data_type": "TYPE_INT32", "dims": [ 2, -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "STOP_WORDS_IDS", "data_type": "TYPE_INT32", "dims": [ 2, -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "EMBEDDING_BIAS", "data_type": "TYPE_FP32", "dims": [ -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "REQUEST_OUTPUT_LEN", "data_type": "TYPE_INT32", "dims": [ -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "OUT_END_ID", "data_type": "TYPE_INT32", "dims": [ -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "OUT_PAD_ID", "data_type": "TYPE_INT32", "dims": [ -1 ], "label_filename": "", "is_shape_tensor": false } ], "batch_input": [], "batch_output": [], "optimization": { "priority": "PRIORITY_DEFAULT", "input_pinned_memory": { "enable": true }, "output_pinned_memory": { "enable": true }, "gather_kernel_buffer_threshold": 0, "eager_batching": false }, "instance_group": [ { "name": "preprocessing_0", "kind": "KIND_CPU", "count": 1, "gpus": [], "secondary_devices": [], "profile": [], "passive": false, "host_policy": "" } ], "default_model_filename": "model.py", "cc_model_filenames": {}, "metric_tags": {}, "parameters": { "add_special_tokens": { "string_value": "False" }, "tokenizer_type": { "string_value": "llama" }, "tokenizer_dir": { "string_value": "/tensorrt/models/Meta-Llama-Guard-2-8B" } }, "model_warmup": [] } I0627 23:07:16.876554 1660 python_be.cc:2381] TRITONBACKEND_ModelInstanceInitialize: preprocessing_0_0 (CPU device 0) I0627 23:07:16.876582 1660 backend_model_instance.cc:69] Creating instance preprocessing_0_0 on CPU using artifact 'model.py' I0627 23:07:16.877359 1660 stub_launcher.cc:388] Starting Python backend stub: exec /opt/tritonserver/backends/python/triton_python_backend_stub /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/preprocessing/1/model.py triton_python_backend_shm_region_5 1048576 1048576 1660 /opt/tritonserver/backends/python 336 preprocessing_0_0 DEFAULT I0627 23:07:16.961337 1660 python_be.cc:2031] model configuration: { "name": "postprocessing", "platform": "", "backend": "python", "runtime": "", "version_policy": { "latest": { "num_versions": 1 } }, "max_batch_size": 64, "input": [ { "name": "TOKENS_BATCH", "data_type": "TYPE_INT32", "format": "FORMAT_NONE", "dims": [ -1, -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": false }, { "name": "SEQUENCE_LENGTH", "data_type": "TYPE_INT32", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": false }, { "name": "CUM_LOG_PROBS", "data_type": "TYPE_FP32", "format": "FORMAT_NONE", "dims": [ -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": false }, { "name": "OUTPUT_LOG_PROBS", "data_type": "TYPE_FP32", "format": "FORMAT_NONE", "dims": [ -1, -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": false }, { "name": "CONTEXT_LOGITS", "data_type": "TYPE_FP32", "format": "FORMAT_NONE", "dims": [ -1, -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true }, { "name": "GENERATION_LOGITS", "data_type": "TYPE_FP32", "format": "FORMAT_NONE", "dims": [ -1, -1, -1 ], "is_shape_tensor": false, "allow_ragged_batch": false, "optional": true } ], "output": [ { "name": "OUTPUT", "data_type": "TYPE_STRING", "dims": [ -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "OUT_CUM_LOG_PROBS", "data_type": "TYPE_FP32", "dims": [ -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "OUT_OUTPUT_LOG_PROBS", "data_type": "TYPE_FP32", "dims": [ -1, -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "OUT_CONTEXT_LOGITS", "data_type": "TYPE_FP32", "dims": [ -1, -1 ], "label_filename": "", "is_shape_tensor": false }, { "name": "OUT_GENERATION_LOGITS", "data_type": "TYPE_FP32", "dims": [ -1, -1, -1 ], "label_filename": "", "is_shape_tensor": false } ], "batch_input": [], "batch_output": [], "optimization": { "priority": "PRIORITY_DEFAULT", "input_pinned_memory": { "enable": true }, "output_pinned_memory": { "enable": true }, "gather_kernel_buffer_threshold": 0, "eager_batching": false }, "instance_group": [ { "name": "postprocessing_0", "kind": "KIND_CPU", "count": 1, "gpus": [], "secondary_devices": [], "profile": [], "passive": false, "host_policy": "" } ], "default_model_filename": "model.py", "cc_model_filenames": {}, "metric_tags": {}, "parameters": { "skip_special_tokens": { "string_value": "True" }, "tokenizer_dir": { "string_value": "/tensorrt/models/Meta-Llama-Guard-2-8B" }, "tokenizer_type": { "string_value": "llama" } }, "model_warmup": [] } I0627 23:07:16.962207 1660 python_be.cc:2381] TRITONBACKEND_ModelInstanceInitialize: postprocessing_0_0 (CPU device 0) I0627 23:07:16.962244 1660 backend_model_instance.cc:69] Creating instance postprocessing_0_0 on CPU using artifact 'model.py' I0627 23:07:16.963229 1660 stub_launcher.cc:388] Starting Python backend stub: exec /opt/tritonserver/backends/python/triton_python_backend_stub /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/postprocessing/1/model.py triton_python_backend_shm_region_6 1048576 1048576 1660 /opt/tritonserver/backends/python 336 postprocessing_0_0 DEFAULT The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. The class this function is called from is 'LlamaTokenizer'. I0627 23:07:19.244434 1660 pb_stub.cc:358] Failed to initialize Python stub: TypeError: expected str, bytes or os.PathLike object, not NoneType At: /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(206): get_spm_processor /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(178): __init__ /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2261): _from_pretrained /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2028): from_pretrained /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/preprocessing/1/model.py(74): initialize The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. The tokenizer class you load from this checkpoint is 'PreTrainedTokenizerFast'. The class this function is called from is 'LlamaTokenizer'. I0627 23:07:19.302859 1660 pb_stub.cc:358] Failed to initialize Python stub: TypeError: expected str, bytes or os.PathLike object, not NoneType At: /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(206): get_spm_processor /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(178): __init__ /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2261): _from_pretrained /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2028): from_pretrained /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/postprocessing/1/model.py(73): initialize I0627 23:07:19.652459 1660 python_be.cc:2545] TRITONBACKEND_ModelInstanceFinalize: delete instance state E0627 23:07:19.653028 1660 backend_model.cc:691] ERROR: Failed to create instance: TypeError: expected str, bytes or os.PathLike object, not NoneType At: /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(206): get_spm_processor /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(178): __init__ /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2261): _from_pretrained /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2028): from_pretrained /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/preprocessing/1/model.py(74): initialize I0627 23:07:19.653069 1660 python_be.cc:2360] TRITONBACKEND_ModelFinalize: delete model state E0627 23:07:19.653148 1660 model_lifecycle.cc:638] failed to load 'preprocessing' version 1: Internal: TypeError: expected str, bytes or os.PathLike object, not NoneType At: /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(206): get_spm_processor /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(178): __init__ /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2261): _from_pretrained /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2028): from_pretrained /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/preprocessing/1/model.py(74): initialize I0627 23:07:19.653173 1660 model_lifecycle.cc:692] OnLoadComplete() 'preprocessing' version 1 I0627 23:07:19.653203 1660 model_lifecycle.cc:730] OnLoadFinal() 'preprocessing' for all version(s) I0627 23:07:19.653219 1660 model_lifecycle.cc:773] failed to load 'preprocessing' I0627 23:07:19.741453 1660 python_be.cc:2545] TRITONBACKEND_ModelInstanceFinalize: delete instance state E0627 23:07:19.741972 1660 backend_model.cc:691] ERROR: Failed to create instance: TypeError: expected str, bytes or os.PathLike object, not NoneType At: /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(206): get_spm_processor /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(178): __init__ /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2261): _from_pretrained /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2028): from_pretrained /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/postprocessing/1/model.py(73): initialize I0627 23:07:19.742043 1660 python_be.cc:2360] TRITONBACKEND_ModelFinalize: delete model state E0627 23:07:19.742111 1660 model_lifecycle.cc:638] failed to load 'postprocessing' version 1: Internal: TypeError: expected str, bytes or os.PathLike object, not NoneType At: /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(206): get_spm_processor /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(178): __init__ /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2261): _from_pretrained /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2028): from_pretrained /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/postprocessing/1/model.py(73): initialize I0627 23:07:19.742145 1660 model_lifecycle.cc:692] OnLoadComplete() 'postprocessing' version 1 I0627 23:07:19.742177 1660 model_lifecycle.cc:730] OnLoadFinal() 'postprocessing' for all version(s) I0627 23:07:19.742193 1660 model_lifecycle.cc:773] failed to load 'postprocessing' I0627 23:07:19.742284 1660 model_lifecycle.cc:294] VersionStates() 'postprocessing' I0627 23:07:19.742342 1660 model_lifecycle.cc:294] VersionStates() 'preprocessing' [TensorRT-LLM][INFO] TRTGptModel maxNumSequences: 1 [TensorRT-LLM][INFO] TRTGptModel maxBatchSize: 1 [TensorRT-LLM][INFO] TRTGptModel mMaxAttentionWindowSize: 2560 [TensorRT-LLM][INFO] TRTGptModel enableTrtOverlap: 0 [TensorRT-LLM][INFO] TRTGptModel normalizeLogProbs: 1 [TensorRT-LLM][INFO] Loaded engine size: 15319 MiB [TensorRT-LLM][INFO] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 15364, GPU 15656 (MiB) [TensorRT-LLM][INFO] [MemUsageChange] Init cuDNN: CPU +2, GPU +10, now: CPU 15366, GPU 15666 (MiB) [TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in engine deserialization: CPU +0, GPU +15316, now: CPU 0, GPU 15316 (MiB) [TensorRT-LLM][INFO] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 15398, GPU 19536 (MiB) [TensorRT-LLM][INFO] [MemUsageChange] Init cuDNN: CPU +0, GPU +8, now: CPU 15398, GPU 19544 (MiB) [TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 15316 (MiB) [TensorRT-LLM][INFO] [MemUsageChange] Init cuBLAS/cuBLASLt: CPU +0, GPU +8, now: CPU 15419, GPU 19564 (MiB) [TensorRT-LLM][INFO] [MemUsageChange] Init cuDNN: CPU +0, GPU +10, now: CPU 15419, GPU 19574 (MiB) [TensorRT-LLM][INFO] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +0, now: CPU 0, GPU 15316 (MiB) [TensorRT-LLM][WARNING] Both freeGpuMemoryFraction (aka kv_cache_free_gpu_mem_fraction) and maxTokens (aka max_tokens_in_paged_kv_cache) are set (to 0.900000 and 2560, respectively). The smaller value will be used. [TensorRT-LLM][INFO] Allocate 335544320 bytes for k/v cache. [TensorRT-LLM][INFO] Using 2560 total tokens in paged KV cache, and 20 blocks per sequence I0627 23:07:28.130344 1660 backend_model_instance.cc:772] Starting backend thread for tensorrt_llm_0_0 at nice 0 on device 0... I0627 23:07:28.130538 1660 backend_model.cc:674] Created model instance named 'tensorrt_llm_0_0' with device id '0' I0627 23:07:28.130650 1660 model_lifecycle.cc:692] OnLoadComplete() 'tensorrt_llm' version 1 I0627 23:07:28.130665 1660 model_lifecycle.cc:730] OnLoadFinal() 'tensorrt_llm' for all version(s) I0627 23:07:28.130681 1660 model_lifecycle.cc:835] successfully loaded 'tensorrt_llm' I0627 23:07:28.130680 1660 dynamic_batch_scheduler.cc:297] Starting dynamic-batcher thread for tensorrt_llm at nice 0... I0627 23:07:28.130739 1660 model_lifecycle.cc:294] VersionStates() 'tensorrt_llm' I0627 23:07:28.130761 1660 model_lifecycle.cc:294] VersionStates() 'tensorrt_llm_bls' I0627 23:07:28.130786 1660 model_lifecycle.cc:390] AsyncUnload() 'ensemble' E0627 23:07:28.130794 1660 model_repository_manager.cc:579] Invalid argument: ensemble 'ensemble' depends on 'postprocessing' which has no loaded version. Model 'postprocessing' loading failed with error: version 1 is at UNAVAILABLE state: Internal: TypeError: expected str, bytes or os.PathLike object, not NoneType At: /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(206): get_spm_processor /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(178): __init__ /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2261): _from_pretrained /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2028): from_pretrained /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/postprocessing/1/model.py(73): initialize ; I0627 23:07:28.131028 1660 server.cc:607] +------------------+------+ | Repository Agent | Path | +------------------+------+ +------------------+------+ I0627 23:07:28.131084 1660 server.cc:634] +-------------+-----------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------+ | Backend | Path | Config | +-------------+-----------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------+ | python | /opt/tritonserver/backends/python/libtriton_python.so | {"cmdline":{"auto-complete-config":"true","backend-directory":"/opt/tritonserver/backends","min-compute-capability":"6.000000","de | | | | fault-max-batch-size":"4"}} | | tensorrtllm | /opt/tritonserver/backends/tensorrtllm/libtriton_tensorrtllm.so | {"cmdline":{"auto-complete-config":"true","backend-directory":"/opt/tritonserver/backends","min-compute-capability":"6.000000","de | | | | fault-max-batch-size":"4"}} | +-------------+-----------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------+ I0627 23:07:28.131103 1660 model_lifecycle.cc:273] ModelStates() I0627 23:07:28.131154 1660 server.cc:677] +------------------+---------+-------------------------------------------------------------------------------------------------------------------+ | Model | Version | Status | +------------------+---------+-------------------------------------------------------------------------------------------------------------------+ | postprocessing | 1 | UNAVAILABLE: Internal: TypeError: expected str, bytes or os.PathLike object, not NoneType | | | | | | | | At: | | | | /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(206): get_spm_processor | | | | /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(178): __init__ | | | | /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2261): _from_pretrained | | | | /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2028): from_pretrained | | | | /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/postprocessing/1/model.py(73): initialize | | preprocessing | 1 | UNAVAILABLE: Internal: TypeError: expected str, bytes or os.PathLike object, not NoneType | | | | | | | | At: | | | | /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(206): get_spm_processor | | | | /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(178): __init__ | | | | /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2261): _from_pretrained | | | | /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2028): from_pretrained | | | | /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/preprocessing/1/model.py(74): initialize | | tensorrt_llm | 1 | READY | | tensorrt_llm_bls | 1 | READY | +------------------+---------+-------------------------------------------------------------------------------------------------------------------+ I0627 23:07:28.172692 1660 metrics.cc:877] Collecting metrics for GPU 0: NVIDIA A10G I0627 23:07:28.175665 1660 metrics.cc:770] Collecting CPU metrics I0627 23:07:28.175987 1660 tritonserver.cc:2508] +----------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Option | Value | +----------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | server_id | triton | | server_version | 2.43.0 | | server_extensions | classification sequence model_repository model_repository(unload_dependents) schedule_policy model_configuration system_shared_memory cuda_shared_memory binary_tensor_data para | | | meters statistics trace logging | | model_repository_path[0] | /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B | | model_control_mode | MODE_EXPLICIT | | startup_models_0 | ensemble | | startup_models_1 | postprocessing | | startup_models_2 | preprocessing | | startup_models_3 | tensorrt_llm | | startup_models_4 | tensorrt_llm_bls | | strict_model_config | 0 | | rate_limit | OFF | | pinned_memory_pool_byte_size | 268435456 | | cuda_memory_pool_byte_size{0} | 67108864 | | min_supported_compute_capability | 6.0 | | strict_readiness | 1 | | exit_timeout | 30 | | cache_enabled | 0 | +----------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ I0627 23:07:28.176092 1660 server.cc:307] Waiting for in-flight requests to complete. I0627 23:07:28.176103 1660 model_lifecycle.cc:223] StopAllModels() I0627 23:07:28.176118 1660 model_lifecycle.cc:241] InflightStatus() I0627 23:07:28.176129 1660 server.cc:323] Timeout 30: Found 0 model versions that have in-flight inferences I0627 23:07:28.176592 1660 model_lifecycle.cc:390] AsyncUnload() 'ensemble' I0627 23:07:28.176625 1660 model_lifecycle.cc:390] AsyncUnload() 'postprocessing' I0627 23:07:28.176635 1660 model_lifecycle.cc:390] AsyncUnload() 'preprocessing' I0627 23:07:28.176645 1660 model_lifecycle.cc:390] AsyncUnload() 'tensorrt_llm' I0627 23:07:28.176717 1660 model_lifecycle.cc:390] AsyncUnload() 'tensorrt_llm_bls' I0627 23:07:28.176762 1660 dynamic_batch_scheduler.cc:454] Stopping dynamic-batcher thread for tensorrt_llm... I0627 23:07:28.176814 1660 server.cc:338] All models are stopped, unloading models I0627 23:07:28.176829 1660 model_lifecycle.cc:190] LiveModelStates() I0627 23:07:28.176836 1660 model_lifecycle.cc:265] BackgroundModelsSize() I0627 23:07:28.176841 1660 server.cc:347] Timeout 30: Found 2 live models and 0 in-flight non-inference requests I0627 23:07:28.176864 1660 server.cc:353] tensorrt_llm v1: UNLOADING I0627 23:07:28.176871 1660 server.cc:353] tensorrt_llm_bls v1: UNLOADING I0627 23:07:28.176876 1660 backend_model_instance.cc:795] Stopping backend thread for tensorrt_llm_bls_0_0... I0627 23:07:28.176894 1660 backend_model_instance.cc:795] Stopping backend thread for tensorrt_llm_0_0... I0627 23:07:28.176951 1660 python_be.cc:2545] TRITONBACKEND_ModelInstanceFinalize: delete instance state I0627 23:07:28.219896 1660 model_lifecycle.cc:618] OnDestroy callback() 'tensorrt_llm' version 1 I0627 23:07:28.219934 1660 model_lifecycle.cc:620] successfully unloaded 'tensorrt_llm' version 1 I0627 23:07:29.176968 1660 model_lifecycle.cc:190] LiveModelStates() I0627 23:07:29.176995 1660 model_lifecycle.cc:265] BackgroundModelsSize() I0627 23:07:29.177001 1660 server.cc:347] Timeout 29: Found 1 live models and 0 in-flight non-inference requests I0627 23:07:29.177006 1660 server.cc:353] tensorrt_llm_bls v1: UNLOADING Cleaning up... I0627 23:07:29.429256 1660 python_be.cc:2360] TRITONBACKEND_ModelFinalize: delete model state I0627 23:07:29.429347 1660 model_lifecycle.cc:618] OnDestroy callback() 'tensorrt_llm_bls' version 1 I0627 23:07:29.429361 1660 model_lifecycle.cc:620] successfully unloaded 'tensorrt_llm_bls' version 1 I0627 23:07:30.177116 1660 model_lifecycle.cc:190] LiveModelStates() I0627 23:07:30.177156 1660 model_lifecycle.cc:265] BackgroundModelsSize() I0627 23:07:30.177162 1660 server.cc:347] Timeout 28: Found 0 live models and 0 in-flight non-inference requests I0627 23:07:30.265508 1660 backend_manager.cc:138] unloading backend 'tensorrtllm' I0627 23:07:30.265540 1660 backend_manager.cc:138] unloading backend 'python' I0627 23:07:30.265547 1660 python_be.cc:2317] TRITONBACKEND_Finalize: Start I0627 23:07:30.265807 1660 python_be.cc:2322] TRITONBACKEND_Finalize: End error: creating server: Invalid argument - load failed for model 'preprocessing': version 1 is at UNAVAILABLE state: Internal: TypeError: expected str, bytes or os.PathLike object, not NoneType At: /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(206): get_spm_processor /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(178): __init__ /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2261): _from_pretrained /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2028): from_pretrained /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/preprocessing/1/model.py(74): initialize ; load failed for model 'postprocessing': version 1 is at UNAVAILABLE state: Internal: TypeError: expected str, bytes or os.PathLike object, not NoneType At: /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(206): get_spm_processor /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(178): __init__ /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2261): _from_pretrained /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2028): from_pretrained /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/postprocessing/1/model.py(73): initialize ; load failed for model 'ensemble': ensemble 'ensemble' depends on 'postprocessing' which has no loaded version. Model 'postprocessing' loading failed with error: version 1 is at UNAVAILABLE state: Internal: TypeError: expected str, bytes or os.PathLike object, not NoneType At: /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(206): get_spm_processor /usr/local/lib/python3.10/dist-packages/transformers/models/llama/tokenization_llama.py(178): __init__ /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2261): _from_pretrained /usr/local/lib/python3.10/dist-packages/transformers/tokenization_utils_base.py(2028): from_pretrained /tensorrt/triton-repos/trt-Meta-Llama-Guard-2-8B/postprocessing/1/model.py(73): initialize ;