diff --git a/doc/markdown-doc/cpp-inference.md b/doc/markdown-doc/cpp-inference.md index 336cc92f..469250b7 100644 --- a/doc/markdown-doc/cpp-inference.md +++ b/doc/markdown-doc/cpp-inference.md @@ -46,6 +46,8 @@ Then you'll see the output images in the `cmake-build/Linux` folder. If you cannot execute the binary file successfully, you can refer to the `tensorrt.log` for more details. +> To see the best performance, you'd better test `example-batch-detector` with big amount of data or just run it more than once to avoid cold start. + ### Speed Results If you want to share your results on your machine, welcome to PR! diff --git a/src/post_process.hpp b/src/post_process.hpp index b38732c3..42c05769 100644 --- a/src/post_process.hpp +++ b/src/post_process.hpp @@ -134,8 +134,6 @@ template class peak_finder_t ksize(ksize), smoothed_cpu(channel, height, width), pooled_cpu(channel, height, width), - pool_input_gpu(channel, height, width), - pooled_gpu(channel, height, width), same_max_pool_3x3_gpu(1, channel, height, width, 3, 3) { // std::cout << "Appread Once\n" << '\n'; @@ -154,6 +152,8 @@ template class peak_finder_t if (use_gpu) { TRACE_SCOPE("find_peak_coords::max pooling on GPU"); + ttl::cuda_tensor pool_input_gpu(channel, height, width), + pooled_gpu(channel, height, width); ttl::copy(ttl::ref(pool_input_gpu), ttl::view(smoothed_cpu)); // FIXME: pass ttl::tensor_{ref/view} same_max_pool_3x3_gpu(pool_input_gpu.data(), pooled_gpu.data()); @@ -211,8 +211,6 @@ template class peak_finder_t ttl::tensor smoothed_cpu; ttl::tensor pooled_cpu; - ttl::cuda_tensor pool_input_gpu; - ttl::cuda_tensor pooled_gpu; Pool_NCHW_PaddingSame_Max same_max_pool_3x3_gpu; -}; +}; \ No newline at end of file