From d60e5d72e8fbfa60486c1ab7151786bc0d407a24 Mon Sep 17 00:00:00 2001 From: Stephen Nicholas Swatman Date: Fri, 30 Aug 2024 15:11:53 +0200 Subject: [PATCH] Fuse inner loop kernels in device CKF The inner loop of the device CKF consists of five loops: material interaction application, measurement counting, candidate finding, hole writing, and propagation. I believe that the middle three can be easily merged into a single kernel, reducing the amount of work we have to do on the host and simplifying thd code a lot. This commit makes that change. --- .../include/traccc/finding/finding_config.hpp | 6 - device/common/CMakeLists.txt | 4 - .../edm/device/finding_global_counter.hpp | 3 - .../finding/device/add_links_for_holes.hpp | 31 -- .../finding/device/count_measurements.hpp | 48 --- .../traccc/finding/device/find_tracks.hpp | 32 +- .../device/impl/add_links_for_holes.ipp | 90 ----- .../device/impl/count_measurements.ipp | 57 --- .../finding/device/impl/find_tracks.ipp | 329 ++++++++++++------ device/cuda/src/finding/finding_algorithm.cu | 160 ++------- 10 files changed, 283 insertions(+), 477 deletions(-) delete mode 100644 device/common/include/traccc/finding/device/add_links_for_holes.hpp delete mode 100644 device/common/include/traccc/finding/device/count_measurements.hpp delete mode 100644 device/common/include/traccc/finding/device/impl/add_links_for_holes.ipp delete mode 100644 device/common/include/traccc/finding/device/impl/count_measurements.ipp diff --git a/core/include/traccc/finding/finding_config.hpp b/core/include/traccc/finding/finding_config.hpp index 3c670a6069..cac0855f0d 100644 --- a/core/include/traccc/finding/finding_config.hpp +++ b/core/include/traccc/finding/finding_config.hpp @@ -48,12 +48,6 @@ struct finding_config { /// Particle hypothesis detray::pdg_particle ptc_hypothesis = detray::muon(); - - /**************************** - * GPU-specfic parameters - ****************************/ - /// The number of measurements to be iterated per thread - unsigned int n_measurements_per_thread = 8; }; } // namespace traccc diff --git a/device/common/CMakeLists.txt b/device/common/CMakeLists.txt index 735e9b56c0..5e13be3086 100644 --- a/device/common/CMakeLists.txt +++ b/device/common/CMakeLists.txt @@ -63,17 +63,13 @@ traccc_add_library( traccc_device_common device_common TYPE SHARED # Track finding funtions(s). "include/traccc/finding/device/apply_interaction.hpp" "include/traccc/finding/device/build_tracks.hpp" - "include/traccc/finding/device/count_measurements.hpp" "include/traccc/finding/device/find_tracks.hpp" - "include/traccc/finding/device/add_links_for_holes.hpp" "include/traccc/finding/device/make_barcode_sequence.hpp" "include/traccc/finding/device/propagate_to_next_surface.hpp" "include/traccc/finding/device/prune_tracks.hpp" "include/traccc/finding/device/impl/apply_interaction.ipp" "include/traccc/finding/device/impl/build_tracks.ipp" - "include/traccc/finding/device/impl/count_measurements.ipp" "include/traccc/finding/device/impl/find_tracks.ipp" - "include/traccc/finding/device/impl/add_links_for_holes.ipp" "include/traccc/finding/device/impl/make_barcode_sequence.ipp" "include/traccc/finding/device/impl/propagate_to_next_surface.ipp" "include/traccc/finding/device/impl/prune_tracks.ipp" diff --git a/device/common/include/traccc/edm/device/finding_global_counter.hpp b/device/common/include/traccc/edm/device/finding_global_counter.hpp index 81143854cf..d58b9b3523 100644 --- a/device/common/include/traccc/edm/device/finding_global_counter.hpp +++ b/device/common/include/traccc/edm/device/finding_global_counter.hpp @@ -11,9 +11,6 @@ namespace traccc::device { struct finding_global_counter { - // Sum of the number of measurements for every parameter - unsigned int n_measurements_sum; - // Number of found measurements for the current step unsigned int n_candidates; diff --git a/device/common/include/traccc/finding/device/add_links_for_holes.hpp b/device/common/include/traccc/finding/device/add_links_for_holes.hpp deleted file mode 100644 index df356f9552..0000000000 --- a/device/common/include/traccc/finding/device/add_links_for_holes.hpp +++ /dev/null @@ -1,31 +0,0 @@ -/** TRACCC library, part of the ACTS project (R&D line) - * - * (c) 2024 CERN for the benefit of the ACTS project - * - * Mozilla Public License Version 2.0 - */ - -#pragma once - -// Project include(s). -#include "traccc/definitions/qualifiers.hpp" - -namespace traccc::device { - -/// Function to add a dummy link in case of a hole - -TRACCC_DEVICE inline void add_links_for_holes( - std::size_t globalIndex, - vecmem::data::vector_view n_candidates_view, - bound_track_parameters_collection_types::const_view in_params_view, - vecmem::data::vector_view prev_links_view, - vecmem::data::vector_view prev_param_to_link_view, - const unsigned int step, const unsigned int& n_max_candidates, - bound_track_parameters_collection_types::view out_params_view, - vecmem::data::vector_view links_view, - unsigned int& n_total_candidates); - -} // namespace traccc::device - -// Include the implementation. -#include "traccc/finding/device/impl/add_links_for_holes.ipp" \ No newline at end of file diff --git a/device/common/include/traccc/finding/device/count_measurements.hpp b/device/common/include/traccc/finding/device/count_measurements.hpp deleted file mode 100644 index 041ad31670..0000000000 --- a/device/common/include/traccc/finding/device/count_measurements.hpp +++ /dev/null @@ -1,48 +0,0 @@ -/** TRACCC library, part of the ACTS project (R&D line) - * - * (c) 2023 CERN for the benefit of the ACTS project - * - * Mozilla Public License Version 2.0 - */ - -#pragma once - -// Project include(s). -#include "traccc/definitions/qualifiers.hpp" - -// Thrust include(s) -#include - -// System include(s) -#include - -namespace traccc::device { - -/// Function evalulating the number of measurements to be iterated per thread -/// and the total number of measurements -/// -/// @param[in] globalIndex The index of the current thread -/// @param[in] params_view Input parameters view object -/// @param[in] barcodes_view Barcodes view object -/// @param[in] upper_bounds Upper bounds of measurements w.r.t geometry -/// ID -/// @param[out] n_measurements_view The number of measurements per parameter -/// @param[out] ref_meas_idx The first index of measurements per -/// parameter -/// @param[out] n_measurements_sum The sum of the number of measurements per -/// parameter -/// -TRACCC_DEVICE inline void count_measurements( - std::size_t globalIndex, - bound_track_parameters_collection_types::const_view params_view, - vecmem::data::vector_view barcodes_view, - vecmem::data::vector_view upper_bounds_view, - const unsigned int n_in_params, - vecmem::data::vector_view n_measurements_view, - vecmem::data::vector_view ref_meas_idx_view, - unsigned int& n_measurements_sum); - -} // namespace traccc::device - -// Include the implementation. -#include "traccc/finding/device/impl/count_measurements.ipp" diff --git a/device/common/include/traccc/finding/device/find_tracks.hpp b/device/common/include/traccc/finding/device/find_tracks.hpp index 96fc3e8ece..619e00b961 100644 --- a/device/common/include/traccc/finding/device/find_tracks.hpp +++ b/device/common/include/traccc/finding/device/find_tracks.hpp @@ -10,6 +10,8 @@ // Project include(s). #include "traccc/definitions/primitives.hpp" #include "traccc/definitions/qualifiers.hpp" +#include "traccc/device/concepts/barrier.hpp" +#include "traccc/device/concepts/thread_id.hpp" #include "traccc/edm/measurement.hpp" #include "traccc/edm/track_parameters.hpp" @@ -22,43 +24,47 @@ namespace traccc::device { /// If the chi2 of the measurement < chi2_max, its measurement index and the /// index of the link from the previous step are added to the link container. /// -/// @param[in] globalIndex The index of the current thread +/// @param[in] thread_id A thread identifier object +/// @param[in] barrier A block-wide barrier /// @param[in] cfg Track finding config object /// @param[in] det_data Detector view object /// @param[in] measurements_view Measurements container view +/// @param[in] in_params_view Input parameters +/// @param[in] n_in_params The number of input params +/// @param[in] barcodes_view View of a measurement -> barcode map /// @param[in] upper_bounds_view Upper bounds of measurements unique w.r.t /// barcode -/// @param[in] in_params_view Input parameters -/// @param[in] n_measurements_prefix_sum_view Prefix sum of the number of -/// measurements per parameter -/// @param[in] ref_meas_idx_view The first index of measurements per parameter /// @param[in] prev_links_view link container from the previous step /// @param[in] prev_param_to_link_view param_to_link container from the /// previous step /// @param[in] step Step index /// @param[in] n_max_candidates Number of maximum candidates /// @param[out] out_params_view Output parameters -/// @param[out] n_candidates_view Number of candidates per input parameter /// @param[out] links_view link container for the current step /// @param[out] n_total_candidates The number of total candidates for the /// current step +/// @param shared_num_candidates Shared memory scratch space +/// @param shared_candidates Shared memory scratch space +/// @param shared_candidates_size Shared memory scratch space /// -template +template TRACCC_DEVICE inline void find_tracks( - std::size_t globalIndex, const config_t cfg, + thread_id_t& thread_id, barrier_t& barrier, const config_t cfg, typename detector_t::view_type det_data, measurement_collection_types::const_view measurements_view, bound_track_parameters_collection_types::const_view in_params_view, - vecmem::data::vector_view - n_measurements_prefix_sum_view, - vecmem::data::vector_view ref_meas_idx_view, + const unsigned int n_in_params, + vecmem::data::vector_view barcodes_view, + vecmem::data::vector_view upper_bounds_view, vecmem::data::vector_view prev_links_view, vecmem::data::vector_view prev_param_to_link_view, const unsigned int step, const unsigned int& n_max_candidates, bound_track_parameters_collection_types::view out_params_view, - vecmem::data::vector_view n_candidates_view, vecmem::data::vector_view links_view, - unsigned int& n_total_candidates); + unsigned int& n_total_candidates, unsigned int* shared_num_candidates, + std::pair* shared_candidates, + unsigned int& shared_candidates_size); } // namespace traccc::device diff --git a/device/common/include/traccc/finding/device/impl/add_links_for_holes.ipp b/device/common/include/traccc/finding/device/impl/add_links_for_holes.ipp deleted file mode 100644 index 511f43bd81..0000000000 --- a/device/common/include/traccc/finding/device/impl/add_links_for_holes.ipp +++ /dev/null @@ -1,90 +0,0 @@ -/** TRACCC library, part of the ACTS project (R&D line) - * - * (c) 2024 CERN for the benefit of the ACTS project - * - * Mozilla Public License Version 2.0 - */ - -#pragma once - -#include - -namespace traccc::device { - -TRACCC_DEVICE inline void add_links_for_holes( - std::size_t globalIndex, - vecmem::data::vector_view n_candidates_view, - bound_track_parameters_collection_types::const_view in_params_view, - vecmem::data::vector_view prev_links_view, - vecmem::data::vector_view prev_param_to_link_view, - const unsigned int step, const unsigned int& n_max_candidates, - bound_track_parameters_collection_types::view out_params_view, - vecmem::data::vector_view links_view, - unsigned int& n_total_candidates) { - - // Number of candidates per parameter - vecmem::device_vector n_candidates(n_candidates_view); - - if (globalIndex >= n_candidates.size()) { - return; - } - - // Input parameters - bound_track_parameters_collection_types::const_device in_params( - in_params_view); - - // Previous links - vecmem::device_vector prev_links(prev_links_view); - - // Previous param_to_link - vecmem::device_vector prev_param_to_link( - prev_param_to_link_view); - - // Output parameters - bound_track_parameters_collection_types::device out_params(out_params_view); - - // Links - vecmem::device_vector links(links_view); - - // Last step ID - const candidate_link::link_index_type::first_type previous_step = - (step == 0) ? std::numeric_limits< - candidate_link::link_index_type::first_type>::max() - : step - 1; - - if (n_candidates[globalIndex] == 0u) { - - // Add measurement candidates to link - vecmem::device_atomic_ref num_total_candidates( - n_total_candidates); - - const unsigned int l_pos = num_total_candidates.fetch_add(1); - - if (l_pos >= n_max_candidates) { - - n_total_candidates = n_max_candidates; - return; - } - - // Seed id - unsigned int orig_param_id = - (step == 0 ? globalIndex - : prev_links[prev_param_to_link[globalIndex]].seed_idx); - // Skip counter - unsigned int skip_counter = - (step == 0 ? 0 - : prev_links[prev_param_to_link[globalIndex]].n_skipped); - - // Add a dummy link - links.at(l_pos) = { - {previous_step, globalIndex}, - std::numeric_limits().meas_idx)>>::max(), - orig_param_id, - skip_counter + 1}; - - out_params.at(l_pos) = in_params.at(globalIndex); - } -} - -} // namespace traccc::device diff --git a/device/common/include/traccc/finding/device/impl/count_measurements.ipp b/device/common/include/traccc/finding/device/impl/count_measurements.ipp deleted file mode 100644 index 0fe0402532..0000000000 --- a/device/common/include/traccc/finding/device/impl/count_measurements.ipp +++ /dev/null @@ -1,57 +0,0 @@ -/** TRACCC library, part of the ACTS project (R&D line) - * - * (c) 2023 CERN for the benefit of the ACTS project - * - * Mozilla Public License Version 2.0 - */ - -#pragma once - -namespace traccc::device { - -TRACCC_DEVICE inline void count_measurements( - std::size_t globalIndex, - bound_track_parameters_collection_types::const_view params_view, - vecmem::data::vector_view barcodes_view, - vecmem::data::vector_view upper_bounds_view, - const unsigned int n_in_params, - vecmem::data::vector_view n_measurements_view, - vecmem::data::vector_view ref_meas_idx_view, - unsigned int& n_measurements_sum) { - - bound_track_parameters_collection_types::const_device params(params_view); - vecmem::device_vector barcodes( - barcodes_view); - vecmem::device_vector upper_bounds(upper_bounds_view); - vecmem::device_vector n_measurements(n_measurements_view); - vecmem::device_vector ref_meas_idx(ref_meas_idx_view); - - if (globalIndex >= n_in_params) { - return; - } - - // Get barcode - const auto bcd = params.at(globalIndex).surface_link(); - const auto lo = - thrust::lower_bound(thrust::seq, barcodes.begin(), barcodes.end(), bcd); - - // If barcode is not found (no measurement) - if (lo == barcodes.end()) { - return; - } - - const auto bcd_id = std::distance(barcodes.begin(), lo); - - // Get the reference measurement index and the number of measurements per - // parameter - ref_meas_idx.at(globalIndex) = - lo == barcodes.begin() ? 0u : upper_bounds[bcd_id - 1]; - n_measurements.at(globalIndex) = - upper_bounds[bcd_id] - ref_meas_idx.at(globalIndex); - - // Increase the total number of measurements with atomic addition - vecmem::device_atomic_ref n_meas_sum(n_measurements_sum); - n_meas_sum.fetch_add(n_measurements.at(globalIndex)); -} - -} // namespace traccc::device diff --git a/device/common/include/traccc/finding/device/impl/find_tracks.ipp b/device/common/include/traccc/finding/device/impl/find_tracks.ipp index 913b38d5b0..9c1c15b6b6 100644 --- a/device/common/include/traccc/finding/device/impl/find_tracks.ipp +++ b/device/common/include/traccc/finding/device/impl/find_tracks.ipp @@ -8,6 +8,9 @@ #pragma once // Project include(s). +#include "traccc/device/concepts/barrier.hpp" +#include "traccc/device/concepts/thread_id.hpp" +#include "traccc/finding/candidate_link.hpp" #include "traccc/fitting/kalman_filter/gain_matrix_updater.hpp" // System include(s). @@ -15,137 +18,265 @@ namespace traccc::device { -template +template TRACCC_DEVICE inline void find_tracks( - std::size_t globalIndex, const config_t cfg, + thread_id_t& thread_id, barrier_t& barrier, const config_t cfg, typename detector_t::view_type det_data, measurement_collection_types::const_view measurements_view, bound_track_parameters_collection_types::const_view in_params_view, - vecmem::data::vector_view - n_measurements_prefix_sum_view, - vecmem::data::vector_view ref_meas_idx_view, + const unsigned int n_in_params, + vecmem::data::vector_view barcodes_view, + vecmem::data::vector_view upper_bounds_view, vecmem::data::vector_view prev_links_view, vecmem::data::vector_view prev_param_to_link_view, const unsigned int step, const unsigned int& n_max_candidates, bound_track_parameters_collection_types::view out_params_view, - vecmem::data::vector_view n_candidates_view, vecmem::data::vector_view links_view, - unsigned int& n_total_candidates) { + unsigned int& n_total_candidates, unsigned int* shared_num_candidates, + std::pair* shared_candidates, + unsigned int& shared_candidates_size) { + + /* + * Initialize the block-shared data; in particular, set the total size of + * the candidate buffer to zero, and then set the number of candidates for + * each parameter to zero. + */ + if (thread_id.getLocalThreadIdX() == 0) { + shared_candidates_size = 0; + } - // Detector - detector_t det(det_data); + shared_num_candidates[thread_id.getLocalThreadIdX()] = 0; - // Measurement - measurement_collection_types::const_device measurements(measurements_view); + barrier.blockBarrier(); - // Input parameters + /* + * Initialize all of the device vectors from their vecmem views. + */ + detector_t det(det_data); + measurement_collection_types::const_device measurements(measurements_view); bound_track_parameters_collection_types::const_device in_params( in_params_view); - - // Previous links vecmem::device_vector prev_links(prev_links_view); - - // Previous param_to_link vecmem::device_vector prev_param_to_link( prev_param_to_link_view); - - // Output parameters bound_track_parameters_collection_types::device out_params(out_params_view); - - // Number of candidates per parameter - vecmem::device_vector n_candidates(n_candidates_view); - - // Links vecmem::device_vector links(links_view); - - // Prefix sum of the number of measurements per parameter - vecmem::device_vector n_measurements_prefix_sum( - n_measurements_prefix_sum_view); - - // Reference (first) measurement index per parameter - vecmem::device_vector ref_meas_idx(ref_meas_idx_view); - - // Last step ID + vecmem::device_atomic_ref num_total_candidates( + n_total_candidates); + vecmem::device_vector barcodes( + barcodes_view); + vecmem::device_vector upper_bounds(upper_bounds_view); + + /* + * Compute the last step ID, using a sentinel value if the current step is + * step 0. + */ const candidate_link::link_index_type::first_type previous_step = (step == 0) ? std::numeric_limits< candidate_link::link_index_type::first_type>::max() : step - 1; - const unsigned int n_measurements_sum = n_measurements_prefix_sum.back(); - const unsigned int stride = globalIndex * cfg.n_measurements_per_thread; - - vecmem::device_vector::iterator lo1; - - for (unsigned int i_meas = 0; i_meas < cfg.n_measurements_per_thread; - i_meas++) { - const unsigned int idx = stride + i_meas; - - if (idx >= n_measurements_sum) { - break; + const unsigned int in_param_id = thread_id.getGlobalThreadIdX(); + + /* + * Step 1 of this kernel is to determine which indices belong to which + * parameter. Because the measurements are guaranteed to be grouped, we can + * simply find the first measurement's index and the total number of + * indices. + * + * This entire step is executed on a one-thread-one-parameter model. + */ + unsigned int init_meas; + unsigned int num_meas = 0; + + if (in_param_id < n_in_params) { + /* + * Get the barcode of this thread's parameters, then find the first + * measurement that matches it. + */ + const auto bcd = in_params.at(in_param_id).surface_link(); + const auto lo = thrust::lower_bound(thrust::seq, barcodes.begin(), + barcodes.end(), bcd); + + /* + * If we cannot find any corresponding measurements, set the number of + * measurements to zero. + */ + if (lo == barcodes.end()) { + init_meas = 0; + } + /* + * If measurements are found, use the previously (outside this kernel) + * computed upper bound array to compute the range of measurements for + * this thread. + */ + else { + const auto bcd_id = std::distance(barcodes.begin(), lo); + + init_meas = lo == barcodes.begin() ? 0u : upper_bounds[bcd_id - 1]; + num_meas = upper_bounds[bcd_id] - init_meas; } + } - if (i_meas == 0 || idx == *lo1) { - lo1 = thrust::lower_bound(thrust::seq, - n_measurements_prefix_sum.begin(), - n_measurements_prefix_sum.end(), idx + 1); + /* + * Step 2 of this kernel involves processing the candidate measurements and + * updating them on their corresponding surface. + * + * Because the number of measurements per parameter can vary wildly + * (between 0 and 20), a naive one-thread-one-parameter model would incur a + * lot of thread divergence here. Instead, we use a load-balanced model in + * which threads process each others' measurements. + * + * The core idea is that each thread places its measurements into a shared + * pool. We keep track of how many measurements each thread has placed into + * the pool. + */ + unsigned int curr_meas = 0; + + /* + * This loop keeps running until all threads have processed all of their + * measurements. + */ + while ( + barrier.blockOr(curr_meas < num_meas || shared_candidates_size > 0)) { + /* + * The outer loop consists of three general components. The first + * components is that each thread starts to fill a shared buffer of + * measurements. The buffer is twice the size of the block to + * accomodate any overflow. + * + * Threads insert their measurements into the shared buffer until they + * either run out of measurements, or until the shared buffer is full. + */ + for (; curr_meas < num_meas && + shared_candidates_size < thread_id.getBlockDimX(); + curr_meas++) { + unsigned int idx = + vecmem::device_atomic_ref(shared_candidates_size) + .fetch_add(1u); + + /* + * The buffer elemements are tuples of the measurement index and + * the index of the thread that originally inserted that + * measurement. + */ + shared_candidates[idx] = {init_meas + curr_meas, + thread_id.getLocalThreadIdX()}; } - const unsigned int in_param_id = - std::distance(n_measurements_prefix_sum.begin(), lo1); - const detray::geometry::barcode bcd = - in_params.at(in_param_id).surface_link(); - const unsigned int offset = - lo1 == n_measurements_prefix_sum.begin() ? idx : idx - *(lo1 - 1); - const unsigned int meas_idx = ref_meas_idx.at(in_param_id) + offset; - bound_track_parameters in_par = in_params.at(in_param_id); - - const auto& meas = measurements.at(meas_idx); - track_state trk_state(meas); - const detray::tracking_surface sf{det, bcd}; - - // Run the Kalman update - sf.template visit_mask< - gain_matrix_updater>(trk_state, - in_par); - // Get the chi-square - const auto chi2 = trk_state.filtered_chi2(); - - if (chi2 < cfg.chi2_max) { - - // Add measurement candidates to link - vecmem::device_atomic_ref num_total_candidates( - n_total_candidates); - - const unsigned int l_pos = num_total_candidates.fetch_add(1); - - if (l_pos >= n_max_candidates) { - n_total_candidates = n_max_candidates; - return; + barrier.blockBarrier(); + + /* + * The shared buffer is now full; each thread picks out zero or one of + * the measurements and processes it. + */ + if (thread_id.getLocalThreadIdX() < shared_candidates_size) { + const unsigned int owner_local_thread_id = + shared_candidates[thread_id.getLocalThreadIdX()].second; + const unsigned int owner_global_thread_id = + owner_local_thread_id + + thread_id.getBlockDimX() * thread_id.getBlockIdX(); + bound_track_parameters in_par = + in_params.at(owner_global_thread_id); + const unsigned int meas_idx = + shared_candidates[thread_id.getLocalThreadIdX()].first; + + const auto& meas = measurements.at(meas_idx); + + track_state trk_state(meas); + const detray::tracking_surface sf{det, in_par.surface_link()}; + + // Run the Kalman update + sf.template visit_mask< + gain_matrix_updater>( + trk_state, in_par); + // Get the chi-square + const auto chi2 = trk_state.filtered_chi2(); + + if (chi2 < cfg.chi2_max) { + // Add measurement candidates to link + const unsigned int l_pos = num_total_candidates.fetch_add(1); + + if (l_pos >= n_max_candidates) { + n_total_candidates = n_max_candidates; + } else { + if (step == 0) { + links.at(l_pos) = { + {previous_step, owner_global_thread_id}, + meas_idx, + owner_global_thread_id, + 0}; + } else { + const candidate_link& prev_link = prev_links + [prev_param_to_link[owner_global_thread_id]]; + + links.at(l_pos) = { + {previous_step, owner_global_thread_id}, + meas_idx, + prev_link.seed_idx, + prev_link.n_skipped}; + } + + // Increase the number of candidates (or branches) per input + // parameter + vecmem::device_atomic_ref( + shared_num_candidates[owner_local_thread_id]) + .fetch_add(1u); + + out_params.at(l_pos) = trk_state.filtered(); + } } + } - // Seed id - unsigned int orig_param_id = - (step == 0 - ? in_param_id - : prev_links[prev_param_to_link[in_param_id]].seed_idx); - // Skip counter - unsigned int skip_counter = - (step == 0 - ? 0 - : prev_links[prev_param_to_link[in_param_id]].n_skipped); - - links[l_pos] = {{previous_step, in_param_id}, - meas_idx, - orig_param_id, - skip_counter}; + barrier.blockBarrier(); + + /* + * The reason the buffer is twice the size of the block is that we + * might end up having some spill-over; this spill-over should be moved + * to the front of the buffer. + */ + shared_candidates[thread_id.getLocalThreadIdX()] = + shared_candidates[thread_id.getLocalThreadIdX() + + thread_id.getBlockDimX()]; + + if (thread_id.getLocalThreadIdX() == 0) { + if (shared_candidates_size >= thread_id.getBlockDimX()) { + shared_candidates_size -= thread_id.getBlockDimX(); + } else { + shared_candidates_size = 0; + } + } + } - // Increase the number of candidates (or branches) per input - // parameter - vecmem::device_atomic_ref num_candidates( - n_candidates[in_param_id]); - num_candidates.fetch_add(1); + /* + * Part three of the kernel inserts holes for parameters which did not + * match any measurements. + */ + if (in_param_id < n_in_params && + shared_num_candidates[thread_id.getLocalThreadIdX()] == 0u) { + // Add measurement candidates to link + const unsigned int l_pos = num_total_candidates.fetch_add(1); + + if (l_pos >= n_max_candidates) { + n_total_candidates = n_max_candidates; + } else { + if (step == 0) { + links.at(l_pos) = {{previous_step, in_param_id}, + std::numeric_limits::max(), + in_param_id, + 1}; + } else { + const candidate_link& prev_link = + prev_links[prev_param_to_link[in_param_id]]; + + links.at(l_pos) = {{previous_step, in_param_id}, + std::numeric_limits::max(), + prev_link.seed_idx, + prev_link.n_skipped + 1}; + } - out_params[l_pos] = trk_state.filtered(); + out_params.at(l_pos) = in_params.at(in_param_id); } } } diff --git a/device/cuda/src/finding/finding_algorithm.cu b/device/cuda/src/finding/finding_algorithm.cu index d1d6234868..213745fed5 100644 --- a/device/cuda/src/finding/finding_algorithm.cu +++ b/device/cuda/src/finding/finding_algorithm.cu @@ -7,17 +7,17 @@ // Project include(s). #include "../sanity/contiguous_on.cuh" +#include "../utils/barrier.hpp" #include "../utils/cuda_error_handling.hpp" #include "../utils/utils.hpp" #include "traccc/cuda/finding/finding_algorithm.hpp" +#include "traccc/cuda/utils/thread_id.hpp" #include "traccc/definitions/primitives.hpp" #include "traccc/definitions/qualifiers.hpp" #include "traccc/edm/device/finding_global_counter.hpp" #include "traccc/finding/candidate_link.hpp" -#include "traccc/finding/device/add_links_for_holes.hpp" #include "traccc/finding/device/apply_interaction.hpp" #include "traccc/finding/device/build_tracks.hpp" -#include "traccc/finding/device/count_measurements.hpp" #include "traccc/finding/device/find_tracks.hpp" #include "traccc/finding/device/make_barcode_sequence.hpp" #include "traccc/finding/device/propagate_to_next_surface.hpp" @@ -75,66 +75,37 @@ __global__ void apply_interaction( params_view); } -/// CUDA kernel for running @c traccc::device::count_measurements -__global__ void count_measurements( - bound_track_parameters_collection_types::const_view params_view, - vecmem::data::vector_view barcodes_view, - vecmem::data::vector_view upper_bounds_view, - const unsigned int n_in_params, - vecmem::data::vector_view n_measurements_view, - vecmem::data::vector_view ref_meas_idx_view, - unsigned int& n_measurements_sum) { - - int gid = threadIdx.x + blockIdx.x * blockDim.x; - - device::count_measurements( - gid, params_view, barcodes_view, upper_bounds_view, n_in_params, - n_measurements_view, ref_meas_idx_view, n_measurements_sum); -} - /// CUDA kernel for running @c traccc::device::find_tracks template __global__ void find_tracks( const config_t cfg, typename detector_t::view_type det_data, measurement_collection_types::const_view measurements_view, bound_track_parameters_collection_types::const_view in_params_view, - vecmem::data::vector_view - n_measurements_prefix_sum_view, - vecmem::data::vector_view ref_meas_idx_view, + const unsigned int n_in_params, + vecmem::data::vector_view barcodes_view, + vecmem::data::vector_view upper_bounds_view, vecmem::data::vector_view prev_links_view, vecmem::data::vector_view prev_param_to_link_view, const unsigned int step, const unsigned int n_max_candidates, bound_track_parameters_collection_types::view out_params_view, - vecmem::data::vector_view n_candidates_view, vecmem::data::vector_view links_view, unsigned int& n_candidates) { - - int gid = threadIdx.x + blockIdx.x * blockDim.x; - - device::find_tracks( - gid, cfg, det_data, measurements_view, in_params_view, - n_measurements_prefix_sum_view, ref_meas_idx_view, prev_links_view, + __shared__ unsigned int shared_candidates_size; + extern __shared__ unsigned int s[]; + unsigned int* shared_num_candidates = s; + std::pair* shared_candidates = + reinterpret_cast*>( + &shared_num_candidates[blockDim.x]); + + cuda::barrier barrier; + cuda::thread_id1 thread_id; + + device::find_tracks( + thread_id, barrier, cfg, det_data, measurements_view, in_params_view, + n_in_params, barcodes_view, upper_bounds_view, prev_links_view, prev_param_to_link_view, step, n_max_candidates, out_params_view, - n_candidates_view, links_view, n_candidates); -} - -/// CUDA kernel for running @c traccc::device::add_links_for_holes -__global__ void add_links_for_holes( - vecmem::data::vector_view n_candidates_view, - bound_track_parameters_collection_types::const_view in_params_view, - vecmem::data::vector_view prev_links_view, - vecmem::data::vector_view prev_param_to_link_view, - const unsigned int step, const unsigned int n_max_candidates, - bound_track_parameters_collection_types::view out_params_view, - vecmem::data::vector_view links_view, - unsigned int& n_total_candidates) { - - int gid = threadIdx.x + blockIdx.x * blockDim.x; - - device::add_links_for_holes(gid, n_candidates_view, in_params_view, - prev_links_view, prev_param_to_link_view, step, - n_max_candidates, out_params_view, links_view, - n_total_candidates); + links_view, n_candidates, shared_num_candidates, shared_candidates, + shared_candidates_size); } /// CUDA kernel for running @c traccc::device::propagate_to_next_surface @@ -353,49 +324,7 @@ finding_algorithm::operator()( TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); /***************************************************************** - * Kernel3: Count the number of measurements per parameter - ****************************************************************/ - - vecmem::data::vector_buffer n_measurements_buffer( - n_in_params, m_mr.main); - vecmem::device_vector n_measurements_device( - n_measurements_buffer); - thrust::fill(thrust::cuda::par.on(stream), - n_measurements_device.begin(), n_measurements_device.end(), - 0u); - - // Create a buffer for the first measurement index of parameter - vecmem::data::vector_buffer ref_meas_idx_buffer( - n_in_params, m_mr.main); - - nThreads = m_warp_size * 2; - nBlocks = (n_in_params + nThreads - 1) / nThreads; - kernels::count_measurements<<>>( - in_params_buffer, barcodes_buffer, upper_bounds_buffer, n_in_params, - n_measurements_buffer, ref_meas_idx_buffer, - (*global_counter_device).n_measurements_sum); - TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); - - // Global counter object: Device -> Host - TRACCC_CUDA_ERROR_CHECK( - cudaMemcpyAsync(&global_counter_host, global_counter_device.get(), - sizeof(device::finding_global_counter), - cudaMemcpyDeviceToHost, stream)); - - m_stream.synchronize(); - - // Create the buffer for the prefix sum of the number of measurements - // per parameter - vecmem::data::vector_buffer - n_measurements_prefix_sum_buffer(n_in_params, m_mr.main); - vecmem::device_vector n_measurements_prefix_sum( - n_measurements_prefix_sum_buffer); - thrust::inclusive_scan( - thrust::cuda::par.on(stream), n_measurements_device.begin(), - n_measurements_device.end(), n_measurements_prefix_sum.begin()); - - /***************************************************************** - * Kernel4: Find valid tracks + * Kernel3: Find valid tracks *****************************************************************/ // Buffer for kalman-updated parameters spawned by the measurement @@ -403,13 +332,6 @@ finding_algorithm::operator()( const unsigned int n_max_candidates = n_in_params * m_cfg.max_num_branches_per_surface; - vecmem::data::vector_buffer n_candidates_buffer{ - n_in_params, m_mr.main}; - vecmem::device_vector n_candidates_device( - n_candidates_buffer); - thrust::fill(thrust::cuda::par.on(stream), n_candidates_device.begin(), - n_candidates_device.end(), 0u); - bound_track_parameters_collection_types::buffer updated_params_buffer( n_in_params * m_cfg.max_num_branches_per_surface, m_mr.main); @@ -417,34 +339,20 @@ finding_algorithm::operator()( link_map[step] = {n_in_params * m_cfg.max_num_branches_per_surface, m_mr.main}; m_copy.setup(link_map[step]); - nBlocks = (global_counter_host.n_measurements_sum + - nThreads * m_cfg.n_measurements_per_thread - 1) / - (nThreads * m_cfg.n_measurements_per_thread); - - if (nBlocks > 0) { - kernels::find_tracks - <<>>( - m_cfg, det_view, measurements, in_params_buffer, - n_measurements_prefix_sum_buffer, ref_meas_idx_buffer, - link_map[prev_step], param_to_link_map[prev_step], step, - n_max_candidates, updated_params_buffer, - n_candidates_buffer, link_map[step], - (*global_counter_device).n_candidates); - TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); - } - - /***************************************************************** - * Kernel5: Add a dummy links in case of no branches - *****************************************************************/ - nBlocks = (n_in_params + nThreads - 1) / nThreads; if (nBlocks > 0) { - kernels::add_links_for_holes<<>>( - n_candidates_buffer, in_params_buffer, link_map[prev_step], - param_to_link_map[prev_step], step, n_max_candidates, - updated_params_buffer, link_map[step], - (*global_counter_device).n_candidates); + kernels::find_tracks + <<), + stream>>>(m_cfg, det_view, measurements, in_params_buffer, + n_in_params, barcodes_buffer, upper_bounds_buffer, + link_map[prev_step], param_to_link_map[prev_step], + step, n_max_candidates, updated_params_buffer, + link_map[step], + (*global_counter_device).n_candidates); TRACCC_CUDA_ERROR_CHECK(cudaGetLastError()); } @@ -457,7 +365,7 @@ finding_algorithm::operator()( m_stream.synchronize(); /***************************************************************** - * Kernel6: Propagate to the next surface + * Kernel4: Propagate to the next surface *****************************************************************/ // Buffer for out parameters for the next step @@ -569,7 +477,7 @@ finding_algorithm::operator()( } /***************************************************************** - * Kernel7: Build tracks + * Kernel5: Build tracks *****************************************************************/ // Create track candidate buffer