From fc1f7a0c9d23710899ae1ae8e6a317b9390c2ba7 Mon Sep 17 00:00:00 2001
From: Attila Krasznahorkay <Attila.Krasznahorkay@cern.ch>
Date: Wed, 26 Aug 2020 15:29:52 +0200
Subject: [PATCH 1/4] Made the CUDA seed finding process triplets for multiple
 middle spacepoints at once.

This was done with a **lot** of different changes, which were developed in
a separate branch. This is just a cleaned up version of all of those
developments.

The code now includes the ability to use CUDA streams, and now manages
CUDA device memory using its own manager class (Acts::Cuda::MemoryManager).
---
 Plugins/Cuda/CMakeLists.txt                   |   7 +
 .../Cuda/Seeding2/Details/FindTriplets.hpp    |  13 +-
 .../Plugins/Cuda/Seeding2/Details/Types.hpp   |  12 +-
 .../Acts/Plugins/Cuda/Seeding2/SeedFinder.hpp |  15 +-
 .../Acts/Plugins/Cuda/Seeding2/SeedFinder.ipp |  33 +-
 .../Acts/Plugins/Cuda/Utilities/Arrays.hpp    |  13 +
 .../Acts/Plugins/Cuda/Utilities/Info.hpp      |  73 ++++
 .../Plugins/Cuda/Utilities/MemoryManager.hpp  | 104 ++++++
 .../Plugins/Cuda/Utilities/StreamWrapper.hpp  |  64 ++++
 Plugins/Cuda/src/Seeding2/CountDublets.cu     |   4 +-
 Plugins/Cuda/src/Seeding2/FindTriplets.cu     | 336 +++++++++++-------
 Plugins/Cuda/src/Utilities/Arrays.cu          |  54 ++-
 Plugins/Cuda/src/Utilities/Info.cu            |  76 ++++
 Plugins/Cuda/src/Utilities/MemoryManager.cu   | 144 ++++++++
 Plugins/Cuda/src/Utilities/StreamHandlers.cuh |  26 ++
 Plugins/Cuda/src/Utilities/StreamWrapper.cu   |  75 ++++
 16 files changed, 889 insertions(+), 160 deletions(-)
 create mode 100644 Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/Info.hpp
 create mode 100644 Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/MemoryManager.hpp
 create mode 100644 Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/StreamWrapper.hpp
 create mode 100644 Plugins/Cuda/src/Utilities/Info.cu
 create mode 100644 Plugins/Cuda/src/Utilities/MemoryManager.cu
 create mode 100644 Plugins/Cuda/src/Utilities/StreamHandlers.cuh
 create mode 100644 Plugins/Cuda/src/Utilities/StreamWrapper.cu
diff --git a/Plugins/Cuda/CMakeLists.txt b/Plugins/Cuda/CMakeLists.txt
index 092c2ba5e1e..49e7c1d5c65 100644
--- a/Plugins/Cuda/CMakeLists.txt
+++ b/Plugins/Cuda/CMakeLists.txt
@@ -28,13 +28,20 @@ add_library(
   include/Acts/Plugins/Cuda/Seeding2/SeedFinder.ipp
   include/Acts/Plugins/Cuda/Seeding2/TripletFilterConfig.hpp
   include/Acts/Plugins/Cuda/Utilities/Arrays.hpp
+  include/Acts/Plugins/Cuda/Utilities/Info.hpp
+  include/Acts/Plugins/Cuda/Utilities/MemoryManager.hpp
+  include/Acts/Plugins/Cuda/Utilities/StreamWrapper.hpp
   src/Seeding2/CountDublets.cu
   src/Seeding2/FindDublets.cu
   src/Seeding2/FindTriplets.cu
   src/Utilities/Arrays.cu
   src/Utilities/ErrorCheck.cuh
   src/Utilities/ErrorCheck.cu
+  src/Utilities/Info.cu
   src/Utilities/MatrixMacros.hpp
+  src/Utilities/MemoryManager.cu
+  src/Utilities/StreamHandlers.cuh
+  src/Utilities/StreamWrapper.cu
 )
 
 target_include_directories(
diff --git a/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/Details/FindTriplets.hpp b/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/Details/FindTriplets.hpp
index 849061caf1b..e4b17d0f17c 100644
--- a/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/Details/FindTriplets.hpp
+++ b/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/Details/FindTriplets.hpp
@@ -11,6 +11,7 @@
 // CUDA plugin include(s).
 #include "Acts/Plugins/Cuda/Seeding2/Details/Types.hpp"
 #include "Acts/Plugins/Cuda/Utilities/Arrays.hpp"
+#include "Acts/Plugins/Cuda/Utilities/Info.hpp"
 
 // System include(s).
 #include <cstddef>
@@ -34,6 +35,7 @@ namespace Details {
 /// that still need to be filtered through
 /// @c Acts::SeedFilter::filterSeeds_1SpFixed before returning it to the user.
 ///
+/// @param[in] device Properties of the device that the code will be running on
 /// @param[in] maxBlockSize The maximum block size to use on the GPU
 /// @param[in] dubletCounts The output object from
 ///            @c Acts::Cuda::Details::countDublets
@@ -70,11 +72,12 @@ namespace Details {
 ///         for each middle spacepoint
 ///
 std::vector<std::vector<Triplet> > findTriplets(
-    std::size_t maxBlockSize, const DubletCounts& dubletCounts,
-    const SeedFilterConfig& seedConfig, const TripletFilterConfig& filterConfig,
-    std::size_t nBottomSPs, const device_array<SpacePoint>& bottomSPs,
-    std::size_t nMiddleSPs, const device_array<SpacePoint>& middleSPs,
-    std::size_t nTopSPs, const device_array<SpacePoint>& topSPs,
+    const Info::Device& device, std::size_t maxBlockSize,
+    const DubletCounts& dubletCounts, const SeedFilterConfig& seedConfig,
+    const TripletFilterConfig& filterConfig, std::size_t nBottomSPs,
+    const device_array<SpacePoint>& bottomSPs, std::size_t nMiddleSPs,
+    const device_array<SpacePoint>& middleSPs, std::size_t nTopSPs,
+    const device_array<SpacePoint>& topSPs,
     const device_array<unsigned int>& middleBottomCounts,
     const device_array<std::size_t>& middleBottomDublets,
     const device_array<unsigned int>& middleTopCounts,
diff --git a/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/Details/Types.hpp b/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/Details/Types.hpp
index 168a970ade0..1e7ff758a3b 100644
--- a/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/Details/Types.hpp
+++ b/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/Details/Types.hpp
@@ -8,9 +8,6 @@
 
 #pragma once
 
-// System include(s).
-#include <cstddef>
-
 namespace Acts {
 namespace Cuda {
 namespace Details {
@@ -28,9 +25,9 @@ struct SpacePoint {
 /// Helper struct summarising the results of the dublet search
 struct DubletCounts {
   /// The total number of dublets (M-B and M-T) found
-  std::size_t nDublets = 0;
+  unsigned int nDublets = 0;
   /// The total number of triplet candidates found
-  std::size_t nTriplets = 0;
+  unsigned int nTriplets = 0;
   /// The maximal number of middle-bottom dublets
   unsigned int maxMBDublets = 0;
   /// The maximal number of middle-top dublets
@@ -51,8 +48,9 @@ struct LinCircle {
 
 /// Structure used in the CUDA-based triplet finding
 struct Triplet {
-  std::size_t bottomIndex = static_cast<std::size_t>(-1);
-  std::size_t topIndex = static_cast<std::size_t>(-1);
+  unsigned int bottomIndex = static_cast<unsigned int>(-1);
+  unsigned int middleIndex = static_cast<unsigned int>(-1);
+  unsigned int topIndex = static_cast<unsigned int>(-1);
   float impactParameter = 0.0f;
   float invHelixDiameter = 0.0f;
   float weight = 0.0f;
diff --git a/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.hpp b/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.hpp
index dabd607bb8f..1592f65a691 100644
--- a/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.hpp
+++ b/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.hpp
@@ -15,6 +15,7 @@
 #include "Acts/Seeding/Seed.hpp"
 #include "Acts/Seeding/SeedFilterConfig.hpp"
 #include "Acts/Seeding/SeedfinderConfig.hpp"
+#include "Acts/Utilities/Logger.hpp"
 
 namespace Acts {
 namespace Cuda {
@@ -26,9 +27,19 @@ class SeedFinder {
   ///////////////////////////////////////////////////////////////////
 
  public:
+  /// Create a CUDA backed seed finder object
+  ///
+  /// @param commonConfig Configuration shared with @c Acts::Seedfinder
+  /// @param seedFilterConfig Configuration shared with @c Acts::SeedFilter
+  /// @param tripletFilterConfig Configuration for the GPU based triplet
+  ///        filtering
+  /// @param device The identifier of the CUDA device to run on
+  /// @param loggerLevel Output level of messages coming from the object
+  ///
   SeedFinder(SeedfinderConfig<external_spacepoint_t> commonConfig,
              const SeedFilterConfig& seedFilterConfig,
-             const TripletFilterConfig& tripletFilterConfig);
+             const TripletFilterConfig& tripletFilterConfig, int device = 0,
+             Acts::Logging::Level loggerLevel = Acts::Logging::INFO);
 
   /// Create all seeds from the space points in the three iterators.
   /// Can be used to parallelize the seed creation
@@ -49,6 +60,8 @@ class SeedFinder {
   SeedFilterConfig m_seedFilterConfig;
   /// Configuration for the (device) triplet filter
   TripletFilterConfig m_tripletFilterConfig;
+  /// CUDA device identifier
+  int m_device;
 };
 
 }  // namespace Cuda
diff --git a/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.ipp b/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.ipp
index d5014464dfb..5fed388d88d 100644
--- a/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.ipp
+++ b/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.ipp
@@ -14,6 +14,8 @@
 #include "Acts/Plugins/Cuda/Seeding2/Details/FindTriplets.hpp"
 #include "Acts/Plugins/Cuda/Seeding2/Details/Types.hpp"
 #include "Acts/Plugins/Cuda/Utilities/Arrays.hpp"
+#include "Acts/Plugins/Cuda/Utilities/Info.hpp"
+#include "Acts/Plugins/Cuda/Utilities/MemoryManager.hpp"
 
 // Acts include(s).
 #include "Acts/Seeding/InternalSeed.hpp"
@@ -30,10 +32,12 @@ template <typename external_spacepoint_t>
 SeedFinder<external_spacepoint_t>::SeedFinder(
     Acts::SeedfinderConfig<external_spacepoint_t> commonConfig,
     const SeedFilterConfig& seedFilterConfig,
-    const TripletFilterConfig& tripletFilterConfig)
+    const TripletFilterConfig& tripletFilterConfig, int device,
+    Acts::Logging::Level loggerLevel)
     : m_commonConfig(std::move(commonConfig)),
       m_seedFilterConfig(seedFilterConfig),
-      m_tripletFilterConfig(tripletFilterConfig) {
+      m_tripletFilterConfig(tripletFilterConfig),
+      m_device(device) {
   // calculation of scattering using the highland formula
   // convert pT to p once theta angle is known
   m_commonConfig.highland =
@@ -50,6 +54,17 @@ SeedFinder<external_spacepoint_t>::SeedFinder(
       std::pow(m_commonConfig.minPt * 2 / m_commonConfig.pTPerHelixRadius, 2);
   m_commonConfig.pT2perRadius =
       std::pow(m_commonConfig.highland / m_commonConfig.pTPerHelixRadius, 2);
+
+  // Tell the user what CUDA device will be used by the object.
+  ACTS_LOCAL_LOGGER(
+      Acts::getDefaultLogger("Acts::Cuda::SeedFinder", loggerLevel));
+  if (static_cast<std::size_t>(m_device) < Info::instance().devices().size()) {
+    ACTS_DEBUG("Will be using device:\n"
+               << Info::instance().devices()[m_device]);
+  } else {
+    ACTS_FATAL("Invalid CUDA device requested");
+    throw std::runtime_error("Invalid CUDA device requested");
+  }
 }
 
 template <typename external_spacepoint_t>
@@ -161,11 +176,12 @@ SeedFinder<external_spacepoint_t>::createSeedsForGroup(
 
   // Launch the triplet finding code on all of the previously found dublets.
   auto tripletCandidates = Details::findTriplets(
-      m_commonConfig.maxBlockSize, dubletCounts, m_seedFilterConfig,
-      m_tripletFilterConfig, bottomSPVec.size(), bottomSPDeviceArray,
-      middleSPVec.size(), middleSPDeviceArray, topSPVec.size(),
-      topSPDeviceArray, middleBottomCounts, middleBottomDublets,
-      middleTopCounts, middleTopDublets, m_commonConfig.maxScatteringAngle2,
+      Info::instance().devices()[m_device], m_commonConfig.maxBlockSize,
+      dubletCounts, m_seedFilterConfig, m_tripletFilterConfig,
+      bottomSPVec.size(), bottomSPDeviceArray, middleSPVec.size(),
+      middleSPDeviceArray, topSPVec.size(), topSPDeviceArray,
+      middleBottomCounts, middleBottomDublets, middleTopCounts,
+      middleTopDublets, m_commonConfig.maxScatteringAngle2,
       m_commonConfig.sigmaScattering, m_commonConfig.minHelixDiameter2,
       m_commonConfig.pT2perRadius, m_commonConfig.impactMax);
   assert(tripletCandidates.size() == middleSPVec.size());
@@ -192,6 +208,9 @@ SeedFinder<external_spacepoint_t>::createSeedsForGroup(
     m_commonConfig.seedFilter->filterSeeds_1SpFixed(seedsPerSPM, outputVec);
   }
 
+  // Free up all allocated device memory.
+  MemoryManager::instance().reset(m_device);
+
   // Return the collected spacepoints.
   return outputVec;
 }
diff --git a/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/Arrays.hpp b/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/Arrays.hpp
index 5f287fda5e7..c08e480a185 100644
--- a/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/Arrays.hpp
+++ b/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/Arrays.hpp
@@ -8,6 +8,9 @@
 
 #pragma once
 
+// CUDA plugin include(s).
+#include "Acts/Plugins/Cuda/Utilities/StreamWrapper.hpp"
+
 // System include(s).
 #include <cstddef>
 #include <memory>
@@ -58,10 +61,20 @@ template <typename T>
 void copyToDevice(device_array<T>& dev, const host_array<T>& host,
                   std::size_t arraySize);
 
+/// Copy one array from the host to the device asynchronously
+template <typename T>
+void copyToDevice(device_array<T>& dev, const host_array<T>& host,
+                  std::size_t arraySize, const StreamWrapper& stream);
+
 /// Copy one array from the device to the host
 template <typename T>
 void copyToHost(host_array<T>& host, const device_array<T>& dev,
                 std::size_t arraySize);
 
+/// Copy one array from the device to the host asynchronously
+template <typename T>
+void copyToHost(host_array<T>& host, const device_array<T>& dev,
+                std::size_t arraySize, const StreamWrapper& stream);
+
 }  // namespace Cuda
 }  // namespace Acts
diff --git a/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/Info.hpp b/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/Info.hpp
new file mode 100644
index 00000000000..989aea2e2df
--- /dev/null
+++ b/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/Info.hpp
@@ -0,0 +1,73 @@
+// This file is part of the Acts project.
+//
+// Copyright (C) 2020 CERN for the benefit of the Acts project
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+// System include(s).
+#include <iosfwd>
+#include <string>
+#include <vector>
+
+namespace Acts {
+namespace Cuda {
+
+/// Class providing information about the CUDA devices at runtime
+///
+/// Without exposing any CUDA dependencies publicly to the clients.
+///
+class Info {
+ public:
+  /// @name Declarations preventing any copies of the singleton object
+  /// @{
+
+  /// Explicitly delete the copy constructor
+  Info(const Info&) = delete;
+  /// Explicitly delete the move constructor
+  Info(Info&&) = delete;
+
+  /// Explicitly delete the copy assignment operator
+  Info& operator=(const Info&) = delete;
+  /// Explicitly delete the move assignment operator
+  Info& operator=(Info&&) = delete;
+
+  /// @}
+
+  /// Singleton accessor function
+  static Info& instance();
+
+  /// Helper struct describing one available CUDA device
+  struct Device {
+    /// Identifier that CUDA knows this device by
+    int id = -1;
+    /// The name of this device
+    std::string name;
+    /// The maximal number of threads per block for this device
+    int maxThreadsPerBlock = -1;
+    /// Whether the device supports multiple kernel executions in parallel
+    bool concurrentKernels = false;
+    /// The total amount of (global) memory on the device
+    std::size_t totalMemory = 0;
+  };  // struct Device
+
+  /// Get all the available CUDA devices
+  const std::vector<Device>& devices() const;
+
+ private:
+  /// The constructor is private to implement the singleton behaviour
+  Info();
+
+  /// Information about all available devices
+  std::vector<Device> m_devices;
+
+};  // class Info
+
+/// Print operator for @c Acts::Cuda::Info::Device
+std::ostream& operator<<(std::ostream& out, const Info::Device& device);
+
+}  // namespace Cuda
+}  // namespace Acts
diff --git a/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/MemoryManager.hpp b/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/MemoryManager.hpp
new file mode 100644
index 00000000000..6225cf4f7a3
--- /dev/null
+++ b/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/MemoryManager.hpp
@@ -0,0 +1,104 @@
+// This file is part of the Acts project.
+//
+// Copyright (C) 2020 CERN for the benefit of the Acts project
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+// System include(s).
+#include <cstddef>
+#include <vector>
+
+namespace Acts {
+namespace Cuda {
+
+/// Singleton class used for allocating memory on CUDA device(s)
+///
+/// In order to avoid calling @c cudaMalloc(...) and @c cudaFree(...) too many
+/// times in the code (which can turn out to be pretty slow), device memory
+/// is allocated using this singleton memory manager for the
+/// @c Acts::Cuda::device_array arrays.
+///
+/// It is implemented in a **very** simple way. It allocates a big blob of
+/// memory, and then hands out pointers from this blob to anyone that asks for
+/// device memory.
+///
+/// The class doesn't handle memory returns in any sophisticated way. It assumes
+/// that any calculation will need all allocated memory until the end of that
+/// calculation. At which point all of that memory gets re-purpused in one call.
+///
+/// The code is not thread safe currently in any shape or form. But there should
+/// be ways of making it at least "thread friendly" later on.
+///
+class MemoryManager {
+ public:
+  /// Destructor, freeing up all allocated memory
+  ~MemoryManager();
+
+  /// @name Declarations preventing any copies of the singleton object
+  /// @{
+
+  /// Disallow copy construction
+  MemoryManager(const MemoryManager&) = delete;
+  /// Disallow move construction
+  MemoryManager(MemoryManager&&) = delete;
+
+  /// Disallow copy assignment
+  MemoryManager& operator=(const MemoryManager&) = delete;
+  /// Disallow move assignment
+  MemoryManager& operator=(MemoryManager&&) = delete;
+
+  /// @}
+
+  /// @name Functions that the users of Acts may be interacting with
+  /// @{
+
+  /// Singleton object accessor
+  static MemoryManager& instance();
+
+  /// Set the amount of memory to use on a particular device
+  void setMemorySize(std::size_t sizeInBytes, int device = -1);
+
+  /// @}
+
+  /// @name Functions used internally by the Acts code
+  /// @{
+
+  /// Get the amount of memory still available on a specific device
+  std::size_t availableMemory(int device = -1) const;
+
+  /// Get a pointer to an available memory block on the device
+  void* allocate(std::size_t sizeInBytes, int device = -1);
+
+  /// Reset all allocations
+  void reset(int device = -1);
+
+  /// @}
+
+ private:
+  /// Hide the constructor of the class
+  MemoryManager();
+
+  /// Struct describing the state of the memory allocation on a particular
+  /// device
+  struct DeviceMemory {
+    /// The amount of memory allocated on the CUDA device
+    std::size_t m_size = 0;
+    /// Pointer to the beginning of the memory allocation
+    char* m_ptr = nullptr;
+    /// Pointer to the next available memory block in the "current round"
+    char* m_nextAllocation = nullptr;
+    /// The maximum amount of memory used at a time during the job
+    std::ptrdiff_t m_maxUsage = 0;
+  };
+
+  /// Object holding information about memory allocations on all devices
+  std::vector<DeviceMemory> m_memory;
+
+};  // class MemoryManager
+
+}  // namespace Cuda
+}  // namespace Acts
diff --git a/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/StreamWrapper.hpp b/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/StreamWrapper.hpp
new file mode 100644
index 00000000000..39d8c646143
--- /dev/null
+++ b/Plugins/Cuda/include/Acts/Plugins/Cuda/Utilities/StreamWrapper.hpp
@@ -0,0 +1,64 @@
+// This file is part of the Acts project.
+//
+// Copyright (C) 2020 CERN for the benefit of the Acts project
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+// CUDA plugin include(s).
+#include "Acts/Plugins/Cuda/Utilities/Info.hpp"
+
+namespace Acts {
+namespace Cuda {
+
+/// Helper class for passing around @c cudaStream_t objects (pointers)
+///
+/// In order to be able to create user interfaces that return/receive CUDA
+/// streams, while not exposing the users of those interfaces to the CUDA
+/// Runtime API, this class helps us hiding the concrete CUDA types from our
+/// interfaces.
+///
+class StreamWrapper {
+/// Declare the @c Acts::Cuda::getStreamFrom function a frient of the class
+///
+/// Note that it's not practical to put that function into the
+/// @c Acts::Cuda::details namespace, because then we would be forced to
+/// forward declare it in this header.
+#ifdef __CUDACC__
+  friend cudaStream_t getStreamFrom(const StreamWrapper&);
+#endif  // __CUDACC__
+
+ public:
+  /// Constructor with the stream to be wrapped
+  StreamWrapper(void* stream, bool ownsStream = true);
+  /// Move constructor
+  StreamWrapper(StreamWrapper&& parent);
+  /// Disabled copy constructor
+  StreamWrapper(const StreamWrapper&) = delete;
+  /// Destructor
+  ~StreamWrapper();
+
+  /// Move assignment operator
+  StreamWrapper& operator=(StreamWrapper&& rhs);
+  /// Disabled copy assignment operator
+  StreamWrapper& operator=(const StreamWrapper&) = delete;
+
+  /// Wait for all scheduled operations to finish in the stream
+  void synchronize() const;
+
+ private:
+  /// Type erased pointer, managed by this wrapper class
+  void* m_stream;
+  /// Flag showing whether the object owns the stream that it wraps
+  bool m_ownsStream;
+
+};  // class StreamWrapper
+
+/// Create a stream for a particular CUDA device
+StreamWrapper createStreamFor(const Acts::Cuda::Info::Device& device);
+
+}  // namespace Cuda
+}  // namespace Acts
diff --git a/Plugins/Cuda/src/Seeding2/CountDublets.cu b/Plugins/Cuda/src/Seeding2/CountDublets.cu
index c59de6f8e31..9d9a0f682ac 100644
--- a/Plugins/Cuda/src/Seeding2/CountDublets.cu
+++ b/Plugins/Cuda/src/Seeding2/CountDublets.cu
@@ -120,9 +120,7 @@ DubletCounts countDublets(
 
   // Copy the sum(s) back to the host.
   auto dubletCountsHost = make_host_array<DubletCounts>(numBlocks);
-  ACTS_CUDA_ERROR_CHECK(
-      cudaMemcpy(dubletCountsHost.get(), dubletCountsDevice.get(),
-                 numBlocks * sizeof(DubletCounts), cudaMemcpyDeviceToHost));
+  copyToHost(dubletCountsHost, dubletCountsDevice, numBlocks);
 
   // Perform the final summation on the host. Assuming that the number of
   // middle space points is not so large that it would make sense to do the
diff --git a/Plugins/Cuda/src/Seeding2/FindTriplets.cu b/Plugins/Cuda/src/Seeding2/FindTriplets.cu
index 49536fecb70..dba08ddc591 100644
--- a/Plugins/Cuda/src/Seeding2/FindTriplets.cu
+++ b/Plugins/Cuda/src/Seeding2/FindTriplets.cu
@@ -10,6 +10,7 @@
 #include "Acts/Plugins/Cuda/Seeding2/Details/FindTriplets.hpp"
 #include "Acts/Plugins/Cuda/Seeding2/Details/Types.hpp"
 #include "Acts/Plugins/Cuda/Seeding2/TripletFilterConfig.hpp"
+#include "Acts/Plugins/Cuda/Utilities/MemoryManager.hpp"
 #include "../Utilities/ErrorCheck.cuh"
 #include "../Utilities/MatrixMacros.hpp"
 
@@ -168,14 +169,18 @@ __global__ void transformCoordinates(
 
 /// Kernel used for finding all the triplet candidates
 ///
-/// @param[in] middleIndex The middle spacepoint index to run the triplet search
-///            for
+/// @param[in] middleIndexStart The middle spacepoint index that the kernel was
+///            "started from"
 /// @param[in] maxMBDublets The maximal number of middle-bottom dublets found
 ///            for any middle spacepoint
 /// @param[in] maxMTDublets The maximal number of middle-top dublets found for
 ///            any middle spacepoint
 /// @param[in] maxTriplets The maximum number of triplets for which memory is
 ///            booked
+/// @param[in] nParallelMiddleSPs The number of middle spacepoints that the
+///            "largest" kernels may be started on in parallel
+/// @param[in] nMiddleSPsProcessed The number of middle spacepoints that the
+///            kernel was started on in parallel
 /// @param[in] nBottomSPs The number of bottom spacepoints in @c bottomSPs
 /// @param[in] bottomSPs Properties of all of the bottom spacepoints
 /// @param[in] nMiddleSPs The number of middle spacepoints in @c middleSPs
@@ -214,11 +219,13 @@ __global__ void transformCoordinates(
 ///             candidates
 ///
 __global__ void findTriplets(
-    std::size_t middleIndex, unsigned int maxMBDublets,
-    unsigned int maxMTDublets, unsigned int maxTriplets, std::size_t nBottomSPs,
-    const Details::SpacePoint* bottomSPs, std::size_t nMiddleSPs,
-    const Details::SpacePoint* middleSPs, std::size_t nTopSPs,
-    const Details::SpacePoint* topSPs, const unsigned int* middleBottomCounts,
+    std::size_t middleIndexStart, unsigned int maxMBDublets,
+    unsigned int maxMTDublets, unsigned int maxTriplets,
+    std::size_t nParallelMiddleSPs, std::size_t nMiddleSPsProcessed,
+    std::size_t nBottomSPs, const Details::SpacePoint* bottomSPs,
+    std::size_t nMiddleSPs, const Details::SpacePoint* middleSPs,
+    std::size_t nTopSPs, const Details::SpacePoint* topSPs,
+    const unsigned int* middleBottomCounts,
     const std::size_t* middleBottomDublets, const unsigned int* middleTopCounts,
     const std::size_t* middleTopDublets,
     const Details::LinCircle* bottomSPLinTransArray,
@@ -229,26 +236,39 @@ __global__ void findTriplets(
     unsigned int* maxTripletsPerSpB, unsigned int* tripletCount,
     Details::Triplet* triplets) {
   // A sanity check.
+  assert(middleIndexStart + nMiddleSPsProcessed <= nMiddleSPs);
+
+  // Find the middle spacepoint index to operate on.
+  const unsigned int middleIndexOffset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (middleIndexOffset >= nMiddleSPsProcessed) {
+    return;
+  }
+  const unsigned int middleIndex = middleIndexStart + middleIndexOffset;
   assert(middleIndex < nMiddleSPs);
 
-  // The total number of dublets for this middle spacepoint.
-  const unsigned int nMiddleBottomDublets = middleBottomCounts[middleIndex];
-  const unsigned int nMiddleTopDublets = middleTopCounts[middleIndex];
+  // Counts of middle-bottom and middle-top pairs for this middle spacepoint.
+  const unsigned int middleBottomPairCount = middleBottomCounts[middleIndex];
+  const unsigned int middleTopPairCount = middleTopCounts[middleIndex];
 
-  // Get the indices of the dublets to operate on.
-  const std::size_t bottomDubletIndex = blockIdx.x * blockDim.x + threadIdx.x;
-  const std::size_t topDubletIndex = blockIdx.y * blockDim.y + threadIdx.y;
-  if ((bottomDubletIndex >= nMiddleBottomDublets) ||
-      (topDubletIndex >= nMiddleTopDublets)) {
+  // Find the indices of the middle-bottom and middle-top pairs to operate on.
+  const unsigned int tripletCandidateIndex =
+      blockIdx.y * blockDim.y + threadIdx.y;
+  if (tripletCandidateIndex >= middleBottomPairCount * middleTopPairCount) {
     return;
   }
+  const unsigned int bottomDubletIndex =
+      tripletCandidateIndex / middleTopPairCount;
+  assert(bottomDubletIndex < middleBottomPairCount);
+  const unsigned int topDubletIndex =
+      tripletCandidateIndex - bottomDubletIndex * middleTopPairCount;
+  assert(topDubletIndex < middleTopPairCount);
 
   // Get the indices of the spacepoints to operate on.
-  const std::size_t bottomIndex =
+  const unsigned int bottomIndex =
       ACTS_CUDA_MATRIX2D_ELEMENT(middleBottomDublets, nMiddleSPs, nBottomSPs,
                                  middleIndex, bottomDubletIndex);
   assert(bottomIndex < nBottomSPs);
-  const std::size_t topIndex = ACTS_CUDA_MATRIX2D_ELEMENT(
+  const unsigned int topIndex = ACTS_CUDA_MATRIX2D_ELEMENT(
       middleTopDublets, nMiddleSPs, nTopSPs, middleIndex, topDubletIndex);
   assert(topIndex < nTopSPs);
 
@@ -347,10 +367,12 @@ __global__ void findTriplets(
   }
 
   // Reserve elements (positions) in the global matrices/arrays.
-  int tripletIndexRow =
-      atomicAdd(tripletsPerBottomDublet + bottomDubletIndex, 1);
+  unsigned int* tripletIndexRowPtr = &(ACTS_CUDA_MATRIX2D_ELEMENT(
+      tripletsPerBottomDublet, nParallelMiddleSPs, maxMBDublets,
+      middleIndexOffset, bottomDubletIndex));
+  const unsigned int tripletIndexRow = atomicAdd(tripletIndexRowPtr, 1);
   assert(tripletIndexRow < maxMTDublets);
-  int tripletIndex = atomicAdd(tripletCount, 1);
+  const unsigned int tripletIndex = atomicAdd(tripletCount, 1);
   assert(tripletIndex < maxTriplets);
 
   // Collect the maximal value of tripletIndexRow + 1 (since we want the
@@ -358,12 +380,14 @@ __global__ void findTriplets(
   atomicMax(maxTripletsPerSpB, tripletIndexRow + 1);
 
   // Save the index of the triplet candidate, which will be created now.
-  ACTS_CUDA_MATRIX2D_ELEMENT(tripletIndices, maxMBDublets, maxMTDublets,
-                             bottomDubletIndex, tripletIndexRow) = tripletIndex;
+  ACTS_CUDA_MATRIX3D_ELEMENT(tripletIndices, nParallelMiddleSPs, maxMBDublets,
+                             maxMTDublets, middleIndexOffset, bottomDubletIndex,
+                             tripletIndexRow) = tripletIndex;
 
   // Now store the triplet in the above mentioned location.
-  Details::Triplet triplet = {bottomIndex, topIndex, Im, B / sqrtf(S2),
-                              -(Im * impactWeightFactor)};
+  Details::Triplet triplet = {bottomIndex,   middleIndex,
+                              topIndex,      Im,
+                              B / sqrtf(S2), -(Im * impactWeightFactor)};
   triplets[tripletIndex] = triplet;
 
   return;
@@ -375,14 +399,22 @@ __global__ void findTriplets(
 ///            function
 /// @param[in] singleSeedCut Pointer to the user-provided seed filtering
 ///            function
-/// @param[in] middleIndex The middle spacepoint index to run the triplet search
-///            for
+/// @param[in] middleIndexStart The middle spacepoint index that the kernel was
+///            "started from"
 /// @param[in] maxMBDublets The maximal number of middle-bottom dublets found
 ///            for any middle spacepoint
 /// @param[in] maxMTDublets The maximal number of middle-top dublets found for
 ///            any middle spacepoint
-/// @param[in] nMiddleBottomDublets The total number of middle-bottom spacepoint
-///            dublets for this middle spacepoint
+/// @param[in] maxTriplets The maximum number of triplets for which memory is
+///            booked
+/// @param[in] nAllTriplets The number of triplets that were reconstructed for
+///            this middle spacepoint group
+/// @param[in] nParallelMiddleSPs The number of middle spacepoints that the
+///            "largest" kernels may be started on in parallel
+/// @param[in] nMiddleSPsProcessed The number of middle spacepoints that the
+///            kernel was started on in parallel
+/// @param[in] middleBottomCounts 1-D array of the number of middle-bottom
+///            dublets found for each middle spacepoint
 /// @param[in] nBottomSPs The number of bottom spacepoints in @c bottomSPs
 /// @param[in] bottomSPs Properties of all of the bottom spacepoints
 /// @param[in] nMiddleSPs The number of middle spacepoints in @c middleSPs
@@ -393,8 +425,6 @@ __global__ void findTriplets(
 ///            counts for each bottom spacepoint
 /// @param[in] tripletIndices 2-dimensional matrix of the indices of the
 ///            triplets created for each middle-bottom spacepoint dublet
-/// @param[in] nAllTriplets Pointer to the scalar number of triplets found in
-///            total
 /// @param[in] allTriplets 1-dimensional array of all the found triplets
 /// @param[in] deltaInvHelixDiameter Parameter from @c Acts::SeedFilterConfig
 /// @param[in] deltaRMin Parameter from @c Acts::SeedFilterConfig
@@ -408,37 +438,53 @@ __global__ void findTriplets(
 __global__ void filterTriplets2Sp(
     TripletFilterConfig::seedWeightFunc_t seedWeight,
     TripletFilterConfig::singleSeedCutFunc_t singleSeedCut,
-    std::size_t middleIndex, int maxMBDublets, int maxMTDublets,
-    unsigned int nMiddleBottomDublets, std::size_t nBottomSPs,
-    const Details::SpacePoint* bottomSPs, std::size_t nMiddleSPs,
-    const Details::SpacePoint* middleSPs, std::size_t nTopSPs,
-    const Details::SpacePoint* topSPs,
+    std::size_t middleIndexStart, unsigned int maxMBDublets,
+    unsigned int maxMTDublets, unsigned int maxTriplets,
+    unsigned int nAllTriplets, std::size_t nParallelMiddleSPs,
+    std::size_t nMiddleSPsProcessed, unsigned int* middleBottomCounts,
+    std::size_t nBottomSPs, const Details::SpacePoint* bottomSPs,
+    std::size_t nMiddleSPs, const Details::SpacePoint* middleSPs,
+    std::size_t nTopSPs, const Details::SpacePoint* topSPs,
     const unsigned int* tripletsPerBottomDublet,
-    const std::size_t* tripletIndices, const unsigned int* nAllTriplets,
-    const Details::Triplet* allTriplets, float deltaInvHelixDiameter,
-    float deltaRMin, float compatSeedWeight, std::size_t compatSeedLimit,
-    unsigned int* nFilteredTriplets, Details::Triplet* filteredTriplets) {
+    const std::size_t* tripletIndices, const Details::Triplet* allTriplets,
+    float deltaInvHelixDiameter, float deltaRMin, float compatSeedWeight,
+    std::size_t compatSeedLimit, unsigned int* nFilteredTriplets,
+    Details::Triplet* filteredTriplets) {
   // Sanity checks.
   assert(seedWeight != nullptr);
   assert(singleSeedCut != nullptr);
+  assert(middleIndexStart + nMiddleSPsProcessed <= nMiddleSPs);
+
+  // Find the middle spacepoint index to operate on.
+  const unsigned int middleIndexOffset = blockIdx.x * blockDim.x + threadIdx.x;
+  if (middleIndexOffset >= nMiddleSPsProcessed) {
+    return;
+  }
+  const unsigned int middleIndex = middleIndexStart + middleIndexOffset;
   assert(middleIndex < nMiddleSPs);
 
-  // Get the indices of the objects to operate on.
-  const std::size_t bottomDubletIndex = blockIdx.x * blockDim.x + threadIdx.x;
-  if (bottomDubletIndex >= nMiddleBottomDublets) {
+  // Find the middle-bottom dublet to operate on.
+  const unsigned int middleBottomPairCount = middleBottomCounts[middleIndex];
+  const unsigned int bottomDubletIndex = blockIdx.y * blockDim.y + threadIdx.y;
+  if (bottomDubletIndex >= middleBottomPairCount) {
     return;
   }
-  const std::size_t nTriplets = tripletsPerBottomDublet[bottomDubletIndex];
-  const std::size_t tripletMatrixIndex = blockIdx.y * blockDim.y + threadIdx.y;
-  if (tripletMatrixIndex >= nTriplets) {
+
+  // Find the triplet to operate on.
+  const unsigned int nTripletsForMiddleBottom = ACTS_CUDA_MATRIX2D_ELEMENT(
+      tripletsPerBottomDublet, nParallelMiddleSPs, maxMBDublets,
+      middleIndexOffset, bottomDubletIndex);
+  const unsigned int tripletCandidateIndex =
+      blockIdx.z * blockDim.z + threadIdx.z;
+  if (tripletCandidateIndex >= nTripletsForMiddleBottom) {
     return;
   }
 
   // Get the index of this triplet.
-  const std::size_t triplet1Index =
-      ACTS_CUDA_MATRIX2D_ELEMENT(tripletIndices, maxMBDublets, maxMTDublets,
-                                 bottomDubletIndex, tripletMatrixIndex);
-  assert(triplet1Index < *nAllTriplets);
+  const std::size_t triplet1Index = ACTS_CUDA_MATRIX3D_ELEMENT(
+      tripletIndices, nParallelMiddleSPs, maxMBDublets, maxMTDublets,
+      middleIndexOffset, bottomDubletIndex, tripletCandidateIndex);
+  assert(triplet1Index < nAllTriplets);
 
   // Load this triplet into the thread.
   Details::Triplet triplet1 = allTriplets[triplet1Index];
@@ -458,16 +504,17 @@ __global__ void filterTriplets2Sp(
   std::size_t nCompatibleSeedR = 0;
 
   // Loop over all the other triplets found for this bottom-middle dublet.
-  for (std::size_t i = 0; i < nTriplets; ++i) {
+  for (std::size_t i = 0; i < nTripletsForMiddleBottom; ++i) {
     // Don't consider the same triplet that the thread is evaluating in the
     // first place.
-    if (i == tripletMatrixIndex) {
+    if (i == tripletCandidateIndex) {
       continue;
     }
     // Get the index of the second triplet.
-    const std::size_t triplet2Index = ACTS_CUDA_MATRIX2D_ELEMENT(
-        tripletIndices, maxMBDublets, maxMTDublets, bottomDubletIndex, i);
-    assert(triplet2Index < *nAllTriplets);
+    const std::size_t triplet2Index = ACTS_CUDA_MATRIX3D_ELEMENT(
+        tripletIndices, nParallelMiddleSPs, maxMBDublets, maxMTDublets,
+        middleIndexOffset, bottomDubletIndex, i);
+    assert(triplet2Index < nAllTriplets);
     assert(triplet2Index != triplet1Index);
 
     // Load the second triplet into the thread.
@@ -522,7 +569,8 @@ __global__ void filterTriplets2Sp(
   }
 
   // Put the triplet into the "filtered list".
-  const int tripletRow = atomicAdd(nFilteredTriplets, 1);
+  const unsigned int tripletRow = atomicAdd(nFilteredTriplets, 1);
+  assert(tripletRow < nAllTriplets);
   filteredTriplets[tripletRow] = triplet1;
   return;
 }
@@ -532,11 +580,12 @@ __global__ void filterTriplets2Sp(
 namespace Details {
 
 std::vector<std::vector<Triplet>> findTriplets(
-    std::size_t maxBlockSize, const DubletCounts& dubletCounts,
-    const SeedFilterConfig& seedConfig, const TripletFilterConfig& filterConfig,
-    std::size_t nBottomSPs, const device_array<SpacePoint>& bottomSPs,
-    std::size_t nMiddleSPs, const device_array<SpacePoint>& middleSPs,
-    std::size_t nTopSPs, const device_array<SpacePoint>& topSPs,
+    const Info::Device& device, std::size_t maxBlockSize,
+    const DubletCounts& dubletCounts, const SeedFilterConfig& seedConfig,
+    const TripletFilterConfig& filterConfig, std::size_t nBottomSPs,
+    const device_array<SpacePoint>& bottomSPs, std::size_t nMiddleSPs,
+    const device_array<SpacePoint>& middleSPs, std::size_t nTopSPs,
+    const device_array<SpacePoint>& topSPs,
     const device_array<unsigned int>& middleBottomCounts,
     const device_array<std::size_t>& middleBottomDublets,
     const device_array<unsigned int>& middleTopCounts,
@@ -563,11 +612,29 @@ std::vector<std::vector<Triplet>> findTriplets(
   ACTS_CUDA_ERROR_CHECK(cudaGetLastError());
   ACTS_CUDA_ERROR_CHECK(cudaDeviceSynchronize());
 
-  // Copy the dublet counts back to the host.
-  auto middleBottomCountsHost = make_host_array<unsigned int>(nMiddleSPs);
-  copyToHost(middleBottomCountsHost, middleBottomCounts, nMiddleSPs);
-  auto middleTopCountsHost = make_host_array<unsigned int>(nMiddleSPs);
-  copyToHost(middleTopCountsHost, middleTopCounts, nMiddleSPs);
+  // With the information from @c Acts::Cuda::Details::DubletCounts, figure out
+  // how many middle spacepoints we could handle at the same time in the triplet
+  // finding/filtering.
+
+  // For one middle spacepoint we need the following amount:
+  const std::size_t memorySizePerMiddleSP =
+      // First let's consider the storage of the triplet objects themselves.
+      2 * dubletCounts.maxTriplets * sizeof(Triplet) +
+      // Then the objects holding indices to the triplets per middle-bottom
+      // dublet.
+      dubletCounts.maxMBDublets * sizeof(unsigned int) +
+      dubletCounts.maxMBDublets * dubletCounts.maxMTDublets *
+          sizeof(std::size_t) +
+      // Finally the array holding the filtered triplet counts per middle
+      // spacepoint.
+      sizeof(unsigned int);
+
+  // See how many we can fit into the (still) available memory.
+  const std::size_t nParallelMiddleSPs =
+      std::min(MemoryManager::instance().availableMemory(device.id) /
+                   memorySizePerMiddleSP,
+               nMiddleSPs);
+  assert(nParallelMiddleSPs > 0);
 
   // Helper variables for handling the various object counts in device memory.
   enum ObjectCountType : int {
@@ -582,67 +649,82 @@ std::vector<std::vector<Triplet>> findTriplets(
   auto objectCountsHostNull = make_host_array<unsigned int>(NObjectCountTypes);
   memset(objectCountsHostNull.get(), 0,
          NObjectCountTypes * sizeof(unsigned int));
+  auto objectCountsHost = make_host_array<unsigned int>(NObjectCountTypes);
   auto objectCounts = make_device_array<unsigned int>(NObjectCountTypes);
 
   // Allocate enough memory for triplet candidates that would suffice for every
   // middle spacepoint.
-  auto allTriplets = make_device_array<Triplet>(dubletCounts.maxTriplets);
-  auto filteredTriplets = make_device_array<Triplet>(dubletCounts.maxTriplets);
+  auto allTriplets =
+      make_device_array<Triplet>(nParallelMiddleSPs * dubletCounts.maxTriplets);
+  auto filteredTriplets =
+      make_device_array<Triplet>(nParallelMiddleSPs * dubletCounts.maxTriplets);
   auto filteredTripletsHost =
-      make_host_array<Triplet>(dubletCounts.maxTriplets);
+      make_host_array<Triplet>(nParallelMiddleSPs * dubletCounts.maxTriplets);
 
   // Allocate and initialise the array holding the per bottom dublet triplet
   // numbers.
-  auto tripletsPerBottomDubletHost =
-      make_host_array<unsigned int>(dubletCounts.maxMBDublets);
+  auto tripletsPerBottomDubletHost = make_host_array<unsigned int>(
+      nParallelMiddleSPs * dubletCounts.maxMBDublets);
   memset(tripletsPerBottomDubletHost.get(), 0,
-         dubletCounts.maxMBDublets * sizeof(int));
-  auto tripletsPerBottomDublet =
-      make_device_array<unsigned int>(dubletCounts.maxMBDublets);
+         nParallelMiddleSPs * dubletCounts.maxMBDublets * sizeof(unsigned int));
+  auto tripletsPerBottomDublet = make_device_array<unsigned int>(
+      nParallelMiddleSPs * dubletCounts.maxMBDublets);
 
   // Allocate the array holding the indices of the triplets found for a given
   // bottom-middle spacepoint combination.
   auto tripletIndices = make_device_array<std::size_t>(
-      dubletCounts.maxMBDublets * dubletCounts.maxMTDublets);
+      nParallelMiddleSPs * dubletCounts.maxMBDublets *
+      dubletCounts.maxMTDublets);
+
+  // Allocate and initialise the arrays holding the per-middle-spacepoint
+  // filtered triplet counts.
+  auto filteredTripletCountsHostNull =
+      make_host_array<unsigned int>(nParallelMiddleSPs);
+  memset(filteredTripletCountsHostNull.get(), 0,
+         nParallelMiddleSPs * sizeof(unsigned int));
+  auto filteredTripletCountsHost =
+      make_host_array<unsigned int>(nParallelMiddleSPs);
+  auto filteredTripletCounts =
+      make_device_array<unsigned int>(nParallelMiddleSPs);
 
   // Block size used in the triplet finding.
   const std::size_t blockSize = std::sqrt(maxBlockSize);
 
   // Create the result object.
-  std::vector<std::vector<Triplet>> result;
-  result.reserve(nMiddleSPs);
-
-  // Execute the triplet finding and filtering separately for each middle
-  // spacepoint.
-  for (std::size_t middleIndex = 0; middleIndex < nMiddleSPs; ++middleIndex) {
-    // The number of bottom-middle and middle-top dublets found for this middle
-    // spacepoint.
-    const unsigned int nMiddleBottomDublets =
-        middleBottomCountsHost.get()[middleIndex];
-    const unsigned int nMiddleTopDublets =
-        middleTopCountsHost.get()[middleIndex];
-    if ((nMiddleBottomDublets == 0) || (nMiddleTopDublets == 0)) {
-      result.emplace_back();
-      continue;
-    }
+  std::vector<std::vector<Triplet>> result(nMiddleSPs);
 
-    // Reset device arrays.
+  // Copy the dublet counts back to the host.
+  auto middleBottomCountsHost = make_host_array<unsigned int>(nMiddleSPs);
+  copyToHost(middleBottomCountsHost, middleBottomCounts, nMiddleSPs);
+  auto middleTopCountsHost = make_host_array<unsigned int>(nMiddleSPs);
+  copyToHost(middleTopCountsHost, middleTopCounts, nMiddleSPs);
+
+  // Execute the triplet finding and filtering in the maximal allowed groups of
+  // middle spacepoints.
+  for (std::size_t middleIndex = 0; middleIndex < nMiddleSPs;
+       middleIndex += nParallelMiddleSPs) {
+    // Reset the device arrays.
     copyToDevice(objectCounts, objectCountsHostNull, NObjectCountTypes);
     copyToDevice(tripletsPerBottomDublet, tripletsPerBottomDubletHost,
-                 dubletCounts.maxMBDublets);
+                 nParallelMiddleSPs * dubletCounts.maxMBDublets);
+
+    // The number of middle spacepoints to process in this iteration.
+    const std::size_t nMiddleSPsProcessed =
+        std::min(nParallelMiddleSPs, nMiddleSPs - middleIndex);
 
-    // Calculate the parallelisation for the triplet finding for this middle
-    // spacepoint.
-    const dim3 blockSizeFT(blockSize, blockSize);
+    // Calculate the parallelisation for the triplet finding for this collection
+    // of middle spacepoints.
+    const dim3 blockSizeFT(1, maxBlockSize);
     const dim3 numBlocksFT(
-        ((nMiddleBottomDublets + blockSizeFT.x - 1) / blockSizeFT.x),
-        ((nMiddleTopDublets + blockSizeFT.y - 1) / blockSizeFT.y));
+        (nMiddleSPsProcessed + blockSizeFT.x - 1) / blockSizeFT.x,
+        (dubletCounts.maxTriplets + blockSizeFT.y - 1) / blockSizeFT.y);
+    assert(dubletCounts.maxTriplets > 0);
 
     // Launch the triplet finding for this middle spacepoint.
     Kernels::findTriplets<<<numBlocksFT, blockSizeFT>>>(
         // Parameters needed to use all the arrays.
         middleIndex, dubletCounts.maxMBDublets, dubletCounts.maxMTDublets,
-        dubletCounts.maxTriplets,
+        dubletCounts.maxTriplets, nParallelMiddleSPs, nMiddleSPsProcessed,
         // Parameters of all of the spacepoints.
         nBottomSPs, bottomSPs.get(), nMiddleSPs, middleSPs.get(), nTopSPs,
         topSPs.get(),
@@ -662,24 +744,26 @@ std::vector<std::vector<Triplet>> findTriplets(
     ACTS_CUDA_ERROR_CHECK(cudaGetLastError());
     ACTS_CUDA_ERROR_CHECK(cudaDeviceSynchronize());
 
-    // Retrieve the maximal number of triplets found for any given bottom-middle
-    // dublet.
-    int maxTripletsPerSpB = 0;
-    ACTS_CUDA_ERROR_CHECK(cudaMemcpy(&maxTripletsPerSpB,
-                                     objectCounts.get() + MaxTripletsPerSpB,
-                                     sizeof(int), cudaMemcpyDeviceToHost));
-    // If no such triplet has been found, stop here for this middle spacepoint.
-    if (maxTripletsPerSpB == 0) {
-      result.emplace_back();
+    // Retrieve the object counts.
+    copyToHost(objectCountsHost, objectCounts, NObjectCountTypes);
+    const unsigned int nAllTriplets = objectCountsHost.get()[AllTriplets];
+    const unsigned int nMaxTripletsPerSpB =
+        objectCountsHost.get()[MaxTripletsPerSpB];
+
+    // If no triplet has been found, stop here for this middle spacepoint range.
+    if (nAllTriplets == 0) {
       continue;
     }
 
     // Calculate the parallelisation for the "2SpFixed" filtering of the
     // triplets.
-    const dim3 blockSizeF2SP(blockSize, blockSize);
+    const dim3 blockSizeF2SP(1, blockSize, blockSize);
     const dim3 numBlocksF2SP(
-        ((nMiddleBottomDublets + blockSizeF2SP.x - 1) / blockSizeF2SP.x),
-        ((maxTripletsPerSpB + blockSizeF2SP.y - 1) / blockSizeF2SP.y));
+        (nMiddleSPsProcessed + blockSizeF2SP.x - 1) / blockSizeF2SP.x,
+        (dubletCounts.maxMBDublets + blockSizeF2SP.y - 1) / blockSizeF2SP.y,
+        (nMaxTripletsPerSpB + blockSizeF2SP.z - 1) / blockSizeF2SP.z);
+    assert(dubletCounts.maxMBDublets > 0);
+    assert(nMaxTripletsPerSpB > 0);
 
     // Launch the "2SpFixed" filtering of the triplets.
     assert(filterConfig.seedWeight != nullptr);
@@ -689,13 +773,13 @@ std::vector<std::vector<Triplet>> findTriplets(
         filterConfig.seedWeight, filterConfig.singleSeedCut,
         // Parameters needed to use all the arrays.
         middleIndex, dubletCounts.maxMBDublets, dubletCounts.maxMTDublets,
-        nMiddleBottomDublets,
+        dubletCounts.maxTriplets, nAllTriplets, nParallelMiddleSPs,
+        nMiddleSPsProcessed, middleBottomCounts.get(),
         // Parameters of all of the spacepoints.
         nBottomSPs, bottomSPs.get(), nMiddleSPs, middleSPs.get(), nTopSPs,
         topSPs.get(),
         // Variables holding the results of the triplet finding.
-        tripletsPerBottomDublet.get(), tripletIndices.get(),
-        objectCounts.get() + AllTriplets, allTriplets.get(),
+        tripletsPerBottomDublet.get(), tripletIndices.get(), allTriplets.get(),
         // Configuration constants.
         seedConfig.deltaInvHelixDiameter, seedConfig.deltaRMin,
         seedConfig.compatSeedWeight, seedConfig.compatSeedLimit,
@@ -704,22 +788,28 @@ std::vector<std::vector<Triplet>> findTriplets(
     ACTS_CUDA_ERROR_CHECK(cudaGetLastError());
     ACTS_CUDA_ERROR_CHECK(cudaDeviceSynchronize());
 
-    // Retrieve the filtered number of triplets, to know how many to copy back
-    // to the host.
-    int nFilteredTriplets = 0;
-    ACTS_CUDA_ERROR_CHECK(cudaMemcpy(&nFilteredTriplets,
-                                     objectCounts.get() + FilteredTriplets,
-                                     sizeof(int), cudaMemcpyDeviceToHost));
+    // Retrieve the result counts of the filtering.
+    copyToHost(objectCountsHost, objectCounts, NObjectCountTypes);
+
+    // The number of triplets that survived the 2Sp filtering.
+    const unsigned int nFilteredTriplets =
+        objectCountsHost.get()[FilteredTriplets];
+    if (nFilteredTriplets == 0) {
+      continue;
+    }
 
     // Move the filtered triplets back to the host for the final selection.
     ACTS_CUDA_ERROR_CHECK(cudaMemcpy(
         filteredTripletsHost.get(), filteredTriplets.get(),
         nFilteredTriplets * sizeof(Triplet), cudaMemcpyDeviceToHost));
 
-    // Remember these triplets.
-    result.push_back(
-        std::vector<Triplet>(filteredTripletsHost.get(),
-                             filteredTripletsHost.get() + nFilteredTriplets));
+    // Fill the output variable.
+    for (std::size_t i = 0; i < nFilteredTriplets; ++i) {
+      // Access the triplet.
+      const Triplet& triplet = filteredTripletsHost.get()[i];
+      // Put it into the output object.
+      result[triplet.middleIndex].push_back(triplet);
+    }
   }
 
   // Return the indices of all identified triplets.
diff --git a/Plugins/Cuda/src/Utilities/Arrays.cu b/Plugins/Cuda/src/Utilities/Arrays.cu
index bc64a26e08e..4aea17d4f6a 100644
--- a/Plugins/Cuda/src/Utilities/Arrays.cu
+++ b/Plugins/Cuda/src/Utilities/Arrays.cu
@@ -9,23 +9,23 @@
 // CUDA plugin include(s).
 #include "Acts/Plugins/Cuda/Seeding2/Details/Types.hpp"
 #include "Acts/Plugins/Cuda/Utilities/Arrays.hpp"
-#include "../Utilities/ErrorCheck.cuh"
+#include "Acts/Plugins/Cuda/Utilities/MemoryManager.hpp"
+#include "ErrorCheck.cuh"
+#include "StreamHandlers.cuh"
 
 // CUDA include(s).
 #include <cuda_runtime.h>
 
+// System include(s).
+#include <cstdlib>
+
 namespace Acts {
 namespace Cuda {
 namespace Details {
 
-void DeviceArrayDeleter::operator()(void* ptr) {
-  // Ignore null-pointers.
-  if (ptr == nullptr) {
-    return;
-  }
-
-  // Free the pinned host memory.
-  ACTS_CUDA_ERROR_CHECK(cudaFree(ptr));
+void DeviceArrayDeleter::operator()(void*) {
+  // The memory is managed by @c Acts::Cuda::MemoryManager, don't do anything
+  // here.
   return;
 }
 
@@ -35,8 +35,8 @@ void HostArrayDeleter::operator()(void* ptr) {
     return;
   }
 
-  // Free the pinned host memory.
-  ACTS_CUDA_ERROR_CHECK(cudaFreeHost(ptr));
+  // Free the host memory.
+  free(ptr);
   return;
 }
 
@@ -47,7 +47,7 @@ device_array<T> make_device_array(std::size_t size) {
   // Allocate the memory.
   T* ptr = nullptr;
   if (size != 0) {
-    ACTS_CUDA_ERROR_CHECK(cudaMalloc(&ptr, size * sizeof(T)));
+    ptr = static_cast<T*>(MemoryManager::instance().allocate(size * sizeof(T)));
   }
   // Create the smart pointer.
   return device_array<T>(ptr);
@@ -58,7 +58,7 @@ host_array<T> make_host_array(std::size_t size) {
   // Allocate the memory.
   T* ptr = nullptr;
   if (size != 0) {
-    ACTS_CUDA_ERROR_CHECK(cudaMallocHost(&ptr, size * sizeof(T)));
+    ptr = static_cast<T*>(malloc(size * sizeof(T)));
   }
   // Create the smart pointer.
   return host_array<T>(ptr);
@@ -72,6 +72,15 @@ void copyToDevice(device_array<T>& dev, const host_array<T>& host,
   return;
 }
 
+template <typename T>
+void copyToDevice(device_array<T>& dev, const host_array<T>& host,
+                  std::size_t arraySize, const StreamWrapper& stream) {
+  ACTS_CUDA_ERROR_CHECK(
+      cudaMemcpyAsync(dev.get(), host.get(), arraySize * sizeof(T),
+                      cudaMemcpyHostToDevice, getStreamFrom(stream)));
+  return;
+}
+
 template <typename T>
 void copyToHost(host_array<T>& host, const device_array<T>& dev,
                 std::size_t arraySize) {
@@ -80,6 +89,15 @@ void copyToHost(host_array<T>& host, const device_array<T>& dev,
   return;
 }
 
+template <typename T>
+void copyToHost(host_array<T>& host, const device_array<T>& dev,
+                std::size_t arraySize, const StreamWrapper& stream) {
+  ACTS_CUDA_ERROR_CHECK(
+      cudaMemcpyAsync(host.get(), dev.get(), arraySize * sizeof(T),
+                      cudaMemcpyDeviceToHost, getStreamFrom(stream)));
+  return;
+}
+
 }  // namespace Cuda
 }  // namespace Acts
 
@@ -101,10 +119,18 @@ void copyToHost(host_array<T>& host, const device_array<T>& dev,
       std::unique_ptr<TYPE, Acts::Cuda::Details::DeviceArrayDeleter>&,         \
       const std::unique_ptr<TYPE, Acts::Cuda::Details::HostArrayDeleter>&,     \
       std::size_t);                                                            \
+  template void Acts::Cuda::copyToDevice<TYPE>(                                \
+      std::unique_ptr<TYPE, Acts::Cuda::Details::DeviceArrayDeleter>&,         \
+      const std::unique_ptr<TYPE, Acts::Cuda::Details::HostArrayDeleter>&,     \
+      std::size_t, const Acts::Cuda::StreamWrapper&);                          \
+  template void Acts::Cuda::copyToHost<TYPE>(                                  \
+      std::unique_ptr<TYPE, Acts::Cuda::Details::HostArrayDeleter>&,           \
+      const std::unique_ptr<TYPE, Acts::Cuda::Details::DeviceArrayDeleter>&,   \
+      std::size_t);                                                            \
   template void Acts::Cuda::copyToHost<TYPE>(                                  \
       std::unique_ptr<TYPE, Acts::Cuda::Details::HostArrayDeleter>&,           \
       const std::unique_ptr<TYPE, Acts::Cuda::Details::DeviceArrayDeleter>&,   \
-      std::size_t)
+      std::size_t, const Acts::Cuda::StreamWrapper&)
 
 // Instantiate the templated functions for all primitive types.
 INST_ARRAY_FOR_TYPE(char);
diff --git a/Plugins/Cuda/src/Utilities/Info.cu b/Plugins/Cuda/src/Utilities/Info.cu
new file mode 100644
index 00000000000..d3faf2ee976
--- /dev/null
+++ b/Plugins/Cuda/src/Utilities/Info.cu
@@ -0,0 +1,76 @@
+// This file is part of the Acts project.
+//
+// Copyright (C) 2020 CERN for the benefit of the Acts project
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// CUDA plugin include(s).
+#include "Acts/Plugins/Cuda/Utilities/Info.hpp"
+#include "ErrorCheck.cuh"
+
+// System include(s).
+#include <cmath>
+#include <iostream>
+
+namespace Acts {
+namespace Cuda {
+
+Info& Info::instance() {
+  static Info info;
+  return info;
+}
+
+const std::vector<Info::Device>& Info::devices() const {
+  return m_devices;
+}
+
+Info::Info() {
+  // Collect all information about all the available devices on
+  // construction. Note that we explicitly ignore the return value of the call,
+  // in case the code is executed without any available CUDA devices.
+  int nDevices = 0;
+  static_cast<void>(cudaGetDeviceCount(&nDevices));
+
+  for (int i = 0; i < nDevices; ++i) {
+    // Retrieve all properties of this device.
+    cudaDeviceProp properties;
+    ACTS_CUDA_ERROR_CHECK(cudaGetDeviceProperties(&properties, i));
+
+    // Create an @c Acts::Cuda::Info::Device object from the information.
+    m_devices.push_back({i, properties.name, properties.maxThreadsPerBlock,
+                         static_cast<bool>(properties.concurrentKernels),
+                         properties.totalGlobalMem});
+  }
+}
+
+std::ostream& operator<<(std::ostream& out, const Info::Device& device) {
+  out << " /-- Device ID " << device.id << " " << std::string(31, '-') << "\\"
+      << std::endl;
+  out << " | Name: " << device.name
+      << std::string(
+             (39 > device.name.length() ? 39 - device.name.length() : 0), ' ')
+      << "|" << std::endl;
+  const std::size_t threadDigits =
+      static_cast<std::size_t>(std::log10(device.maxThreadsPerBlock)) + 1;
+  out << " | Max. threads per block: " << device.maxThreadsPerBlock
+      << std::string((21 > threadDigits ? 21 - threadDigits : 0), ' ') << "|"
+      << std::endl;
+  out << " | Concurrent kernels: "
+      << (device.concurrentKernels ? "true " : "false") << std::string(20, ' ')
+      << "|" << std::endl;
+  static constexpr double MEGABYTES = 1.0 / (1024 * 1024);
+  const double totalMem = device.totalMemory * MEGABYTES;
+  const std::size_t memDigits =
+      static_cast<std::size_t>(std::log10(totalMem)) + 1;
+  out << " | Total memory: " << totalMem << " MB"
+      << std::string((25 > memDigits ? 25 - memDigits : 0), ' ') << "|"
+      << std::endl;
+  out << " \\" << std::string(46, '-') << "/";
+
+  return out;
+}
+
+}  // namespace Cuda
+}  // namespace Acts
diff --git a/Plugins/Cuda/src/Utilities/MemoryManager.cu b/Plugins/Cuda/src/Utilities/MemoryManager.cu
new file mode 100644
index 00000000000..ccc785e10c0
--- /dev/null
+++ b/Plugins/Cuda/src/Utilities/MemoryManager.cu
@@ -0,0 +1,144 @@
+// This file is part of the Acts project.
+//
+// Copyright (C) 2020 CERN for the benefit of the Acts project
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// CUDA plugin include(s).
+#include "Acts/Plugins/Cuda/Utilities/MemoryManager.hpp"
+#include "ErrorCheck.cuh"
+
+// CUDA include(s).
+#include <cuda_runtime.h>
+
+// System include(s).
+#include <cmath>
+#include <stdexcept>
+
+namespace Acts {
+namespace Cuda {
+
+MemoryManager::~MemoryManager() {
+  // Free all the allocated memory.
+  for (DeviceMemory& mem : m_memory) {
+    if (mem.m_ptr == nullptr) {
+      continue;
+    }
+    ACTS_CUDA_ERROR_CHECK(cudaFree(mem.m_ptr));
+  }
+}
+
+MemoryManager& MemoryManager::instance() {
+  static MemoryManager mm;
+  return mm;
+}
+
+void MemoryManager::setMemorySize(std::size_t sizeInBytes, int device) {
+  // If the user didn't ask for a specific device, use the one currently used by
+  // CUDA.
+  if (device == -1) {
+    ACTS_CUDA_ERROR_CHECK(cudaGetDevice(&device));
+  }
+
+  // Make sure that the internal storage variable is large enough.
+  if (static_cast<std::size_t>(device) >= m_memory.size()) {
+    m_memory.resize(device + 1);
+  }
+
+  // Get the object responsible for this device.
+  DeviceMemory& mem = m_memory[device];
+
+  // De-allocate any previously allocated memory.
+  if (mem.m_ptr) {
+    ACTS_CUDA_ERROR_CHECK(cudaFree(mem.m_ptr));
+  }
+
+  // Allocate the newly requested amount.
+  ACTS_CUDA_ERROR_CHECK(cudaSetDevice(device));
+  ACTS_CUDA_ERROR_CHECK(cudaMalloc(&(mem.m_ptr), sizeInBytes));
+
+  // Set up the internal state of the object correctly.
+  mem.m_size = sizeInBytes;
+  mem.m_nextAllocation = mem.m_ptr;
+  return;
+}
+
+std::size_t MemoryManager::availableMemory(int device) const {
+  // If the user didn't ask for a specific device, use the one currently used by
+  // CUDA.
+  if (device == -1) {
+    ACTS_CUDA_ERROR_CHECK(cudaGetDevice(&device));
+  }
+
+  // Make sure that memory was allocated on the requested device.
+  if (m_memory.size() <= static_cast<std::size_t>(device)) {
+    throw std::bad_alloc();
+  }
+  const DeviceMemory& mem = m_memory[device];
+
+  // Return the requested information.
+  return (mem.m_size - (mem.m_nextAllocation - mem.m_ptr));
+}
+
+void* MemoryManager::allocate(std::size_t sizeInBytes, int device) {
+  // If the user didn't ask for a specific device, use the one currently used by
+  // CUDA.
+  if (device == -1) {
+    ACTS_CUDA_ERROR_CHECK(cudaGetDevice(&device));
+  }
+
+  // Make sure that memory was allocated on the requested device.
+  if (m_memory.size() <= static_cast<std::size_t>(device)) {
+    throw std::bad_alloc();
+  }
+  DeviceMemory& mem = m_memory[device];
+
+  // We already know what we want to return...
+  void* result = mem.m_nextAllocation;
+
+  // Make sure that all addresses given out are 8-byte aligned.
+  static constexpr std::size_t ALIGN_SIZE = 8;
+  const std::size_t misalignment = sizeInBytes % ALIGN_SIZE;
+  const std::size_t padding =
+      ((misalignment != 0) ? (ALIGN_SIZE - misalignment) : 0);
+
+  // Increment the internal pointer.
+  mem.m_nextAllocation += sizeInBytes + padding;
+  // And make sure that we didn't run out of memory.
+  if (mem.m_nextAllocation - mem.m_ptr >= mem.m_size) {
+    throw std::bad_alloc();
+  }
+
+  // Apparently everything is okay.
+  return result;
+}
+
+void MemoryManager::reset(int device) {
+  // If the user didn't ask for a specific device, use the one currently used by
+  // CUDA.
+  if (device == -1) {
+    ACTS_CUDA_ERROR_CHECK(cudaGetDevice(&device));
+  }
+
+  // Make sure that memory was allocated on the requested device.
+  if (m_memory.size() <= static_cast<std::size_t>(device)) {
+    throw std::bad_alloc();
+  }
+  DeviceMemory& mem = m_memory[device];
+
+  // Note down how much memory was used in total until the reset.
+  mem.m_maxUsage = std::max(mem.m_maxUsage, mem.m_nextAllocation - mem.m_ptr);
+  // Return the internal pointer to its startout location.
+  mem.m_nextAllocation = mem.m_ptr;
+  return;
+}
+
+MemoryManager::MemoryManager() {
+  // Allocate 1500 MBs of memory as a start on the default device.
+  setMemorySize(1500 * 1024l * 1024l);
+}
+
+}  // namespace Cuda
+}  // namespace Acts
diff --git a/Plugins/Cuda/src/Utilities/StreamHandlers.cuh b/Plugins/Cuda/src/Utilities/StreamHandlers.cuh
new file mode 100644
index 00000000000..e1191ef45a5
--- /dev/null
+++ b/Plugins/Cuda/src/Utilities/StreamHandlers.cuh
@@ -0,0 +1,26 @@
+// This file is part of the Acts project.
+//
+// Copyright (C) 2020 CERN for the benefit of the Acts project
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#pragma once
+
+// CUDA plugin include(s).
+#include "Acts/Plugins/Cuda/Utilities/StreamWrapper.hpp"
+
+// CUDA include(s).
+#include <cuda_runtime.h>
+
+namespace Acts {
+namespace Cuda {
+
+/// Get the @c cudaStream_t value out of an @c Acts::Cuda::StreamWrapper object
+inline cudaStream_t getStreamFrom(const StreamWrapper& wrapper) {
+  return static_cast<cudaStream_t>(wrapper.m_stream);
+}
+
+}  // namespace Cuda
+}  // namespace Acts
diff --git a/Plugins/Cuda/src/Utilities/StreamWrapper.cu b/Plugins/Cuda/src/Utilities/StreamWrapper.cu
new file mode 100644
index 00000000000..c20e7ccba2c
--- /dev/null
+++ b/Plugins/Cuda/src/Utilities/StreamWrapper.cu
@@ -0,0 +1,75 @@
+// This file is part of the Acts project.
+//
+// Copyright (C) 2020 CERN for the benefit of the Acts project
+//
+// This Source Code Form is subject to the terms of the Mozilla Public
+// License, v. 2.0. If a copy of the MPL was not distributed with this
+// file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+// CUDA plugin include(s).
+#include "Acts/Plugins/Cuda/Utilities/StreamWrapper.hpp"
+#include "ErrorCheck.cuh"
+#include "StreamHandlers.cuh"
+
+// CUDA include(s).
+#include <cuda_runtime.h>
+
+namespace Acts {
+namespace Cuda {
+
+StreamWrapper::StreamWrapper(void* stream, bool ownsStream)
+    : m_stream(stream), m_ownsStream(ownsStream) {}
+
+StreamWrapper::StreamWrapper(StreamWrapper&& parent)
+    : m_stream(parent.m_stream), m_ownsStream(parent.m_ownsStream) {
+  parent.m_stream = nullptr;
+  parent.m_ownsStream = false;
+}
+
+StreamWrapper::~StreamWrapper() {
+  // Destroy the stream, if we still hold it.
+  if (m_stream && m_ownsStream) {
+    ACTS_CUDA_ERROR_CHECK(cudaStreamDestroy(getStreamFrom(*this)));
+  }
+}
+
+StreamWrapper& StreamWrapper::operator=(StreamWrapper&& rhs) {
+  // Check whether anything needs to be done.
+  if (this == &rhs) {
+    return *this;
+  }
+
+  // Destroy the current stream, if we hold one.
+  if (m_stream && m_ownsStream) {
+    ACTS_CUDA_ERROR_CHECK(cudaStreamDestroy(getStreamFrom(*this)));
+  }
+
+  // Perform the move.
+  m_stream = rhs.m_stream;
+  m_ownsStream = rhs.m_ownsStream;
+  rhs.m_stream = nullptr;
+  rhs.m_ownsStream = false;
+
+  // Return this object.
+  return *this;
+}
+
+void StreamWrapper::synchronize() const {
+  // Use CUDA to wait for all tasks to finish in the stream.
+  ACTS_CUDA_ERROR_CHECK(cudaStreamSynchronize(getStreamFrom(*this)));
+  return;
+}
+
+StreamWrapper createStreamFor(const Acts::Cuda::Info::Device& device) {
+  // Create the stream for the selected device.
+  ACTS_CUDA_ERROR_CHECK(cudaSetDevice(device.id));
+  cudaStream_t stream = nullptr;
+  ACTS_CUDA_ERROR_CHECK(
+      cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
+
+  // Return the new object.
+  return StreamWrapper(stream);
+}
+
+}  // namespace Cuda
+}  // namespace Acts

From 367748bff0cafdd3e024fb1e58c96038fee49543 Mon Sep 17 00:00:00 2001
From: Attila Krasznahorkay <Attila.Krasznahorkay@cern.ch>
Date: Wed, 26 Aug 2020 15:59:04 +0200
Subject: [PATCH 2/4] Taught the unit test about some of the new plugin
 features.

Mainly to be able to specify which CUDA device to run on, and
how much memory to use from that device.
---
 .../Cuda/Seeding2/CommandLineArguments.cpp    |  8 ++++++-
 .../Cuda/Seeding2/CommandLineArguments.hpp    |  6 +++++
 .../UnitTests/Plugins/Cuda/Seeding2/main.cpp  | 23 +++++++++++++++++++
 3 files changed, 36 insertions(+), 1 deletion(-)

diff --git a/Tests/UnitTests/Plugins/Cuda/Seeding2/CommandLineArguments.cpp b/Tests/UnitTests/Plugins/Cuda/Seeding2/CommandLineArguments.cpp
index 9b15e0f8de1..16ee26f60dc 100644
--- a/Tests/UnitTests/Plugins/Cuda/Seeding2/CommandLineArguments.cpp
+++ b/Tests/UnitTests/Plugins/Cuda/Seeding2/CommandLineArguments.cpp
@@ -34,7 +34,11 @@ void CommandLineArguments::interpret(int argc, char* argv[]) {
       "The number of groups to process as a maximum")(
       "filterDuplicates,d", po::bool_switch(),
       "Look for spacepoint duplicates in the input file, and remove them "
-      "(slow!)");
+      "(slow!)")("cudaDevice", po::value<int>()->default_value(0),
+                 "The CUDA device to use in the test")(
+      "cudaDeviceMemory", po::value<unsigned int>()->default_value(0),
+      "The amount of CUDA device memory to use, in megabytes. By default it is"
+      " 80% of the available amount.");
 
   // Parse the command line arguments.
   po::variables_map vm;
@@ -53,5 +57,7 @@ void CommandLineArguments::interpret(int argc, char* argv[]) {
   onlyGPU = vm["onlyGPU"].as<bool>();
   groupsToIterate = vm["groupsToIterate"].as<unsigned int>();
   filterDuplicates = vm["filterDuplicates"].as<bool>();
+  cudaDevice = vm["cudaDevice"].as<int>();
+  cudaDeviceMemory = vm["cudaDeviceMemory"].as<unsigned int>();
   return;
 }
diff --git a/Tests/UnitTests/Plugins/Cuda/Seeding2/CommandLineArguments.hpp b/Tests/UnitTests/Plugins/Cuda/Seeding2/CommandLineArguments.hpp
index f714e864147..6050daf6486 100644
--- a/Tests/UnitTests/Plugins/Cuda/Seeding2/CommandLineArguments.hpp
+++ b/Tests/UnitTests/Plugins/Cuda/Seeding2/CommandLineArguments.hpp
@@ -26,6 +26,12 @@ struct CommandLineArguments {
   /// Look for spacepoint duplicates in the received input file, and remove them
   bool filterDuplicates = false;
 
+  /// The CUDA device to use
+  int cudaDevice = 0;
+  /// Memory to use on the CUDA device in megabytes (by default it's 80% of the
+  /// available)
+  std::size_t cudaDeviceMemory = 0;
+
   /// Interpret the command line arguments of the test executable
   void interpret(int argc, char* argv[]);
 
diff --git a/Tests/UnitTests/Plugins/Cuda/Seeding2/main.cpp b/Tests/UnitTests/Plugins/Cuda/Seeding2/main.cpp
index e1b9e1e5397..1b1bee1e087 100644
--- a/Tests/UnitTests/Plugins/Cuda/Seeding2/main.cpp
+++ b/Tests/UnitTests/Plugins/Cuda/Seeding2/main.cpp
@@ -15,6 +15,8 @@
 
 // CUDA plugin include(s).
 #include "Acts/Plugins/Cuda/Seeding2/SeedFinder.hpp"
+#include "Acts/Plugins/Cuda/Utilities/Info.hpp"
+#include "Acts/Plugins/Cuda/Utilities/MemoryManager.hpp"
 
 // Acts include(s).
 #include "Acts/Seeding/BinFinder.hpp"
@@ -106,6 +108,27 @@ int main(int argc, char* argv[]) {
   // Make a convenient iterator that will be used multiple times later on.
   auto spGroup_end = spGroup.end();
 
+  // Allocate memory on the selected CUDA device.
+  if (Acts::Cuda::Info::instance().devices().size() <=
+      static_cast<std::size_t>(cmdl.cudaDevice)) {
+    std::cerr << "Invalid CUDA device (" << cmdl.cudaDevice << ") requested"
+              << std::endl;
+    return 1;
+  }
+  static constexpr std::size_t MEGABYTES = 1024l * 1024l;
+  std::size_t deviceMemoryAllocation = cmdl.cudaDeviceMemory * MEGABYTES;
+  if (deviceMemoryAllocation == 0) {
+    deviceMemoryAllocation =
+        Acts::Cuda::Info::instance().devices()[cmdl.cudaDevice].totalMemory *
+        0.8;
+  }
+  std::cout << "Allocating " << deviceMemoryAllocation / MEGABYTES
+            << " MB memory on device:\n"
+            << Acts::Cuda::Info::instance().devices()[cmdl.cudaDevice]
+            << std::endl;
+  Acts::Cuda::MemoryManager::instance().setMemorySize(deviceMemoryAllocation,
+                                                      cmdl.cudaDevice);
+
   // Set up the seedfinder configuration objects.
   TestHostCuts hostCuts;
   Acts::SeedFilterConfig filterConfig;

From b51e9dd91d36ed3d65212042f4e645ab02e6b4d2 Mon Sep 17 00:00:00 2001
From: Attila Krasznahorkay <Attila.Krasznahorkay@cern.ch>
Date: Fri, 9 Oct 2020 14:05:30 +0200
Subject: [PATCH 3/4] Updated Acts::Cuda::SeedFinder to allow the user to give
 it a custom logger.

---
 .../Acts/Plugins/Cuda/Seeding2/SeedFinder.hpp  | 18 +++++++++++++++++-
 .../Acts/Plugins/Cuda/Seeding2/SeedFinder.ipp  | 13 +++++++++----
 2 files changed, 26 insertions(+), 5 deletions(-)

diff --git a/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.hpp b/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.hpp
index 1592f65a691..fe98542aa17 100644
--- a/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.hpp
+++ b/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.hpp
@@ -17,6 +17,9 @@
 #include "Acts/Seeding/SeedfinderConfig.hpp"
 #include "Acts/Utilities/Logger.hpp"
 
+// System include(s).
+#include <memory>
+
 namespace Acts {
 namespace Cuda {
 
@@ -39,7 +42,8 @@ class SeedFinder {
   SeedFinder(SeedfinderConfig<external_spacepoint_t> commonConfig,
              const SeedFilterConfig& seedFilterConfig,
              const TripletFilterConfig& tripletFilterConfig, int device = 0,
-             Acts::Logging::Level loggerLevel = Acts::Logging::INFO);
+             std::unique_ptr<const Logger> logger =
+                 getDefaultLogger("Cuda::SeedFinder", Logging::INFO));
 
   /// Create all seeds from the space points in the three iterators.
   /// Can be used to parallelize the seed creation
@@ -53,7 +57,17 @@ class SeedFinder {
   std::vector<Seed<external_spacepoint_t> > createSeedsForGroup(
       sp_range_t bottomSPs, sp_range_t middleSPs, sp_range_t topSPs) const;
 
+  /// set logging instance
+  ///
+  /// @param [in] newLogger is the logging istance to be set
+  void setLogger(std::unique_ptr<const Logger> newLogger);
+
  private:
+  /// Private access to the logger
+  ///
+  /// @return a const reference to the logger
+  const Logger& logger() const { return *m_logger; }
+
   /// Configuration for the seed finder
   SeedfinderConfig<external_spacepoint_t> m_commonConfig;
   /// Configuration for the (host) seed filter
@@ -62,6 +76,8 @@ class SeedFinder {
   TripletFilterConfig m_tripletFilterConfig;
   /// CUDA device identifier
   int m_device;
+  /// The logger object
+  std::unique_ptr<const Logger> m_logger;
 };
 
 }  // namespace Cuda
diff --git a/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.ipp b/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.ipp
index 5fed388d88d..e1862d33bf4 100644
--- a/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.ipp
+++ b/Plugins/Cuda/include/Acts/Plugins/Cuda/Seeding2/SeedFinder.ipp
@@ -33,11 +33,12 @@ SeedFinder<external_spacepoint_t>::SeedFinder(
     Acts::SeedfinderConfig<external_spacepoint_t> commonConfig,
     const SeedFilterConfig& seedFilterConfig,
     const TripletFilterConfig& tripletFilterConfig, int device,
-    Acts::Logging::Level loggerLevel)
+    std::unique_ptr<const Logger> incomingLogger)
     : m_commonConfig(std::move(commonConfig)),
       m_seedFilterConfig(seedFilterConfig),
       m_tripletFilterConfig(tripletFilterConfig),
-      m_device(device) {
+      m_device(device),
+      m_logger(std::move(incomingLogger)) {
   // calculation of scattering using the highland formula
   // convert pT to p once theta angle is known
   m_commonConfig.highland =
@@ -56,8 +57,6 @@ SeedFinder<external_spacepoint_t>::SeedFinder(
       std::pow(m_commonConfig.highland / m_commonConfig.pTPerHelixRadius, 2);
 
   // Tell the user what CUDA device will be used by the object.
-  ACTS_LOCAL_LOGGER(
-      Acts::getDefaultLogger("Acts::Cuda::SeedFinder", loggerLevel));
   if (static_cast<std::size_t>(m_device) < Info::instance().devices().size()) {
     ACTS_DEBUG("Will be using device:\n"
                << Info::instance().devices()[m_device]);
@@ -215,5 +214,11 @@ SeedFinder<external_spacepoint_t>::createSeedsForGroup(
   return outputVec;
 }
 
+template <typename external_spacepoint_t>
+void SeedFinder<external_spacepoint_t>::setLogger(
+    std::unique_ptr<const Logger> newLogger) {
+  return m_logger.swap(newLogger);
+}
+
 }  // namespace Cuda
 }  // namespace Acts

From 6f08696eb8cd5faa0418f68668b6bc4df68f53a8 Mon Sep 17 00:00:00 2001
From: Attila Krasznahorkay <Attila.Krasznahorkay@cern.ch>
Date: Fri, 9 Oct 2020 14:07:05 +0200
Subject: [PATCH 4/4] Added an explicit specification for which CUDA device
 should be used.

It was left out of the code by mistake so far...
---
 Tests/UnitTests/Plugins/Cuda/Seeding2/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tests/UnitTests/Plugins/Cuda/Seeding2/main.cpp b/Tests/UnitTests/Plugins/Cuda/Seeding2/main.cpp
index 1b1bee1e087..05d137a049e 100644
--- a/Tests/UnitTests/Plugins/Cuda/Seeding2/main.cpp
+++ b/Tests/UnitTests/Plugins/Cuda/Seeding2/main.cpp
@@ -139,7 +139,7 @@ int main(int argc, char* argv[]) {
   // Set up the seedfinder objects.
   Acts::Seedfinder<TestSpacePoint> seedfinder_host(sfConfig);
   Acts::Cuda::SeedFinder<TestSpacePoint> seedfinder_device(
-      sfConfig, filterConfig, deviceCuts);
+      sfConfig, filterConfig, deviceCuts, cmdl.cudaDevice);
 
   //
   // Perform the seed finding on the host.