From f7280ff6f41ab621cff9bb1b25179450ca567493 Mon Sep 17 00:00:00 2001
From: Vincenzo Innocente <vincenzo.innocente@cern.ch>
Date: Mon, 30 Nov 2020 16:24:21 +0100
Subject: [PATCH] make clusterizer kernels independent of grid size

---
 .../plugins/gpuClusterChargeCut.h             | 173 ++++----
 .../plugins/gpuClustering.h                   | 409 +++++++++---------
 .../SiPixelClusterizer/test/gpuClustering_t.h |  22 +-
 3 files changed, 298 insertions(+), 306 deletions(-)

diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
index b781b10792fff..8f45d35b267b1 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusterChargeCut.h
@@ -19,105 +19,106 @@ namespace gpuClustering {
       uint32_t const* __restrict__ moduleId,     // module id of each module
       int32_t* __restrict__ clusterId,           // modified: cluster id of each pixel
       uint32_t numElements) {
-    if (blockIdx.x >= moduleStart[0])
-      return;
-
-    auto firstPixel = moduleStart[1 + blockIdx.x];
-    auto thisModuleId = id[firstPixel];
-    assert(thisModuleId < MaxNumModules);
-    assert(thisModuleId == moduleId[blockIdx.x]);
+    __shared__ int32_t charge[MaxNumClustersPerModules];
+    __shared__ uint8_t ok[MaxNumClustersPerModules];
+    __shared__ uint16_t newclusId[MaxNumClustersPerModules];
 
-    auto nclus = nClustersInModule[thisModuleId];
-    if (nclus == 0)
-      return;
+    auto firstModule = blockIdx.x;
+    auto endModule = moduleStart[0];
+    for (auto module = firstModule; module < endModule; module += gridDim.x) {
+      auto firstPixel = moduleStart[1 + module];
+      auto thisModuleId = id[firstPixel];
+      assert(thisModuleId < MaxNumModules);
+      assert(thisModuleId == moduleId[module]);
+
+      auto nclus = nClustersInModule[thisModuleId];
+      if (nclus == 0)
+        continue;
+
+      if (threadIdx.x == 0 && nclus > MaxNumClustersPerModules)
+        printf("Warning too many clusters in module %d in block %d: %d > %d\n",
+               thisModuleId,
+               blockIdx.x,
+               nclus,
+               MaxNumClustersPerModules);
+
+      auto first = firstPixel + threadIdx.x;
+
+      if (nclus > MaxNumClustersPerModules) {
+        // remove excess  FIXME find a way to cut charge first....
+        for (auto i = first; i < numElements; i += blockDim.x) {
+          if (id[i] == InvId)
+            continue;  // not valid
+          if (id[i] != thisModuleId)
+            break;  // end of module
+          if (clusterId[i] >= MaxNumClustersPerModules) {
+            id[i] = InvId;
+            clusterId[i] = InvId;
+          }
+        }
+        nclus = MaxNumClustersPerModules;
+      }
 
-    if (threadIdx.x == 0 && nclus > MaxNumClustersPerModules)
-      printf("Warning too many clusters in module %d in block %d: %d > %d\n",
-             thisModuleId,
-             blockIdx.x,
-             nclus,
-             MaxNumClustersPerModules);
+#ifdef GPU_DEBUG
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("start cluster charge cut for module %d in block %d\n", thisModuleId, blockIdx.x);
+#endif
 
-    auto first = firstPixel + threadIdx.x;
+      assert(nclus <= MaxNumClustersPerModules);
+      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+        charge[i] = 0;
+      }
+      __syncthreads();
 
-    if (nclus > MaxNumClustersPerModules) {
-      // remove excess  FIXME find a way to cut charge first....
       for (auto i = first; i < numElements; i += blockDim.x) {
         if (id[i] == InvId)
           continue;  // not valid
         if (id[i] != thisModuleId)
           break;  // end of module
-        if (clusterId[i] >= MaxNumClustersPerModules) {
-          id[i] = InvId;
-          clusterId[i] = InvId;
-        }
+        atomicAdd(&charge[clusterId[i]], adc[i]);
       }
-      nclus = MaxNumClustersPerModules;
-    }
+      __syncthreads();
 
-#ifdef GPU_DEBUG
-    if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
-#endif
+      auto chargeCut = thisModuleId < 96 ? 2000 : 4000;  // move in constants (calib?)
+      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+        newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0;
+      }
 
-    __shared__ int32_t charge[MaxNumClustersPerModules];
-    __shared__ uint8_t ok[MaxNumClustersPerModules];
-    __shared__ uint16_t newclusId[MaxNumClustersPerModules];
+      __syncthreads();
+
+      // renumber
+      __shared__ uint16_t ws[32];
+      cms::cuda::blockPrefixScan(newclusId, nclus, ws);
+
+      assert(nclus >= newclusId[nclus - 1]);
+
+      if (nclus == newclusId[nclus - 1])
+        continue;
+
+      nClustersInModule[thisModuleId] = newclusId[nclus - 1];
+      __syncthreads();
+
+      // mark bad cluster again
+      for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
+        if (0 == ok[i])
+          newclusId[i] = InvId + 1;
+      }
+      __syncthreads();
+
+      // reassign id
+      for (auto i = first; i < numElements; i += blockDim.x) {
+        if (id[i] == InvId)
+          continue;  // not valid
+        if (id[i] != thisModuleId)
+          break;  // end of module
+        clusterId[i] = newclusId[clusterId[i]] - 1;
+        if (clusterId[i] == InvId)
+          id[i] = InvId;
+      }
 
-    assert(nclus <= MaxNumClustersPerModules);
-    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-      charge[i] = 0;
-    }
-    __syncthreads();
-
-    for (auto i = first; i < numElements; i += blockDim.x) {
-      if (id[i] == InvId)
-        continue;  // not valid
-      if (id[i] != thisModuleId)
-        break;  // end of module
-      atomicAdd(&charge[clusterId[i]], adc[i]);
-    }
-    __syncthreads();
-
-    auto chargeCut = thisModuleId < 96 ? 2000 : 4000;  // move in constants (calib?)
-    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-      newclusId[i] = ok[i] = charge[i] > chargeCut ? 1 : 0;
-    }
-
-    __syncthreads();
-
-    // renumber
-    __shared__ uint16_t ws[32];
-    cms::cuda::blockPrefixScan(newclusId, nclus, ws);
-
-    assert(nclus >= newclusId[nclus - 1]);
-
-    if (nclus == newclusId[nclus - 1])
-      return;
-
-    nClustersInModule[thisModuleId] = newclusId[nclus - 1];
-    __syncthreads();
-
-    // mark bad cluster again
-    for (auto i = threadIdx.x; i < nclus; i += blockDim.x) {
-      if (0 == ok[i])
-        newclusId[i] = InvId + 1;
-    }
-    __syncthreads();
-
-    // reassign id
-    for (auto i = first; i < numElements; i += blockDim.x) {
-      if (id[i] == InvId)
-        continue;  // not valid
-      if (id[i] != thisModuleId)
-        break;  // end of module
-      clusterId[i] = newclusId[clusterId[i]] - 1;
-      if (clusterId[i] == InvId)
-        id[i] = InvId;
-    }
-
-    //done
+      //done
+    }  // loop on modules
   }
 
 }  // namespace gpuClustering
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
index 16c181a431ce8..3bf42c8265b1e 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/gpuClustering.h
@@ -47,260 +47,261 @@ namespace gpuClustering {
                uint32_t* __restrict__ moduleId,           // output: module id of each module
                int32_t* __restrict__ clusterId,           // output: cluster id of each pixel
                int numElements) {
-    if (blockIdx.x >= moduleStart[0])
-      return;
+    __shared__ int msize;
 
-    auto firstPixel = moduleStart[1 + blockIdx.x];
-    auto thisModuleId = id[firstPixel];
-    assert(thisModuleId < MaxNumModules);
+    auto firstModule = blockIdx.x;
+    auto endModule = moduleStart[0];
+    for (auto module = firstModule; module < endModule; module += gridDim.x) {
+      auto firstPixel = moduleStart[1 + module];
+      auto thisModuleId = id[firstPixel];
+      assert(thisModuleId < MaxNumModules);
 
 #ifdef GPU_DEBUG
-    if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("start clusterizer for module %d in block %d\n", thisModuleId, blockIdx.x);
 #endif
 
-    auto first = firstPixel + threadIdx.x;
+      auto first = firstPixel + threadIdx.x;
 
-    // find the index of the first pixel not belonging to this module (or invalid)
-    __shared__ int msize;
-    msize = numElements;
-    __syncthreads();
+      // find the index of the first pixel not belonging to this module (or invalid)
+      msize = numElements;
+      __syncthreads();
 
-    // skip threads not associated to an existing pixel
-    for (int i = first; i < numElements; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      if (id[i] != thisModuleId) {  // find the first pixel in a different module
-        atomicMin(&msize, i);
-        break;
+      // skip threads not associated to an existing pixel
+      for (int i = first; i < numElements; i += blockDim.x) {
+        if (id[i] == InvId)  // skip invalid pixels
+          continue;
+        if (id[i] != thisModuleId) {  // find the first pixel in a different module
+          atomicMin(&msize, i);
+          break;
+        }
       }
-    }
 
-    //init hist  (ymax=416 < 512 : 9bits)
-    constexpr uint32_t maxPixInModule = 4000;
-    constexpr auto nbins = phase1PixelTopology::numColsInModule + 2;  //2+2;
-    using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, 9, uint16_t>;
-    __shared__ Hist hist;
-    __shared__ typename Hist::Counter ws[32];
-    for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
-      hist.off[j] = 0;
-    }
-    __syncthreads();
+      //init hist  (ymax=416 < 512 : 9bits)
+      constexpr uint32_t maxPixInModule = 4000;
+      constexpr auto nbins = phase1PixelTopology::numColsInModule + 2;  //2+2;
+      using Hist = cms::cuda::HistoContainer<uint16_t, nbins, maxPixInModule, 9, uint16_t>;
+      __shared__ Hist hist;
+      __shared__ typename Hist::Counter ws[32];
+      for (auto j = threadIdx.x; j < Hist::totbins(); j += blockDim.x) {
+        hist.off[j] = 0;
+      }
+      __syncthreads();
 
-    assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
+      assert((msize == numElements) or ((msize < numElements) and (id[msize] != thisModuleId)));
 
-    // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
-    if (0 == threadIdx.x) {
-      if (msize - firstPixel > maxPixInModule) {
-        printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
-        msize = maxPixInModule + firstPixel;
+      // limit to maxPixInModule  (FIXME if recurrent (and not limited to simulation with low threshold) one will need to implement something cleverer)
+      if (0 == threadIdx.x) {
+        if (msize - firstPixel > maxPixInModule) {
+          printf("too many pixels in module %d: %d > %d\n", thisModuleId, msize - firstPixel, maxPixInModule);
+          msize = maxPixInModule + firstPixel;
+        }
       }
-    }
 
-    __syncthreads();
-    assert(msize - firstPixel <= maxPixInModule);
+      __syncthreads();
+      assert(msize - firstPixel <= maxPixInModule);
 
 #ifdef GPU_DEBUG
-    __shared__ uint32_t totGood;
-    totGood = 0;
-    __syncthreads();
+      __shared__ uint32_t totGood;
+      totGood = 0;
+      __syncthreads();
 #endif
 
-    // fill histo
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      hist.count(y[i]);
+      // fill histo
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)  // skip invalid pixels
+          continue;
+        hist.count(y[i]);
 #ifdef GPU_DEBUG
-      atomicAdd(&totGood, 1);
+        atomicAdd(&totGood, 1);
 #endif
-    }
-    __syncthreads();
-    if (threadIdx.x < 32)
-      ws[threadIdx.x] = 0;  // used by prefix scan...
-    __syncthreads();
-    hist.finalize(ws);
-    __syncthreads();
+      }
+      __syncthreads();
+      if (threadIdx.x < 32)
+        ws[threadIdx.x] = 0;  // used by prefix scan...
+      __syncthreads();
+      hist.finalize(ws);
+      __syncthreads();
 #ifdef GPU_DEBUG
-    assert(hist.size() == totGood);
-    if (thisModuleId % 100 == 1)
-      if (threadIdx.x == 0)
-        printf("histo size %d\n", hist.size());
+      assert(hist.size() == totGood);
+      if (thisModuleId % 100 == 1)
+        if (threadIdx.x == 0)
+          printf("histo size %d\n", hist.size());
 #endif
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      hist.fill(y[i], i - firstPixel);
-    }
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)  // skip invalid pixels
+          continue;
+        hist.fill(y[i], i - firstPixel);
+      }
 
 #ifdef __CUDA_ARCH__
-    // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
-    constexpr int maxiter = 16;
+      // assume that we can cover the whole module with up to 16 blockDim.x-wide iterations
+      constexpr int maxiter = 16;
 #else
-    auto maxiter = hist.size();
+      auto maxiter = hist.size();
 #endif
-    // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
-    constexpr int maxNeighbours = 10;
-    assert((hist.size() / blockDim.x) <= maxiter);
-    // nearest neighbour
-    uint16_t nn[maxiter][maxNeighbours];
-    uint8_t nnn[maxiter];  // number of nn
-    for (uint32_t k = 0; k < maxiter; ++k)
-      nnn[k] = 0;
+      // allocate space for duplicate pixels: a pixel can appear more than once with different charge in the same event
+      constexpr int maxNeighbours = 10;
+      assert((hist.size() / blockDim.x) <= maxiter);
+      // nearest neighbour
+      uint16_t nn[maxiter][maxNeighbours];
+      uint8_t nnn[maxiter];  // number of nn
+      for (uint32_t k = 0; k < maxiter; ++k)
+        nnn[k] = 0;
 
-    __syncthreads();  // for hit filling!
+      __syncthreads();  // for hit filling!
 
 #ifdef GPU_DEBUG
-    // look for anomalous high occupancy
-    __shared__ uint32_t n40, n60;
-    n40 = n60 = 0;
-    __syncthreads();
-    for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
-      if (hist.size(j) > 60)
-        atomicAdd(&n60, 1);
-      if (hist.size(j) > 40)
-        atomicAdd(&n40, 1);
-    }
-    __syncthreads();
-    if (0 == threadIdx.x) {
-      if (n60 > 0)
-        printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
-      else if (n40 > 0)
-        printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
-    }
-    __syncthreads();
+      // look for anomalous high occupancy
+      __shared__ uint32_t n40, n60;
+      n40 = n60 = 0;
+      __syncthreads();
+      for (auto j = threadIdx.x; j < Hist::nbins(); j += blockDim.x) {
+        if (hist.size(j) > 60)
+          atomicAdd(&n60, 1);
+        if (hist.size(j) > 40)
+          atomicAdd(&n40, 1);
+      }
+      __syncthreads();
+      if (0 == threadIdx.x) {
+        if (n60 > 0)
+          printf("columns with more than 60 px %d in %d\n", n60, thisModuleId);
+        else if (n40 > 0)
+          printf("columns with more than 40 px %d in %d\n", n40, thisModuleId);
+      }
+      __syncthreads();
 #endif
 
-    // fill NN
-    for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
-      assert(k < maxiter);
-      auto p = hist.begin() + j;
-      auto i = *p + firstPixel;
-      assert(id[i] != InvId);
-      assert(id[i] == thisModuleId);  // same module
-      int be = Hist::bin(y[i] + 1);
-      auto e = hist.end(be);
-      ++p;
-      assert(0 == nnn[k]);
-      for (; p < e; ++p) {
-        auto m = (*p) + firstPixel;
-        assert(m != i);
-        assert(int(y[m]) - int(y[i]) >= 0);
-        assert(int(y[m]) - int(y[i]) <= 1);
-        if (std::abs(int(x[m]) - int(x[i])) > 1)
-          continue;
-        auto l = nnn[k]++;
-        assert(l < maxNeighbours);
-        nn[k][l] = *p;
+      // fill NN
+      for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
+        assert(k < maxiter);
+        auto p = hist.begin() + j;
+        auto i = *p + firstPixel;
+        assert(id[i] != InvId);
+        assert(id[i] == thisModuleId);  // same module
+        int be = Hist::bin(y[i] + 1);
+        auto e = hist.end(be);
+        ++p;
+        assert(0 == nnn[k]);
+        for (; p < e; ++p) {
+          auto m = (*p) + firstPixel;
+          assert(m != i);
+          assert(int(y[m]) - int(y[i]) >= 0);
+          assert(int(y[m]) - int(y[i]) <= 1);
+          if (std::abs(int(x[m]) - int(x[i])) > 1)
+            continue;
+          auto l = nnn[k]++;
+          assert(l < maxNeighbours);
+          nn[k][l] = *p;
+        }
       }
-    }
 
-    // for each pixel, look at all the pixels until the end of the module;
-    // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
-    // after the loop, all the pixel in each cluster should have the id equeal to the lowest
-    // pixel in the cluster ( clus[i] == i ).
-    bool more = true;
-    int nloops = 0;
-    while (__syncthreads_or(more)) {
-      if (1 == nloops % 2) {
-        for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
-          auto p = hist.begin() + j;
-          auto i = *p + firstPixel;
-          auto m = clusterId[i];
-          while (m != clusterId[m])
-            m = clusterId[m];
-          clusterId[i] = m;
+      // for each pixel, look at all the pixels until the end of the module;
+      // when two valid pixels within +/- 1 in x or y are found, set their id to the minimum;
+      // after the loop, all the pixel in each cluster should have the id equeal to the lowest
+      // pixel in the cluster ( clus[i] == i ).
+      bool more = true;
+      int nloops = 0;
+      while (__syncthreads_or(more)) {
+        if (1 == nloops % 2) {
+          for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
+            auto p = hist.begin() + j;
+            auto i = *p + firstPixel;
+            auto m = clusterId[i];
+            while (m != clusterId[m])
+              m = clusterId[m];
+            clusterId[i] = m;
+          }
+        } else {
+          more = false;
+          for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
+            auto p = hist.begin() + j;
+            auto i = *p + firstPixel;
+            for (int kk = 0; kk < nnn[k]; ++kk) {
+              auto l = nn[k][kk];
+              auto m = l + firstPixel;
+              assert(m != i);
+              auto old = atomicMin(&clusterId[m], clusterId[i]);
+              if (old != clusterId[i]) {
+                // end the loop only if no changes were applied
+                more = true;
+              }
+              atomicMin(&clusterId[i], old);
+            }  // nnloop
+          }    // pixel loop
         }
-      } else {
-        more = false;
-        for (auto j = threadIdx.x, k = 0U; j < hist.size(); j += blockDim.x, ++k) {
-          auto p = hist.begin() + j;
-          auto i = *p + firstPixel;
-          for (int kk = 0; kk < nnn[k]; ++kk) {
-            auto l = nn[k][kk];
-            auto m = l + firstPixel;
-            assert(m != i);
-            auto old = atomicMin(&clusterId[m], clusterId[i]);
-            if (old != clusterId[i]) {
-              // end the loop only if no changes were applied
-              more = true;
-            }
-            atomicMin(&clusterId[i], old);
-          }  // nnloop
-        }    // pixel loop
-      }
-      ++nloops;
-    }  // end while
+        ++nloops;
+      }  // end while
 
 #ifdef GPU_DEBUG
-    {
-      __shared__ int n0;
-      if (threadIdx.x == 0)
-        n0 = nloops;
-      __syncthreads();
-      auto ok = n0 == nloops;
-      assert(__syncthreads_and(ok));
-      if (thisModuleId % 100 == 1)
+      {
+        __shared__ int n0;
         if (threadIdx.x == 0)
-          printf("# loops %d\n", nloops);
-    }
+          n0 = nloops;
+        __syncthreads();
+        auto ok = n0 == nloops;
+        assert(__syncthreads_and(ok));
+        if (thisModuleId % 100 == 1)
+          if (threadIdx.x == 0)
+            printf("# loops %d\n", nloops);
+      }
 #endif
 
-    __shared__ unsigned int foundClusters;
-    foundClusters = 0;
-    __syncthreads();
+      __shared__ unsigned int foundClusters;
+      foundClusters = 0;
+      __syncthreads();
 
-    // find the number of different clusters, identified by a pixels with clus[i] == i;
-    // mark these pixels with a negative id.
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      if (clusterId[i] == i) {
-        auto old = atomicInc(&foundClusters, 0xffffffff);
-        clusterId[i] = -(old + 1);
+      // find the number of different clusters, identified by a pixels with clus[i] == i;
+      // mark these pixels with a negative id.
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)  // skip invalid pixels
+          continue;
+        if (clusterId[i] == i) {
+          auto old = atomicInc(&foundClusters, 0xffffffff);
+          clusterId[i] = -(old + 1);
+        }
       }
-    }
-    __syncthreads();
+      __syncthreads();
 
-    // propagate the negative id to all the pixels in the cluster.
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId)  // skip invalid pixels
-        continue;
-      if (clusterId[i] >= 0) {
-        // mark each pixel in a cluster with the same id as the first one
-        clusterId[i] = clusterId[clusterId[i]];
+      // propagate the negative id to all the pixels in the cluster.
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId)  // skip invalid pixels
+          continue;
+        if (clusterId[i] >= 0) {
+          // mark each pixel in a cluster with the same id as the first one
+          clusterId[i] = clusterId[clusterId[i]];
+        }
       }
-    }
-    __syncthreads();
+      __syncthreads();
 
-    // adjust the cluster id to be a positive value starting from 0
-    for (int i = first; i < msize; i += blockDim.x) {
-      if (id[i] == InvId) {  // skip invalid pixels
-        clusterId[i] = -9999;
-        continue;
+      // adjust the cluster id to be a positive value starting from 0
+      for (int i = first; i < msize; i += blockDim.x) {
+        if (id[i] == InvId) {  // skip invalid pixels
+          clusterId[i] = -9999;
+          continue;
+        }
+        clusterId[i] = -clusterId[i] - 1;
       }
-      clusterId[i] = -clusterId[i] - 1;
-    }
-    __syncthreads();
+      __syncthreads();
 
-    if (threadIdx.x == 0) {
-      nClustersInModule[thisModuleId] = foundClusters;
-      moduleId[blockIdx.x] = thisModuleId;
+      if (threadIdx.x == 0) {
+        nClustersInModule[thisModuleId] = foundClusters;
+        moduleId[module] = thisModuleId;
 #ifdef GPU_DEBUG
-      if (foundClusters > gMaxHit) {
-        gMaxHit = foundClusters;
-        if (foundClusters > 8)
-          printf("max hit %d in %d\n", foundClusters, thisModuleId);
-      }
+        if (foundClusters > gMaxHit) {
+          gMaxHit = foundClusters;
+          if (foundClusters > 8)
+            printf("max hit %d in %d\n", foundClusters, thisModuleId);
+        }
 #endif
 #ifdef GPU_DEBUG
-      if (thisModuleId % 100 == 1)
-        printf("%d clusters in module %d\n", foundClusters, thisModuleId);
+        if (thisModuleId % 100 == 1)
+          printf("%d clusters in module %d\n", foundClusters, thisModuleId);
 #endif
-    }
+      }
+    }  // module loop
   }
-
 }  // namespace gpuClustering
 
 #endif  // RecoLocalTracker_SiPixelClusterizer_plugins_gpuClustering_h
diff --git a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
index 8ec665f8960b6..d7bb0da47984e 100644
--- a/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
+++ b/RecoLocalTracker/SiPixelClusterizer/test/gpuClustering_t.h
@@ -304,18 +304,10 @@ int main(void) {
     h_moduleStart[0] = nModules;
     countModules(h_id.get(), h_moduleStart.get(), h_clus.get(), n);
     memset(h_clusInModule.get(), 0, MaxNumModules * sizeof(uint32_t));
-    gridDim.x = MaxNumModules;  //not needed in the kernel for this specific case;
+    assert(gridDim.x == 1);
     assert(blockIdx.x == 0);
-    for (; blockIdx.x < gridDim.x; ++blockIdx.x)
-      findClus(h_id.get(),
-               h_x.get(),
-               h_y.get(),
-               h_moduleStart.get(),
-               h_clusInModule.get(),
-               h_moduleId.get(),
-               h_clus.get(),
-               n);
-    resetGrid();
+    findClus(
+        h_id.get(), h_x.get(), h_y.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n);
 
     nModules = h_moduleStart[0];
     auto nclus = h_clusInModule.get();
@@ -330,12 +322,10 @@ int main(void) {
     if (ncl != std::accumulate(nclus, nclus + MaxNumModules, 0))
       std::cout << "ERROR!!!!! wrong number of cluster found" << std::endl;
 
-    gridDim.x = MaxNumModules;  // no needed in the kernel for in this specific case
+    assert(gridDim.x == 1);
     assert(blockIdx.x == 0);
-    for (; blockIdx.x < gridDim.x; ++blockIdx.x)
-      clusterChargeCut(
-          h_id.get(), h_adc.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n);
-    resetGrid();
+    clusterChargeCut(
+        h_id.get(), h_adc.get(), h_moduleStart.get(), h_clusInModule.get(), h_moduleId.get(), h_clus.get(), n);
 
 #endif