From 459d6f6351cfcf622a8ef6d0edea515398ec23ce Mon Sep 17 00:00:00 2001
From: Oliver Darbyshire <oliver.darbyshire@zenotech.com>
Date: Thu, 15 Dec 2022 16:36:17 +0000
Subject: [PATCH 1/2] Update output for block sizes > 1

---
 include/reorder_partition.h            | 25 ++++++----
 src/amgx_c.cu                          | 64 ++++++++++++++++++--------
 src/distributed/distributed_manager.cu | 10 ++--
 src/matrix.cu                          |  1 +
 4 files changed, 68 insertions(+), 32 deletions(-)
diff --git a/include/reorder_partition.h b/include/reorder_partition.h
index b80e8911..66f4e28a 100644
--- a/include/reorder_partition.h
+++ b/include/reorder_partition.h
@@ -29,13 +29,15 @@
 
 template <typename index_type, typename mat_value_type, bool reorder_rows, bool reorder_cols>
 void reorder_partition_host(index_type n, index_type nnz, index_type *Ap, index_type *Ac, mat_value_type *Av,
-                            index_type *Bp, index_type *Bc, mat_value_type *Bv, index_type l, index_type *p)
+                            index_type *Bp, index_type *Bc, mat_value_type *Bv, index_type l, index_type *p,
+			    index_type block_dimx, index_type block_dimy)
 {
     //applies reordering P from left adn right on matrix A, so that B=P'*A*P.
     //notice that matrix A may be rectangular (does not need to be square).
     //WARNING: columns within the rows are not sorted in the output matrix B.
     using namespace amgx;
     index_type row, col, i, j, k;
+    index_type block_size = block_dimx * block_dimy;
     //(i)  reorder rows
     //compute number of elements per row (in new locations)
     Bp[0] = 0;
@@ -56,22 +58,27 @@ void reorder_partition_host(index_type n, index_type nnz, index_type *Ap, index_
     for (row = 0; row < n; row++)
     {
         i = reorder_rows ? p[row] : row;
-
         for (k = Bp[i], j = Ap[row]; j < Ap[row + 1]; j++, k++)
         {
             col  = Ac[j];
             Bc[k] = reorder_cols ? p[col] : col;
-            Bv[k] = Av[j];
+	    for (int kx = 0; kx < block_dimx; kx++)
+		for (int ky = 0; ky < block_dimy; ky++){
+		    int loc = j * block_size + kx * block_dimy + k;
+		    Bv[k * block_size + kx * block_dimy + ky] = Av[j * block_size + kx * block_dimy + ky];
+		}
         }
     }
 }
 
 template <typename index_type, typename mat_value_type, bool reorder_rows, bool reorder_cols>
 void reorder_partition(index_type n, index_type nnz, index_type *Ap, index_type *Ac, mat_value_type *Av,
-                       index_type *Bp, index_type *Bc, mat_value_type *Bv, index_type l, index_type *p)
+                       index_type *Bp, index_type *Bc, mat_value_type *Bv, index_type l, index_type *p,
+		       index_type block_dimx, index_type block_dimy)
 {
     using namespace amgx;
     cudaError_t st1, st2, st3, st4;
+    index_type block_size = block_dimx * block_dimy;
     //applies reordering P from left adn right on matrix A, so that B=P'*A*P.
     //notice that matrix A may be rectangular (does not need to be square).
     //for now implement device by copying matrices to the host and calling host.
@@ -85,10 +92,10 @@ void reorder_partition(index_type n, index_type nnz, index_type *Ap, index_type
     p_h  = (index_type *)malloc(    l * sizeof( p_h[0]));
     Ap_h = (index_type *)malloc((n + 1) * sizeof(Ap_h[0]));
     Ac_h = (index_type *)malloc(  nnz * sizeof(Ac_h[0]));
-    Av_h = (mat_value_type *)malloc(  nnz * sizeof(Av_h[0]));
+    Av_h = (mat_value_type *)malloc(  nnz * block_size * sizeof(Av_h[0]));
     Bp_h = (index_type *)malloc((n + 1) * sizeof(Bp_h[0]));
     Bc_h = (index_type *)malloc(  nnz * sizeof(Bc_h[0]));
-    Bv_h = (mat_value_type *)malloc(  nnz * sizeof(Bv_h[0]));
+    Bv_h = (mat_value_type *)malloc(  nnz * block_size * sizeof(Bv_h[0]));
 
     if (!p_h || !Ap_h || !Ac_h || !Av_h || !Bp_h || !Bc_h || !Bv_h)
     {
@@ -98,7 +105,7 @@ void reorder_partition(index_type n, index_type nnz, index_type *Ap, index_type
     st1 = cudaMemcpy(p_h,  p,    l * sizeof( p[0]), cudaMemcpyDeviceToHost);
     st2 = cudaMemcpy(Ap_h, Ap, (n + 1) * sizeof(Ap[0]), cudaMemcpyDeviceToHost);
     st3 = cudaMemcpy(Ac_h, Ac,  nnz * sizeof(Ac[0]), cudaMemcpyDeviceToHost);
-    st4 = cudaMemcpy(Av_h, Av,  nnz * sizeof(Av[0]), cudaMemcpyDeviceToHost);
+    st4 = cudaMemcpy(Av_h, Av,  nnz * block_size * sizeof(Av[0]), cudaMemcpyDeviceToHost);
 
     if ((st1 != cudaSuccess) || (st2 != cudaSuccess) || (st3 != cudaSuccess) || (st4 != cudaSuccess))
     {
@@ -106,10 +113,10 @@ void reorder_partition(index_type n, index_type nnz, index_type *Ap, index_type
     }
 
     reorder_partition_host<index_type, mat_value_type, reorder_rows, reorder_cols>
-    (n, nnz, Ap_h, Ac_h, Av_h, Bp_h, Bc_h, Bv_h, l, p_h);
+    (n, nnz, Ap_h, Ac_h, Av_h, Bp_h, Bc_h, Bv_h, l, p_h, block_dimx, block_dimy);
     st1 = cudaMemcpy(Bp, Bp_h, (n + 1) * sizeof(Bp[0]), cudaMemcpyHostToDevice);
     st2 = cudaMemcpy(Bc, Bc_h,  nnz * sizeof(Bc[0]), cudaMemcpyHostToDevice);
-    st3 = cudaMemcpy(Bv, Bv_h,  nnz * sizeof(Bv[0]), cudaMemcpyHostToDevice);
+    st3 = cudaMemcpy(Bv, Bv_h,  nnz * block_size * sizeof(Bv[0]), cudaMemcpyHostToDevice);
 
     if ((st1 != cudaSuccess) || (st2 != cudaSuccess) || (st3 != cudaSuccess))
     {
diff --git a/src/amgx_c.cu b/src/amgx_c.cu
index b74fd718..c3117f77 100644
--- a/src/amgx_c.cu
+++ b/src/amgx_c.cu
@@ -236,6 +236,8 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
     int n, nnz, offset, l, k, i;
     int start, end, shift;
     int mpist;
+    int block_dimx, block_dimy;
+
     MPI_Comm mpicm;
     //MPI call parameters
     t_IndPrec *rc_ptr, *di_ptr;
@@ -273,6 +275,10 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
 
         nv_mtx->getOffsetAndSizeForView(OWNED, &offset, &n);
         nv_mtx->getNnzForView(OWNED, &nnz);
+	block_dimx = nv_mtx->get_block_dimx();
+	block_dimy = nv_mtx->get_block_dimy();
+	gA.set_block_dimx(block_dimx);
+	gA.set_block_dimy(block_dimy);
 
         if (nv_mtx->manager->part_offsets_h.size() == 0)   // create part_offsets_h & part_offsets
         {
@@ -284,10 +290,10 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
         //some allocations/resizing
         Bp.resize(n + 1);
         Bi.resize(nnz);
-        Bv.resize(nnz);
+        Bv.resize(nnz * block_dimx * block_dimy);
         hBp.resize(n + 1);
         hBi.resize(nnz);
-        hBv.resize(nnz);
+        hBv.resize(nnz * block_dimx * block_dimy);
 
         if (rank == root)
         {
@@ -303,6 +309,7 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
         nv_mtx->manager->unpack_partition(thrust::raw_pointer_cast(Bp.data()),
                                           thrust::raw_pointer_cast(Bi.data()),
                                           thrust::raw_pointer_cast(Bv.data()));
+
         cudaCheckError();
         //copy to host (should be able to optimize this out later on)
         hBp = Bp;
@@ -363,7 +370,7 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
 
             //some allocations/resizing
             hAi.resize(hAp[k]); //now we know global nnz and can allocate storage
-            hAv.resize(hAp[k]); //now we know global nnz and can allocate storage
+            hAv.resize(hAp[k] * block_dimx * block_dimy); //now we know global nnz and can allocate storage
         }
 
         //alias raw pointers to thrust vector data (see thrust example unwrap_pointer for details)
@@ -392,13 +399,22 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
         }
 
         //values
+	if(rank == root) {
+	    for (i = 0; i < l; i++) {
+		rc[i] *= block_dimx * block_dimy;
+		di[i] *= block_dimx * block_dimy;
+	    }
+	}
+        rc_ptr = thrust::raw_pointer_cast(rc.data());
+        di_ptr = thrust::raw_pointer_cast(di.data());
+
         if      (typeid(t_MatPrec) == typeid(float))
         {
-            mpist = MPI_Gatherv(hlv_ptr, nnz, MPI_FLOAT,  hgv_ptr, rc_ptr, di_ptr, MPI_FLOAT,  root, mpicm);
+            mpist = MPI_Gatherv(hlv_ptr, nnz * block_dimx * block_dimy, MPI_FLOAT,  hgv_ptr, rc_ptr, di_ptr, MPI_FLOAT,  root, mpicm);
         }
         else if (typeid(t_MatPrec) == typeid(double))
         {
-            mpist = MPI_Gatherv(hlv_ptr, nnz, MPI_DOUBLE, hgv_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, mpicm);
+            mpist = MPI_Gatherv(hlv_ptr, nnz * block_dimx * block_dimy, MPI_DOUBLE, hgv_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, mpicm);
         }
         else
         {
@@ -447,10 +463,10 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
                  thrust::raw_pointer_cast(hBp.data()),
                  thrust::raw_pointer_cast(hBi.data()),
                  thrust::raw_pointer_cast(hBv.data()),
-                 imap.size(), thrust::raw_pointer_cast(imap.data()));
+                 imap.size(), thrust::raw_pointer_cast(imap.data()), block_dimx, block_dimy);
                 cudaCheckError();
                 gA.addProps(CSR); //need to add this property, so that row_offsets, col_indices & values are resized appropriately in the next call
-                gA.resize(hBp.size() - 1, hBp.size() - 1, hBi.size());
+                gA.resize(hBp.size() - 1, hBp.size() - 1, hBi.size(), block_dimx, block_dimy);
                 thrust::copy(hBp.begin(), hBp.end(), gA.row_offsets.begin());
                 thrust::copy(hBi.begin(), hBi.end(), gA.col_indices.begin());
                 thrust::copy(hBv.begin(), hBv.end(), gA.values.begin());
@@ -460,7 +476,7 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
             {
                 //copy (host -> host or device depending on matrix type)
                 gA.addProps(CSR); //need to add this property, so that row_offsets, col_indices & values are resized appropriately in the next call
-                gA.resize(hAp.size() - 1, hAp.size() - 1, hAi.size());
+                gA.resize(hAp.size() - 1, hAp.size() - 1, hAi.size(), block_dimx, block_dimy);
                 thrust::copy(hAp.begin(), hAp.end(), gA.row_offsets.begin());
                 thrust::copy(hAi.begin(), hAi.end(), gA.col_indices.begin());
                 thrust::copy(hAv.begin(), hAv.end(), gA.values.begin());
@@ -472,7 +488,7 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
     {
         /* ASSUMPTION: when manager has not been allocated you are running on a single rank */
         gA.addProps(CSR); //need to add this property, so that row_offsets, col_indices & values are resized appropriately in the next call
-        gA.resize(nv_mtx->row_offsets.size() - 1, nv_mtx->row_offsets.size() - 1, nv_mtx->col_indices.size());
+        gA.resize(nv_mtx->row_offsets.size() - 1, nv_mtx->row_offsets.size() - 1, nv_mtx->col_indices.size(), nv_mtx->get_block_dimx(), nv_mtx->get_block_dimy());
         thrust::copy(nv_mtx->row_offsets.begin(), nv_mtx->row_offsets.end(), gA.row_offsets.begin());
         thrust::copy(nv_mtx->col_indices.begin(), nv_mtx->col_indices.end(), gA.col_indices.begin());
         thrust::copy(nv_mtx->values.begin(),     nv_mtx->values.end(),      gA.values.begin());
@@ -488,7 +504,7 @@ int construct_global_vector(int &root, int &rank, Matrix<TConfig> *nv_mtx, Vecto
 {
     typedef typename TConfig::IndPrec t_IndPrec;
     typedef typename TConfig::VecPrec t_VecPrec;
-    int n, nnz, offset, l;
+    int n, nnz, offset, l, block_dim;
     int mpist;
     MPI_Comm mpicm;
     //MPI call parameters
@@ -519,6 +535,7 @@ int construct_global_vector(int &root, int &rank, Matrix<TConfig> *nv_mtx, Vecto
 
         nv_mtx->getOffsetAndSizeForView(OWNED, &offset, &n);
         nv_mtx->getNnzForView(OWNED, &nnz);
+	block_dim = nv_mtx->get_block_dimx(); // assume square blocks
 
         if (nv_mtx->manager->part_offsets_h.size() == 0)   // create part_offsets_h & part_offsets
         {
@@ -527,11 +544,11 @@ int construct_global_vector(int &root, int &rank, Matrix<TConfig> *nv_mtx, Vecto
 
         l = nv_mtx->manager->part_offsets_h.size() - 1;    // number of partitions
         //some allocations/resizing
-        hv.resize(nv_vec->size());                         // host copy of nv_vec
+        hv.resize(nv_vec->size() * block_dim);                         // host copy of nv_vec
 
         if (rank == root)
         {
-            hg.resize(nv_mtx->manager->part_offsets_h[l]); // host copy of gvec
+            hg.resize(nv_mtx->manager->part_offsets_h[l] * block_dim); // host copy of gvec
             rc.resize(l);
             di.resize(l);
         }
@@ -554,6 +571,11 @@ int construct_global_vector(int &root, int &rank, Matrix<TConfig> *nv_mtx, Vecto
             cudaCheckError();
             thrust::copy(nv_mtx->manager->part_offsets_h.begin(), nv_mtx->manager->part_offsets_h.begin() + l, di.begin());
             cudaCheckError();
+
+            for(int i = 0; i<l; ++i) {
+                rc[i] *= block_dim;
+                di[i] *= block_dim;
+            }
         }
 
         //alias raw pointers to thrust vector data (see thrust example unwrap_pointer for details)
@@ -566,11 +588,11 @@ int construct_global_vector(int &root, int &rank, Matrix<TConfig> *nv_mtx, Vecto
         //gather (on the host)
         if      (typeid(t_VecPrec) == typeid(float))
         {
-            mpist = MPI_Gatherv(hv_ptr, n, MPI_FLOAT,  hg_ptr, rc_ptr, di_ptr, MPI_FLOAT,  root, mpicm);
+            mpist = MPI_Gatherv(hv_ptr, n * block_dim, MPI_FLOAT,  hg_ptr, rc_ptr, di_ptr, MPI_FLOAT,  root, mpicm);
         }
         else if (typeid(t_VecPrec) == typeid(double))
         {
-            mpist = MPI_Gatherv(hv_ptr, n, MPI_DOUBLE, hg_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, mpicm);
+            mpist = MPI_Gatherv(hv_ptr, n * block_dim, MPI_DOUBLE, hg_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, mpicm);
         }
         else
         {
@@ -587,24 +609,26 @@ int construct_global_vector(int &root, int &rank, Matrix<TConfig> *nv_mtx, Vecto
             if (partition_vector != NULL)
             {
                 //sanity check
-                if (partition_vector_size != hg.size())
+                if (partition_vector_size * block_dim != hg.size())
                 {
                     FatalError("partition_vector_size does not match the global vector size", AMGX_ERR_CORE);
                 }
 
                 //construct a map (based on partition vector)
-                int i, j, nranks;
+                int i, j, k, nranks;
                 MPI_Comm_size(mpicm, &nranks);
                 thrust::host_vector<t_IndPrec> c(nranks, 0);
                 thrust::host_vector<t_IndPrec> map(hg.size());
                 thrust::host_vector<t_IndPrec> imap(hg.size());
 
-                for (i = 0; i < hg.size(); i++)
+                for (i = 0; i < nv_mtx->manager->part_offsets_h[l]; i++)
                 {
                     j = partition_vector[i];
-                    map[i] = nv_mtx->manager->part_offsets_h[j] + c[j];
-                    imap[map[i]] = i;
-                    c[j]++;
+		    for(k = 0 ; k < block_dim; ++k) {
+			map[i * block_dim + k] = nv_mtx->manager->part_offsets_h[j] + c[j];
+			imap[map[i * block_dim + k]] = i * block_dim + k;
+			c[j]++;
+		    }
                 }
 
                 //permute according to map during copy (host -> host or device depending on vector type)
diff --git a/src/distributed/distributed_manager.cu b/src/distributed/distributed_manager.cu
index 229f61ea..3235d1d4 100644
--- a/src/distributed/distributed_manager.cu
+++ b/src/distributed/distributed_manager.cu
@@ -3253,7 +3253,7 @@ void DistributedManager<TemplateConfig<AMGX_device, t_vecPrec, t_matPrec, t_indP
 template <AMGX_VecPrecision t_vecPrec, AMGX_MatPrecision t_matPrec, AMGX_IndPrecision t_indPrec>
 void DistributedManager<TemplateConfig<AMGX_device, t_vecPrec, t_matPrec, t_indPrec> >::unpack_partition(index_type *Bp, index_type *Bc, mat_value_type *Bv)
 {
-    index_type l, n, nnz, offset;
+    index_type l, n, nnz, offset, block_dimx, block_dimy;
     index_type     *ir;
     index_type     *Ap;
     index_type     *Ac;
@@ -3262,6 +3262,10 @@ void DistributedManager<TemplateConfig<AMGX_device, t_vecPrec, t_matPrec, t_indP
     //some initializations
     this->A->getOffsetAndSizeForView(OWNED, &offset, &n);
     this->A->getNnzForView(OWNED, &nnz);
+
+    block_dimx = this->A->get_block_dimx();
+    block_dimy = this->A->get_block_dimy();
+
     l  = this->inverse_renumbering.size();
     ir = this->inverse_renumbering.raw();
     Ap = this->A->row_offsets.raw();
@@ -3270,7 +3274,7 @@ void DistributedManager<TemplateConfig<AMGX_device, t_vecPrec, t_matPrec, t_indP
     //(i) reorder the matrix back (into mixed interior-boundary nodes)
     //applies to rows and columns (out-of-place)
     reorder_partition<index_type, mat_value_type, true, true>
-    (n, nnz, Ap, Ac, Av, Bp, Bc, Bv, l, ir);
+	(n, nnz, Ap, Ac, Av, Bp, Bc, Bv, l, ir, block_dimx, block_dimy);
     cudaCheckError();
     //obtain reordering q that combines the shift of the diagonal block with the off-diagonal block indices conversion from local to global
     this->obtain_shift_l2g_reordering(n, this->local_to_global_map, this->inverse_renumbering, q);
@@ -3278,7 +3282,7 @@ void DistributedManager<TemplateConfig<AMGX_device, t_vecPrec, t_matPrec, t_indP
     //(ii) reorder the matrix back (shift the diagonal block and convert off-diagonal block column indices from local to global)
     //applies columns only (in-place)
     reorder_partition<index_type, mat_value_type, false, true>
-    (n, nnz, Bp, Bc, Bv, Bp, Bc, Bv, q.size(), q.raw());
+    (n, nnz, Bp, Bc, Bv, Bp, Bc, Bv, q.size(), q.raw(), block_dimx, block_dimy);
     cudaCheckError();
 }
 
diff --git a/src/matrix.cu b/src/matrix.cu
index e0a97f23..9b4c1095 100644
--- a/src/matrix.cu
+++ b/src/matrix.cu
@@ -327,6 +327,7 @@ Matrix< TemplateConfig<AMGX_device, t_vecPrec, t_matPrec, t_indPrec> >::print(ch
 
                     for (xdim = 0; xdim < this->get_block_dimx(); xdim++)
                     {
+			int loc = ii * this->get_block_dimx() * this->get_block_dimy() + this->get_block_dimy() * ydim + xdim;
                         a = this->values[ii * this->get_block_dimx() * this->get_block_dimy() + this->get_block_dimy() * ydim + xdim];
                         fprintf(fid, "%d %d ", i + 1, j + 1);
                         types::util<value_type>::fprintf(fid, "%20.16f", a);

From 222caeea4afa816829e3d0680b810d531a4df8fb Mon Sep 17 00:00:00 2001
From: Oliver Darbyshire <oliver.darbyshire@zenotech.com>
Date: Tue, 3 Jan 2023 09:32:07 +0000
Subject: [PATCH 2/2] remove debug code

---
 include/reorder_partition.h | 1 -
 src/matrix.cu               | 1 -
 2 files changed, 2 deletions(-)

diff --git a/include/reorder_partition.h b/include/reorder_partition.h
index 66f4e28a..5eef963f 100644
--- a/include/reorder_partition.h
+++ b/include/reorder_partition.h
@@ -64,7 +64,6 @@ void reorder_partition_host(index_type n, index_type nnz, index_type *Ap, index_
             Bc[k] = reorder_cols ? p[col] : col;
 	    for (int kx = 0; kx < block_dimx; kx++)
 		for (int ky = 0; ky < block_dimy; ky++){
-		    int loc = j * block_size + kx * block_dimy + k;
 		    Bv[k * block_size + kx * block_dimy + ky] = Av[j * block_size + kx * block_dimy + ky];
 		}
         }
diff --git a/src/matrix.cu b/src/matrix.cu
index 9b4c1095..e0a97f23 100644
--- a/src/matrix.cu
+++ b/src/matrix.cu
@@ -327,7 +327,6 @@ Matrix< TemplateConfig<AMGX_device, t_vecPrec, t_matPrec, t_indPrec> >::print(ch
 
                     for (xdim = 0; xdim < this->get_block_dimx(); xdim++)
                     {
-			int loc = ii * this->get_block_dimx() * this->get_block_dimy() + this->get_block_dimy() * ydim + xdim;
                         a = this->values[ii * this->get_block_dimx() * this->get_block_dimy() + this->get_block_dimy() * ydim + xdim];
                         fprintf(fid, "%d %d ", i + 1, j + 1);
                         types::util<value_type>::fprintf(fid, "%20.16f", a);