From 459d6f6351cfcf622a8ef6d0edea515398ec23ce Mon Sep 17 00:00:00 2001 From: Oliver Darbyshire Date: Thu, 15 Dec 2022 16:36:17 +0000 Subject: [PATCH 1/2] Update output for block sizes > 1 --- include/reorder_partition.h | 25 ++++++---- src/amgx_c.cu | 64 ++++++++++++++++++-------- src/distributed/distributed_manager.cu | 10 ++-- src/matrix.cu | 1 + 4 files changed, 68 insertions(+), 32 deletions(-) diff --git a/include/reorder_partition.h b/include/reorder_partition.h index b80e8911..66f4e28a 100644 --- a/include/reorder_partition.h +++ b/include/reorder_partition.h @@ -29,13 +29,15 @@ template void reorder_partition_host(index_type n, index_type nnz, index_type *Ap, index_type *Ac, mat_value_type *Av, - index_type *Bp, index_type *Bc, mat_value_type *Bv, index_type l, index_type *p) + index_type *Bp, index_type *Bc, mat_value_type *Bv, index_type l, index_type *p, + index_type block_dimx, index_type block_dimy) { //applies reordering P from left adn right on matrix A, so that B=P'*A*P. //notice that matrix A may be rectangular (does not need to be square). //WARNING: columns within the rows are not sorted in the output matrix B. using namespace amgx; index_type row, col, i, j, k; + index_type block_size = block_dimx * block_dimy; //(i) reorder rows //compute number of elements per row (in new locations) Bp[0] = 0; @@ -56,22 +58,27 @@ void reorder_partition_host(index_type n, index_type nnz, index_type *Ap, index_ for (row = 0; row < n; row++) { i = reorder_rows ? p[row] : row; - for (k = Bp[i], j = Ap[row]; j < Ap[row + 1]; j++, k++) { col = Ac[j]; Bc[k] = reorder_cols ? p[col] : col; - Bv[k] = Av[j]; + for (int kx = 0; kx < block_dimx; kx++) + for (int ky = 0; ky < block_dimy; ky++){ + int loc = j * block_size + kx * block_dimy + k; + Bv[k * block_size + kx * block_dimy + ky] = Av[j * block_size + kx * block_dimy + ky]; + } } } } template void reorder_partition(index_type n, index_type nnz, index_type *Ap, index_type *Ac, mat_value_type *Av, - index_type *Bp, index_type *Bc, mat_value_type *Bv, index_type l, index_type *p) + index_type *Bp, index_type *Bc, mat_value_type *Bv, index_type l, index_type *p, + index_type block_dimx, index_type block_dimy) { using namespace amgx; cudaError_t st1, st2, st3, st4; + index_type block_size = block_dimx * block_dimy; //applies reordering P from left adn right on matrix A, so that B=P'*A*P. //notice that matrix A may be rectangular (does not need to be square). //for now implement device by copying matrices to the host and calling host. @@ -85,10 +92,10 @@ void reorder_partition(index_type n, index_type nnz, index_type *Ap, index_type p_h = (index_type *)malloc( l * sizeof( p_h[0])); Ap_h = (index_type *)malloc((n + 1) * sizeof(Ap_h[0])); Ac_h = (index_type *)malloc( nnz * sizeof(Ac_h[0])); - Av_h = (mat_value_type *)malloc( nnz * sizeof(Av_h[0])); + Av_h = (mat_value_type *)malloc( nnz * block_size * sizeof(Av_h[0])); Bp_h = (index_type *)malloc((n + 1) * sizeof(Bp_h[0])); Bc_h = (index_type *)malloc( nnz * sizeof(Bc_h[0])); - Bv_h = (mat_value_type *)malloc( nnz * sizeof(Bv_h[0])); + Bv_h = (mat_value_type *)malloc( nnz * block_size * sizeof(Bv_h[0])); if (!p_h || !Ap_h || !Ac_h || !Av_h || !Bp_h || !Bc_h || !Bv_h) { @@ -98,7 +105,7 @@ void reorder_partition(index_type n, index_type nnz, index_type *Ap, index_type st1 = cudaMemcpy(p_h, p, l * sizeof( p[0]), cudaMemcpyDeviceToHost); st2 = cudaMemcpy(Ap_h, Ap, (n + 1) * sizeof(Ap[0]), cudaMemcpyDeviceToHost); st3 = cudaMemcpy(Ac_h, Ac, nnz * sizeof(Ac[0]), cudaMemcpyDeviceToHost); - st4 = cudaMemcpy(Av_h, Av, nnz * sizeof(Av[0]), cudaMemcpyDeviceToHost); + st4 = cudaMemcpy(Av_h, Av, nnz * block_size * sizeof(Av[0]), cudaMemcpyDeviceToHost); if ((st1 != cudaSuccess) || (st2 != cudaSuccess) || (st3 != cudaSuccess) || (st4 != cudaSuccess)) { @@ -106,10 +113,10 @@ void reorder_partition(index_type n, index_type nnz, index_type *Ap, index_type } reorder_partition_host - (n, nnz, Ap_h, Ac_h, Av_h, Bp_h, Bc_h, Bv_h, l, p_h); + (n, nnz, Ap_h, Ac_h, Av_h, Bp_h, Bc_h, Bv_h, l, p_h, block_dimx, block_dimy); st1 = cudaMemcpy(Bp, Bp_h, (n + 1) * sizeof(Bp[0]), cudaMemcpyHostToDevice); st2 = cudaMemcpy(Bc, Bc_h, nnz * sizeof(Bc[0]), cudaMemcpyHostToDevice); - st3 = cudaMemcpy(Bv, Bv_h, nnz * sizeof(Bv[0]), cudaMemcpyHostToDevice); + st3 = cudaMemcpy(Bv, Bv_h, nnz * block_size * sizeof(Bv[0]), cudaMemcpyHostToDevice); if ((st1 != cudaSuccess) || (st2 != cudaSuccess) || (st3 != cudaSuccess)) { diff --git a/src/amgx_c.cu b/src/amgx_c.cu index b74fd718..c3117f77 100644 --- a/src/amgx_c.cu +++ b/src/amgx_c.cu @@ -236,6 +236,8 @@ int construct_global_matrix(int &root, int &rank, Matrix *nv_mtx, Matri int n, nnz, offset, l, k, i; int start, end, shift; int mpist; + int block_dimx, block_dimy; + MPI_Comm mpicm; //MPI call parameters t_IndPrec *rc_ptr, *di_ptr; @@ -273,6 +275,10 @@ int construct_global_matrix(int &root, int &rank, Matrix *nv_mtx, Matri nv_mtx->getOffsetAndSizeForView(OWNED, &offset, &n); nv_mtx->getNnzForView(OWNED, &nnz); + block_dimx = nv_mtx->get_block_dimx(); + block_dimy = nv_mtx->get_block_dimy(); + gA.set_block_dimx(block_dimx); + gA.set_block_dimy(block_dimy); if (nv_mtx->manager->part_offsets_h.size() == 0) // create part_offsets_h & part_offsets { @@ -284,10 +290,10 @@ int construct_global_matrix(int &root, int &rank, Matrix *nv_mtx, Matri //some allocations/resizing Bp.resize(n + 1); Bi.resize(nnz); - Bv.resize(nnz); + Bv.resize(nnz * block_dimx * block_dimy); hBp.resize(n + 1); hBi.resize(nnz); - hBv.resize(nnz); + hBv.resize(nnz * block_dimx * block_dimy); if (rank == root) { @@ -303,6 +309,7 @@ int construct_global_matrix(int &root, int &rank, Matrix *nv_mtx, Matri nv_mtx->manager->unpack_partition(thrust::raw_pointer_cast(Bp.data()), thrust::raw_pointer_cast(Bi.data()), thrust::raw_pointer_cast(Bv.data())); + cudaCheckError(); //copy to host (should be able to optimize this out later on) hBp = Bp; @@ -363,7 +370,7 @@ int construct_global_matrix(int &root, int &rank, Matrix *nv_mtx, Matri //some allocations/resizing hAi.resize(hAp[k]); //now we know global nnz and can allocate storage - hAv.resize(hAp[k]); //now we know global nnz and can allocate storage + hAv.resize(hAp[k] * block_dimx * block_dimy); //now we know global nnz and can allocate storage } //alias raw pointers to thrust vector data (see thrust example unwrap_pointer for details) @@ -392,13 +399,22 @@ int construct_global_matrix(int &root, int &rank, Matrix *nv_mtx, Matri } //values + if(rank == root) { + for (i = 0; i < l; i++) { + rc[i] *= block_dimx * block_dimy; + di[i] *= block_dimx * block_dimy; + } + } + rc_ptr = thrust::raw_pointer_cast(rc.data()); + di_ptr = thrust::raw_pointer_cast(di.data()); + if (typeid(t_MatPrec) == typeid(float)) { - mpist = MPI_Gatherv(hlv_ptr, nnz, MPI_FLOAT, hgv_ptr, rc_ptr, di_ptr, MPI_FLOAT, root, mpicm); + mpist = MPI_Gatherv(hlv_ptr, nnz * block_dimx * block_dimy, MPI_FLOAT, hgv_ptr, rc_ptr, di_ptr, MPI_FLOAT, root, mpicm); } else if (typeid(t_MatPrec) == typeid(double)) { - mpist = MPI_Gatherv(hlv_ptr, nnz, MPI_DOUBLE, hgv_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, mpicm); + mpist = MPI_Gatherv(hlv_ptr, nnz * block_dimx * block_dimy, MPI_DOUBLE, hgv_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, mpicm); } else { @@ -447,10 +463,10 @@ int construct_global_matrix(int &root, int &rank, Matrix *nv_mtx, Matri thrust::raw_pointer_cast(hBp.data()), thrust::raw_pointer_cast(hBi.data()), thrust::raw_pointer_cast(hBv.data()), - imap.size(), thrust::raw_pointer_cast(imap.data())); + imap.size(), thrust::raw_pointer_cast(imap.data()), block_dimx, block_dimy); cudaCheckError(); gA.addProps(CSR); //need to add this property, so that row_offsets, col_indices & values are resized appropriately in the next call - gA.resize(hBp.size() - 1, hBp.size() - 1, hBi.size()); + gA.resize(hBp.size() - 1, hBp.size() - 1, hBi.size(), block_dimx, block_dimy); thrust::copy(hBp.begin(), hBp.end(), gA.row_offsets.begin()); thrust::copy(hBi.begin(), hBi.end(), gA.col_indices.begin()); thrust::copy(hBv.begin(), hBv.end(), gA.values.begin()); @@ -460,7 +476,7 @@ int construct_global_matrix(int &root, int &rank, Matrix *nv_mtx, Matri { //copy (host -> host or device depending on matrix type) gA.addProps(CSR); //need to add this property, so that row_offsets, col_indices & values are resized appropriately in the next call - gA.resize(hAp.size() - 1, hAp.size() - 1, hAi.size()); + gA.resize(hAp.size() - 1, hAp.size() - 1, hAi.size(), block_dimx, block_dimy); thrust::copy(hAp.begin(), hAp.end(), gA.row_offsets.begin()); thrust::copy(hAi.begin(), hAi.end(), gA.col_indices.begin()); thrust::copy(hAv.begin(), hAv.end(), gA.values.begin()); @@ -472,7 +488,7 @@ int construct_global_matrix(int &root, int &rank, Matrix *nv_mtx, Matri { /* ASSUMPTION: when manager has not been allocated you are running on a single rank */ gA.addProps(CSR); //need to add this property, so that row_offsets, col_indices & values are resized appropriately in the next call - gA.resize(nv_mtx->row_offsets.size() - 1, nv_mtx->row_offsets.size() - 1, nv_mtx->col_indices.size()); + gA.resize(nv_mtx->row_offsets.size() - 1, nv_mtx->row_offsets.size() - 1, nv_mtx->col_indices.size(), nv_mtx->get_block_dimx(), nv_mtx->get_block_dimy()); thrust::copy(nv_mtx->row_offsets.begin(), nv_mtx->row_offsets.end(), gA.row_offsets.begin()); thrust::copy(nv_mtx->col_indices.begin(), nv_mtx->col_indices.end(), gA.col_indices.begin()); thrust::copy(nv_mtx->values.begin(), nv_mtx->values.end(), gA.values.begin()); @@ -488,7 +504,7 @@ int construct_global_vector(int &root, int &rank, Matrix *nv_mtx, Vecto { typedef typename TConfig::IndPrec t_IndPrec; typedef typename TConfig::VecPrec t_VecPrec; - int n, nnz, offset, l; + int n, nnz, offset, l, block_dim; int mpist; MPI_Comm mpicm; //MPI call parameters @@ -519,6 +535,7 @@ int construct_global_vector(int &root, int &rank, Matrix *nv_mtx, Vecto nv_mtx->getOffsetAndSizeForView(OWNED, &offset, &n); nv_mtx->getNnzForView(OWNED, &nnz); + block_dim = nv_mtx->get_block_dimx(); // assume square blocks if (nv_mtx->manager->part_offsets_h.size() == 0) // create part_offsets_h & part_offsets { @@ -527,11 +544,11 @@ int construct_global_vector(int &root, int &rank, Matrix *nv_mtx, Vecto l = nv_mtx->manager->part_offsets_h.size() - 1; // number of partitions //some allocations/resizing - hv.resize(nv_vec->size()); // host copy of nv_vec + hv.resize(nv_vec->size() * block_dim); // host copy of nv_vec if (rank == root) { - hg.resize(nv_mtx->manager->part_offsets_h[l]); // host copy of gvec + hg.resize(nv_mtx->manager->part_offsets_h[l] * block_dim); // host copy of gvec rc.resize(l); di.resize(l); } @@ -554,6 +571,11 @@ int construct_global_vector(int &root, int &rank, Matrix *nv_mtx, Vecto cudaCheckError(); thrust::copy(nv_mtx->manager->part_offsets_h.begin(), nv_mtx->manager->part_offsets_h.begin() + l, di.begin()); cudaCheckError(); + + for(int i = 0; i *nv_mtx, Vecto //gather (on the host) if (typeid(t_VecPrec) == typeid(float)) { - mpist = MPI_Gatherv(hv_ptr, n, MPI_FLOAT, hg_ptr, rc_ptr, di_ptr, MPI_FLOAT, root, mpicm); + mpist = MPI_Gatherv(hv_ptr, n * block_dim, MPI_FLOAT, hg_ptr, rc_ptr, di_ptr, MPI_FLOAT, root, mpicm); } else if (typeid(t_VecPrec) == typeid(double)) { - mpist = MPI_Gatherv(hv_ptr, n, MPI_DOUBLE, hg_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, mpicm); + mpist = MPI_Gatherv(hv_ptr, n * block_dim, MPI_DOUBLE, hg_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, mpicm); } else { @@ -587,24 +609,26 @@ int construct_global_vector(int &root, int &rank, Matrix *nv_mtx, Vecto if (partition_vector != NULL) { //sanity check - if (partition_vector_size != hg.size()) + if (partition_vector_size * block_dim != hg.size()) { FatalError("partition_vector_size does not match the global vector size", AMGX_ERR_CORE); } //construct a map (based on partition vector) - int i, j, nranks; + int i, j, k, nranks; MPI_Comm_size(mpicm, &nranks); thrust::host_vector c(nranks, 0); thrust::host_vector map(hg.size()); thrust::host_vector imap(hg.size()); - for (i = 0; i < hg.size(); i++) + for (i = 0; i < nv_mtx->manager->part_offsets_h[l]; i++) { j = partition_vector[i]; - map[i] = nv_mtx->manager->part_offsets_h[j] + c[j]; - imap[map[i]] = i; - c[j]++; + for(k = 0 ; k < block_dim; ++k) { + map[i * block_dim + k] = nv_mtx->manager->part_offsets_h[j] + c[j]; + imap[map[i * block_dim + k]] = i * block_dim + k; + c[j]++; + } } //permute according to map during copy (host -> host or device depending on vector type) diff --git a/src/distributed/distributed_manager.cu b/src/distributed/distributed_manager.cu index 229f61ea..3235d1d4 100644 --- a/src/distributed/distributed_manager.cu +++ b/src/distributed/distributed_manager.cu @@ -3253,7 +3253,7 @@ void DistributedManager void DistributedManager >::unpack_partition(index_type *Bp, index_type *Bc, mat_value_type *Bv) { - index_type l, n, nnz, offset; + index_type l, n, nnz, offset, block_dimx, block_dimy; index_type *ir; index_type *Ap; index_type *Ac; @@ -3262,6 +3262,10 @@ void DistributedManagerA->getOffsetAndSizeForView(OWNED, &offset, &n); this->A->getNnzForView(OWNED, &nnz); + + block_dimx = this->A->get_block_dimx(); + block_dimy = this->A->get_block_dimy(); + l = this->inverse_renumbering.size(); ir = this->inverse_renumbering.raw(); Ap = this->A->row_offsets.raw(); @@ -3270,7 +3274,7 @@ void DistributedManager - (n, nnz, Ap, Ac, Av, Bp, Bc, Bv, l, ir); + (n, nnz, Ap, Ac, Av, Bp, Bc, Bv, l, ir, block_dimx, block_dimy); cudaCheckError(); //obtain reordering q that combines the shift of the diagonal block with the off-diagonal block indices conversion from local to global this->obtain_shift_l2g_reordering(n, this->local_to_global_map, this->inverse_renumbering, q); @@ -3278,7 +3282,7 @@ void DistributedManager - (n, nnz, Bp, Bc, Bv, Bp, Bc, Bv, q.size(), q.raw()); + (n, nnz, Bp, Bc, Bv, Bp, Bc, Bv, q.size(), q.raw(), block_dimx, block_dimy); cudaCheckError(); } diff --git a/src/matrix.cu b/src/matrix.cu index e0a97f23..9b4c1095 100644 --- a/src/matrix.cu +++ b/src/matrix.cu @@ -327,6 +327,7 @@ Matrix< TemplateConfig >::print(ch for (xdim = 0; xdim < this->get_block_dimx(); xdim++) { + int loc = ii * this->get_block_dimx() * this->get_block_dimy() + this->get_block_dimy() * ydim + xdim; a = this->values[ii * this->get_block_dimx() * this->get_block_dimy() + this->get_block_dimy() * ydim + xdim]; fprintf(fid, "%d %d ", i + 1, j + 1); types::util::fprintf(fid, "%20.16f", a); From 222caeea4afa816829e3d0680b810d531a4df8fb Mon Sep 17 00:00:00 2001 From: Oliver Darbyshire Date: Tue, 3 Jan 2023 09:32:07 +0000 Subject: [PATCH 2/2] remove debug code --- include/reorder_partition.h | 1 - src/matrix.cu | 1 - 2 files changed, 2 deletions(-) diff --git a/include/reorder_partition.h b/include/reorder_partition.h index 66f4e28a..5eef963f 100644 --- a/include/reorder_partition.h +++ b/include/reorder_partition.h @@ -64,7 +64,6 @@ void reorder_partition_host(index_type n, index_type nnz, index_type *Ap, index_ Bc[k] = reorder_cols ? p[col] : col; for (int kx = 0; kx < block_dimx; kx++) for (int ky = 0; ky < block_dimy; ky++){ - int loc = j * block_size + kx * block_dimy + k; Bv[k * block_size + kx * block_dimy + ky] = Av[j * block_size + kx * block_dimy + ky]; } } diff --git a/src/matrix.cu b/src/matrix.cu index 9b4c1095..e0a97f23 100644 --- a/src/matrix.cu +++ b/src/matrix.cu @@ -327,7 +327,6 @@ Matrix< TemplateConfig >::print(ch for (xdim = 0; xdim < this->get_block_dimx(); xdim++) { - int loc = ii * this->get_block_dimx() * this->get_block_dimy() + this->get_block_dimy() * ydim + xdim; a = this->values[ii * this->get_block_dimx() * this->get_block_dimy() + this->get_block_dimy() * ydim + xdim]; fprintf(fid, "%d %d ", i + 1, j + 1); types::util::fprintf(fid, "%20.16f", a);