Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update output for block sizes > 1 #4

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions include/reorder_partition.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,15 @@

template <typename index_type, typename mat_value_type, bool reorder_rows, bool reorder_cols>
void reorder_partition_host(index_type n, index_type nnz, index_type *Ap, index_type *Ac, mat_value_type *Av,
index_type *Bp, index_type *Bc, mat_value_type *Bv, index_type l, index_type *p)
index_type *Bp, index_type *Bc, mat_value_type *Bv, index_type l, index_type *p,
index_type block_dimx, index_type block_dimy)
{
//applies reordering P from left adn right on matrix A, so that B=P'*A*P.
//notice that matrix A may be rectangular (does not need to be square).
//WARNING: columns within the rows are not sorted in the output matrix B.
using namespace amgx;
index_type row, col, i, j, k;
index_type block_size = block_dimx * block_dimy;
//(i) reorder rows
//compute number of elements per row (in new locations)
Bp[0] = 0;
Expand All @@ -56,22 +58,27 @@ void reorder_partition_host(index_type n, index_type nnz, index_type *Ap, index_
for (row = 0; row < n; row++)
{
i = reorder_rows ? p[row] : row;

for (k = Bp[i], j = Ap[row]; j < Ap[row + 1]; j++, k++)
{
col = Ac[j];
Bc[k] = reorder_cols ? p[col] : col;
Bv[k] = Av[j];
for (int kx = 0; kx < block_dimx; kx++)
for (int ky = 0; ky < block_dimy; ky++){
int loc = j * block_size + kx * block_dimy + k;
Bv[k * block_size + kx * block_dimy + ky] = Av[j * block_size + kx * block_dimy + ky];
}
}
}
}

template <typename index_type, typename mat_value_type, bool reorder_rows, bool reorder_cols>
void reorder_partition(index_type n, index_type nnz, index_type *Ap, index_type *Ac, mat_value_type *Av,
index_type *Bp, index_type *Bc, mat_value_type *Bv, index_type l, index_type *p)
index_type *Bp, index_type *Bc, mat_value_type *Bv, index_type l, index_type *p,
index_type block_dimx, index_type block_dimy)
{
using namespace amgx;
cudaError_t st1, st2, st3, st4;
index_type block_size = block_dimx * block_dimy;
//applies reordering P from left adn right on matrix A, so that B=P'*A*P.
//notice that matrix A may be rectangular (does not need to be square).
//for now implement device by copying matrices to the host and calling host.
Expand All @@ -85,10 +92,10 @@ void reorder_partition(index_type n, index_type nnz, index_type *Ap, index_type
p_h = (index_type *)malloc( l * sizeof( p_h[0]));
Ap_h = (index_type *)malloc((n + 1) * sizeof(Ap_h[0]));
Ac_h = (index_type *)malloc( nnz * sizeof(Ac_h[0]));
Av_h = (mat_value_type *)malloc( nnz * sizeof(Av_h[0]));
Av_h = (mat_value_type *)malloc( nnz * block_size * sizeof(Av_h[0]));
Bp_h = (index_type *)malloc((n + 1) * sizeof(Bp_h[0]));
Bc_h = (index_type *)malloc( nnz * sizeof(Bc_h[0]));
Bv_h = (mat_value_type *)malloc( nnz * sizeof(Bv_h[0]));
Bv_h = (mat_value_type *)malloc( nnz * block_size * sizeof(Bv_h[0]));

if (!p_h || !Ap_h || !Ac_h || !Av_h || !Bp_h || !Bc_h || !Bv_h)
{
Expand All @@ -98,18 +105,18 @@ void reorder_partition(index_type n, index_type nnz, index_type *Ap, index_type
st1 = cudaMemcpy(p_h, p, l * sizeof( p[0]), cudaMemcpyDeviceToHost);
st2 = cudaMemcpy(Ap_h, Ap, (n + 1) * sizeof(Ap[0]), cudaMemcpyDeviceToHost);
st3 = cudaMemcpy(Ac_h, Ac, nnz * sizeof(Ac[0]), cudaMemcpyDeviceToHost);
st4 = cudaMemcpy(Av_h, Av, nnz * sizeof(Av[0]), cudaMemcpyDeviceToHost);
st4 = cudaMemcpy(Av_h, Av, nnz * block_size * sizeof(Av[0]), cudaMemcpyDeviceToHost);

if ((st1 != cudaSuccess) || (st2 != cudaSuccess) || (st3 != cudaSuccess) || (st4 != cudaSuccess))
{
FatalError("reorder_partition (one of the cudaMemcpy back to host failed", AMGX_ERR_CORE);
}

reorder_partition_host<index_type, mat_value_type, reorder_rows, reorder_cols>
(n, nnz, Ap_h, Ac_h, Av_h, Bp_h, Bc_h, Bv_h, l, p_h);
(n, nnz, Ap_h, Ac_h, Av_h, Bp_h, Bc_h, Bv_h, l, p_h, block_dimx, block_dimy);
st1 = cudaMemcpy(Bp, Bp_h, (n + 1) * sizeof(Bp[0]), cudaMemcpyHostToDevice);
st2 = cudaMemcpy(Bc, Bc_h, nnz * sizeof(Bc[0]), cudaMemcpyHostToDevice);
st3 = cudaMemcpy(Bv, Bv_h, nnz * sizeof(Bv[0]), cudaMemcpyHostToDevice);
st3 = cudaMemcpy(Bv, Bv_h, nnz * block_size * sizeof(Bv[0]), cudaMemcpyHostToDevice);

if ((st1 != cudaSuccess) || (st2 != cudaSuccess) || (st3 != cudaSuccess))
{
Expand Down
64 changes: 44 additions & 20 deletions src/amgx_c.cu
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,8 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
int n, nnz, offset, l, k, i;
int start, end, shift;
int mpist;
int block_dimx, block_dimy;

MPI_Comm mpicm;
//MPI call parameters
t_IndPrec *rc_ptr, *di_ptr;
Expand Down Expand Up @@ -273,6 +275,10 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri

nv_mtx->getOffsetAndSizeForView(OWNED, &offset, &n);
nv_mtx->getNnzForView(OWNED, &nnz);
block_dimx = nv_mtx->get_block_dimx();
block_dimy = nv_mtx->get_block_dimy();
gA.set_block_dimx(block_dimx);
gA.set_block_dimy(block_dimy);

if (nv_mtx->manager->part_offsets_h.size() == 0) // create part_offsets_h & part_offsets
{
Expand All @@ -284,10 +290,10 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
//some allocations/resizing
Bp.resize(n + 1);
Bi.resize(nnz);
Bv.resize(nnz);
Bv.resize(nnz * block_dimx * block_dimy);
hBp.resize(n + 1);
hBi.resize(nnz);
hBv.resize(nnz);
hBv.resize(nnz * block_dimx * block_dimy);

if (rank == root)
{
Expand All @@ -303,6 +309,7 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
nv_mtx->manager->unpack_partition(thrust::raw_pointer_cast(Bp.data()),
thrust::raw_pointer_cast(Bi.data()),
thrust::raw_pointer_cast(Bv.data()));

cudaCheckError();
//copy to host (should be able to optimize this out later on)
hBp = Bp;
Expand Down Expand Up @@ -363,7 +370,7 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri

//some allocations/resizing
hAi.resize(hAp[k]); //now we know global nnz and can allocate storage
hAv.resize(hAp[k]); //now we know global nnz and can allocate storage
hAv.resize(hAp[k] * block_dimx * block_dimy); //now we know global nnz and can allocate storage
}

//alias raw pointers to thrust vector data (see thrust example unwrap_pointer for details)
Expand Down Expand Up @@ -392,13 +399,22 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
}

//values
if(rank == root) {
for (i = 0; i < l; i++) {
rc[i] *= block_dimx * block_dimy;
di[i] *= block_dimx * block_dimy;
}
}
rc_ptr = thrust::raw_pointer_cast(rc.data());
di_ptr = thrust::raw_pointer_cast(di.data());

if (typeid(t_MatPrec) == typeid(float))
{
mpist = MPI_Gatherv(hlv_ptr, nnz, MPI_FLOAT, hgv_ptr, rc_ptr, di_ptr, MPI_FLOAT, root, mpicm);
mpist = MPI_Gatherv(hlv_ptr, nnz * block_dimx * block_dimy, MPI_FLOAT, hgv_ptr, rc_ptr, di_ptr, MPI_FLOAT, root, mpicm);
}
else if (typeid(t_MatPrec) == typeid(double))
{
mpist = MPI_Gatherv(hlv_ptr, nnz, MPI_DOUBLE, hgv_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, mpicm);
mpist = MPI_Gatherv(hlv_ptr, nnz * block_dimx * block_dimy, MPI_DOUBLE, hgv_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, mpicm);
}
else
{
Expand Down Expand Up @@ -447,10 +463,10 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
thrust::raw_pointer_cast(hBp.data()),
thrust::raw_pointer_cast(hBi.data()),
thrust::raw_pointer_cast(hBv.data()),
imap.size(), thrust::raw_pointer_cast(imap.data()));
imap.size(), thrust::raw_pointer_cast(imap.data()), block_dimx, block_dimy);
cudaCheckError();
gA.addProps(CSR); //need to add this property, so that row_offsets, col_indices & values are resized appropriately in the next call
gA.resize(hBp.size() - 1, hBp.size() - 1, hBi.size());
gA.resize(hBp.size() - 1, hBp.size() - 1, hBi.size(), block_dimx, block_dimy);
thrust::copy(hBp.begin(), hBp.end(), gA.row_offsets.begin());
thrust::copy(hBi.begin(), hBi.end(), gA.col_indices.begin());
thrust::copy(hBv.begin(), hBv.end(), gA.values.begin());
Expand All @@ -460,7 +476,7 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
{
//copy (host -> host or device depending on matrix type)
gA.addProps(CSR); //need to add this property, so that row_offsets, col_indices & values are resized appropriately in the next call
gA.resize(hAp.size() - 1, hAp.size() - 1, hAi.size());
gA.resize(hAp.size() - 1, hAp.size() - 1, hAi.size(), block_dimx, block_dimy);
thrust::copy(hAp.begin(), hAp.end(), gA.row_offsets.begin());
thrust::copy(hAi.begin(), hAi.end(), gA.col_indices.begin());
thrust::copy(hAv.begin(), hAv.end(), gA.values.begin());
Expand All @@ -472,7 +488,7 @@ int construct_global_matrix(int &root, int &rank, Matrix<TConfig> *nv_mtx, Matri
{
/* ASSUMPTION: when manager has not been allocated you are running on a single rank */
gA.addProps(CSR); //need to add this property, so that row_offsets, col_indices & values are resized appropriately in the next call
gA.resize(nv_mtx->row_offsets.size() - 1, nv_mtx->row_offsets.size() - 1, nv_mtx->col_indices.size());
gA.resize(nv_mtx->row_offsets.size() - 1, nv_mtx->row_offsets.size() - 1, nv_mtx->col_indices.size(), nv_mtx->get_block_dimx(), nv_mtx->get_block_dimy());
thrust::copy(nv_mtx->row_offsets.begin(), nv_mtx->row_offsets.end(), gA.row_offsets.begin());
thrust::copy(nv_mtx->col_indices.begin(), nv_mtx->col_indices.end(), gA.col_indices.begin());
thrust::copy(nv_mtx->values.begin(), nv_mtx->values.end(), gA.values.begin());
Expand All @@ -488,7 +504,7 @@ int construct_global_vector(int &root, int &rank, Matrix<TConfig> *nv_mtx, Vecto
{
typedef typename TConfig::IndPrec t_IndPrec;
typedef typename TConfig::VecPrec t_VecPrec;
int n, nnz, offset, l;
int n, nnz, offset, l, block_dim;
int mpist;
MPI_Comm mpicm;
//MPI call parameters
Expand Down Expand Up @@ -519,6 +535,7 @@ int construct_global_vector(int &root, int &rank, Matrix<TConfig> *nv_mtx, Vecto

nv_mtx->getOffsetAndSizeForView(OWNED, &offset, &n);
nv_mtx->getNnzForView(OWNED, &nnz);
block_dim = nv_mtx->get_block_dimx(); // assume square blocks

if (nv_mtx->manager->part_offsets_h.size() == 0) // create part_offsets_h & part_offsets
{
Expand All @@ -527,11 +544,11 @@ int construct_global_vector(int &root, int &rank, Matrix<TConfig> *nv_mtx, Vecto

l = nv_mtx->manager->part_offsets_h.size() - 1; // number of partitions
//some allocations/resizing
hv.resize(nv_vec->size()); // host copy of nv_vec
hv.resize(nv_vec->size() * block_dim); // host copy of nv_vec

if (rank == root)
{
hg.resize(nv_mtx->manager->part_offsets_h[l]); // host copy of gvec
hg.resize(nv_mtx->manager->part_offsets_h[l] * block_dim); // host copy of gvec
rc.resize(l);
di.resize(l);
}
Expand All @@ -554,6 +571,11 @@ int construct_global_vector(int &root, int &rank, Matrix<TConfig> *nv_mtx, Vecto
cudaCheckError();
thrust::copy(nv_mtx->manager->part_offsets_h.begin(), nv_mtx->manager->part_offsets_h.begin() + l, di.begin());
cudaCheckError();

for(int i = 0; i<l; ++i) {
rc[i] *= block_dim;
di[i] *= block_dim;
}
}

//alias raw pointers to thrust vector data (see thrust example unwrap_pointer for details)
Expand All @@ -566,11 +588,11 @@ int construct_global_vector(int &root, int &rank, Matrix<TConfig> *nv_mtx, Vecto
//gather (on the host)
if (typeid(t_VecPrec) == typeid(float))
{
mpist = MPI_Gatherv(hv_ptr, n, MPI_FLOAT, hg_ptr, rc_ptr, di_ptr, MPI_FLOAT, root, mpicm);
mpist = MPI_Gatherv(hv_ptr, n * block_dim, MPI_FLOAT, hg_ptr, rc_ptr, di_ptr, MPI_FLOAT, root, mpicm);
}
else if (typeid(t_VecPrec) == typeid(double))
{
mpist = MPI_Gatherv(hv_ptr, n, MPI_DOUBLE, hg_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, mpicm);
mpist = MPI_Gatherv(hv_ptr, n * block_dim, MPI_DOUBLE, hg_ptr, rc_ptr, di_ptr, MPI_DOUBLE, root, mpicm);
}
else
{
Expand All @@ -587,24 +609,26 @@ int construct_global_vector(int &root, int &rank, Matrix<TConfig> *nv_mtx, Vecto
if (partition_vector != NULL)
{
//sanity check
if (partition_vector_size != hg.size())
if (partition_vector_size * block_dim != hg.size())
{
FatalError("partition_vector_size does not match the global vector size", AMGX_ERR_CORE);
}

//construct a map (based on partition vector)
int i, j, nranks;
int i, j, k, nranks;
MPI_Comm_size(mpicm, &nranks);
thrust::host_vector<t_IndPrec> c(nranks, 0);
thrust::host_vector<t_IndPrec> map(hg.size());
thrust::host_vector<t_IndPrec> imap(hg.size());

for (i = 0; i < hg.size(); i++)
for (i = 0; i < nv_mtx->manager->part_offsets_h[l]; i++)
{
j = partition_vector[i];
map[i] = nv_mtx->manager->part_offsets_h[j] + c[j];
imap[map[i]] = i;
c[j]++;
for(k = 0 ; k < block_dim; ++k) {
map[i * block_dim + k] = nv_mtx->manager->part_offsets_h[j] + c[j];
imap[map[i * block_dim + k]] = i * block_dim + k;
c[j]++;
}
}

//permute according to map during copy (host -> host or device depending on vector type)
Expand Down
10 changes: 7 additions & 3 deletions src/distributed/distributed_manager.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3253,7 +3253,7 @@ void DistributedManager<TemplateConfig<AMGX_device, t_vecPrec, t_matPrec, t_indP
template <AMGX_VecPrecision t_vecPrec, AMGX_MatPrecision t_matPrec, AMGX_IndPrecision t_indPrec>
void DistributedManager<TemplateConfig<AMGX_device, t_vecPrec, t_matPrec, t_indPrec> >::unpack_partition(index_type *Bp, index_type *Bc, mat_value_type *Bv)
{
index_type l, n, nnz, offset;
index_type l, n, nnz, offset, block_dimx, block_dimy;
index_type *ir;
index_type *Ap;
index_type *Ac;
Expand All @@ -3262,6 +3262,10 @@ void DistributedManager<TemplateConfig<AMGX_device, t_vecPrec, t_matPrec, t_indP
//some initializations
this->A->getOffsetAndSizeForView(OWNED, &offset, &n);
this->A->getNnzForView(OWNED, &nnz);

block_dimx = this->A->get_block_dimx();
block_dimy = this->A->get_block_dimy();

l = this->inverse_renumbering.size();
ir = this->inverse_renumbering.raw();
Ap = this->A->row_offsets.raw();
Expand All @@ -3270,15 +3274,15 @@ void DistributedManager<TemplateConfig<AMGX_device, t_vecPrec, t_matPrec, t_indP
//(i) reorder the matrix back (into mixed interior-boundary nodes)
//applies to rows and columns (out-of-place)
reorder_partition<index_type, mat_value_type, true, true>
(n, nnz, Ap, Ac, Av, Bp, Bc, Bv, l, ir);
(n, nnz, Ap, Ac, Av, Bp, Bc, Bv, l, ir, block_dimx, block_dimy);
cudaCheckError();
//obtain reordering q that combines the shift of the diagonal block with the off-diagonal block indices conversion from local to global
this->obtain_shift_l2g_reordering(n, this->local_to_global_map, this->inverse_renumbering, q);
cudaCheckError();
//(ii) reorder the matrix back (shift the diagonal block and convert off-diagonal block column indices from local to global)
//applies columns only (in-place)
reorder_partition<index_type, mat_value_type, false, true>
(n, nnz, Bp, Bc, Bv, Bp, Bc, Bv, q.size(), q.raw());
(n, nnz, Bp, Bc, Bv, Bp, Bc, Bv, q.size(), q.raw(), block_dimx, block_dimy);
cudaCheckError();
}

Expand Down
1 change: 1 addition & 0 deletions src/matrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,7 @@ Matrix< TemplateConfig<AMGX_device, t_vecPrec, t_matPrec, t_indPrec> >::print(ch

for (xdim = 0; xdim < this->get_block_dimx(); xdim++)
{
int loc = ii * this->get_block_dimx() * this->get_block_dimy() + this->get_block_dimy() * ydim + xdim;
a = this->values[ii * this->get_block_dimx() * this->get_block_dimy() + this->get_block_dimy() * ydim + xdim];
fprintf(fid, "%d %d ", i + 1, j + 1);
types::util<value_type>::fprintf(fid, "%20.16f", a);
Expand Down