Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Matvec pinned comm #663

Draft
wants to merge 2 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions src/parcsr_mv/_hypre_parcsr_mv.h
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,10 @@ typedef struct hypre_ParCSRMatrix_struct
/* these two arrays are reserveed for SoC matrices on GPUs to help build interpolation */
HYPRE_Int *soc_diag_j;
HYPRE_Int *soc_offd_j;

/* These arrays are reserved for pinned data transfer */
char *send_pinned;
char *recv_pinned;
#endif

} hypre_ParCSRMatrix;
Expand Down Expand Up @@ -389,6 +393,8 @@ typedef struct hypre_ParCSRMatrix_struct
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
#define hypre_ParCSRMatrixSocDiagJ(matrix) ((matrix) -> soc_diag_j)
#define hypre_ParCSRMatrixSocOffdJ(matrix) ((matrix) -> soc_offd_j)
#define hypre_ParCSRMatrixSendPinned(matrix) ((matrix) -> send_pinned)
#define hypre_ParCSRMatrixRecvPinned(matrix) ((matrix) -> recv_pinned)
#endif

#define hypre_ParCSRMatrixNumRows(matrix) hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(matrix))
Expand Down Expand Up @@ -858,7 +864,11 @@ hypre_ParCSRCommHandle *hypre_ParCSRCommHandleCreate ( HYPRE_Int job, hypre_ParC
hypre_ParCSRCommHandle *hypre_ParCSRCommHandleCreate_v2 ( HYPRE_Int job,
hypre_ParCSRCommPkg *comm_pkg, HYPRE_MemoryLocation send_memory_location, void *send_data_in,
HYPRE_MemoryLocation recv_memory_location, void *recv_data_in );
hypre_ParCSRCommHandle *hypre_ParCSRCommHandleCreate_v3 ( HYPRE_Int job,
hypre_ParCSRCommPkg *comm_pkg, HYPRE_MemoryLocation send_memory_location, void *send_data_in, void *send_data_pinned,
HYPRE_MemoryLocation recv_memory_location, void *recv_data_in, void *recv_data_pinned );
HYPRE_Int hypre_ParCSRCommHandleDestroy ( hypre_ParCSRCommHandle *comm_handle );
HYPRE_Int hypre_ParCSRCommHandleDestroy_v3 ( hypre_ParCSRCommHandle *comm_handle );
void hypre_ParCSRCommPkgCreate_core ( MPI_Comm comm, HYPRE_BigInt *col_map_offd,
HYPRE_BigInt first_col_diag, HYPRE_BigInt *col_starts, HYPRE_Int num_cols_diag,
HYPRE_Int num_cols_offd, HYPRE_Int *p_num_recvs, HYPRE_Int **p_recv_procs,
Expand Down
318 changes: 318 additions & 0 deletions src/parcsr_mv/par_csr_communication.c
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,284 @@ hypre_ParCSRCommHandleCreate_v2 ( HYPRE_Int job,
return ( comm_handle );
}

hypre_ParCSRCommHandle*
hypre_ParCSRCommHandleCreate_v3 ( HYPRE_Int job,
hypre_ParCSRCommPkg *comm_pkg,
HYPRE_MemoryLocation send_memory_location,
void *send_data_in,
void *send_data_pinned,
HYPRE_MemoryLocation recv_memory_location,
void *recv_data_in,
void *recv_data_pinned )
{
HYPRE_Int num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg);
HYPRE_Int num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg);
MPI_Comm comm = hypre_ParCSRCommPkgComm(comm_pkg);
HYPRE_Int num_send_bytes = 0;
HYPRE_Int num_recv_bytes = 0;
hypre_ParCSRCommHandle *comm_handle;
HYPRE_Int num_requests;
hypre_MPI_Request *requests;
HYPRE_Int i, j;
HYPRE_Int my_id, num_procs;
HYPRE_Int ip, vec_start, vec_len;
void *send_data;
void *recv_data;

/*--------------------------------------------------------------------
* hypre_Initialize sets up a communication handle,
* posts receives and initiates sends. It always requires num_sends,
* num_recvs, recv_procs and send_procs to be set in comm_pkg.
* There are different options for job:
* job = 1 : is used to initialize communication exchange for the parts
* of vector needed to perform a Matvec, it requires send_data
* and recv_data to be doubles, recv_vec_starts and
* send_map_starts need to be set in comm_pkg.
* job = 2 : is used to initialize communication exchange for the parts
* of vector needed to perform a MatvecT, it requires send_data
* and recv_data to be doubles, recv_vec_starts and
* send_map_starts need to be set in comm_pkg.
* job = 11: similar to job = 1, but exchanges data of type HYPRE_Int (not HYPRE_Complex),
* requires send_data and recv_data to be ints
* recv_vec_starts and send_map_starts need to be set in comm_pkg.
* job = 12: similar to job = 2, but exchanges data of type HYPRE_Int (not HYPRE_Complex),
* requires send_data and recv_data to be ints
* recv_vec_starts and send_map_starts need to be set in comm_pkg.
* job = 21: similar to job = 1, but exchanges data of type HYPRE_BigInt (not HYPRE_Complex),
* requires send_data and recv_data to be ints
* recv_vec_starts and send_map_starts need to be set in comm_pkg.
* job = 22: similar to job = 2, but exchanges data of type HYPRE_BigInt (not HYPRE_Complex),
* requires send_data and recv_data to be ints
* recv_vec_starts and send_map_starts need to be set in comm_pkg.
* default: ignores send_data and recv_data, requires send_mpi_types
* and recv_mpi_types to be set in comm_pkg.
* datatypes need to point to absolute
* addresses, e.g. generated using hypre_MPI_Address .
*--------------------------------------------------------------------*/
#ifndef HYPRE_WITH_GPU_AWARE_MPI
switch (job)
{
case 1:
num_send_bytes = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends) * sizeof(HYPRE_Complex);
num_recv_bytes = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs) * sizeof(HYPRE_Complex);
break;
case 2:
num_send_bytes = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs) * sizeof(HYPRE_Complex);
num_recv_bytes = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends) * sizeof(HYPRE_Complex);
break;
case 11:
num_send_bytes = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends) * sizeof(HYPRE_Int);
num_recv_bytes = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs) * sizeof(HYPRE_Int);
break;
case 12:
num_send_bytes = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs) * sizeof(HYPRE_Int);
num_recv_bytes = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends) * sizeof(HYPRE_Int);
break;
case 21:
num_send_bytes = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends) * sizeof(HYPRE_BigInt);
num_recv_bytes = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs) * sizeof(HYPRE_BigInt);
break;
case 22:
num_send_bytes = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs) * sizeof(HYPRE_BigInt);
num_recv_bytes = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends) * sizeof(HYPRE_BigInt);
break;
}

hypre_MemoryLocation act_send_memory_location = hypre_GetActualMemLocation(send_memory_location);

if ( act_send_memory_location == hypre_MEMORY_DEVICE ||
act_send_memory_location == hypre_MEMORY_UNIFIED )
{
send_data = send_data_pinned;
hypre_SyncCudaDevice(hypre_handle());
}
else
{
send_data = send_data_in;
}

hypre_MemoryLocation act_recv_memory_location = hypre_GetActualMemLocation(recv_memory_location);

if ( act_recv_memory_location == hypre_MEMORY_DEVICE ||
act_recv_memory_location == hypre_MEMORY_UNIFIED )
{
recv_data = recv_data_pinned;
}
else
{
recv_data = recv_data_in;
}
#else /* #ifndef HYPRE_WITH_GPU_AWARE_MPI */
send_data = send_data_in;
recv_data = recv_data_in;
// TODO RL: it seems that we need to sync the CUDA stream before doing GPU-GPU MPI.
// Need to check MPI documentation whether this is acutally true
hypre_SyncCudaComputeStream(hypre_handle());
#endif

num_requests = num_sends + num_recvs;
requests = hypre_CTAlloc(hypre_MPI_Request, num_requests, HYPRE_MEMORY_HOST);

hypre_MPI_Comm_size(comm, &num_procs);
hypre_MPI_Comm_rank(comm, &my_id);

j = 0;
switch (job)
{
case 1:
{
HYPRE_Complex *d_send_data = (HYPRE_Complex *) send_data;
HYPRE_Complex *d_recv_data = (HYPRE_Complex *) recv_data;
for (i = 0; i < num_recvs; i++)
{
ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i);
vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i);
vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start;
hypre_MPI_Irecv(&d_recv_data[vec_start], vec_len, HYPRE_MPI_COMPLEX,
ip, 0, comm, &requests[j++]);
}
for (i = 0; i < num_sends; i++)
{
ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i);
vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start;
hypre_MPI_Isend(&d_send_data[vec_start], vec_len, HYPRE_MPI_COMPLEX,
ip, 0, comm, &requests[j++]);
}
break;
}
case 2:
{
HYPRE_Complex *d_send_data = (HYPRE_Complex *) send_data;
HYPRE_Complex *d_recv_data = (HYPRE_Complex *) recv_data;
for (i = 0; i < num_sends; i++)
{
ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i);
vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start;
hypre_MPI_Irecv(&d_recv_data[vec_start], vec_len, HYPRE_MPI_COMPLEX,
ip, 0, comm, &requests[j++]);
}
for (i = 0; i < num_recvs; i++)
{
ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i);
vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i);
vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start;
hypre_MPI_Isend(&d_send_data[vec_start], vec_len, HYPRE_MPI_COMPLEX,
ip, 0, comm, &requests[j++]);
}
break;
}
case 11:
{
HYPRE_Int *i_send_data = (HYPRE_Int *) send_data;
HYPRE_Int *i_recv_data = (HYPRE_Int *) recv_data;
for (i = 0; i < num_recvs; i++)
{
ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i);
vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i);
vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start;
hypre_MPI_Irecv(&i_recv_data[vec_start], vec_len, HYPRE_MPI_INT,
ip, 0, comm, &requests[j++]);
}
for (i = 0; i < num_sends; i++)
{
ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i);
vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start;
hypre_MPI_Isend(&i_send_data[vec_start], vec_len, HYPRE_MPI_INT,
ip, 0, comm, &requests[j++]);
}
break;
}
case 12:
{
HYPRE_Int *i_send_data = (HYPRE_Int *) send_data;
HYPRE_Int *i_recv_data = (HYPRE_Int *) recv_data;
for (i = 0; i < num_sends; i++)
{
ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i);
vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start;
hypre_MPI_Irecv(&i_recv_data[vec_start], vec_len, HYPRE_MPI_INT,
ip, 0, comm, &requests[j++]);
}
for (i = 0; i < num_recvs; i++)
{
ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i);
vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i);
vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start;
hypre_MPI_Isend(&i_send_data[vec_start], vec_len, HYPRE_MPI_INT,
ip, 0, comm, &requests[j++]);
}
break;
}
case 21:
{
HYPRE_BigInt *i_send_data = (HYPRE_BigInt *) send_data;
HYPRE_BigInt *i_recv_data = (HYPRE_BigInt *) recv_data;
for (i = 0; i < num_recvs; i++)
{
ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i);
vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i);
vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start;
hypre_MPI_Irecv(&i_recv_data[vec_start], vec_len, HYPRE_MPI_BIG_INT,
ip, 0, comm, &requests[j++]);
}
for (i = 0; i < num_sends; i++)
{
vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start;
ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i);
hypre_MPI_Isend(&i_send_data[vec_start], vec_len, HYPRE_MPI_BIG_INT,
ip, 0, comm, &requests[j++]);
}
break;
}
case 22:
{
HYPRE_BigInt *i_send_data = (HYPRE_BigInt *) send_data;
HYPRE_BigInt *i_recv_data = (HYPRE_BigInt *) recv_data;
for (i = 0; i < num_sends; i++)
{
vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i);
vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start;
ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i);
hypre_MPI_Irecv(&i_recv_data[vec_start], vec_len, HYPRE_MPI_BIG_INT,
ip, 0, comm, &requests[j++]);
}
for (i = 0; i < num_recvs; i++)
{
ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i);
vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i);
vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start;
hypre_MPI_Isend(&i_send_data[vec_start], vec_len, HYPRE_MPI_BIG_INT,
ip, 0, comm, &requests[j++]);
}
break;
}
}
/*--------------------------------------------------------------------
* set up comm_handle and return
*--------------------------------------------------------------------*/

comm_handle = hypre_CTAlloc(hypre_ParCSRCommHandle, 1, HYPRE_MEMORY_HOST);

hypre_ParCSRCommHandleCommPkg(comm_handle) = comm_pkg;
hypre_ParCSRCommHandleSendMemoryLocation(comm_handle) = send_memory_location;
hypre_ParCSRCommHandleRecvMemoryLocation(comm_handle) = recv_memory_location;
hypre_ParCSRCommHandleNumSendBytes(comm_handle) = num_send_bytes;
hypre_ParCSRCommHandleNumRecvBytes(comm_handle) = num_recv_bytes;
hypre_ParCSRCommHandleSendData(comm_handle) = send_data_in;
hypre_ParCSRCommHandleRecvData(comm_handle) = recv_data_in;
hypre_ParCSRCommHandleSendDataBuffer(comm_handle) = send_data;
hypre_ParCSRCommHandleRecvDataBuffer(comm_handle) = recv_data;
hypre_ParCSRCommHandleNumRequests(comm_handle) = num_requests;
hypre_ParCSRCommHandleRequests(comm_handle) = requests;

return ( comm_handle );
}

HYPRE_Int
hypre_ParCSRCommHandleDestroy( hypre_ParCSRCommHandle *comm_handle )
{
Expand Down Expand Up @@ -682,6 +960,46 @@ hypre_ParCSRCommHandleDestroy( hypre_ParCSRCommHandle *comm_handle )
}


HYPRE_Int
hypre_ParCSRCommHandleDestroy_v3( hypre_ParCSRCommHandle *comm_handle )
{
if ( comm_handle == NULL )
{
return hypre_error_flag;
}

if (hypre_ParCSRCommHandleNumRequests(comm_handle))
{
hypre_MPI_Status *status0;
status0 = hypre_CTAlloc(hypre_MPI_Status,
hypre_ParCSRCommHandleNumRequests(comm_handle), HYPRE_MEMORY_HOST);
hypre_MPI_Waitall(hypre_ParCSRCommHandleNumRequests(comm_handle),
hypre_ParCSRCommHandleRequests(comm_handle), status0);
hypre_TFree(status0, HYPRE_MEMORY_HOST);
}

#ifndef HYPRE_WITH_GPU_AWARE_MPI
hypre_MemoryLocation act_recv_memory_location = hypre_GetActualMemLocation(
hypre_ParCSRCommHandleRecvMemoryLocation(comm_handle));
if ( act_recv_memory_location == hypre_MEMORY_DEVICE ||
act_recv_memory_location == hypre_MEMORY_UNIFIED )
{
hypre_TMemcpyAsync( hypre_ParCSRCommHandleRecvData(comm_handle),
hypre_ParCSRCommHandleRecvDataBuffer(comm_handle),
char,
hypre_ParCSRCommHandleNumRecvBytes(comm_handle),
HYPRE_MEMORY_DEVICE,
HYPRE_MEMORY_HOST );
}
#endif

hypre_TFree(hypre_ParCSRCommHandleRequests(comm_handle), HYPRE_MEMORY_HOST);
hypre_TFree(comm_handle, HYPRE_MEMORY_HOST);

return hypre_error_flag;
}


/* hypre_MatCommPkgCreate_core does all the communications and computations for
hypre_MatCommPkgCreate ( hypre_ParCSRMatrix *A) and
hypre_BoolMatCommPkgCreate ( hypre_ParCSRBooleanMatrix *A) To support both
Expand Down
4 changes: 4 additions & 0 deletions src/parcsr_mv/par_csr_matrix.c
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,8 @@ hypre_ParCSRMatrixCreate( MPI_Comm comm,
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
hypre_ParCSRMatrixSocDiagJ(matrix) = NULL;
hypre_ParCSRMatrixSocOffdJ(matrix) = NULL;
hypre_ParCSRMatrixSendPinned(matrix) = NULL;
hypre_ParCSRMatrixRecvPinned(matrix) = NULL;
#endif

return matrix;
Expand Down Expand Up @@ -204,6 +206,8 @@ hypre_ParCSRMatrixDestroy( hypre_ParCSRMatrix *matrix )
#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP)
hypre_TFree(hypre_ParCSRMatrixSocDiagJ(matrix), HYPRE_MEMORY_DEVICE);
hypre_TFree(hypre_ParCSRMatrixSocOffdJ(matrix), HYPRE_MEMORY_DEVICE);
_hypre_TFree(hypre_ParCSRMatrixSendPinned(matrix), hypre_MEMORY_HOST_PINNED);
_hypre_TFree(hypre_ParCSRMatrixRecvPinned(matrix), hypre_MEMORY_HOST_PINNED);
#endif

hypre_TFree(matrix, HYPRE_MEMORY_HOST);
Expand Down
Loading