diff --git a/src/parcsr_mv/_hypre_parcsr_mv.h b/src/parcsr_mv/_hypre_parcsr_mv.h index a75b6f8094..5727bb5d77 100644 --- a/src/parcsr_mv/_hypre_parcsr_mv.h +++ b/src/parcsr_mv/_hypre_parcsr_mv.h @@ -351,6 +351,10 @@ typedef struct hypre_ParCSRMatrix_struct /* these two arrays are reserveed for SoC matrices on GPUs to help build interpolation */ HYPRE_Int *soc_diag_j; HYPRE_Int *soc_offd_j; + + /* These arrays are reserved for pinned data transfer */ + char *send_pinned; + char *recv_pinned; #endif } hypre_ParCSRMatrix; @@ -389,6 +393,8 @@ typedef struct hypre_ParCSRMatrix_struct #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) #define hypre_ParCSRMatrixSocDiagJ(matrix) ((matrix) -> soc_diag_j) #define hypre_ParCSRMatrixSocOffdJ(matrix) ((matrix) -> soc_offd_j) +#define hypre_ParCSRMatrixSendPinned(matrix) ((matrix) -> send_pinned) +#define hypre_ParCSRMatrixRecvPinned(matrix) ((matrix) -> recv_pinned) #endif #define hypre_ParCSRMatrixNumRows(matrix) hypre_CSRMatrixNumRows(hypre_ParCSRMatrixDiag(matrix)) @@ -858,7 +864,11 @@ hypre_ParCSRCommHandle *hypre_ParCSRCommHandleCreate ( HYPRE_Int job, hypre_ParC hypre_ParCSRCommHandle *hypre_ParCSRCommHandleCreate_v2 ( HYPRE_Int job, hypre_ParCSRCommPkg *comm_pkg, HYPRE_MemoryLocation send_memory_location, void *send_data_in, HYPRE_MemoryLocation recv_memory_location, void *recv_data_in ); +hypre_ParCSRCommHandle *hypre_ParCSRCommHandleCreate_v3 ( HYPRE_Int job, + hypre_ParCSRCommPkg *comm_pkg, HYPRE_MemoryLocation send_memory_location, void *send_data_in, void *send_data_pinned, + HYPRE_MemoryLocation recv_memory_location, void *recv_data_in, void *recv_data_pinned ); HYPRE_Int hypre_ParCSRCommHandleDestroy ( hypre_ParCSRCommHandle *comm_handle ); +HYPRE_Int hypre_ParCSRCommHandleDestroy_v3 ( hypre_ParCSRCommHandle *comm_handle ); void hypre_ParCSRCommPkgCreate_core ( MPI_Comm comm, HYPRE_BigInt *col_map_offd, HYPRE_BigInt first_col_diag, HYPRE_BigInt *col_starts, HYPRE_Int num_cols_diag, HYPRE_Int num_cols_offd, HYPRE_Int *p_num_recvs, HYPRE_Int **p_recv_procs, diff --git a/src/parcsr_mv/par_csr_communication.c b/src/parcsr_mv/par_csr_communication.c index 79f83ff1bb..546402c21f 100644 --- a/src/parcsr_mv/par_csr_communication.c +++ b/src/parcsr_mv/par_csr_communication.c @@ -611,6 +611,284 @@ hypre_ParCSRCommHandleCreate_v2 ( HYPRE_Int job, return ( comm_handle ); } +hypre_ParCSRCommHandle* +hypre_ParCSRCommHandleCreate_v3 ( HYPRE_Int job, + hypre_ParCSRCommPkg *comm_pkg, + HYPRE_MemoryLocation send_memory_location, + void *send_data_in, + void *send_data_pinned, + HYPRE_MemoryLocation recv_memory_location, + void *recv_data_in, + void *recv_data_pinned ) +{ + HYPRE_Int num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg); + HYPRE_Int num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg); + MPI_Comm comm = hypre_ParCSRCommPkgComm(comm_pkg); + HYPRE_Int num_send_bytes = 0; + HYPRE_Int num_recv_bytes = 0; + hypre_ParCSRCommHandle *comm_handle; + HYPRE_Int num_requests; + hypre_MPI_Request *requests; + HYPRE_Int i, j; + HYPRE_Int my_id, num_procs; + HYPRE_Int ip, vec_start, vec_len; + void *send_data; + void *recv_data; + + /*-------------------------------------------------------------------- + * hypre_Initialize sets up a communication handle, + * posts receives and initiates sends. It always requires num_sends, + * num_recvs, recv_procs and send_procs to be set in comm_pkg. + * There are different options for job: + * job = 1 : is used to initialize communication exchange for the parts + * of vector needed to perform a Matvec, it requires send_data + * and recv_data to be doubles, recv_vec_starts and + * send_map_starts need to be set in comm_pkg. + * job = 2 : is used to initialize communication exchange for the parts + * of vector needed to perform a MatvecT, it requires send_data + * and recv_data to be doubles, recv_vec_starts and + * send_map_starts need to be set in comm_pkg. + * job = 11: similar to job = 1, but exchanges data of type HYPRE_Int (not HYPRE_Complex), + * requires send_data and recv_data to be ints + * recv_vec_starts and send_map_starts need to be set in comm_pkg. + * job = 12: similar to job = 2, but exchanges data of type HYPRE_Int (not HYPRE_Complex), + * requires send_data and recv_data to be ints + * recv_vec_starts and send_map_starts need to be set in comm_pkg. + * job = 21: similar to job = 1, but exchanges data of type HYPRE_BigInt (not HYPRE_Complex), + * requires send_data and recv_data to be ints + * recv_vec_starts and send_map_starts need to be set in comm_pkg. + * job = 22: similar to job = 2, but exchanges data of type HYPRE_BigInt (not HYPRE_Complex), + * requires send_data and recv_data to be ints + * recv_vec_starts and send_map_starts need to be set in comm_pkg. + * default: ignores send_data and recv_data, requires send_mpi_types + * and recv_mpi_types to be set in comm_pkg. + * datatypes need to point to absolute + * addresses, e.g. generated using hypre_MPI_Address . + *--------------------------------------------------------------------*/ +#ifndef HYPRE_WITH_GPU_AWARE_MPI + switch (job) + { + case 1: + num_send_bytes = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends) * sizeof(HYPRE_Complex); + num_recv_bytes = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs) * sizeof(HYPRE_Complex); + break; + case 2: + num_send_bytes = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs) * sizeof(HYPRE_Complex); + num_recv_bytes = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends) * sizeof(HYPRE_Complex); + break; + case 11: + num_send_bytes = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends) * sizeof(HYPRE_Int); + num_recv_bytes = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs) * sizeof(HYPRE_Int); + break; + case 12: + num_send_bytes = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs) * sizeof(HYPRE_Int); + num_recv_bytes = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends) * sizeof(HYPRE_Int); + break; + case 21: + num_send_bytes = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends) * sizeof(HYPRE_BigInt); + num_recv_bytes = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs) * sizeof(HYPRE_BigInt); + break; + case 22: + num_send_bytes = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs) * sizeof(HYPRE_BigInt); + num_recv_bytes = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends) * sizeof(HYPRE_BigInt); + break; + } + + hypre_MemoryLocation act_send_memory_location = hypre_GetActualMemLocation(send_memory_location); + + if ( act_send_memory_location == hypre_MEMORY_DEVICE || + act_send_memory_location == hypre_MEMORY_UNIFIED ) + { + send_data = send_data_pinned; + hypre_SyncCudaDevice(hypre_handle()); + } + else + { + send_data = send_data_in; + } + + hypre_MemoryLocation act_recv_memory_location = hypre_GetActualMemLocation(recv_memory_location); + + if ( act_recv_memory_location == hypre_MEMORY_DEVICE || + act_recv_memory_location == hypre_MEMORY_UNIFIED ) + { + recv_data = recv_data_pinned; + } + else + { + recv_data = recv_data_in; + } +#else /* #ifndef HYPRE_WITH_GPU_AWARE_MPI */ + send_data = send_data_in; + recv_data = recv_data_in; + // TODO RL: it seems that we need to sync the CUDA stream before doing GPU-GPU MPI. + // Need to check MPI documentation whether this is acutally true + hypre_SyncCudaComputeStream(hypre_handle()); +#endif + + num_requests = num_sends + num_recvs; + requests = hypre_CTAlloc(hypre_MPI_Request, num_requests, HYPRE_MEMORY_HOST); + + hypre_MPI_Comm_size(comm, &num_procs); + hypre_MPI_Comm_rank(comm, &my_id); + + j = 0; + switch (job) + { + case 1: + { + HYPRE_Complex *d_send_data = (HYPRE_Complex *) send_data; + HYPRE_Complex *d_recv_data = (HYPRE_Complex *) recv_data; + for (i = 0; i < num_recvs; i++) + { + ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Irecv(&d_recv_data[vec_start], vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, &requests[j++]); + } + for (i = 0; i < num_sends; i++) + { + ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Isend(&d_send_data[vec_start], vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, &requests[j++]); + } + break; + } + case 2: + { + HYPRE_Complex *d_send_data = (HYPRE_Complex *) send_data; + HYPRE_Complex *d_recv_data = (HYPRE_Complex *) recv_data; + for (i = 0; i < num_sends; i++) + { + ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Irecv(&d_recv_data[vec_start], vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, &requests[j++]); + } + for (i = 0; i < num_recvs; i++) + { + ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Isend(&d_send_data[vec_start], vec_len, HYPRE_MPI_COMPLEX, + ip, 0, comm, &requests[j++]); + } + break; + } + case 11: + { + HYPRE_Int *i_send_data = (HYPRE_Int *) send_data; + HYPRE_Int *i_recv_data = (HYPRE_Int *) recv_data; + for (i = 0; i < num_recvs; i++) + { + ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Irecv(&i_recv_data[vec_start], vec_len, HYPRE_MPI_INT, + ip, 0, comm, &requests[j++]); + } + for (i = 0; i < num_sends; i++) + { + ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Isend(&i_send_data[vec_start], vec_len, HYPRE_MPI_INT, + ip, 0, comm, &requests[j++]); + } + break; + } + case 12: + { + HYPRE_Int *i_send_data = (HYPRE_Int *) send_data; + HYPRE_Int *i_recv_data = (HYPRE_Int *) recv_data; + for (i = 0; i < num_sends; i++) + { + ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Irecv(&i_recv_data[vec_start], vec_len, HYPRE_MPI_INT, + ip, 0, comm, &requests[j++]); + } + for (i = 0; i < num_recvs; i++) + { + ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Isend(&i_send_data[vec_start], vec_len, HYPRE_MPI_INT, + ip, 0, comm, &requests[j++]); + } + break; + } + case 21: + { + HYPRE_BigInt *i_send_data = (HYPRE_BigInt *) send_data; + HYPRE_BigInt *i_recv_data = (HYPRE_BigInt *) recv_data; + for (i = 0; i < num_recvs; i++) + { + ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Irecv(&i_recv_data[vec_start], vec_len, HYPRE_MPI_BIG_INT, + ip, 0, comm, &requests[j++]); + } + for (i = 0; i < num_sends; i++) + { + vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + hypre_MPI_Isend(&i_send_data[vec_start], vec_len, HYPRE_MPI_BIG_INT, + ip, 0, comm, &requests[j++]); + } + break; + } + case 22: + { + HYPRE_BigInt *i_send_data = (HYPRE_BigInt *) send_data; + HYPRE_BigInt *i_recv_data = (HYPRE_BigInt *) recv_data; + for (i = 0; i < num_sends; i++) + { + vec_start = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i); + vec_len = hypre_ParCSRCommPkgSendMapStart(comm_pkg, i + 1) - vec_start; + ip = hypre_ParCSRCommPkgSendProc(comm_pkg, i); + hypre_MPI_Irecv(&i_recv_data[vec_start], vec_len, HYPRE_MPI_BIG_INT, + ip, 0, comm, &requests[j++]); + } + for (i = 0; i < num_recvs; i++) + { + ip = hypre_ParCSRCommPkgRecvProc(comm_pkg, i); + vec_start = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i); + vec_len = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, i + 1) - vec_start; + hypre_MPI_Isend(&i_send_data[vec_start], vec_len, HYPRE_MPI_BIG_INT, + ip, 0, comm, &requests[j++]); + } + break; + } + } + /*-------------------------------------------------------------------- + * set up comm_handle and return + *--------------------------------------------------------------------*/ + + comm_handle = hypre_CTAlloc(hypre_ParCSRCommHandle, 1, HYPRE_MEMORY_HOST); + + hypre_ParCSRCommHandleCommPkg(comm_handle) = comm_pkg; + hypre_ParCSRCommHandleSendMemoryLocation(comm_handle) = send_memory_location; + hypre_ParCSRCommHandleRecvMemoryLocation(comm_handle) = recv_memory_location; + hypre_ParCSRCommHandleNumSendBytes(comm_handle) = num_send_bytes; + hypre_ParCSRCommHandleNumRecvBytes(comm_handle) = num_recv_bytes; + hypre_ParCSRCommHandleSendData(comm_handle) = send_data_in; + hypre_ParCSRCommHandleRecvData(comm_handle) = recv_data_in; + hypre_ParCSRCommHandleSendDataBuffer(comm_handle) = send_data; + hypre_ParCSRCommHandleRecvDataBuffer(comm_handle) = recv_data; + hypre_ParCSRCommHandleNumRequests(comm_handle) = num_requests; + hypre_ParCSRCommHandleRequests(comm_handle) = requests; + + return ( comm_handle ); +} + HYPRE_Int hypre_ParCSRCommHandleDestroy( hypre_ParCSRCommHandle *comm_handle ) { @@ -682,6 +960,46 @@ hypre_ParCSRCommHandleDestroy( hypre_ParCSRCommHandle *comm_handle ) } +HYPRE_Int +hypre_ParCSRCommHandleDestroy_v3( hypre_ParCSRCommHandle *comm_handle ) +{ + if ( comm_handle == NULL ) + { + return hypre_error_flag; + } + + if (hypre_ParCSRCommHandleNumRequests(comm_handle)) + { + hypre_MPI_Status *status0; + status0 = hypre_CTAlloc(hypre_MPI_Status, + hypre_ParCSRCommHandleNumRequests(comm_handle), HYPRE_MEMORY_HOST); + hypre_MPI_Waitall(hypre_ParCSRCommHandleNumRequests(comm_handle), + hypre_ParCSRCommHandleRequests(comm_handle), status0); + hypre_TFree(status0, HYPRE_MEMORY_HOST); + } + +#ifndef HYPRE_WITH_GPU_AWARE_MPI + hypre_MemoryLocation act_recv_memory_location = hypre_GetActualMemLocation( + hypre_ParCSRCommHandleRecvMemoryLocation(comm_handle)); + if ( act_recv_memory_location == hypre_MEMORY_DEVICE || + act_recv_memory_location == hypre_MEMORY_UNIFIED ) + { + hypre_TMemcpyAsync( hypre_ParCSRCommHandleRecvData(comm_handle), + hypre_ParCSRCommHandleRecvDataBuffer(comm_handle), + char, + hypre_ParCSRCommHandleNumRecvBytes(comm_handle), + HYPRE_MEMORY_DEVICE, + HYPRE_MEMORY_HOST ); + } +#endif + + hypre_TFree(hypre_ParCSRCommHandleRequests(comm_handle), HYPRE_MEMORY_HOST); + hypre_TFree(comm_handle, HYPRE_MEMORY_HOST); + + return hypre_error_flag; +} + + /* hypre_MatCommPkgCreate_core does all the communications and computations for hypre_MatCommPkgCreate ( hypre_ParCSRMatrix *A) and hypre_BoolMatCommPkgCreate ( hypre_ParCSRBooleanMatrix *A) To support both diff --git a/src/parcsr_mv/par_csr_matrix.c b/src/parcsr_mv/par_csr_matrix.c index 2bd7854847..ace19407ad 100644 --- a/src/parcsr_mv/par_csr_matrix.c +++ b/src/parcsr_mv/par_csr_matrix.c @@ -128,6 +128,8 @@ hypre_ParCSRMatrixCreate( MPI_Comm comm, #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_ParCSRMatrixSocDiagJ(matrix) = NULL; hypre_ParCSRMatrixSocOffdJ(matrix) = NULL; + hypre_ParCSRMatrixSendPinned(matrix) = NULL; + hypre_ParCSRMatrixRecvPinned(matrix) = NULL; #endif return matrix; @@ -204,6 +206,8 @@ hypre_ParCSRMatrixDestroy( hypre_ParCSRMatrix *matrix ) #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) hypre_TFree(hypre_ParCSRMatrixSocDiagJ(matrix), HYPRE_MEMORY_DEVICE); hypre_TFree(hypre_ParCSRMatrixSocOffdJ(matrix), HYPRE_MEMORY_DEVICE); + _hypre_TFree(hypre_ParCSRMatrixSendPinned(matrix), hypre_MEMORY_HOST_PINNED); + _hypre_TFree(hypre_ParCSRMatrixRecvPinned(matrix), hypre_MEMORY_HOST_PINNED); #endif hypre_TFree(matrix, HYPRE_MEMORY_HOST); diff --git a/src/parcsr_mv/par_csr_matvec.c b/src/parcsr_mv/par_csr_matvec.c index 5da75d708d..9615a9c936 100644 --- a/src/parcsr_mv/par_csr_matvec.c +++ b/src/parcsr_mv/par_csr_matvec.c @@ -51,7 +51,7 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, HYPRE_Int num_vectors = hypre_VectorNumVectors(x_local); HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(offd); HYPRE_Int ierr = 0; - HYPRE_Int num_sends, jv; + HYPRE_Int num_sends, num_recvs, jv; HYPRE_Int vecstride = hypre_VectorVectorStride( x_local ); HYPRE_Int idxstride = hypre_VectorIndexStride( x_local ); @@ -118,6 +118,7 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, } num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg); + num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg); hypre_assert( num_cols_offd == hypre_ParCSRCommPkgRecvVecStart(comm_pkg, hypre_ParCSRCommPkgNumRecvs(comm_pkg)) ); hypre_assert( hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0) == 0 ); @@ -176,6 +177,23 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, } #endif + /* allocate pinned memory buffers */ +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + HYPRE_Int Nsends = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends); + HYPRE_Int Nrecvs = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs); + HYPRE_Int N = Nsends>Nrecvs ? Nsends : Nrecvs; + if (!hypre_ParCSRMatrixSendPinned(A)) + { + char * temp = _hypre_TAlloc(char, N*sizeof(HYPRE_Complex), hypre_MEMORY_HOST_PINNED); + hypre_ParCSRMatrixSendPinned(A) = temp; + } + if (!hypre_ParCSRMatrixRecvPinned(A)) + { + char * temp = _hypre_TAlloc(char, N*sizeof(HYPRE_Complex), hypre_MEMORY_HOST_PINNED); + hypre_ParCSRMatrixRecvPinned(A) = temp; + } +#endif + hypre_SeqVectorInitialize_v2(x_tmp, HYPRE_MEMORY_DEVICE); x_tmp_data = hypre_VectorData(x_tmp); @@ -239,13 +257,31 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, /* if on device, no need to Sync: send_data is on device memory */ #if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) - /* pack send data on device */ - HYPRE_THRUST_CALL( gather, - hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg), - hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg) + - hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), - locl_data, - send_data ); + + /* Write the data directly into the Pinned memory buffer */ + if (N) + { + send_data = (HYPRE_Complex *)hypre_HostGetDevicePointer(hypre_ParCSRMatrixSendPinned(A)); + +#if 0 + /* This kernel seems a little bit faster than thrust */ + cusparseHandle_t handle = hypre_HandleCusparseHandle(hypre_handle()); + HYPRE_CUSPARSE_CALL( cusparseDgthr(handle, + hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), + locl_data, + send_data, + hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg), + CUSPARSE_INDEX_BASE_ZERO) ); +#else + HYPRE_THRUST_CALL( gather, + hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg), + hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg) + + hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), + locl_data, + send_data ); +#endif + } + #elif defined(HYPRE_USING_SYCL) auto permuted_source = oneapi::dpl::make_permutation_iterator(locl_data, hypre_ParCSRCommPkgDeviceSendMapElmts(comm_pkg)); @@ -253,6 +289,7 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, permuted_source, permuted_source + hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends), send_data ); + #elif defined(HYPRE_USING_DEVICE_OPENMP) /* pack send data on device */ HYPRE_Int i; @@ -308,8 +345,13 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, { for ( jv = 0; jv < num_vectors; ++jv ) { - comm_handle[jv] = hypre_ParCSRCommHandleCreate_v2( 1, comm_pkg, HYPRE_MEMORY_DEVICE, x_buf_data[jv], +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + comm_handle[jv] = hypre_ParCSRCommHandleCreate_v3( 1, comm_pkg, HYPRE_MEMORY_DEVICE, x_buf_data[jv], hypre_ParCSRMatrixSendPinned(A), + HYPRE_MEMORY_DEVICE, &x_tmp_data[jv * num_cols_offd], hypre_ParCSRMatrixRecvPinned(A) ); +#else + comm_handle[jv] = hypre_ParCSRCommHandleCreate_v2( 1, comm_pkg, HYPRE_MEMORY_DEVICE, x_buf_data[jv], HYPRE_MEMORY_DEVICE, &x_tmp_data[jv * num_cols_offd] ); +#endif } } @@ -337,7 +379,12 @@ hypre_ParCSRMatrixMatvecOutOfPlace( HYPRE_Complex alpha, { for ( jv = 0; jv < num_vectors; ++jv ) { +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + hypre_ParCSRCommHandleDestroy_v3(comm_handle[jv]); +#else hypre_ParCSRCommHandleDestroy(comm_handle[jv]); +#endif + comm_handle[jv] = NULL; } hypre_TFree(comm_handle, HYPRE_MEMORY_HOST); @@ -439,7 +486,7 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha, HYPRE_Int num_vectors = hypre_VectorNumVectors(y_local); HYPRE_Int num_cols_offd = hypre_CSRMatrixNumCols(offd); HYPRE_Int ierr = 0; - HYPRE_Int num_sends, jv; + HYPRE_Int num_sends, num_recvs, jv; HYPRE_Int vecstride = hypre_VectorVectorStride(y_local); HYPRE_Int idxstride = hypre_VectorIndexStride(y_local); @@ -504,6 +551,7 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha, } num_sends = hypre_ParCSRCommPkgNumSends(comm_pkg); + num_recvs = hypre_ParCSRCommPkgNumRecvs(comm_pkg); hypre_assert( num_cols_offd == hypre_ParCSRCommPkgRecvVecStart(comm_pkg, hypre_ParCSRCommPkgNumRecvs(comm_pkg)) ); hypre_assert( hypre_ParCSRCommPkgSendMapStart(comm_pkg, 0) == 0 ); @@ -562,6 +610,28 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha, } #endif +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + HYPRE_Int Nsends = hypre_ParCSRCommPkgSendMapStart(comm_pkg, num_sends); + HYPRE_Int Nrecvs = hypre_ParCSRCommPkgRecvVecStart(comm_pkg, num_recvs); + HYPRE_Int N = Nsends>Nrecvs ? Nsends : Nrecvs; + if (!hypre_ParCSRMatrixSendPinned(A)) + { + char * temp = _hypre_TAlloc(char, N*sizeof(HYPRE_Complex), hypre_MEMORY_HOST_PINNED); + hypre_ParCSRMatrixSendPinned(A) = temp; + } + if (!hypre_ParCSRMatrixRecvPinned(A)) + { + char * temp = _hypre_TAlloc(char, N*sizeof(HYPRE_Complex), hypre_MEMORY_HOST_PINNED); + hypre_ParCSRMatrixRecvPinned(A) = temp; + } + + /* Reassign the pointer of the first SpMV output so that kernel writes data directly into the Pinned memory buffer */ + if (N) + { + hypre_VectorData(y_tmp) = (HYPRE_Complex *)hypre_HostGetDevicePointer(hypre_ParCSRMatrixSendPinned(A)); + } +#endif + hypre_SeqVectorInitialize_v2(y_tmp, HYPRE_MEMORY_DEVICE); y_tmp_data = hypre_VectorData(y_tmp); @@ -651,10 +721,15 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha, { for ( jv = 0; jv < num_vectors; ++jv ) { +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + comm_handle[jv] = hypre_ParCSRCommHandleCreate_v3( 2, comm_pkg, HYPRE_MEMORY_DEVICE, &y_tmp_data[jv * num_cols_offd], hypre_ParCSRMatrixSendPinned(A), + HYPRE_MEMORY_DEVICE, y_buf_data[jv], hypre_ParCSRMatrixRecvPinned(A) ); +#else /* this is where we assume multivectors are 'column' storage */ comm_handle[jv] = hypre_ParCSRCommHandleCreate_v2( 2, comm_pkg, HYPRE_MEMORY_DEVICE, &y_tmp_data[jv * num_cols_offd], HYPRE_MEMORY_DEVICE, y_buf_data[jv] ); +#endif } } @@ -690,7 +765,11 @@ hypre_ParCSRMatrixMatvecT( HYPRE_Complex alpha, { for ( jv = 0; jv < num_vectors; ++jv ) { +#if defined(HYPRE_USING_CUDA) || defined(HYPRE_USING_HIP) + hypre_ParCSRCommHandleDestroy_v3(comm_handle[jv]); +#else hypre_ParCSRCommHandleDestroy(comm_handle[jv]); +#endif comm_handle[jv] = NULL; } hypre_TFree(comm_handle, HYPRE_MEMORY_HOST); diff --git a/src/parcsr_mv/protos.h b/src/parcsr_mv/protos.h index 9d68befef3..d82c5f0faa 100644 --- a/src/parcsr_mv/protos.h +++ b/src/parcsr_mv/protos.h @@ -233,7 +233,11 @@ hypre_ParCSRCommHandle *hypre_ParCSRCommHandleCreate ( HYPRE_Int job, hypre_ParC hypre_ParCSRCommHandle *hypre_ParCSRCommHandleCreate_v2 ( HYPRE_Int job, hypre_ParCSRCommPkg *comm_pkg, HYPRE_MemoryLocation send_memory_location, void *send_data_in, HYPRE_MemoryLocation recv_memory_location, void *recv_data_in ); +hypre_ParCSRCommHandle *hypre_ParCSRCommHandleCreate_v3 ( HYPRE_Int job, + hypre_ParCSRCommPkg *comm_pkg, HYPRE_MemoryLocation send_memory_location, void *send_data_in, void *send_data_pinned, + HYPRE_MemoryLocation recv_memory_location, void *recv_data_in, void *recv_data_pinned ); HYPRE_Int hypre_ParCSRCommHandleDestroy ( hypre_ParCSRCommHandle *comm_handle ); +HYPRE_Int hypre_ParCSRCommHandleDestroy_v3 ( hypre_ParCSRCommHandle *comm_handle ); void hypre_ParCSRCommPkgCreate_core ( MPI_Comm comm, HYPRE_BigInt *col_map_offd, HYPRE_BigInt first_col_diag, HYPRE_BigInt *col_starts, HYPRE_Int num_cols_diag, HYPRE_Int num_cols_offd, HYPRE_Int *p_num_recvs, HYPRE_Int **p_recv_procs, diff --git a/src/utilities/_hypre_utilities.h b/src/utilities/_hypre_utilities.h index 8801caddd8..5baac9817a 100644 --- a/src/utilities/_hypre_utilities.h +++ b/src/utilities/_hypre_utilities.h @@ -737,6 +737,13 @@ typedef struct } \ ) +#define hypre_TMemcpyAsync(dst, src, type, count, locdst, locsrc) \ +( \ +{ \ + hypre_MemcpyAsync((void *)(dst), (void *)(src), (size_t)(sizeof(type) * (count)), locdst, locsrc); \ +} \ +) + #define hypre_TFree(ptr, location) \ ( \ { \ @@ -777,6 +784,9 @@ typedef struct #define hypre_TMemcpy(dst, src, type, count, locdst, locsrc) \ (hypre_Memcpy((void *)(dst), (void *)(src), (size_t)(sizeof(type) * (count)), locdst, locsrc)) +#define hypre_TMemcpyAsync(dst, src, type, count, locdst, locsrc) \ +(hypre_MemcpyAsync((void *)(dst), (void *)(src), (size_t)(sizeof(type) * (count)), locdst, locsrc)) + #define hypre_TFree(ptr, location) \ ( hypre_Free((void *)ptr, location), ptr = NULL ) @@ -796,8 +806,11 @@ void hypre_MemPrefetch(void *ptr, size_t size, HYPRE_MemoryLocation location); void * hypre_MAlloc(size_t size, HYPRE_MemoryLocation location); void * hypre_CAlloc( size_t count, size_t elt_size, HYPRE_MemoryLocation location); void hypre_Free(void *ptr, HYPRE_MemoryLocation location); +void * hypre_HostGetDevicePointer(void *hostPtr); void hypre_Memcpy(void *dst, void *src, size_t size, HYPRE_MemoryLocation loc_dst, HYPRE_MemoryLocation loc_src); +void hypre_MemcpyAsync(void *dst, void *src, size_t size, HYPRE_MemoryLocation loc_dst, + HYPRE_MemoryLocation loc_src); void * hypre_ReAlloc(void *ptr, size_t size, HYPRE_MemoryLocation location); void * hypre_ReAlloc_v2(void *ptr, size_t old_size, size_t new_size, HYPRE_MemoryLocation location); diff --git a/src/utilities/memory.c b/src/utilities/memory.c index bd54868bcb..cbdb4bc079 100644 --- a/src/utilities/memory.c +++ b/src/utilities/memory.c @@ -324,6 +324,29 @@ hypre_UnifiedMalloc(size_t size, HYPRE_Int zeroinit) return ptr; } +static inline void * +hypre_HostGetDevicePointer_core(void * hostPtr) +{ + void * devPtr = NULL; +#if defined(HYPRE_USING_CUDA) + HYPRE_CUDA_CALL( cudaHostGetDevicePointer(&devPtr, hostPtr, 0) ); +#endif +#if defined(HYPRE_USING_HIP) + HYPRE_HIP_CALL( hipHostGetDevicePointer(&devPtr, hostPtr, hipHostRegisterMapped) ); +#endif + return devPtr; +} + +/*-------------------------------------------------------------------------- + * hypre_Free + *--------------------------------------------------------------------------*/ + +void * +hypre_HostGetDevicePointer(void *hostPtr) +{ + return hypre_HostGetDevicePointer_core(hostPtr); +} + static inline void * hypre_HostPinnedMalloc(size_t size, HYPRE_Int zeroinit) { @@ -334,11 +357,12 @@ hypre_HostPinnedMalloc(size_t size, HYPRE_Int zeroinit) #else #if defined(HYPRE_USING_CUDA) - HYPRE_CUDA_CALL( cudaMallocHost(&ptr, size) ); + //HYPRE_CUDA_CALL( cudaMallocHost(&ptr, size) ); + HYPRE_CUDA_CALL( cudaHostAlloc(&ptr, size, cudaHostAllocMapped) ); #endif #if defined(HYPRE_USING_HIP) - HYPRE_HIP_CALL( hipHostMalloc(&ptr, size) ); + HYPRE_HIP_CALL( hipHostMalloc(&ptr, size, hipHostRegisterMapped) ); #endif #if defined(HYPRE_USING_SYCL) @@ -546,6 +570,41 @@ _hypre_Free(void *ptr, hypre_MemoryLocation location) hypre_Free_core(ptr, location); } +/*-------------------------------------------------------------------------- + * MemcpyAsync + *--------------------------------------------------------------------------*/ +static inline void +hypre_MemcpyAsync_core(void *dst, void *src, size_t size, hypre_MemoryLocation loc_dst, + hypre_MemoryLocation loc_src) +{ + if (dst == NULL || src == NULL) + { + if (size) + { + hypre_printf("hypre_Memcpy warning: copy %ld bytes from %p to %p !\n", size, src, dst); + hypre_assert(0); + } + + return; + } + + if (dst == src) + { + return; + } + + /* 2: UVM <-- Host, UVM <-- Pinned */ + if (loc_dst == hypre_MEMORY_UNIFIED || loc_dst == hypre_MEMORY_DEVICE) + { +#if defined(HYPRE_USING_CUDA) + HYPRE_CUDA_CALL( cudaMemcpyAsync(dst, src, size, cudaMemcpyHostToDevice, hypre_HandleComputeStream(hypre_handle())) ); +#endif +#if defined(HYPRE_USING_HIP) + HYPRE_HIP_CALL( hipMemcpyAsync(dst, src, size, hipMemcpyDeviceToDevice, hypre_HandleComputeStream(hypre_handle())) ); +#endif + return; + } +} /*-------------------------------------------------------------------------- * Memcpy @@ -927,6 +986,18 @@ hypre_Memcpy(void *dst, void *src, size_t size, HYPRE_MemoryLocation loc_dst, hypre_GetActualMemLocation(loc_src) ); } +/*-------------------------------------------------------------------------- + * hypre_Memcpy + *--------------------------------------------------------------------------*/ + +void +hypre_MemcpyAsync(void *dst, void *src, size_t size, HYPRE_MemoryLocation loc_dst, + HYPRE_MemoryLocation loc_src) +{ + hypre_MemcpyAsync_core( dst, src, size, hypre_GetActualMemLocation(loc_dst), + hypre_GetActualMemLocation(loc_src) ); +} + /*-------------------------------------------------------------------------- * hypre_ReAlloc *--------------------------------------------------------------------------*/ diff --git a/src/utilities/memory.h b/src/utilities/memory.h index 96693e859c..093fbbf31a 100644 --- a/src/utilities/memory.h +++ b/src/utilities/memory.h @@ -223,6 +223,13 @@ typedef struct } \ ) +#define hypre_TMemcpyAsync(dst, src, type, count, locdst, locsrc) \ +( \ +{ \ + hypre_MemcpyAsync((void *)(dst), (void *)(src), (size_t)(sizeof(type) * (count)), locdst, locsrc); \ +} \ +) + #define hypre_TFree(ptr, location) \ ( \ { \ @@ -263,6 +270,9 @@ typedef struct #define hypre_TMemcpy(dst, src, type, count, locdst, locsrc) \ (hypre_Memcpy((void *)(dst), (void *)(src), (size_t)(sizeof(type) * (count)), locdst, locsrc)) +#define hypre_TMemcpyAsync(dst, src, type, count, locdst, locsrc) \ +(hypre_MemcpyAsync((void *)(dst), (void *)(src), (size_t)(sizeof(type) * (count)), locdst, locsrc)) + #define hypre_TFree(ptr, location) \ ( hypre_Free((void *)ptr, location), ptr = NULL ) @@ -282,6 +292,7 @@ void hypre_MemPrefetch(void *ptr, size_t size, HYPRE_MemoryLocation location); void * hypre_MAlloc(size_t size, HYPRE_MemoryLocation location); void * hypre_CAlloc( size_t count, size_t elt_size, HYPRE_MemoryLocation location); void hypre_Free(void *ptr, HYPRE_MemoryLocation location); +void * hypre_HostGetDevicePointer(void *hostPtr); void hypre_Memcpy(void *dst, void *src, size_t size, HYPRE_MemoryLocation loc_dst, HYPRE_MemoryLocation loc_src); void * hypre_ReAlloc(void *ptr, size_t size, HYPRE_MemoryLocation location);