Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cuda LU GW kernel #4

Merged
merged 8 commits into from
Apr 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions src/cu_routines.cu
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,11 @@ namespace green::gpu {
template <typename prec>
cugw_utils<prec>::cugw_utils(int _nts, int _nt_batch, int _nw_b, int _ns, int _nk, int _ink, int _nqkpt, int _NQ, int _nao,
ztensor_view<5>& G_tskij_host, bool low_device_memory, const MatrixXcd& Ttn_FB,
const MatrixXcd& Tnt_BF, int _myid, int _intranode_rank, int _devCount_per_node) :
const MatrixXcd& Tnt_BF, LinearSolverType cuda_lin_solver, int _myid, int _intranode_rank,
int _devCount_per_node) :
_low_device_memory(low_device_memory), qkpts(_nqkpt), V_Qpm(_NQ, _nao, _nao), V_Qim(_NQ, _nao, _nao),
Gk1_stij(_ns, _nts, _nao, _nao), Gk_smtij(_ns, _nts, _nao, _nao),
qpt(_nao, _NQ, _nts, _nw_b, Ttn_FB.data(), Tnt_BF.data()) {
qpt(_nao, _NQ, _nts, _nw_b, Ttn_FB.data(), Tnt_BF.data(), cuda_lin_solver) {
if (cudaSetDevice(_intranode_rank % _devCount_per_node) != cudaSuccess) throw std::runtime_error("Error in cudaSetDevice2");
if (cublasCreate(&_handle) != CUBLAS_STATUS_SUCCESS)
throw std::runtime_error("Rank " + std::to_string(_myid) + ": error initializing cublas");
Expand Down Expand Up @@ -216,7 +217,7 @@ namespace green::gpu {
qpt.verbose() = verbose;

for (size_t q_reduced_id = _devices_rank; q_reduced_id < _ink; q_reduced_id += _devices_size) {
if( verbose > 2)std::cout << "q = " << q_reduced_id << std::endl;
if (verbose > 2) std::cout << "q = " << q_reduced_id << std::endl;
size_t q = reduced_to_full[q_reduced_id];
qpt.reset_Pqk0();
for (size_t k = 0; k < _nk; ++k) {
Expand Down
48 changes: 48 additions & 0 deletions src/cublas_routines_prec.cu
Original file line number Diff line number Diff line change
Expand Up @@ -178,3 +178,51 @@ cusolverStatus_t POTRS(cusolverDnHandle_t handle,
int *devInfo) {
return cusolverDnCpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
}


cublasStatus_t GETRF_BATCHED(cublasHandle_t handle,
int n,
cuComplex *const Aarray[],
int lda,
int *PivotArray,
int *infoArray,
int batchSize) {
return cublasCgetrfBatched(handle, n, Aarray, lda, PivotArray, infoArray, batchSize);
}
cublasStatus_t GETRF_BATCHED(cublasHandle_t handle,
int n,
cuDoubleComplex *const Aarray[],
int lda,
int *PivotArray,
int *infoArray,
int batchSize) {
return cublasZgetrfBatched(handle, n, Aarray, lda, PivotArray, infoArray, batchSize);
}


cublasStatus_t GETRS_BATCHED(cublasHandle_t handle,
cublasOperation_t trans,
int n,
int nrhs,
const cuComplex *const Aarray[],
int lda,
const int *devIpiv,
cuComplex *const Barray[],
int ldb,
int *info,
int batchSize) {
return cublasCgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
}
cublasStatus_t GETRS_BATCHED(cublasHandle_t handle,
cublasOperation_t trans,
int n,
int nrhs,
const cuDoubleComplex *const Aarray[],
int lda,
const int *devIpiv,
cuDoubleComplex *const Barray[],
int ldb,
int *info,
int batchSize) {
return cublasZgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
}
93 changes: 86 additions & 7 deletions src/cugw_qpt.cu
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
namespace green::gpu {
template<typename prec>
gw_qpt<prec>::gw_qpt(int nao, int naux, int nt, int nw_b,
const std::complex<double> *T_tw_fb_host, const std::complex<double> *T_wt_bf_host):
const std::complex<double> *T_tw_fb_host, const std::complex<double> *T_wt_bf_host,
LinearSolverType cuda_lin_solver):
nao_(nao),
nao2_(nao * nao),
nao3_(nao2_ * nao),
Expand All @@ -42,7 +43,7 @@ namespace green::gpu {
handle_(nullptr),
solver_handle_(nullptr),
streams_potrs_(nw_b),
potrs_ready_event_(nw_b), one_minus_P_ready_event_(nw_b) {
potrs_ready_event_(nw_b), one_minus_P_ready_event_(nw_b), cuda_lin_solver_(cuda_lin_solver) {
allocate_IR_transformation_matrices(&T_tw_, &T_wt_, T_tw_fb_host, T_wt_bf_host, nt, nw_b);
}

Expand All @@ -58,7 +59,14 @@ namespace green::gpu {
}
cudaEventCreateWithFlags(&polarization_ready_event_, cudaEventDisableTiming);
cudaEventCreateWithFlags(&bare_polarization_ready_event_, cudaEventDisableTiming);
cudaEventCreateWithFlags(&Cholesky_decomposition_ready_event_, cudaEventDisableTiming);
if(cuda_lin_solver_ == LinearSolverType::Cholesky) {
cudaEventCreateWithFlags(&Cholesky_decomposition_ready_event_, cudaEventDisableTiming);
}
else {
cudaEventCreateWithFlags(&LU_decomposition_ready_event_, cudaEventDisableTiming);
cudaEventCreateWithFlags(&getrs_ready_event_, cudaEventDisableTiming);
}


// We assume nt_ >= nw_b_ and reuse the same memory for P0 and P in imaginary time and frequency domain
// This is a safe assumption in IR and Chebyshev representation
Expand All @@ -77,18 +85,30 @@ namespace green::gpu {

if (cudaMalloc(&one_minus_P_w_ptrs_, nw_b_ * sizeof(cuda_complex*)) != cudaSuccess)
throw std::runtime_error("failure allocating one_minus_P_w_ptrs_");
if(cuda_lin_solver_ == LinearSolverType::LU) {
if (cudaMalloc(&P0_w_ptrs_, nw_b_ * sizeof(cuda_complex*)) != cudaSuccess)
throw std::runtime_error("failure allocating P0_w_ptrs_");
}
if (cudaMalloc(&d_info_, nw_b_ * sizeof(int)) != cudaSuccess)
throw std::runtime_error("failure allocating info int on device");
if(cuda_lin_solver_ == LinearSolverType::LU) {
if (cudaMalloc(&Pivot_, naux_ * nw_b_ * sizeof(int)) != cudaSuccess)
throw std::runtime_error("cudaMalloc failed to allocate Pivot");
}

if (cusolverDnSetStream(*solver_handle_, stream_) != CUSOLVER_STATUS_SUCCESS)
throw std::runtime_error("cusolver set stream problem");
;

// locks so that different threads don't write the results over each other
cudaMalloc(&Pqk0_tQP_lock_, sizeof(int));
cudaMemset(Pqk0_tQP_lock_, 0, sizeof(int));
// For batched potrf
// For batched potrf/LU
set_batch_pointer<<<1, 1, 0, stream_>>>(one_minus_P_w_ptrs_, one_minus_P_wPQ_, naux2_, nw_b_);
// for LU
if(cuda_lin_solver_ == LinearSolverType::LU) {
set_batch_pointer<<<1, 1, 0, stream_>>>(P0_w_ptrs_, Pqk0_wQP_, naux2_, nw_b_);
}

}

template <typename prec>
Expand All @@ -101,14 +121,23 @@ namespace green::gpu {
}
cudaEventDestroy(polarization_ready_event_);
cudaEventDestroy(bare_polarization_ready_event_);
cudaEventDestroy(Cholesky_decomposition_ready_event_);
if(cuda_lin_solver_ == LinearSolverType::LU) {
cudaEventDestroy(LU_decomposition_ready_event_);
}
else {
cudaEventDestroy(Cholesky_decomposition_ready_event_);
}

cudaFree(Pqk0_tQP_);
cudaFree(Pqk0_tQP_lock_);
cudaFree(Pqk0_wQP_);

cudaFree(one_minus_P_w_ptrs_);
cudaFree(d_info_);
if(cuda_lin_solver_ == LinearSolverType::LU) {
cudaFree(P0_w_ptrs_);
cudaFree(Pivot_);
}
cudaFree(T_tw_);
cudaFree(T_wt_);
}
Expand Down Expand Up @@ -157,7 +186,7 @@ namespace green::gpu {
}

template <typename prec>
void gw_qpt<prec>::compute_Pq() {
void gw_qpt<prec>::compute_Pq_chol() {
int threads_per_block = 512;
int blocks_for_id = naux_ / threads_per_block + 1;
if (_verbose > 2) std::cout << "Running Cholesky solver for (I - P0)P = P0" << std::endl;
Expand Down Expand Up @@ -199,6 +228,56 @@ namespace green::gpu {
}
}

template <typename prec>
void gw_qpt<prec>::compute_Pq_lu() {
int threads_per_block = 512;
int blocks_for_id = naux_ / threads_per_block + 1;
if (_verbose > 2) std::cout << "Running pivoted LU solver for (I - P0)P = P0" << std::endl;
for (int w = 0; w < nw_b_; ++w) {
cudaStreamWaitEvent(streams_potrs_[w], bare_polarization_ready_event_, 0);
hermitian_symmetrize<<<blocks_for_id, threads_per_block, 0, streams_potrs_[w]>>>(Pqk0_wQP_ + w * naux2_, naux_);
set_up_one_minus_P<<<blocks_for_id, threads_per_block, 0, streams_potrs_[w]>>>(one_minus_P_wPQ_ + w * naux2_,
Pqk0_wQP_ + w * naux2_, naux_);
cudaEventRecord(one_minus_P_ready_event_[w], streams_potrs_[w]);
}
for (int w = 0; w < nw_b_; ++w) {
cudaStreamWaitEvent(stream_, one_minus_P_ready_event_[w], 0 /*cudaEventWaitDefault*/);
}

if (cublasSetStream(*handle_, stream_) != CUBLAS_STATUS_SUCCESS)
throw std::runtime_error("cublas set stream problem");

// get LU decomposition
if(GETRF_BATCHED(*handle_, naux_, one_minus_P_w_ptrs_,
naux_, Pivot_, d_info_, nw_b_) != CUBLAS_STATUS_SUCCESS) {
throw std::runtime_error("CUDA GETRF failed!");
}
validate_info<<<1, 1, 0, stream_>>>(d_info_, nw_b_);
cudaEventRecord(LU_decomposition_ready_event_, stream_);

if (cudaStreamWaitEvent(stream_, LU_decomposition_ready_event_, 0 /*cudaEventWaitDefault*/))
throw std::runtime_error("Could not wait for LU decomposition");

if (cublasSetStream(*handle_, stream_) != CUBLAS_STATUS_SUCCESS)
throw std::runtime_error("cublas set stream problem");

int host_info; // getrs_batched API requres int on host

// Apply LU to solve linear system
if(GETRS_BATCHED(*handle_, CUBLAS_OP_N, naux_, naux_, one_minus_P_w_ptrs_,
naux_, Pivot_, P0_w_ptrs_, naux_, &host_info, nw_b_) != CUBLAS_STATUS_SUCCESS) {
throw std::runtime_error("CUDA GETRS failed!");
}

if(host_info != 0)
throw std::runtime_error("cublas GETRS info = " + std::to_string(host_info));

cudaEventRecord(getrs_ready_event_, stream_);
if (cudaStreamWaitEvent(stream_, getrs_ready_event_, 0 /*cudaEventWaitDefault*/))
throw std::runtime_error("Could not wait for GETRS");

}

template <typename prec>
void gw_qpt<prec>::wait_for_kpts() {
if (cudaStreamSynchronize(stream_) != cudaSuccess) throw std::runtime_error("could not wait for other streams");
Expand Down
4 changes: 4 additions & 0 deletions src/green/gpu/common_defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,5 +133,9 @@ namespace green::gpu {
out[i] = static_cast<std::complex<float>>(in[i]);
}
}

enum class LinearSolverType {
Cholesky, LU
};
} // namespace green::gpu
#endif // GREEN_GPU_COMMON_DEFS_H
12 changes: 6 additions & 6 deletions src/green/gpu/cu_routines.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ namespace green::gpu {
using cuda_complex = typename cu_type_map<std::complex<double>>::cuda_type;

public:
cuhf_utils(size_t nk, size_t ink, size_t ns, size_t nao, size_t NQ, size_t nkbatch, ztensor<4> dm_fbz, int _myid,
int _intranode_rank, int _devCount_per_node);
cuhf_utils(size_t nk, size_t ink, size_t ns, size_t nao, size_t NQ, size_t nkbatch, ztensor<4> dm_fbz, int _myid,
int _intranode_rank, int _devCount_per_node);

~cuhf_utils();

Expand Down Expand Up @@ -134,11 +134,11 @@ namespace green::gpu {
using cuda_complex = typename cu_type_map<std::complex<prec>>::cuda_type;

public:
cugw_utils(int _nts, int _nt_batch, int _nw_b, int _ns, int _nk, int _ink, int _nqkpt, int _NQ, int _nao,
ztensor_view<5>& G_tskij_host, bool _low_device_memory, const MatrixXcd& Ttn_FB, const MatrixXcd& Tnt_BF,
int _myid, int _intranode_rank, int _devCount_per_node);
cugw_utils(int _nts, int _nt_batch, int _nw_b, int _ns, int _nk, int _ink, int _nqkpt, int _NQ, int _nao,
ztensor_view<5>& G_tskij_host, bool _low_device_memory, const MatrixXcd& Ttn_FB, const MatrixXcd& Tnt_BF,
LinearSolverType cuda_lin_solver, int _myid, int _intranode_rank, int _devCount_per_node);

~ cugw_utils();
~cugw_utils();

void solve(int _nts, int _ns, int _nk, int _ink, int _nao, const std::vector<size_t>& reduced_to_full,
const std::vector<size_t>& full_to_reduced, std::complex<double>* Vk1k2_Qij, ztensor<5>& Sigma_tskij_host,
Expand Down
38 changes: 38 additions & 0 deletions src/green/gpu/cublas_routines_prec.h
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,41 @@ cusolverStatus_t POTRS(cusolverDnHandle_t handle,
cuComplex *B,
int ldb,
int *devInfo);

cublasStatus_t GETRF_BATCHED(cublasHandle_t handle,
int n,
cuComplex *const Aarray[],
int lda,
int *PivotArray,
int *infoArray,
int batchSize);
cublasStatus_t GETRF_BATCHED(cublasHandle_t handle,
int n,
cuDoubleComplex *const Aarray[],
int lda,
int *PivotArray,
int *infoArray,
int batchSize);

cublasStatus_t GETRS_BATCHED(cublasHandle_t handle,
cublasOperation_t trans,
int n,
int nrhs,
const cuComplex *const Aarray[],
int lda,
const int *devIpiv,
cuComplex *const Barray[],
int ldb,
int *info,
int batchSize);
cublasStatus_t GETRS_BATCHED(cublasHandle_t handle,
cublasOperation_t trans,
int n,
int nrhs,
const cuDoubleComplex *const Aarray[],
int lda,
const int *devIpiv,
cuDoubleComplex *const Barray[],
int ldb,
int *info,
int batchSize);
Loading
Loading