Green-Phys · iskakoff · Apr 6, 2024 · Apr 5, 2024 · Apr 5, 2024 · Apr 5, 2024
diff --git a/src/cu_routines.cu b/src/cu_routines.cu
@@ -167,10 +167,11 @@ namespace green::gpu {
   template <typename prec>
   cugw_utils<prec>::cugw_utils(int _nts, int _nt_batch, int _nw_b, int _ns, int _nk, int _ink, int _nqkpt, int _NQ, int _nao,
                                ztensor_view<5>& G_tskij_host, bool low_device_memory, const MatrixXcd& Ttn_FB,
-                               const MatrixXcd& Tnt_BF, int _myid, int _intranode_rank, int _devCount_per_node) :
+                               const MatrixXcd& Tnt_BF, LinearSolverType cuda_lin_solver, int _myid, int _intranode_rank,
+                               int _devCount_per_node) :
       _low_device_memory(low_device_memory), qkpts(_nqkpt), V_Qpm(_NQ, _nao, _nao), V_Qim(_NQ, _nao, _nao),
       Gk1_stij(_ns, _nts, _nao, _nao), Gk_smtij(_ns, _nts, _nao, _nao),
-      qpt(_nao, _NQ, _nts, _nw_b, Ttn_FB.data(), Tnt_BF.data()) {
+      qpt(_nao, _NQ, _nts, _nw_b, Ttn_FB.data(), Tnt_BF.data(), cuda_lin_solver) {
     if (cudaSetDevice(_intranode_rank % _devCount_per_node) != cudaSuccess) throw std::runtime_error("Error in cudaSetDevice2");
     if (cublasCreate(&_handle) != CUBLAS_STATUS_SUCCESS)
       throw std::runtime_error("Rank " + std::to_string(_myid) + ": error initializing cublas");
@@ -216,7 +217,7 @@ namespace green::gpu {
     qpt.verbose() = verbose;
 
     for (size_t q_reduced_id = _devices_rank; q_reduced_id < _ink; q_reduced_id += _devices_size) {
-      if( verbose > 2)std::cout << "q = " << q_reduced_id << std::endl;
+      if (verbose > 2) std::cout << "q = " << q_reduced_id << std::endl;
       size_t q = reduced_to_full[q_reduced_id];
       qpt.reset_Pqk0();
       for (size_t k = 0; k < _nk; ++k) {

diff --git a/src/cublas_routines_prec.cu b/src/cublas_routines_prec.cu
@@ -178,3 +178,51 @@ cusolverStatus_t POTRS(cusolverDnHandle_t handle,
         int *devInfo) {
   return cusolverDnCpotrs(handle, uplo, n, nrhs, A, lda, B, ldb, devInfo);
 }
+
+
+cublasStatus_t  GETRF_BATCHED(cublasHandle_t handle,
+        int n,
+        cuComplex *const Aarray[],
+        int lda,
+        int *PivotArray,
+        int *infoArray,
+        int batchSize) {
+  return cublasCgetrfBatched(handle, n, Aarray, lda, PivotArray, infoArray, batchSize);
+}
+cublasStatus_t  GETRF_BATCHED(cublasHandle_t handle,
+        int n,
+        cuDoubleComplex *const Aarray[],
+        int lda,
+        int *PivotArray,
+        int *infoArray,
+        int batchSize) {
+  return cublasZgetrfBatched(handle, n, Aarray, lda, PivotArray, infoArray, batchSize);
+}
+
+
+cublasStatus_t  GETRS_BATCHED(cublasHandle_t handle,
+        cublasOperation_t trans,
+        int n,
+        int nrhs,
+        const cuComplex *const Aarray[],
+        int lda,
+        const int *devIpiv,
+        cuComplex *const Barray[],
+        int ldb,
+        int *info,
+        int batchSize) {
+  return cublasCgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
+cublasStatus_t  GETRS_BATCHED(cublasHandle_t handle,
+        cublasOperation_t trans,
+        int n,
+        int nrhs,
+        const cuDoubleComplex *const Aarray[],
+        int lda,
+        const int *devIpiv,
+        cuDoubleComplex *const Barray[],
+        int ldb,
+        int *info,
+        int batchSize) {
+  return cublasZgetrsBatched(handle, trans, n, nrhs, Aarray, lda, devIpiv, Barray, ldb, info, batchSize);
+}
diff --git a/src/cugw_qpt.cu b/src/cugw_qpt.cu
@@ -23,7 +23,8 @@
 namespace green::gpu {
   template<typename prec>
   gw_qpt<prec>::gw_qpt(int nao, int naux, int nt, int nw_b,
-                       const std::complex<double> *T_tw_fb_host, const std::complex<double> *T_wt_bf_host):
+                       const std::complex<double> *T_tw_fb_host, const std::complex<double> *T_wt_bf_host, 
+                       LinearSolverType cuda_lin_solver):
       nao_(nao),
       nao2_(nao * nao),
       nao3_(nao2_ * nao),
@@ -42,7 +43,7 @@ namespace green::gpu {
       handle_(nullptr),
       solver_handle_(nullptr),
       streams_potrs_(nw_b),
-      potrs_ready_event_(nw_b), one_minus_P_ready_event_(nw_b) {
+      potrs_ready_event_(nw_b), one_minus_P_ready_event_(nw_b), cuda_lin_solver_(cuda_lin_solver) {
     allocate_IR_transformation_matrices(&T_tw_, &T_wt_, T_tw_fb_host, T_wt_bf_host, nt, nw_b);
   }
 
@@ -58,7 +59,14 @@ namespace green::gpu {
     }
     cudaEventCreateWithFlags(&polarization_ready_event_, cudaEventDisableTiming);
     cudaEventCreateWithFlags(&bare_polarization_ready_event_, cudaEventDisableTiming);
-    cudaEventCreateWithFlags(&Cholesky_decomposition_ready_event_, cudaEventDisableTiming);
+    if(cuda_lin_solver_ == LinearSolverType::Cholesky) {
+      cudaEventCreateWithFlags(&Cholesky_decomposition_ready_event_, cudaEventDisableTiming);
+    }
+    else {
+      cudaEventCreateWithFlags(&LU_decomposition_ready_event_, cudaEventDisableTiming);
+      cudaEventCreateWithFlags(&getrs_ready_event_, cudaEventDisableTiming);
+    }
+
 
     // We assume nt_ >= nw_b_ and reuse the same memory for P0 and P in imaginary time and frequency domain
     // This is a safe assumption in IR and Chebyshev representation
@@ -77,18 +85,30 @@ namespace green::gpu {
 
     if (cudaMalloc(&one_minus_P_w_ptrs_, nw_b_ * sizeof(cuda_complex*)) != cudaSuccess)
       throw std::runtime_error("failure allocating one_minus_P_w_ptrs_");
+    if(cuda_lin_solver_ == LinearSolverType::LU) {
+      if (cudaMalloc(&P0_w_ptrs_, nw_b_ * sizeof(cuda_complex*)) != cudaSuccess)
+        throw std::runtime_error("failure allocating P0_w_ptrs_");
+    }
     if (cudaMalloc(&d_info_, nw_b_ * sizeof(int)) != cudaSuccess)
       throw std::runtime_error("failure allocating info int on device");
+    if(cuda_lin_solver_ == LinearSolverType::LU) {
+      if (cudaMalloc(&Pivot_, naux_ * nw_b_ * sizeof(int)) != cudaSuccess)
+        throw std::runtime_error("cudaMalloc failed to allocate Pivot");
+    }
 
     if (cusolverDnSetStream(*solver_handle_, stream_) != CUSOLVER_STATUS_SUCCESS)
       throw std::runtime_error("cusolver set stream problem");
-    ;
 
     // locks so that different threads don't write the results over each other
     cudaMalloc(&Pqk0_tQP_lock_, sizeof(int));
     cudaMemset(Pqk0_tQP_lock_, 0, sizeof(int));
-    // For batched potrf
+    // For batched potrf/LU
     set_batch_pointer<<<1, 1, 0, stream_>>>(one_minus_P_w_ptrs_, one_minus_P_wPQ_, naux2_, nw_b_);
+    // for LU
+    if(cuda_lin_solver_ == LinearSolverType::LU) {
+      set_batch_pointer<<<1, 1, 0, stream_>>>(P0_w_ptrs_, Pqk0_wQP_, naux2_, nw_b_);
+    }
+
   }
 
   template <typename prec>
@@ -101,14 +121,23 @@ namespace green::gpu {
     }
     cudaEventDestroy(polarization_ready_event_);
     cudaEventDestroy(bare_polarization_ready_event_);
-    cudaEventDestroy(Cholesky_decomposition_ready_event_);
+    if(cuda_lin_solver_ == LinearSolverType::LU) {
+      cudaEventDestroy(LU_decomposition_ready_event_);
+    }
+    else {
+      cudaEventDestroy(Cholesky_decomposition_ready_event_);
+    }
 
     cudaFree(Pqk0_tQP_);
     cudaFree(Pqk0_tQP_lock_);
     cudaFree(Pqk0_wQP_);
 
     cudaFree(one_minus_P_w_ptrs_);
     cudaFree(d_info_);
+    if(cuda_lin_solver_ == LinearSolverType::LU) {
+      cudaFree(P0_w_ptrs_);
+      cudaFree(Pivot_);
+    }
     cudaFree(T_tw_);
     cudaFree(T_wt_);
   }
@@ -157,7 +186,7 @@ namespace green::gpu {
   }
 
   template <typename prec>
-  void gw_qpt<prec>::compute_Pq() {
+  void gw_qpt<prec>::compute_Pq_chol() {
     int threads_per_block = 512;
     int blocks_for_id     = naux_ / threads_per_block + 1;
     if (_verbose > 2) std::cout << "Running Cholesky solver for (I - P0)P = P0" << std::endl;
@@ -199,6 +228,56 @@ namespace green::gpu {
     }
   }
 
+  template <typename prec>
+  void gw_qpt<prec>::compute_Pq_lu() {
+    int threads_per_block = 512;
+    int blocks_for_id     = naux_ / threads_per_block + 1;
+    if (_verbose > 2) std::cout << "Running pivoted LU solver for (I - P0)P = P0" << std::endl;
+    for (int w = 0; w < nw_b_; ++w) {
+      cudaStreamWaitEvent(streams_potrs_[w], bare_polarization_ready_event_, 0);
+      hermitian_symmetrize<<<blocks_for_id, threads_per_block, 0, streams_potrs_[w]>>>(Pqk0_wQP_ + w * naux2_, naux_);
+      set_up_one_minus_P<<<blocks_for_id, threads_per_block, 0, streams_potrs_[w]>>>(one_minus_P_wPQ_ + w * naux2_,
+                                                                                     Pqk0_wQP_ + w * naux2_, naux_);
+      cudaEventRecord(one_minus_P_ready_event_[w], streams_potrs_[w]);
+    }
+    for (int w = 0; w < nw_b_; ++w) {
+      cudaStreamWaitEvent(stream_, one_minus_P_ready_event_[w], 0 /*cudaEventWaitDefault*/);
+    }
+
+    if (cublasSetStream(*handle_, stream_) != CUBLAS_STATUS_SUCCESS)
+      throw std::runtime_error("cublas set stream problem");
+
+    // get LU decomposition
+    if(GETRF_BATCHED(*handle_, naux_, one_minus_P_w_ptrs_, 
+              naux_, Pivot_, d_info_, nw_b_) != CUBLAS_STATUS_SUCCESS) {
+      throw std::runtime_error("CUDA GETRF failed!");
+    }
+     validate_info<<<1, 1, 0, stream_>>>(d_info_, nw_b_);
+    cudaEventRecord(LU_decomposition_ready_event_, stream_);
+
+    if (cudaStreamWaitEvent(stream_, LU_decomposition_ready_event_, 0 /*cudaEventWaitDefault*/))
+      throw std::runtime_error("Could not wait for LU decomposition");
+
+    if (cublasSetStream(*handle_, stream_) != CUBLAS_STATUS_SUCCESS)
+      throw std::runtime_error("cublas set stream problem");
+
+    int host_info; // getrs_batched API requres int on host
+
+    // Apply LU to solve linear system
+    if(GETRS_BATCHED(*handle_, CUBLAS_OP_N, naux_, naux_, one_minus_P_w_ptrs_, 
+             naux_, Pivot_, P0_w_ptrs_, naux_, &host_info, nw_b_) != CUBLAS_STATUS_SUCCESS) {
+      throw std::runtime_error("CUDA GETRS failed!");
+    }
+
+    if(host_info != 0) 
+      throw std::runtime_error("cublas GETRS info = " + std::to_string(host_info));
+
+    cudaEventRecord(getrs_ready_event_, stream_);
+    if (cudaStreamWaitEvent(stream_, getrs_ready_event_, 0 /*cudaEventWaitDefault*/))
+      throw std::runtime_error("Could not wait for GETRS");
+
+  }
+
   template <typename prec>
   void gw_qpt<prec>::wait_for_kpts() {
     if (cudaStreamSynchronize(stream_) != cudaSuccess) throw std::runtime_error("could not wait for other streams");

diff --git a/src/green/gpu/common_defs.h b/src/green/gpu/common_defs.h
@@ -133,5 +133,9 @@ namespace green::gpu {
       out[i] = static_cast<std::complex<float>>(in[i]);
     }
   }
+
+  enum class LinearSolverType {
+    Cholesky, LU
+  };
 }  // namespace green::gpu
 #endif  // GREEN_GPU_COMMON_DEFS_H
diff --git a/src/green/gpu/cu_routines.h b/src/green/gpu/cu_routines.h
@@ -56,8 +56,8 @@ namespace green::gpu {
     using cuda_complex = typename cu_type_map<std::complex<double>>::cuda_type;
 
   public:
-     cuhf_utils(size_t nk, size_t ink, size_t ns, size_t nao, size_t NQ, size_t nkbatch, ztensor<4> dm_fbz, int _myid,
-                int _intranode_rank, int _devCount_per_node);
+    cuhf_utils(size_t nk, size_t ink, size_t ns, size_t nao, size_t NQ, size_t nkbatch, ztensor<4> dm_fbz, int _myid,
+               int _intranode_rank, int _devCount_per_node);
 
     ~cuhf_utils();
 
@@ -134,11 +134,11 @@ namespace green::gpu {
     using cuda_complex = typename cu_type_map<std::complex<prec>>::cuda_type;
 
   public:
-         cugw_utils(int _nts, int _nt_batch, int _nw_b, int _ns, int _nk, int _ink, int _nqkpt, int _NQ, int _nao,
-                    ztensor_view<5>& G_tskij_host, bool _low_device_memory, const MatrixXcd& Ttn_FB, const MatrixXcd& Tnt_BF,
-                    int _myid, int _intranode_rank, int _devCount_per_node);
+    cugw_utils(int _nts, int _nt_batch, int _nw_b, int _ns, int _nk, int _ink, int _nqkpt, int _NQ, int _nao,
+               ztensor_view<5>& G_tskij_host, bool _low_device_memory, const MatrixXcd& Ttn_FB, const MatrixXcd& Tnt_BF,
+               LinearSolverType cuda_lin_solver, int _myid, int _intranode_rank, int _devCount_per_node);
 
-    ~    cugw_utils();
+    ~cugw_utils();
 
     void solve(int _nts, int _ns, int _nk, int _ink, int _nao, const std::vector<size_t>& reduced_to_full,
                const std::vector<size_t>& full_to_reduced, std::complex<double>* Vk1k2_Qij, ztensor<5>& Sigma_tskij_host,

diff --git a/src/green/gpu/cublas_routines_prec.h b/src/green/gpu/cublas_routines_prec.h
@@ -133,3 +133,41 @@ cusolverStatus_t POTRS(cusolverDnHandle_t handle,
         cuComplex *B,
         int ldb,
         int *devInfo);
+
+cublasStatus_t  GETRF_BATCHED(cublasHandle_t handle,
+        int n,
+        cuComplex *const Aarray[],
+        int lda,
+        int *PivotArray,
+        int *infoArray,
+        int batchSize);
+cublasStatus_t  GETRF_BATCHED(cublasHandle_t handle,
+        int n,
+        cuDoubleComplex *const Aarray[],
+        int lda,
+        int *PivotArray,
+        int *infoArray,
+        int batchSize);
+
+cublasStatus_t  GETRS_BATCHED(cublasHandle_t handle,
+        cublasOperation_t trans,
+        int n,
+        int nrhs,
+        const cuComplex *const Aarray[],
+        int lda,
+        const int *devIpiv,
+        cuComplex *const Barray[],
+        int ldb,
+        int *info,
+        int batchSize);
+cublasStatus_t  GETRS_BATCHED(cublasHandle_t handle,
+        cublasOperation_t trans,
+        int n,
+        int nrhs,
+        const cuDoubleComplex *const Aarray[],
+        int lda,
+        const int *devIpiv,
+        cuDoubleComplex *const Barray[],
+        int ldb,
+        int *info,
+        int batchSize);