From 74c1664aa5a866f0cbcdf3ed217c678bbf10f93e Mon Sep 17 00:00:00 2001 From: liiutao <74701833+A-006@users.noreply.github.com> Date: Sat, 22 Jun 2024 21:17:47 +0800 Subject: [PATCH] delete sparse file and Optimize global memory access (#4467) * delete sparse1 * Optimize global variable memory access * [pre-commit.ci lite] apply automatic fixes * Revert "delete sparse1" This reverts commit dadce23cbf39562ac23dcd43df7a86f93bf64a07. * fix bug in compute * [pre-commit.ci lite] apply automatic fixes --------- Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com> --- .../module_gint/gint_force_gpu.cu | 44 +---- .../module_hamilt_lcao/module_gint/gint_k.h | 153 +++++++++--------- .../module_gint/kernels/cuda/gint_force.cu | 62 +++---- .../module_gint/kernels/cuda/gint_force.cuh | 25 +-- .../module_gint/kernels/cuda/interp.cuh | 64 ++++---- 5 files changed, 147 insertions(+), 201 deletions(-) diff --git a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu index f07462172c..3ed5a5a80c 100644 --- a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu +++ b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu @@ -65,15 +65,8 @@ void gint_fvl_gamma_gpu(hamilt::HContainer* dm, Cuda_Mem_Wrapper psi(max_phi_per_z, num_streams, false); Cuda_Mem_Wrapper psi_dm(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper dpsi_dx(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper dpsi_dy(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper dpsi_dz(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper d2psi_dxx(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper d2psi_dxy(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper d2psi_dxz(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper d2psi_dyy(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper d2psi_dyz(max_phi_per_z, num_streams, false); - Cuda_Mem_Wrapper d2psi_dzz(max_phi_per_z, num_streams, false); + Cuda_Mem_Wrapper dpsi(3 * max_phi_per_z, num_streams, false); + Cuda_Mem_Wrapper d2psi(6 * max_phi_per_z, num_streams, false); Cuda_Mem_Wrapper gemm_alpha(max_atompair_per_z, num_streams, true); Cuda_Mem_Wrapper gemm_m(max_atompair_per_z, num_streams, true); @@ -193,15 +186,8 @@ void gint_fvl_gamma_gpu(hamilt::HContainer* dm, psi.memset_device_async(streams[sid], sid, 0); psi_dm.memset_device_async(streams[sid], sid, 0); - dpsi_dx.memset_device_async(streams[sid], sid, 0); - dpsi_dy.memset_device_async(streams[sid], sid, 0); - dpsi_dz.memset_device_async(streams[sid], sid, 0); - d2psi_dxx.memset_device_async(streams[sid], sid, 0); - d2psi_dxy.memset_device_async(streams[sid], sid, 0); - d2psi_dxz.memset_device_async(streams[sid], sid, 0); - d2psi_dyy.memset_device_async(streams[sid], sid, 0); - d2psi_dyz.memset_device_async(streams[sid], sid, 0); - d2psi_dzz.memset_device_async(streams[sid], sid, 0); + dpsi.memset_device_async(streams[sid], sid, 0); + d2psi.memset_device_async(streams[sid], sid, 0); dim3 grid_psi(nbzp, 32); dim3 block_psi(64); @@ -225,15 +211,8 @@ void gint_fvl_gamma_gpu(hamilt::HContainer* dm, gridt.nr_max, gridt.psi_u_g, psi.get_device_pointer(sid), - dpsi_dx.get_device_pointer(sid), - dpsi_dy.get_device_pointer(sid), - dpsi_dz.get_device_pointer(sid), - d2psi_dxx.get_device_pointer(sid), - d2psi_dxy.get_device_pointer(sid), - d2psi_dxz.get_device_pointer(sid), - d2psi_dyy.get_device_pointer(sid), - d2psi_dyz.get_device_pointer(sid), - d2psi_dzz.get_device_pointer(sid)); + dpsi.get_device_pointer(sid), + d2psi.get_device_pointer(sid)); checkCudaLastError(); gridt.fastest_matrix_mul(max_m, @@ -259,9 +238,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer* dm, block_force, block_size * 3 * sizeof(double), streams[sid]>>>( - dpsi_dx.get_device_pointer(sid), - dpsi_dy.get_device_pointer(sid), - dpsi_dz.get_device_pointer(sid), + dpsi.get_device_pointer(sid), psi_dm.get_device_pointer(sid), force.get_device_pointer(sid), iat_per_z.get_device_pointer(sid), @@ -276,12 +253,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer* dm, block_stress, 0, streams[sid]>>>( - d2psi_dxx.get_device_pointer(sid), - d2psi_dxy.get_device_pointer(sid), - d2psi_dxz.get_device_pointer(sid), - d2psi_dyy.get_device_pointer(sid), - d2psi_dyz.get_device_pointer(sid), - d2psi_dzz.get_device_pointer(sid), + d2psi.get_device_pointer(sid), psi_dm.get_device_pointer(sid), stress.get_device_pointer(sid), max_phi_per_z); diff --git a/source/module_hamilt_lcao/module_gint/gint_k.h b/source/module_hamilt_lcao/module_gint/gint_k.h index 8f6fddc1d7..c3d7f08194 100644 --- a/source/module_hamilt_lcao/module_gint/gint_k.h +++ b/source/module_hamilt_lcao/module_gint/gint_k.h @@ -1,38 +1,44 @@ -#ifndef GINT_K_H -#define GINT_K_H +#ifndef W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_K_H +#define W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_K_H -#include "module_basis/module_ao/ORB_atomic_lm.h" +#include "gint.h" #include "grid_technique.h" -#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h" +#include "module_basis/module_ao/ORB_atomic_lm.h" #include "module_elecstate/module_charge/charge.h" -#include "gint.h" +#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h" // add by jingan for map<> in 2021-12-2, will be deleted in the future #include "module_base/abfs-vector3_order.h" class Gint_k : public Gint { - public: + public: ~Gint_k() { destroy_pvpR(); } //------------------------------------------------------ - // in gint_k_pvpr.cpp + // in gint_k_pvpr.cpp //------------------------------------------------------ // pvpR and reset_spin/get_spin : auxilliary methods // for calculating hamiltonian // reset the spin. - void reset_spin(const int &spin_now_in){this->spin_now = spin_now_in;}; + void reset_spin(const int& spin_now_in) + { + this->spin_now = spin_now_in; + }; // get the spin. - int get_spin(void)const{return spin_now;} + int get_spin() const + { + return spin_now; + } - //renew gint index for new iteration + // renew gint index for new iteration void renew(const bool& soft = false) { - if(soft && this->spin_now == 0) - {//in this case, gint will not be recalculated + if (soft && this->spin_now == 0) + { // in this case, gint will not be recalculated return; } else if (this->spin_now != -1) @@ -44,34 +50,37 @@ class Gint_k : public Gint } return; } - + // allocate the matrix element. - void allocate_pvpR(void); + void allocate_pvpR(); // destroy the temporary matrix element. - void destroy_pvpR(void); + void destroy_pvpR(); // allocate the matrix element. - void allocate_pvdpR(void); + void allocate_pvdpR(); // destroy the temporary matrix element. - void destroy_pvdpR(void); + void destroy_pvdpR(); - // folding the < phi_0 | V | phi_R> matrix to + // folding the < phi_0 | V | phi_R> matrix to // // V is (Vl + Vh + Vxc) if no Vna is used, // and is (Vna + delta_Vh + Vxc) if Vna is used. - void folding_vl_k(const int &ik, LCAO_Matrix* LM, Parallel_Orbitals *pv, - const std::vector>& kvec_d, - const UnitCell& ucell,Grid_Driver& gd); + void folding_vl_k(const int& ik, + LCAO_Matrix* LM, + Parallel_Orbitals* pv, + const std::vector>& kvec_d, + const UnitCell& ucell, + Grid_Driver& gd); /** * @brief transfer pvpR to this->hRGint * then pass this->hRGint to Veff::hR - */ - void transfer_pvpR(hamilt::HContainer *hR,const UnitCell* ucell_in,Grid_Driver* gd); - void transfer_pvpR(hamilt::HContainer> *hR,const UnitCell* ucell_in,Grid_Driver* gd); + */ + void transfer_pvpR(hamilt::HContainer* hR, const UnitCell* ucell_in, Grid_Driver* gd); + void transfer_pvpR(hamilt::HContainer>* hR, const UnitCell* ucell_in, Grid_Driver* gd); //------------------------------------------------------ - // in gint_k_env.cpp + // in gint_k_env.cpp //------------------------------------------------------ // calculate the envelop function via grid integrals void cal_env_k(int ik, @@ -79,76 +88,68 @@ class Gint_k : public Gint double* rho, const std::vector>& kvec_c, const std::vector>& kvec_d, - UnitCell &ucell); + UnitCell& ucell); //------------------------------------------------------ - // in gint_k_sparse.cpp - //------------------------------------------------------ + // in gint_k_sparse.cpp + //------------------------------------------------------ // related to sparse matrix // jingan add 2021-6-4, modify 2021-12-2 void distribute_pvpR_sparseMatrix( - const int current_spin, - const double &sparse_threshold, - const std::map, - std::map>> &pvpR_sparseMatrix, - LCAO_Matrix *LM, - Parallel_Orbitals *pv); + const int current_spin, + const double& sparse_threshold, + const std::map, std::map>>& pvpR_sparseMatrix, + LCAO_Matrix* LM, + Parallel_Orbitals* pv); void distribute_pvpR_soc_sparseMatrix( - const double &sparse_threshold, - const std::map, - std::map>>> &pvpR_soc_sparseMatrix, - LCAO_Matrix *LM, - Parallel_Orbitals *pv - ); - - void cal_vlocal_R_sparseMatrix( - const int ¤t_spin, - const double &sparse_threshold, - LCAO_Matrix *LM, - Parallel_Orbitals *pv, - UnitCell &ucell, - Grid_Driver &gdriver); + const double& sparse_threshold, + const std::map, std::map>>>& + pvpR_soc_sparseMatrix, + LCAO_Matrix* LM, + Parallel_Orbitals* pv); + + void cal_vlocal_R_sparseMatrix(const int& current_spin, + const double& sparse_threshold, + LCAO_Matrix* LM, + Parallel_Orbitals* pv, + UnitCell& ucell, + Grid_Driver& gdriver); //------------------------------------------------------ - // in gint_k_sparse1.cpp - //------------------------------------------------------ + // in gint_k_sparse1.cpp + //------------------------------------------------------ // similar to the above 3, just for the derivative void distribute_pvdpR_sparseMatrix( - const int current_spin, + const int current_spin, const int dim, - const double &sparse_threshold, - const std::map, - std::map>> &pvdpR_sparseMatrix, - LCAO_Matrix *LM, - Parallel_Orbitals *pv); + const double& sparse_threshold, + const std::map, std::map>>& pvdpR_sparseMatrix, + LCAO_Matrix* LM, + Parallel_Orbitals* pv); void distribute_pvdpR_soc_sparseMatrix( const int dim, - const double &sparse_threshold, - const std::map, - std::map>>> &pvdpR_soc_sparseMatrix, - LCAO_Matrix *LM, - Parallel_Orbitals *pv); - - void cal_dvlocal_R_sparseMatrix( - const int ¤t_spin, - const double &sparse_threshold, - LCAO_Matrix *LM, - Parallel_Orbitals *pv, - UnitCell &ucell, - Grid_Driver &gdriver); - - private: - + const double& sparse_threshold, + const std::map, std::map>>>& + pvdpR_soc_sparseMatrix, + LCAO_Matrix* LM, + Parallel_Orbitals* pv); + + void cal_dvlocal_R_sparseMatrix(const int& current_spin, + const double& sparse_threshold, + LCAO_Matrix* LM, + Parallel_Orbitals* pv, + UnitCell& ucell, + Grid_Driver& gdriver); + + private: + //---------------------------- + // key variable //---------------------------- - // key variable - //---------------------------- // used only in vlocal. int spin_now = -1; - }; #endif diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu index 1442964d55..526da78247 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu @@ -1,9 +1,7 @@ -#include "gint_force.cuh" +#include "sph.cuh" #include "interp.cuh" -#include "module_hamilt_lcao/module_gint/gint_force_gpu.h" -#include "cuda_tools.cuh" #include "gint_force.cuh" -#include "sph.cuh" +#include "cuda_tools.cuh" #include "cuda_runtime.h" // CUDA kernel to calculate psi and force namespace GintKernel @@ -11,7 +9,7 @@ namespace GintKernel __global__ void get_psi_force(double* ylmcoef, double delta_r_g, int bxyz_g, - double nwmax_g, + const int nwmax_g, double* __restrict__ psi_input_double, int* __restrict__ psi_input_int, int* __restrict__ atom_num_per_bcell, @@ -24,15 +22,8 @@ __global__ void get_psi_force(double* ylmcoef, int nr_max, const double* __restrict__ psi_u, double* psi, - double* dpsi_dx, - double* dpsi_dy, - double* dpsi_dz, - double* d2psi_dxx, - double* d2psi_dxy, - double* d2psi_dxz, - double* d2psi_dyy, - double* d2psi_dyz, - double* d2psi_dzz) + double* dpsi, + double* d2psi) { const int size = atom_num_per_bcell[blockIdx.x]; const int bcell_start = start_idx_per_bcell[blockIdx.x]; @@ -57,7 +48,6 @@ __global__ void get_psi_force(double* ylmcoef, const int nwl = ucell_atom_nwl[it]; spherical_harmonics_d(dr, distance*distance, grly, nwl, ylma, ylmcoef); - interpolate_f(distance, delta_r_g, it, @@ -72,26 +62,14 @@ __global__ void get_psi_force(double* ylmcoef, dist_tmp, ylma, vlbr3_value, - dpsi_dx, + dpsi, dr, grly, - dpsi_dy, - dpsi_dz, - d2psi_dxx, - d2psi_dxy, - d2psi_dxz, - d2psi_dyy, - d2psi_dyz, - d2psi_dzz); + d2psi); } } -__global__ void dot_product_stress(double* d2psi_dxx, - double* d2psi_dxy, - double* d2psi_dxz, - double* d2psi_dyy, - double* d2psi_dyz, - double* d2psi_dzz, +__global__ void dot_product_stress(double* d2psi, double* psir_ylm_dm, double* stress_dot, int elements_num) @@ -103,12 +81,13 @@ __global__ void dot_product_stress(double* d2psi_dxx, while (tid < elements_num) { double psi_dm_2 = psir_ylm_dm[tid] * 2; - tmp[0] += d2psi_dxx[tid] * psi_dm_2; - tmp[1] += d2psi_dxy[tid] * psi_dm_2; - tmp[2] += d2psi_dxz[tid] * psi_dm_2; - tmp[3] += d2psi_dyy[tid] * psi_dm_2; - tmp[4] += d2psi_dyz[tid] * psi_dm_2; - tmp[5] += d2psi_dzz[tid] * psi_dm_2; + const int tid_stress = tid * 6; + tmp[0] += d2psi[tid_stress] * psi_dm_2; + tmp[1] += d2psi[tid_stress + 1] * psi_dm_2; + tmp[2] += d2psi[tid_stress + 2] * psi_dm_2; + tmp[3] += d2psi[tid_stress + 3] * psi_dm_2; + tmp[4] += d2psi[tid_stress + 4] * psi_dm_2; + tmp[5] += d2psi[tid_stress + 5] * psi_dm_2; tid += blockDim.x * gridDim.x; } @@ -141,9 +120,7 @@ __global__ void dot_product_stress(double* d2psi_dxx, } } -__global__ void dot_product_force(double* __restrict__ dpsi_dx, - double* __restrict__ dpsi_dy, - double* __restrict__ dpsi_dz, +__global__ void dot_product_force(double* __restrict__ dpsi, double* __restrict__ psir_ylm_dm, double* force_dot, int* iat, @@ -167,10 +144,11 @@ __global__ void dot_product_force(double* __restrict__ dpsi_dx, { int ls_offset = tid * 3; int psi_offset = offset + i; + int psi_offset_force = psi_offset * 3; double psi_dm_2 = psir_ylm_dm[psi_offset] * 2; - localsum[ls_offset] += dpsi_dx[psi_offset] * psi_dm_2; - localsum[ls_offset + 1] += dpsi_dy[psi_offset] * psi_dm_2; - localsum[ls_offset + 2] += dpsi_dz[psi_offset] * psi_dm_2; + localsum[ls_offset] += dpsi[psi_offset_force] * psi_dm_2; + localsum[ls_offset + 1] += dpsi[psi_offset_force + 1] * psi_dm_2; + localsum[ls_offset + 2] += dpsi[psi_offset_force + 2] * psi_dm_2; } __syncthreads(); diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh index afe9dcb851..3cea097d2b 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh @@ -8,7 +8,7 @@ namespace GintKernel __global__ void get_psi_force(double* ylmcoef, double delta_r_g, int bxyz_g, - double nwmax_g, + const int nwmax_g, double* __restrict__ psi_input_double, int* __restrict__ psi_input_int, int* __restrict__ atom_num_per_bcell, @@ -21,29 +21,16 @@ __global__ void get_psi_force(double* ylmcoef, int nr_max, const double* __restrict__ psi_u, double* psi, - double* dpsi_dx, - double* dpsi_dy, - double* dpsi_dz, - double* d2psi_dxx, - double* d2psi_dxy, - double* d2psi_dxz, - double* d2psi_dyy, - double* d2psi_dyz, - double* d2psi_dzz); + double* dpsi, + double* d2psi); -__global__ void dot_product_stress(double* d2psi_dxx, - double* d2psi_dxy, - double* d2psi_dxz, - double* d2psi_dyy, - double* d2psi_dyz, - double* d2psi_dzz, + +__global__ void dot_product_stress(double* d2psi, double* psir_ylm_dm, double* stress_dot, int elements_num); -__global__ void dot_product_force(double* __restrict__ dpsi_dx, - double* __restrict__ dpsi_dy, - double* __restrict__ dpsi_dz, +__global__ void dot_product_force(double* __restrict__ dpsi, double* __restrict__ psir_ylm_dm, double* force_dot, int* iat, diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/interp.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/interp.cuh index 4522f7f7c9..ade96c6fa9 100644 --- a/source/module_hamilt_lcao/module_gint/kernels/cuda/interp.cuh +++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/interp.cuh @@ -52,28 +52,21 @@ static __device__ void interpolate(const double dist, static __device__ void interpolate_f(const double distance, const double delta_r_g, const int it, - const double nwmax_g, + const int nwmax_g, const int nr_max, const int* __restrict__ atom_nw, const bool* __restrict__ atom_iw2_new, const double* __restrict__ psi_u, const int* __restrict__ atom_iw2_l, const int* __restrict__ atom_iw2_ylm, - double* psir_r, + double* psi, int dist_tmp, const double ylma[49], const double vlbr3_value, - double* psir_lx, + double* dpsi, const double * __restrict__ dr, const double grly[49][3], - double* psir_ly, - double* psir_lz, - double* psir_lxx, - double* psir_lxy, - double* psir_lxz, - double* psir_lyy, - double* psir_lyz, - double* psir_lzz) + double* d2psi) { // Calculate normalized position for interpolation const double postion = distance / delta_r_g; @@ -92,6 +85,8 @@ static __device__ void interpolate_f(const double distance, const int it_nw = it * nwmax_g; int iw_nr = (it_nw * nr_max + ip) * 2; int it_nw_iw = it_nw; + double dpsir[150][4]={0.0}; + int dist_tmp_clac=dist_tmp; for (int iw = 0; iw < atom_nw[it]; ++iw) { if (atom_iw2_new[it_nw_iw]) @@ -110,35 +105,48 @@ static __device__ void interpolate_f(const double distance, const double rl = pow(distance, ll); const double rl_r = 1.0 / rl; const double dist_r = 1 / distance; - + const int dist_tmp_force = dist_tmp_clac * 3; + const int dist_tmp_stress = dist_tmp_clac * 6; // Compute right-hand side of the equation - psir_r[dist_tmp] = tmp * ylma[idx_lm] * rl_r * vlbr3_value; + dpsir[iw][3] = tmp * ylma[idx_lm] * rl_r * vlbr3_value; // Compute derivatives with respect to spatial // coordinates const double tmpdphi_rly = (dtmp - tmp * ll * dist_r) * rl_r * ylma[idx_lm] * dist_r; const double tmprl = tmp * rl_r; - psir_lx[dist_tmp] - = tmpdphi_rly * dr[0] + tmprl * grly[idx_lm][0]; - - psir_ly[dist_tmp] - = tmpdphi_rly * dr[1] + tmprl * grly[idx_lm][1]; - psir_lz[dist_tmp] - = tmpdphi_rly * dr[2] + tmprl * grly[idx_lm][2]; - - psir_lxx[dist_tmp] = psir_lx[dist_tmp] * dr[0]; - psir_lxy[dist_tmp] = psir_lx[dist_tmp] * dr[1]; - psir_lxz[dist_tmp] = psir_lx[dist_tmp] * dr[2]; - psir_lyy[dist_tmp] = psir_ly[dist_tmp] * dr[1]; - psir_lyz[dist_tmp] = psir_ly[dist_tmp] * dr[2]; - psir_lzz[dist_tmp] = psir_lz[dist_tmp] * dr[2]; + const double dpsirx = tmpdphi_rly * dr[0] + tmprl * grly[idx_lm][0]; + const double dpsiry = tmpdphi_rly * dr[1] + tmprl * grly[idx_lm][1]; + const double dpsirz = tmpdphi_rly * dr[2] + tmprl * grly[idx_lm][2]; + dpsir[iw][0] = dpsirx; + dpsir[iw][1] = dpsiry; + dpsir[iw][2] = dpsirz; // Update loop counters and indices - dist_tmp += 1; + dist_tmp_clac += 1; iw_nr += nr_max; iw_nr += nr_max; it_nw_iw++; } + + #pragma unroll + int dist_tmp_trans = dist_tmp; + for (int iw=0;iw