delete sparse file and Optimize global memory access (deepmodeling#4467)

* delete sparse1 * Optimize global variable memory access * [pre-commit.ci lite] apply automatic fixes * Revert "delete sparse1" This reverts commit dadce23. * fix bug in compute * [pre-commit.ci lite] apply automatic fixes --------- Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
mohanchen · Jun 22, 2024 · 74c1664 · 74c1664
1 parent 045843d
commit 74c1664
Show file tree

Hide file tree

Showing 5 changed files with 147 additions and 201 deletions.
diff --git a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu
@@ -65,15 +65,8 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
 
     Cuda_Mem_Wrapper<double> psi(max_phi_per_z, num_streams, false);
     Cuda_Mem_Wrapper<double> psi_dm(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> dpsi_dx(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> dpsi_dy(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> dpsi_dz(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi_dxx(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi_dxy(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi_dxz(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi_dyy(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi_dyz(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi_dzz(max_phi_per_z, num_streams, false);
+    Cuda_Mem_Wrapper<double> dpsi(3 * max_phi_per_z, num_streams, false);
+    Cuda_Mem_Wrapper<double> d2psi(6 * max_phi_per_z, num_streams, false);
 
     Cuda_Mem_Wrapper<double> gemm_alpha(max_atompair_per_z, num_streams, true);
     Cuda_Mem_Wrapper<int> gemm_m(max_atompair_per_z, num_streams, true);
@@ -193,15 +186,8 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
 
             psi.memset_device_async(streams[sid], sid, 0);
             psi_dm.memset_device_async(streams[sid], sid, 0);
-            dpsi_dx.memset_device_async(streams[sid], sid, 0);
-            dpsi_dy.memset_device_async(streams[sid], sid, 0);
-            dpsi_dz.memset_device_async(streams[sid], sid, 0);
-            d2psi_dxx.memset_device_async(streams[sid], sid, 0);
-            d2psi_dxy.memset_device_async(streams[sid], sid, 0);
-            d2psi_dxz.memset_device_async(streams[sid], sid, 0);
-            d2psi_dyy.memset_device_async(streams[sid], sid, 0);
-            d2psi_dyz.memset_device_async(streams[sid], sid, 0);
-            d2psi_dzz.memset_device_async(streams[sid], sid, 0);
+            dpsi.memset_device_async(streams[sid], sid, 0);
+            d2psi.memset_device_async(streams[sid], sid, 0);
 
             dim3 grid_psi(nbzp, 32);
             dim3 block_psi(64);
@@ -225,15 +211,8 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
                 gridt.nr_max,
                 gridt.psi_u_g,
                 psi.get_device_pointer(sid),
-                dpsi_dx.get_device_pointer(sid),
-                dpsi_dy.get_device_pointer(sid),
-                dpsi_dz.get_device_pointer(sid),
-                d2psi_dxx.get_device_pointer(sid),
-                d2psi_dxy.get_device_pointer(sid),
-                d2psi_dxz.get_device_pointer(sid),
-                d2psi_dyy.get_device_pointer(sid),
-                d2psi_dyz.get_device_pointer(sid),
-                d2psi_dzz.get_device_pointer(sid));
+                dpsi.get_device_pointer(sid),
+                d2psi.get_device_pointer(sid));
             checkCudaLastError();
 
             gridt.fastest_matrix_mul(max_m,
@@ -259,9 +238,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
                                 block_force,
                                 block_size * 3 * sizeof(double),
                                 streams[sid]>>>(
-                                    dpsi_dx.get_device_pointer(sid),
-                                    dpsi_dy.get_device_pointer(sid),
-                                    dpsi_dz.get_device_pointer(sid),
+                                    dpsi.get_device_pointer(sid),
                                     psi_dm.get_device_pointer(sid),
                                     force.get_device_pointer(sid),
                                     iat_per_z.get_device_pointer(sid),
@@ -276,12 +253,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
                                  block_stress,
                                  0,
                                  streams[sid]>>>(
-                                d2psi_dxx.get_device_pointer(sid),
-                                d2psi_dxy.get_device_pointer(sid),
-                                d2psi_dxz.get_device_pointer(sid),
-                                d2psi_dyy.get_device_pointer(sid),
-                                d2psi_dyz.get_device_pointer(sid),
-                                d2psi_dzz.get_device_pointer(sid),
+                                d2psi.get_device_pointer(sid),
                                 psi_dm.get_device_pointer(sid),
                                 stress.get_device_pointer(sid),
                                 max_phi_per_z);

diff --git a/source/module_hamilt_lcao/module_gint/gint_k.h b/source/module_hamilt_lcao/module_gint/gint_k.h
@@ -1,38 +1,44 @@
-#ifndef GINT_K_H
-#define GINT_K_H
+#ifndef W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_K_H
+#define W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_K_H
 
-#include "module_basis/module_ao/ORB_atomic_lm.h"
+#include "gint.h"
 #include "grid_technique.h"
-#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h"
+#include "module_basis/module_ao/ORB_atomic_lm.h"
 #include "module_elecstate/module_charge/charge.h"
-#include "gint.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h"
 
 // add by jingan for map<> in 2021-12-2, will be deleted in the future
 #include "module_base/abfs-vector3_order.h"
 
 class Gint_k : public Gint
 {
-    public:
+  public:
     ~Gint_k()
     {
         destroy_pvpR();
     }
     //------------------------------------------------------
-    // in gint_k_pvpr.cpp 
+    // in gint_k_pvpr.cpp
     //------------------------------------------------------
     // pvpR and reset_spin/get_spin : auxilliary methods
     // for calculating hamiltonian
 
     // reset the spin.
-    void reset_spin(const int &spin_now_in){this->spin_now = spin_now_in;};
+    void reset_spin(const int& spin_now_in)
+    {
+        this->spin_now = spin_now_in;
+    };
     // get the spin.
-    int get_spin(void)const{return spin_now;}
+    int get_spin() const
+    {
+        return spin_now;
+    }
 
-    //renew gint index for new iteration
+    // renew gint index for new iteration
     void renew(const bool& soft = false)
     {
-        if(soft && this->spin_now == 0) 
-        {//in this case, gint will not be recalculated
+        if (soft && this->spin_now == 0)
+        { // in this case, gint will not be recalculated
             return;
         }
         else if (this->spin_now != -1)
@@ -44,111 +50,106 @@ class Gint_k : public Gint
         }
         return;
     }
- 
+
     // allocate the <phi_0 | V | phi_R> matrix element.
-    void allocate_pvpR(void);
+    void allocate_pvpR();
     // destroy the temporary <phi_0 | V | phi_R> matrix element.
-    void destroy_pvpR(void);
+    void destroy_pvpR();
 
     // allocate the <phi_0 | V | dphi_R> matrix element.
-    void allocate_pvdpR(void);
+    void allocate_pvdpR();
     // destroy the temporary <phi_0 | V | dphi_R> matrix element.
-    void destroy_pvdpR(void);
+    void destroy_pvdpR();
 
-    // folding the < phi_0 | V | phi_R> matrix to 
+    // folding the < phi_0 | V | phi_R> matrix to
     // <phi_0i | V | phi_0j>
     // V is (Vl + Vh + Vxc) if no Vna is used,
     // and is (Vna + delta_Vh + Vxc) if Vna is used.
-    void folding_vl_k(const int &ik, LCAO_Matrix* LM, Parallel_Orbitals *pv,
-                    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                    const UnitCell& ucell,Grid_Driver& gd);
+    void folding_vl_k(const int& ik,
+                      LCAO_Matrix* LM,
+                      Parallel_Orbitals* pv,
+                      const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                      const UnitCell& ucell,
+                      Grid_Driver& gd);
 
     /**
      * @brief transfer pvpR to this->hRGint
      * then pass this->hRGint to Veff<OperatorLCAO>::hR
-    */
-    void transfer_pvpR(hamilt::HContainer<double> *hR,const UnitCell* ucell_in,Grid_Driver* gd);
-    void transfer_pvpR(hamilt::HContainer<std::complex<double>> *hR,const UnitCell* ucell_in,Grid_Driver* gd);
+     */
+    void transfer_pvpR(hamilt::HContainer<double>* hR, const UnitCell* ucell_in, Grid_Driver* gd);
+    void transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR, const UnitCell* ucell_in, Grid_Driver* gd);
 
     //------------------------------------------------------
-    // in gint_k_env.cpp 
+    // in gint_k_env.cpp
     //------------------------------------------------------
     // calculate the envelop function via grid integrals
     void cal_env_k(int ik,
                    const std::complex<double>* psi_k,
                    double* rho,
                    const std::vector<ModuleBase::Vector3<double>>& kvec_c,
                    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                   UnitCell &ucell);
+                   UnitCell& ucell);
 
     //------------------------------------------------------
-    // in gint_k_sparse.cpp 
-    //------------------------------------------------------    
+    // in gint_k_sparse.cpp
+    //------------------------------------------------------
     // related to sparse matrix
     // jingan add 2021-6-4, modify 2021-12-2
     void distribute_pvpR_sparseMatrix(
-        const int current_spin, 
-        const double &sparse_threshold, 
-        const std::map<Abfs::Vector3_Order<int>,
-        std::map<size_t, std::map<size_t, double>>> &pvpR_sparseMatrix,
-        LCAO_Matrix *LM,
-        Parallel_Orbitals *pv);
+        const int current_spin,
+        const double& sparse_threshold,
+        const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>>& pvpR_sparseMatrix,
+        LCAO_Matrix* LM,
+        Parallel_Orbitals* pv);
 
     void distribute_pvpR_soc_sparseMatrix(
-        const double &sparse_threshold, 
-        const std::map<Abfs::Vector3_Order<int>,
-        std::map<size_t,
-        std::map<size_t, std::complex<double>>>> &pvpR_soc_sparseMatrix,
-        LCAO_Matrix *LM,
-        Parallel_Orbitals *pv
-        );
-
-    void cal_vlocal_R_sparseMatrix(
-        const int &current_spin,
-        const double &sparse_threshold,
-        LCAO_Matrix *LM,
-        Parallel_Orbitals *pv,
-        UnitCell &ucell,
-        Grid_Driver &gdriver);
+        const double& sparse_threshold,
+        const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>&
+            pvpR_soc_sparseMatrix,
+        LCAO_Matrix* LM,
+        Parallel_Orbitals* pv);
+
+    void cal_vlocal_R_sparseMatrix(const int& current_spin,
+                                   const double& sparse_threshold,
+                                   LCAO_Matrix* LM,
+                                   Parallel_Orbitals* pv,
+                                   UnitCell& ucell,
+                                   Grid_Driver& gdriver);
 
     //------------------------------------------------------
-    // in gint_k_sparse1.cpp 
-    //------------------------------------------------------  
+    // in gint_k_sparse1.cpp
+    //------------------------------------------------------
     // similar to the above 3, just for the derivative
     void distribute_pvdpR_sparseMatrix(
-        const int current_spin, 
+        const int current_spin,
         const int dim,
-        const double &sparse_threshold, 
-        const std::map<Abfs::Vector3_Order<int>,
-        std::map<size_t, std::map<size_t, double>>> &pvdpR_sparseMatrix,
-        LCAO_Matrix *LM,
-        Parallel_Orbitals *pv);
+        const double& sparse_threshold,
+        const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>>& pvdpR_sparseMatrix,
+        LCAO_Matrix* LM,
+        Parallel_Orbitals* pv);
 
     void distribute_pvdpR_soc_sparseMatrix(
         const int dim,
-        const double &sparse_threshold, 
-        const std::map<Abfs::Vector3_Order<int>,
-        std::map<size_t, std::map<size_t, std::complex<double>>>> &pvdpR_soc_sparseMatrix,
-        LCAO_Matrix *LM,
-        Parallel_Orbitals *pv);
-
-    void cal_dvlocal_R_sparseMatrix(
-        const int &current_spin,
-        const double &sparse_threshold,
-        LCAO_Matrix *LM,
-        Parallel_Orbitals *pv,
-        UnitCell &ucell,
-        Grid_Driver &gdriver);
-
-    private:
-
+        const double& sparse_threshold,
+        const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>&
+            pvdpR_soc_sparseMatrix,
+        LCAO_Matrix* LM,
+        Parallel_Orbitals* pv);
+
+    void cal_dvlocal_R_sparseMatrix(const int& current_spin,
+                                    const double& sparse_threshold,
+                                    LCAO_Matrix* LM,
+                                    Parallel_Orbitals* pv,
+                                    UnitCell& ucell,
+                                    Grid_Driver& gdriver);
+
+  private:
+    //----------------------------
+    // key variable
     //----------------------------
-    // key variable 
-    //----------------------------  
 
     // used only in vlocal.
     int spin_now = -1;
-
 };
 
 #endif