Skip to content

Commit

Permalink
delete sparse file and Optimize global memory access (deepmodeling#4467)
Browse files Browse the repository at this point in the history
* delete sparse1

* Optimize global variable memory access

* [pre-commit.ci lite] apply automatic fixes

* Revert "delete sparse1"

This reverts commit dadce23.

* fix bug in compute

* [pre-commit.ci lite] apply automatic fixes

---------

Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
  • Loading branch information
A-006 and pre-commit-ci-lite[bot] authored Jun 22, 2024
1 parent 045843d commit 74c1664
Show file tree
Hide file tree
Showing 5 changed files with 147 additions and 201 deletions.
44 changes: 8 additions & 36 deletions source/module_hamilt_lcao/module_gint/gint_force_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,8 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,

Cuda_Mem_Wrapper<double> psi(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> psi_dm(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> dpsi_dx(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> dpsi_dy(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> dpsi_dz(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> d2psi_dxx(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> d2psi_dxy(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> d2psi_dxz(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> d2psi_dyy(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> d2psi_dyz(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> d2psi_dzz(max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> dpsi(3 * max_phi_per_z, num_streams, false);
Cuda_Mem_Wrapper<double> d2psi(6 * max_phi_per_z, num_streams, false);

Cuda_Mem_Wrapper<double> gemm_alpha(max_atompair_per_z, num_streams, true);
Cuda_Mem_Wrapper<int> gemm_m(max_atompair_per_z, num_streams, true);
Expand Down Expand Up @@ -193,15 +186,8 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,

psi.memset_device_async(streams[sid], sid, 0);
psi_dm.memset_device_async(streams[sid], sid, 0);
dpsi_dx.memset_device_async(streams[sid], sid, 0);
dpsi_dy.memset_device_async(streams[sid], sid, 0);
dpsi_dz.memset_device_async(streams[sid], sid, 0);
d2psi_dxx.memset_device_async(streams[sid], sid, 0);
d2psi_dxy.memset_device_async(streams[sid], sid, 0);
d2psi_dxz.memset_device_async(streams[sid], sid, 0);
d2psi_dyy.memset_device_async(streams[sid], sid, 0);
d2psi_dyz.memset_device_async(streams[sid], sid, 0);
d2psi_dzz.memset_device_async(streams[sid], sid, 0);
dpsi.memset_device_async(streams[sid], sid, 0);
d2psi.memset_device_async(streams[sid], sid, 0);

dim3 grid_psi(nbzp, 32);
dim3 block_psi(64);
Expand All @@ -225,15 +211,8 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
gridt.nr_max,
gridt.psi_u_g,
psi.get_device_pointer(sid),
dpsi_dx.get_device_pointer(sid),
dpsi_dy.get_device_pointer(sid),
dpsi_dz.get_device_pointer(sid),
d2psi_dxx.get_device_pointer(sid),
d2psi_dxy.get_device_pointer(sid),
d2psi_dxz.get_device_pointer(sid),
d2psi_dyy.get_device_pointer(sid),
d2psi_dyz.get_device_pointer(sid),
d2psi_dzz.get_device_pointer(sid));
dpsi.get_device_pointer(sid),
d2psi.get_device_pointer(sid));
checkCudaLastError();

gridt.fastest_matrix_mul(max_m,
Expand All @@ -259,9 +238,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
block_force,
block_size * 3 * sizeof(double),
streams[sid]>>>(
dpsi_dx.get_device_pointer(sid),
dpsi_dy.get_device_pointer(sid),
dpsi_dz.get_device_pointer(sid),
dpsi.get_device_pointer(sid),
psi_dm.get_device_pointer(sid),
force.get_device_pointer(sid),
iat_per_z.get_device_pointer(sid),
Expand All @@ -276,12 +253,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
block_stress,
0,
streams[sid]>>>(
d2psi_dxx.get_device_pointer(sid),
d2psi_dxy.get_device_pointer(sid),
d2psi_dxz.get_device_pointer(sid),
d2psi_dyy.get_device_pointer(sid),
d2psi_dyz.get_device_pointer(sid),
d2psi_dzz.get_device_pointer(sid),
d2psi.get_device_pointer(sid),
psi_dm.get_device_pointer(sid),
stress.get_device_pointer(sid),
max_phi_per_z);
Expand Down
153 changes: 77 additions & 76 deletions source/module_hamilt_lcao/module_gint/gint_k.h
Original file line number Diff line number Diff line change
@@ -1,38 +1,44 @@
#ifndef GINT_K_H
#define GINT_K_H
#ifndef W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_K_H
#define W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_K_H

#include "module_basis/module_ao/ORB_atomic_lm.h"
#include "gint.h"
#include "grid_technique.h"
#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h"
#include "module_basis/module_ao/ORB_atomic_lm.h"
#include "module_elecstate/module_charge/charge.h"
#include "gint.h"
#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h"

// add by jingan for map<> in 2021-12-2, will be deleted in the future
#include "module_base/abfs-vector3_order.h"

class Gint_k : public Gint
{
public:
public:
~Gint_k()
{
destroy_pvpR();
}
//------------------------------------------------------
// in gint_k_pvpr.cpp
// in gint_k_pvpr.cpp
//------------------------------------------------------
// pvpR and reset_spin/get_spin : auxilliary methods
// for calculating hamiltonian

// reset the spin.
void reset_spin(const int &spin_now_in){this->spin_now = spin_now_in;};
void reset_spin(const int& spin_now_in)
{
this->spin_now = spin_now_in;
};
// get the spin.
int get_spin(void)const{return spin_now;}
int get_spin() const
{
return spin_now;
}

//renew gint index for new iteration
// renew gint index for new iteration
void renew(const bool& soft = false)
{
if(soft && this->spin_now == 0)
{//in this case, gint will not be recalculated
if (soft && this->spin_now == 0)
{ // in this case, gint will not be recalculated
return;
}
else if (this->spin_now != -1)
Expand All @@ -44,111 +50,106 @@ class Gint_k : public Gint
}
return;
}

// allocate the <phi_0 | V | phi_R> matrix element.
void allocate_pvpR(void);
void allocate_pvpR();
// destroy the temporary <phi_0 | V | phi_R> matrix element.
void destroy_pvpR(void);
void destroy_pvpR();

// allocate the <phi_0 | V | dphi_R> matrix element.
void allocate_pvdpR(void);
void allocate_pvdpR();
// destroy the temporary <phi_0 | V | dphi_R> matrix element.
void destroy_pvdpR(void);
void destroy_pvdpR();

// folding the < phi_0 | V | phi_R> matrix to
// folding the < phi_0 | V | phi_R> matrix to
// <phi_0i | V | phi_0j>
// V is (Vl + Vh + Vxc) if no Vna is used,
// and is (Vna + delta_Vh + Vxc) if Vna is used.
void folding_vl_k(const int &ik, LCAO_Matrix* LM, Parallel_Orbitals *pv,
const std::vector<ModuleBase::Vector3<double>>& kvec_d,
const UnitCell& ucell,Grid_Driver& gd);
void folding_vl_k(const int& ik,
LCAO_Matrix* LM,
Parallel_Orbitals* pv,
const std::vector<ModuleBase::Vector3<double>>& kvec_d,
const UnitCell& ucell,
Grid_Driver& gd);

/**
* @brief transfer pvpR to this->hRGint
* then pass this->hRGint to Veff<OperatorLCAO>::hR
*/
void transfer_pvpR(hamilt::HContainer<double> *hR,const UnitCell* ucell_in,Grid_Driver* gd);
void transfer_pvpR(hamilt::HContainer<std::complex<double>> *hR,const UnitCell* ucell_in,Grid_Driver* gd);
*/
void transfer_pvpR(hamilt::HContainer<double>* hR, const UnitCell* ucell_in, Grid_Driver* gd);
void transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR, const UnitCell* ucell_in, Grid_Driver* gd);

//------------------------------------------------------
// in gint_k_env.cpp
// in gint_k_env.cpp
//------------------------------------------------------
// calculate the envelop function via grid integrals
void cal_env_k(int ik,
const std::complex<double>* psi_k,
double* rho,
const std::vector<ModuleBase::Vector3<double>>& kvec_c,
const std::vector<ModuleBase::Vector3<double>>& kvec_d,
UnitCell &ucell);
UnitCell& ucell);

//------------------------------------------------------
// in gint_k_sparse.cpp
//------------------------------------------------------
// in gint_k_sparse.cpp
//------------------------------------------------------
// related to sparse matrix
// jingan add 2021-6-4, modify 2021-12-2
void distribute_pvpR_sparseMatrix(
const int current_spin,
const double &sparse_threshold,
const std::map<Abfs::Vector3_Order<int>,
std::map<size_t, std::map<size_t, double>>> &pvpR_sparseMatrix,
LCAO_Matrix *LM,
Parallel_Orbitals *pv);
const int current_spin,
const double& sparse_threshold,
const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>>& pvpR_sparseMatrix,
LCAO_Matrix* LM,
Parallel_Orbitals* pv);

void distribute_pvpR_soc_sparseMatrix(
const double &sparse_threshold,
const std::map<Abfs::Vector3_Order<int>,
std::map<size_t,
std::map<size_t, std::complex<double>>>> &pvpR_soc_sparseMatrix,
LCAO_Matrix *LM,
Parallel_Orbitals *pv
);

void cal_vlocal_R_sparseMatrix(
const int &current_spin,
const double &sparse_threshold,
LCAO_Matrix *LM,
Parallel_Orbitals *pv,
UnitCell &ucell,
Grid_Driver &gdriver);
const double& sparse_threshold,
const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>&
pvpR_soc_sparseMatrix,
LCAO_Matrix* LM,
Parallel_Orbitals* pv);

void cal_vlocal_R_sparseMatrix(const int& current_spin,
const double& sparse_threshold,
LCAO_Matrix* LM,
Parallel_Orbitals* pv,
UnitCell& ucell,
Grid_Driver& gdriver);

//------------------------------------------------------
// in gint_k_sparse1.cpp
//------------------------------------------------------
// in gint_k_sparse1.cpp
//------------------------------------------------------
// similar to the above 3, just for the derivative
void distribute_pvdpR_sparseMatrix(
const int current_spin,
const int current_spin,
const int dim,
const double &sparse_threshold,
const std::map<Abfs::Vector3_Order<int>,
std::map<size_t, std::map<size_t, double>>> &pvdpR_sparseMatrix,
LCAO_Matrix *LM,
Parallel_Orbitals *pv);
const double& sparse_threshold,
const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>>& pvdpR_sparseMatrix,
LCAO_Matrix* LM,
Parallel_Orbitals* pv);

void distribute_pvdpR_soc_sparseMatrix(
const int dim,
const double &sparse_threshold,
const std::map<Abfs::Vector3_Order<int>,
std::map<size_t, std::map<size_t, std::complex<double>>>> &pvdpR_soc_sparseMatrix,
LCAO_Matrix *LM,
Parallel_Orbitals *pv);

void cal_dvlocal_R_sparseMatrix(
const int &current_spin,
const double &sparse_threshold,
LCAO_Matrix *LM,
Parallel_Orbitals *pv,
UnitCell &ucell,
Grid_Driver &gdriver);

private:

const double& sparse_threshold,
const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>&
pvdpR_soc_sparseMatrix,
LCAO_Matrix* LM,
Parallel_Orbitals* pv);

void cal_dvlocal_R_sparseMatrix(const int& current_spin,
const double& sparse_threshold,
LCAO_Matrix* LM,
Parallel_Orbitals* pv,
UnitCell& ucell,
Grid_Driver& gdriver);

private:
//----------------------------
// key variable
//----------------------------
// key variable
//----------------------------

// used only in vlocal.
int spin_now = -1;

};

#endif
Loading

0 comments on commit 74c1664

Please sign in to comment.