From 74c1664aa5a866f0cbcdf3ed217c678bbf10f93e Mon Sep 17 00:00:00 2001
From: liiutao <74701833+A-006@users.noreply.github.com>
Date: Sat, 22 Jun 2024 21:17:47 +0800
Subject: [PATCH] delete sparse file and Optimize global memory access (#4467)

* delete sparse1

* Optimize global variable memory access

* [pre-commit.ci lite] apply automatic fixes

* Revert "delete sparse1"

This reverts commit dadce23cbf39562ac23dcd43df7a86f93bf64a07.

* fix bug in compute

* [pre-commit.ci lite] apply automatic fixes

---------

Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
---
 .../module_gint/gint_force_gpu.cu             |  44 +----
 .../module_hamilt_lcao/module_gint/gint_k.h   | 153 +++++++++---------
 .../module_gint/kernels/cuda/gint_force.cu    |  62 +++----
 .../module_gint/kernels/cuda/gint_force.cuh   |  25 +--
 .../module_gint/kernels/cuda/interp.cuh       |  64 ++++----
 5 files changed, 147 insertions(+), 201 deletions(-)
diff --git a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu
index f07462172c..3ed5a5a80c 100644
--- a/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu
+++ b/source/module_hamilt_lcao/module_gint/gint_force_gpu.cu
@@ -65,15 +65,8 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
 
     Cuda_Mem_Wrapper<double> psi(max_phi_per_z, num_streams, false);
     Cuda_Mem_Wrapper<double> psi_dm(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> dpsi_dx(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> dpsi_dy(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> dpsi_dz(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi_dxx(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi_dxy(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi_dxz(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi_dyy(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi_dyz(max_phi_per_z, num_streams, false);
-    Cuda_Mem_Wrapper<double> d2psi_dzz(max_phi_per_z, num_streams, false);
+    Cuda_Mem_Wrapper<double> dpsi(3 * max_phi_per_z, num_streams, false);
+    Cuda_Mem_Wrapper<double> d2psi(6 * max_phi_per_z, num_streams, false);
 
     Cuda_Mem_Wrapper<double> gemm_alpha(max_atompair_per_z, num_streams, true);
     Cuda_Mem_Wrapper<int> gemm_m(max_atompair_per_z, num_streams, true);
@@ -193,15 +186,8 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
 
             psi.memset_device_async(streams[sid], sid, 0);
             psi_dm.memset_device_async(streams[sid], sid, 0);
-            dpsi_dx.memset_device_async(streams[sid], sid, 0);
-            dpsi_dy.memset_device_async(streams[sid], sid, 0);
-            dpsi_dz.memset_device_async(streams[sid], sid, 0);
-            d2psi_dxx.memset_device_async(streams[sid], sid, 0);
-            d2psi_dxy.memset_device_async(streams[sid], sid, 0);
-            d2psi_dxz.memset_device_async(streams[sid], sid, 0);
-            d2psi_dyy.memset_device_async(streams[sid], sid, 0);
-            d2psi_dyz.memset_device_async(streams[sid], sid, 0);
-            d2psi_dzz.memset_device_async(streams[sid], sid, 0);
+            dpsi.memset_device_async(streams[sid], sid, 0);
+            d2psi.memset_device_async(streams[sid], sid, 0);
 
             dim3 grid_psi(nbzp, 32);
             dim3 block_psi(64);
@@ -225,15 +211,8 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
                 gridt.nr_max,
                 gridt.psi_u_g,
                 psi.get_device_pointer(sid),
-                dpsi_dx.get_device_pointer(sid),
-                dpsi_dy.get_device_pointer(sid),
-                dpsi_dz.get_device_pointer(sid),
-                d2psi_dxx.get_device_pointer(sid),
-                d2psi_dxy.get_device_pointer(sid),
-                d2psi_dxz.get_device_pointer(sid),
-                d2psi_dyy.get_device_pointer(sid),
-                d2psi_dyz.get_device_pointer(sid),
-                d2psi_dzz.get_device_pointer(sid));
+                dpsi.get_device_pointer(sid),
+                d2psi.get_device_pointer(sid));
             checkCudaLastError();
 
             gridt.fastest_matrix_mul(max_m,
@@ -259,9 +238,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
                                 block_force,
                                 block_size * 3 * sizeof(double),
                                 streams[sid]>>>(
-                                    dpsi_dx.get_device_pointer(sid),
-                                    dpsi_dy.get_device_pointer(sid),
-                                    dpsi_dz.get_device_pointer(sid),
+                                    dpsi.get_device_pointer(sid),
                                     psi_dm.get_device_pointer(sid),
                                     force.get_device_pointer(sid),
                                     iat_per_z.get_device_pointer(sid),
@@ -276,12 +253,7 @@ void gint_fvl_gamma_gpu(hamilt::HContainer<double>* dm,
                                  block_stress,
                                  0,
                                  streams[sid]>>>(
-                                d2psi_dxx.get_device_pointer(sid),
-                                d2psi_dxy.get_device_pointer(sid),
-                                d2psi_dxz.get_device_pointer(sid),
-                                d2psi_dyy.get_device_pointer(sid),
-                                d2psi_dyz.get_device_pointer(sid),
-                                d2psi_dzz.get_device_pointer(sid),
+                                d2psi.get_device_pointer(sid),
                                 psi_dm.get_device_pointer(sid),
                                 stress.get_device_pointer(sid),
                                 max_phi_per_z);
diff --git a/source/module_hamilt_lcao/module_gint/gint_k.h b/source/module_hamilt_lcao/module_gint/gint_k.h
index 8f6fddc1d7..c3d7f08194 100644
--- a/source/module_hamilt_lcao/module_gint/gint_k.h
+++ b/source/module_hamilt_lcao/module_gint/gint_k.h
@@ -1,38 +1,44 @@
-#ifndef GINT_K_H
-#define GINT_K_H
+#ifndef W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_K_H
+#define W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_HAMILT_LCAO_MODULE_GINT_GINT_K_H
 
-#include "module_basis/module_ao/ORB_atomic_lm.h"
+#include "gint.h"
 #include "grid_technique.h"
-#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h"
+#include "module_basis/module_ao/ORB_atomic_lm.h"
 #include "module_elecstate/module_charge/charge.h"
-#include "gint.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h"
 
 // add by jingan for map<> in 2021-12-2, will be deleted in the future
 #include "module_base/abfs-vector3_order.h"
 
 class Gint_k : public Gint
 {
-    public:
+  public:
     ~Gint_k()
     {
         destroy_pvpR();
     }
     //------------------------------------------------------
-    // in gint_k_pvpr.cpp 
+    // in gint_k_pvpr.cpp
     //------------------------------------------------------
     // pvpR and reset_spin/get_spin : auxilliary methods
     // for calculating hamiltonian
 
     // reset the spin.
-    void reset_spin(const int &spin_now_in){this->spin_now = spin_now_in;};
+    void reset_spin(const int& spin_now_in)
+    {
+        this->spin_now = spin_now_in;
+    };
     // get the spin.
-    int get_spin(void)const{return spin_now;}
+    int get_spin() const
+    {
+        return spin_now;
+    }
 
-    //renew gint index for new iteration
+    // renew gint index for new iteration
     void renew(const bool& soft = false)
     {
-        if(soft && this->spin_now == 0) 
-        {//in this case, gint will not be recalculated
+        if (soft && this->spin_now == 0)
+        { // in this case, gint will not be recalculated
             return;
         }
         else if (this->spin_now != -1)
@@ -44,34 +50,37 @@ class Gint_k : public Gint
         }
         return;
     }
- 
+
     // allocate the <phi_0 | V | phi_R> matrix element.
-    void allocate_pvpR(void);
+    void allocate_pvpR();
     // destroy the temporary <phi_0 | V | phi_R> matrix element.
-    void destroy_pvpR(void);
+    void destroy_pvpR();
 
     // allocate the <phi_0 | V | dphi_R> matrix element.
-    void allocate_pvdpR(void);
+    void allocate_pvdpR();
     // destroy the temporary <phi_0 | V | dphi_R> matrix element.
-    void destroy_pvdpR(void);
+    void destroy_pvdpR();
 
-    // folding the < phi_0 | V | phi_R> matrix to 
+    // folding the < phi_0 | V | phi_R> matrix to
     // <phi_0i | V | phi_0j>
     // V is (Vl + Vh + Vxc) if no Vna is used,
     // and is (Vna + delta_Vh + Vxc) if Vna is used.
-    void folding_vl_k(const int &ik, LCAO_Matrix* LM, Parallel_Orbitals *pv,
-                    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                    const UnitCell& ucell,Grid_Driver& gd);
+    void folding_vl_k(const int& ik,
+                      LCAO_Matrix* LM,
+                      Parallel_Orbitals* pv,
+                      const std::vector<ModuleBase::Vector3<double>>& kvec_d,
+                      const UnitCell& ucell,
+                      Grid_Driver& gd);
 
     /**
      * @brief transfer pvpR to this->hRGint
      * then pass this->hRGint to Veff<OperatorLCAO>::hR
-    */
-    void transfer_pvpR(hamilt::HContainer<double> *hR,const UnitCell* ucell_in,Grid_Driver* gd);
-    void transfer_pvpR(hamilt::HContainer<std::complex<double>> *hR,const UnitCell* ucell_in,Grid_Driver* gd);
+     */
+    void transfer_pvpR(hamilt::HContainer<double>* hR, const UnitCell* ucell_in, Grid_Driver* gd);
+    void transfer_pvpR(hamilt::HContainer<std::complex<double>>* hR, const UnitCell* ucell_in, Grid_Driver* gd);
 
     //------------------------------------------------------
-    // in gint_k_env.cpp 
+    // in gint_k_env.cpp
     //------------------------------------------------------
     // calculate the envelop function via grid integrals
     void cal_env_k(int ik,
@@ -79,76 +88,68 @@ class Gint_k : public Gint
                    double* rho,
                    const std::vector<ModuleBase::Vector3<double>>& kvec_c,
                    const std::vector<ModuleBase::Vector3<double>>& kvec_d,
-                   UnitCell &ucell);
+                   UnitCell& ucell);
 
     //------------------------------------------------------
-    // in gint_k_sparse.cpp 
-    //------------------------------------------------------    
+    // in gint_k_sparse.cpp
+    //------------------------------------------------------
     // related to sparse matrix
     // jingan add 2021-6-4, modify 2021-12-2
     void distribute_pvpR_sparseMatrix(
-        const int current_spin, 
-        const double &sparse_threshold, 
-        const std::map<Abfs::Vector3_Order<int>,
-        std::map<size_t, std::map<size_t, double>>> &pvpR_sparseMatrix,
-        LCAO_Matrix *LM,
-        Parallel_Orbitals *pv);
+        const int current_spin,
+        const double& sparse_threshold,
+        const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>>& pvpR_sparseMatrix,
+        LCAO_Matrix* LM,
+        Parallel_Orbitals* pv);
 
     void distribute_pvpR_soc_sparseMatrix(
-        const double &sparse_threshold, 
-        const std::map<Abfs::Vector3_Order<int>,
-        std::map<size_t,
-        std::map<size_t, std::complex<double>>>> &pvpR_soc_sparseMatrix,
-        LCAO_Matrix *LM,
-        Parallel_Orbitals *pv
-        );
-
-    void cal_vlocal_R_sparseMatrix(
-        const int &current_spin,
-        const double &sparse_threshold,
-        LCAO_Matrix *LM,
-        Parallel_Orbitals *pv,
-        UnitCell &ucell,
-        Grid_Driver &gdriver);
+        const double& sparse_threshold,
+        const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>&
+            pvpR_soc_sparseMatrix,
+        LCAO_Matrix* LM,
+        Parallel_Orbitals* pv);
+
+    void cal_vlocal_R_sparseMatrix(const int& current_spin,
+                                   const double& sparse_threshold,
+                                   LCAO_Matrix* LM,
+                                   Parallel_Orbitals* pv,
+                                   UnitCell& ucell,
+                                   Grid_Driver& gdriver);
 
     //------------------------------------------------------
-    // in gint_k_sparse1.cpp 
-    //------------------------------------------------------  
+    // in gint_k_sparse1.cpp
+    //------------------------------------------------------
     // similar to the above 3, just for the derivative
     void distribute_pvdpR_sparseMatrix(
-        const int current_spin, 
+        const int current_spin,
         const int dim,
-        const double &sparse_threshold, 
-        const std::map<Abfs::Vector3_Order<int>,
-        std::map<size_t, std::map<size_t, double>>> &pvdpR_sparseMatrix,
-        LCAO_Matrix *LM,
-        Parallel_Orbitals *pv);
+        const double& sparse_threshold,
+        const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, double>>>& pvdpR_sparseMatrix,
+        LCAO_Matrix* LM,
+        Parallel_Orbitals* pv);
 
     void distribute_pvdpR_soc_sparseMatrix(
         const int dim,
-        const double &sparse_threshold, 
-        const std::map<Abfs::Vector3_Order<int>,
-        std::map<size_t, std::map<size_t, std::complex<double>>>> &pvdpR_soc_sparseMatrix,
-        LCAO_Matrix *LM,
-        Parallel_Orbitals *pv);
-
-    void cal_dvlocal_R_sparseMatrix(
-        const int &current_spin,
-        const double &sparse_threshold,
-        LCAO_Matrix *LM,
-        Parallel_Orbitals *pv,
-        UnitCell &ucell,
-        Grid_Driver &gdriver);
-
-    private:
-
+        const double& sparse_threshold,
+        const std::map<Abfs::Vector3_Order<int>, std::map<size_t, std::map<size_t, std::complex<double>>>>&
+            pvdpR_soc_sparseMatrix,
+        LCAO_Matrix* LM,
+        Parallel_Orbitals* pv);
+
+    void cal_dvlocal_R_sparseMatrix(const int& current_spin,
+                                    const double& sparse_threshold,
+                                    LCAO_Matrix* LM,
+                                    Parallel_Orbitals* pv,
+                                    UnitCell& ucell,
+                                    Grid_Driver& gdriver);
+
+  private:
+    //----------------------------
+    // key variable
     //----------------------------
-    // key variable 
-    //----------------------------  
 
     // used only in vlocal.
     int spin_now = -1;
-    
 };
 
 #endif
diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu
index 1442964d55..526da78247 100644
--- a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu
+++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cu
@@ -1,9 +1,7 @@
-#include "gint_force.cuh"
+#include "sph.cuh"
 #include "interp.cuh"
-#include "module_hamilt_lcao/module_gint/gint_force_gpu.h"
-#include "cuda_tools.cuh"
 #include "gint_force.cuh"
-#include "sph.cuh"
+#include "cuda_tools.cuh"
 #include "cuda_runtime.h"
 // CUDA kernel to calculate psi and force
 namespace GintKernel
@@ -11,7 +9,7 @@ namespace GintKernel
 __global__ void get_psi_force(double* ylmcoef,
                               double delta_r_g,
                               int bxyz_g,
-                              double nwmax_g,
+                              const int nwmax_g,
                               double* __restrict__ psi_input_double,
                               int* __restrict__ psi_input_int,
                               int* __restrict__ atom_num_per_bcell,
@@ -24,15 +22,8 @@ __global__ void get_psi_force(double* ylmcoef,
                               int nr_max,
                               const double* __restrict__ psi_u,
                               double* psi,
-                              double* dpsi_dx,
-                              double* dpsi_dy,
-                              double* dpsi_dz,
-                              double* d2psi_dxx,
-                              double* d2psi_dxy,
-                              double* d2psi_dxz,
-                              double* d2psi_dyy,
-                              double* d2psi_dyz,
-                              double* d2psi_dzz)
+                              double* dpsi,
+                              double* d2psi)
 {
     const int size = atom_num_per_bcell[blockIdx.x];
     const int bcell_start = start_idx_per_bcell[blockIdx.x];
@@ -57,7 +48,6 @@ __global__ void get_psi_force(double* ylmcoef,
 
         const int nwl = ucell_atom_nwl[it];
         spherical_harmonics_d(dr, distance*distance, grly, nwl, ylma, ylmcoef);
-
         interpolate_f(distance,
                       delta_r_g,
                       it,
@@ -72,26 +62,14 @@ __global__ void get_psi_force(double* ylmcoef,
                       dist_tmp,
                       ylma,
                       vlbr3_value,
-                      dpsi_dx,
+                      dpsi,
                       dr,
                       grly,
-                      dpsi_dy,
-                      dpsi_dz,
-                      d2psi_dxx,
-                      d2psi_dxy,
-                      d2psi_dxz,
-                      d2psi_dyy,
-                      d2psi_dyz,
-                      d2psi_dzz);
+                      d2psi);
     }
 }
 
-__global__ void dot_product_stress(double* d2psi_dxx,
-                                   double* d2psi_dxy,
-                                   double* d2psi_dxz,
-                                   double* d2psi_dyy,
-                                   double* d2psi_dyz,
-                                   double* d2psi_dzz,
+__global__ void dot_product_stress(double* d2psi,
                                    double* psir_ylm_dm,
                                    double* stress_dot,
                                    int elements_num)
@@ -103,12 +81,13 @@ __global__ void dot_product_stress(double* d2psi_dxx,
     while (tid < elements_num)
     {   
         double psi_dm_2 = psir_ylm_dm[tid] * 2;
-        tmp[0] += d2psi_dxx[tid] * psi_dm_2;
-        tmp[1] += d2psi_dxy[tid] * psi_dm_2;
-        tmp[2] += d2psi_dxz[tid] * psi_dm_2;
-        tmp[3] += d2psi_dyy[tid] * psi_dm_2;
-        tmp[4] += d2psi_dyz[tid] * psi_dm_2;
-        tmp[5] += d2psi_dzz[tid] * psi_dm_2;
+        const int tid_stress = tid * 6;
+        tmp[0] += d2psi[tid_stress] * psi_dm_2;
+        tmp[1] += d2psi[tid_stress + 1] * psi_dm_2;
+        tmp[2] += d2psi[tid_stress + 2] * psi_dm_2;
+        tmp[3] += d2psi[tid_stress + 3] * psi_dm_2;
+        tmp[4] += d2psi[tid_stress + 4] * psi_dm_2;
+        tmp[5] += d2psi[tid_stress + 5] * psi_dm_2;
         tid += blockDim.x * gridDim.x;
     }
 
@@ -141,9 +120,7 @@ __global__ void dot_product_stress(double* d2psi_dxx,
     }
 }
 
-__global__ void dot_product_force(double* __restrict__ dpsi_dx,
-                                  double* __restrict__ dpsi_dy,
-                                  double* __restrict__ dpsi_dz,
+__global__ void dot_product_force(double* __restrict__ dpsi,
                                   double* __restrict__ psir_ylm_dm,
                                   double* force_dot,
                                   int* iat,
@@ -167,10 +144,11 @@ __global__ void dot_product_force(double* __restrict__ dpsi_dx,
     {
         int ls_offset = tid * 3;
         int psi_offset = offset + i;
+        int psi_offset_force = psi_offset * 3;
         double psi_dm_2 = psir_ylm_dm[psi_offset] * 2;
-        localsum[ls_offset] += dpsi_dx[psi_offset] * psi_dm_2;
-        localsum[ls_offset + 1] += dpsi_dy[psi_offset] * psi_dm_2;
-        localsum[ls_offset + 2] += dpsi_dz[psi_offset] * psi_dm_2;
+        localsum[ls_offset] += dpsi[psi_offset_force] * psi_dm_2;
+        localsum[ls_offset + 1] += dpsi[psi_offset_force + 1] * psi_dm_2;
+        localsum[ls_offset + 2] += dpsi[psi_offset_force + 2] * psi_dm_2;
     }
     __syncthreads();
     
diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh
index afe9dcb851..3cea097d2b 100644
--- a/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh
+++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/gint_force.cuh
@@ -8,7 +8,7 @@ namespace GintKernel
 __global__ void get_psi_force(double* ylmcoef,
                               double delta_r_g,
                               int bxyz_g,
-                              double nwmax_g,
+                              const int nwmax_g,
                               double* __restrict__ psi_input_double,
                               int* __restrict__ psi_input_int,
                               int* __restrict__ atom_num_per_bcell,
@@ -21,29 +21,16 @@ __global__ void get_psi_force(double* ylmcoef,
                               int nr_max,
                               const double* __restrict__ psi_u,
                               double* psi,
-                              double* dpsi_dx,
-                              double* dpsi_dy,
-                              double* dpsi_dz,
-                              double* d2psi_dxx,
-                              double* d2psi_dxy,
-                              double* d2psi_dxz,
-                              double* d2psi_dyy,
-                              double* d2psi_dyz,
-                              double* d2psi_dzz);
+                              double* dpsi,
+                              double* d2psi);
 
-__global__ void dot_product_stress(double* d2psi_dxx,
-                                   double* d2psi_dxy,
-                                   double* d2psi_dxz,
-                                   double* d2psi_dyy,
-                                   double* d2psi_dyz,
-                                   double* d2psi_dzz,
+
+__global__ void dot_product_stress(double* d2psi,
                                    double* psir_ylm_dm,
                                    double* stress_dot,
                                    int elements_num);
 
-__global__ void dot_product_force(double* __restrict__ dpsi_dx,
-                                  double* __restrict__ dpsi_dy,
-                                  double* __restrict__ dpsi_dz,
+__global__ void dot_product_force(double* __restrict__ dpsi,
                                   double* __restrict__ psir_ylm_dm,
                                   double* force_dot,
                                   int* iat,
diff --git a/source/module_hamilt_lcao/module_gint/kernels/cuda/interp.cuh b/source/module_hamilt_lcao/module_gint/kernels/cuda/interp.cuh
index 4522f7f7c9..ade96c6fa9 100644
--- a/source/module_hamilt_lcao/module_gint/kernels/cuda/interp.cuh
+++ b/source/module_hamilt_lcao/module_gint/kernels/cuda/interp.cuh
@@ -52,28 +52,21 @@ static __device__ void interpolate(const double dist,
 static __device__ void interpolate_f(const double distance,
                                      const double delta_r_g,
                                      const int it,
-                                     const double nwmax_g,
+                                     const int nwmax_g,
                                      const int nr_max,
                                      const int* __restrict__ atom_nw,
                                      const bool* __restrict__ atom_iw2_new,
                                      const double* __restrict__ psi_u,
                                      const int* __restrict__ atom_iw2_l,
                                      const int* __restrict__ atom_iw2_ylm,
-                                     double* psir_r,
+                                     double* psi,
                                      int dist_tmp,
                                      const double ylma[49],
                                      const double vlbr3_value,
-                                     double* psir_lx,
+                                     double* dpsi,
                                      const double * __restrict__ dr,
                                      const double grly[49][3],
-                                     double* psir_ly,
-                                     double* psir_lz,
-                                     double* psir_lxx,
-                                     double* psir_lxy,
-                                     double* psir_lxz,
-                                     double* psir_lyy,
-                                     double* psir_lyz,
-                                     double* psir_lzz)
+                                     double* d2psi)
 {
     // Calculate normalized position for interpolation
     const double postion = distance / delta_r_g;
@@ -92,6 +85,8 @@ static __device__ void interpolate_f(const double distance,
     const int it_nw = it * nwmax_g;
     int iw_nr = (it_nw * nr_max + ip) * 2;
     int it_nw_iw = it_nw;
+    double dpsir[150][4]={0.0};
+    int dist_tmp_clac=dist_tmp;
     for (int iw = 0; iw < atom_nw[it]; ++iw)
     {
         if (atom_iw2_new[it_nw_iw])
@@ -110,35 +105,48 @@ static __device__ void interpolate_f(const double distance,
         const double rl = pow(distance, ll);
         const double rl_r = 1.0 / rl;
         const double dist_r = 1 / distance;
-
+        const int dist_tmp_force = dist_tmp_clac * 3;
+        const int dist_tmp_stress = dist_tmp_clac * 6;
         // Compute right-hand side of the equation
-        psir_r[dist_tmp] = tmp * ylma[idx_lm] * rl_r * vlbr3_value;
+        dpsir[iw][3] = tmp * ylma[idx_lm] * rl_r * vlbr3_value;
         // Compute derivatives with respect to spatial
         // coordinates
         const double tmpdphi_rly
             = (dtmp - tmp * ll * dist_r) * rl_r * ylma[idx_lm] * dist_r;
         const double tmprl = tmp * rl_r;
-        psir_lx[dist_tmp]
-            = tmpdphi_rly * dr[0] + tmprl * grly[idx_lm][0];
-
-        psir_ly[dist_tmp]
-            = tmpdphi_rly * dr[1] + tmprl * grly[idx_lm][1];
-        psir_lz[dist_tmp]
-            = tmpdphi_rly * dr[2] + tmprl * grly[idx_lm][2];
-
-        psir_lxx[dist_tmp] = psir_lx[dist_tmp] * dr[0];
-        psir_lxy[dist_tmp] = psir_lx[dist_tmp] * dr[1];
-        psir_lxz[dist_tmp] = psir_lx[dist_tmp] * dr[2];
-        psir_lyy[dist_tmp] = psir_ly[dist_tmp] * dr[1];
-        psir_lyz[dist_tmp] = psir_ly[dist_tmp] * dr[2];
-        psir_lzz[dist_tmp] = psir_lz[dist_tmp] * dr[2];
+        const double dpsirx = tmpdphi_rly * dr[0] + tmprl * grly[idx_lm][0];
+        const double dpsiry = tmpdphi_rly * dr[1] + tmprl * grly[idx_lm][1];
+        const double dpsirz = tmpdphi_rly * dr[2] + tmprl * grly[idx_lm][2];
+        dpsir[iw][0] = dpsirx;
+        dpsir[iw][1] = dpsiry;
+        dpsir[iw][2] = dpsirz;
 
         // Update loop counters and indices
-        dist_tmp += 1;
+        dist_tmp_clac += 1;
         iw_nr += nr_max;
         iw_nr += nr_max;
         it_nw_iw++;
     }
+
+    #pragma unroll
+    int dist_tmp_trans = dist_tmp;
+    for (int iw=0;iw<atom_nw[it];++iw)
+    {
+        const int dist_tmp_force = dist_tmp_trans * 3;
+        const int dist_tmp_stress = dist_tmp_trans * 6;
+        psi[dist_tmp_trans] = dpsir[iw][3];
+        dpsi[dist_tmp_force] = dpsir[iw][0];
+        dpsi[dist_tmp_force + 1] = dpsir[iw][1];
+        dpsi[dist_tmp_force + 2] = dpsir[iw][2];
+
+        d2psi[dist_tmp_stress] = dpsir[iw][0] * dr[0];
+        d2psi[dist_tmp_stress + 1] = dpsir[iw][0] * dr[1];
+        d2psi[dist_tmp_stress + 2] = dpsir[iw][0] * dr[2];
+        d2psi[dist_tmp_stress + 3] = dpsir[iw][1] * dr[1];
+        d2psi[dist_tmp_stress + 4] = dpsir[iw][1] * dr[2];
+        d2psi[dist_tmp_stress + 5] = dpsir[iw][2] * dr[2];
+        dist_tmp_trans += 1;
+    }
 }
 } // namespace GintKernel