diff --git a/CMakeLists.txt b/CMakeLists.txt
index b53098a001..9086f0b6c4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -711,7 +711,9 @@ target_link_libraries(
   esolver
   vdw
   device
-  container)
+  container
+  dftu
+  deltaspin)
 if(ENABLE_LCAO)
   target_link_libraries(
     ${ABACUS_BIN_NAME}
@@ -719,9 +721,7 @@ if(ENABLE_LCAO)
     tddft
     orb
     gint
-    dftu
     hcontainer
-    deltaspin
     numerical_atomic_orbitals
     lr
     rdmft)
diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md
index ced6c56c54..4b9790fc27 100644
--- a/docs/advanced/input_files/input-main.md
+++ b/docs/advanced/input_files/input-main.md
@@ -1235,6 +1235,12 @@ Note: In new angle mixing, you should set `mixing_beta_mag >> mixing_beta`. The
 - **Description**: To determine the number of old iterations' `drho` used in slope calculations.
 - **Default**: `mixing_ndim`
 
+### sc_os_ndim
+
+- **Type**: int
+- **Description**: To determine the number of old iterations to judge oscillation, it occured,  more accurate lambda with DeltaSpin method would be calculated, only for PW base.
+- **Default**: 5
+
 ### chg_extrap
 
 - **Type**: String
diff --git a/python/pyabacus/src/ModuleNAO/CMakeLists.txt b/python/pyabacus/src/ModuleNAO/CMakeLists.txt
index 53600a08f3..c5eb016903 100644
--- a/python/pyabacus/src/ModuleNAO/CMakeLists.txt
+++ b/python/pyabacus/src/ModuleNAO/CMakeLists.txt
@@ -12,7 +12,6 @@ list(APPEND _naos
     ${NAO_PATH}/two_center_bundle.cpp
     ${NAO_PATH}/two_center_integrator.cpp
     ${NAO_PATH}/two_center_table.cpp
-    ${NAO_PATH}/projgen.cpp
     # dependency
     ${ABACUS_SOURCE_DIR}/module_base/kernels/math_op.cpp
     # ${ABACUS_SOURCE_DIR}/module_psi/kernels/psi_memory_op.cpp
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index e8af89216b..1f4d4a8370 100644
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -47,6 +47,7 @@ list(APPEND device_srcs
 
   module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
   module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
+  module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.cpp
   module_hamilt_pw/hamilt_pwdft/kernels/wf_op.cpp
   module_hamilt_pw/hamilt_pwdft/kernels/vnl_op.cpp
   module_base/kernels/math_op.cpp
@@ -60,6 +61,7 @@ if(USE_CUDA)
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/ekinetic_op.cu
     module_hamilt_pw/hamilt_pwdft/kernels/cuda/meta_op.cu
     module_hamilt_pw/hamilt_stodft/kernels/cuda/hpsi_norm_op.cu
+    module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu
     module_basis/module_pw/kernels/cuda/pw_op.cu
     module_hsolver/kernels/cuda/dngvd_op.cu
     module_hsolver/kernels/cuda/math_kernel_op.cu
@@ -83,6 +85,7 @@ if(USE_ROCM)
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/veff_op.hip.cu
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/ekinetic_op.hip.cu
     module_hamilt_pw/hamilt_pwdft/kernels/rocm/meta_op.hip.cu
+    module_hamilt_pw/hamilt_pwdft/kernels/rocm/onsite_op.hip.cu
     module_hamilt_pw/hamilt_stodft/kernels/rocm/hpsi_norm_op.hip.cu
     module_basis/module_pw/kernels/rocm/pw_op.hip.cu
     module_hsolver/kernels/rocm/dngvd_op.hip.cu
diff --git a/source/Makefile.Objects b/source/Makefile.Objects
index 79c24632a7..661db25611 100644
--- a/source/Makefile.Objects
+++ b/source/Makefile.Objects
@@ -635,10 +635,13 @@ OBJS_SRCPW=H_Ewald_pw.o\
     forces_nl.o\
     forces_cc.o\
     forces_scc.o\
+    forces_onsite.o\
+    onsite_proj_pw.o\
     fs_nonlocal_tools.o\
     fs_kin_tools.o\
     force_op.o\
     stress_op.o\
+    onsite_op.o\
     wf_op.o\
     vnl_op.o\
     global.o\
@@ -663,6 +666,7 @@ OBJS_SRCPW=H_Ewald_pw.o\
     stress_func_loc.o\
     stress_func_nl.o\
     stress_func_us.o\
+    stress_func_onsite.o\
     stress_pw.o\
     of_stress_pw.o\
     symmetry_rho.o\
@@ -673,7 +677,9 @@ OBJS_SRCPW=H_Ewald_pw.o\
     elecond.o\
     sto_tool.o\
     sto_elecond.o\
-    sto_dos.o
+    sto_dos.o\
+    onsite_projector.o\
+    onsite_proj_tools.o
 
 OBJS_VDW=vdw.o\
     vdwd2_parameters.o\
@@ -691,7 +697,8 @@ OBJS_DFTU=dftu.o\
       dftu_io.o\
       dftu_tools.o\
       dftu_occup.o\
-      dftu_hamilt.o
+      dftu_hamilt.o\
+      dftu_pw.o
 
 OBJS_DELTASPIN=basic_funcs.o\
       cal_mw_from_lambda.o\
diff --git a/source/module_base/CMakeLists.txt b/source/module_base/CMakeLists.txt
index 5335be34ac..38c466a2c1 100644
--- a/source/module_base/CMakeLists.txt
+++ b/source/module_base/CMakeLists.txt
@@ -54,6 +54,7 @@ add_library(
     spherical_bessel_transformer.cpp
     cubic_spline.cpp
     parallel_2d.cpp
+    projgen.cpp
     module_mixing/mixing_data.cpp
     module_mixing/mixing.cpp
     module_mixing/plain_mixing.cpp
diff --git a/source/module_basis/module_nao/projgen.cpp b/source/module_base/projgen.cpp
similarity index 100%
rename from source/module_basis/module_nao/projgen.cpp
rename to source/module_base/projgen.cpp
diff --git a/source/module_basis/module_nao/projgen.h b/source/module_base/projgen.h
similarity index 100%
rename from source/module_basis/module_nao/projgen.h
rename to source/module_base/projgen.h
diff --git a/source/module_basis/module_nao/CMakeLists.txt b/source/module_basis/module_nao/CMakeLists.txt
index 8e54af0778..29e091510f 100644
--- a/source/module_basis/module_nao/CMakeLists.txt
+++ b/source/module_basis/module_nao/CMakeLists.txt
@@ -14,7 +14,6 @@ if(ENABLE_LCAO)
     two_center_table.cpp
     two_center_integrator.cpp
     two_center_bundle.cpp
-    projgen.cpp
   )
 
   if(ENABLE_COVERAGE)
diff --git a/source/module_basis/module_nao/atomic_radials.cpp b/source/module_basis/module_nao/atomic_radials.cpp
index 7d095e70cd..e2461a3970 100644
--- a/source/module_basis/module_nao/atomic_radials.cpp
+++ b/source/module_basis/module_nao/atomic_radials.cpp
@@ -7,7 +7,7 @@
 // FIXME: should update with pyabacus
 // #include "module_io/orb_io.h"
 
-#include "projgen.h"
+#include "module_base/projgen.h"
 
 #include <fstream>
 #include <iostream>
diff --git a/source/module_basis/module_nao/test/CMakeLists.txt b/source/module_basis/module_nao/test/CMakeLists.txt
index 0759f33435..0e4f063be6 100644
--- a/source/module_basis/module_nao/test/CMakeLists.txt
+++ b/source/module_basis/module_nao/test/CMakeLists.txt
@@ -14,7 +14,6 @@ AddTest(
     ../atomic_radials.cpp
     ../radial_set.cpp
     ../numerical_radial.cpp
-    ../projgen.cpp
     ../../module_ao/ORB_atomic_lm.cpp
     ../../module_ao/ORB_atomic.cpp
     ../../../module_io/orb_io.cpp
@@ -84,7 +83,6 @@ AddTest(
     ../pswfc_radials.cpp
     ../radial_set.cpp
     ../numerical_radial.cpp
-    ../projgen.cpp
     ../sphbes_radials.cpp
     ../../module_ao/ORB_atomic_lm.cpp
     ../../module_ao/ORB_atomic.cpp
@@ -104,7 +102,6 @@ AddTest(
     ../pswfc_radials.cpp
     ../sphbes_radials.cpp
     ../radial_set.cpp
-    ../projgen.cpp
     ../numerical_radial.cpp
     ../two_center_bundle.cpp
     ../two_center_integrator.cpp
@@ -131,7 +128,6 @@ AddTest(
     ../real_gaunt_table.cpp
     ../radial_collection.cpp
     ../atomic_radials.cpp
-    ../projgen.cpp
     ../beta_radials.cpp
     ../hydrogen_radials.cpp
     ../pswfc_radials.cpp
@@ -158,7 +154,6 @@ AddTest(
     ../pswfc_radials.cpp
     ../sphbes_radials.cpp
     ../radial_set.cpp
-    ../projgen.cpp
     ../numerical_radial.cpp
     ../../../module_io/orb_io.cpp
   LIBS parameter ${math_libs} device base container orb 
@@ -179,7 +174,6 @@ AddTest(
     ../pswfc_radials.cpp
     ../sphbes_radials.cpp
     ../radial_set.cpp
-    ../projgen.cpp
     ../numerical_radial.cpp
     ../../../module_io/orb_io.cpp
   LIBS parameter ${math_libs} device base container orb 
diff --git a/source/module_basis/module_nao/test/projgen_test.cpp b/source/module_basis/module_nao/test/projgen_test.cpp
index aaea89f5d0..2feaadfb7a 100644
--- a/source/module_basis/module_nao/test/projgen_test.cpp
+++ b/source/module_basis/module_nao/test/projgen_test.cpp
@@ -1,4 +1,4 @@
-#include "module_basis/module_nao/projgen.h"
+#include "module_base/projgen.h"
 #include "gtest/gtest.h"
 
 #include "module_base/math_integral.h"
diff --git a/source/module_cell/read_atoms.cpp b/source/module_cell/read_atoms.cpp
index a17ab81ae5..81608ce609 100644
--- a/source/module_cell/read_atoms.cpp
+++ b/source/module_cell/read_atoms.cpp
@@ -455,7 +455,7 @@ bool UnitCell::read_atom_positions(std::ifstream &ifpos, std::ofstream &ofs_runn
             }
             else if(PARAM.inp.basis_type == "pw")
             {
-                if ((PARAM.inp.psi_initializer)&&(PARAM.inp.init_wfc.substr(0, 3) == "nao"))
+                if ((PARAM.inp.psi_initializer)&&(PARAM.inp.init_wfc.substr(0, 3) == "nao") || PARAM.inp.onsite_radius > 0.0)
                 {
                     std::string orbital_file = PARAM.inp.orbital_dir + orbital_fn[it];
                     this->read_orb_file(it, orbital_file, ofs_running, &(atoms[it]));
diff --git a/source/module_cell/unitcell.cpp b/source/module_cell/unitcell.cpp
index cc7140eb45..9f2b8bdbca 100755
--- a/source/module_cell/unitcell.cpp
+++ b/source/module_cell/unitcell.cpp
@@ -64,7 +64,6 @@ UnitCell::UnitCell() {
     atom_mass = nullptr;
     pseudo_fn = new std::string[1];
     pseudo_type = new std::string[1];
-    orbital_fn = new std::string[1];
 
     set_atom_flag = false;
 }
@@ -114,6 +113,15 @@ void UnitCell::bcast_unitcell() {
     Parallel_Common::bcast_int(lc[1]);
     Parallel_Common::bcast_int(lc[2]);
 
+    if(this->orbital_fn == nullptr)
+    {
+        this->orbital_fn = new std::string[ntype];
+    }
+    for (int i = 0; i < ntype; i++)
+    {
+        Parallel_Common::bcast_string(orbital_fn[i]);
+    }
+
     // distribute lattice vectors.
     Parallel_Common::bcast_double(a1.x);
     Parallel_Common::bcast_double(a1.y);
diff --git a/source/module_cell/unitcell.h b/source/module_cell/unitcell.h
index 1933a95c2f..af0d79a5c1 100644
--- a/source/module_cell/unitcell.h
+++ b/source/module_cell/unitcell.h
@@ -216,7 +216,7 @@ class UnitCell {
     std::string* pseudo_fn;
     std::string* pseudo_type; // pseudopotential types for each elements,
                               // sunliang added 2022-09-15.
-    std::string* orbital_fn;  // filenames of orbitals, liuyu add 2022-10-19
+    std::string* orbital_fn = nullptr;  // filenames of orbitals, liuyu add 2022-10-19
     std::string
         descriptor_file; // filenames of descriptor_file, liuyu add 2023-04-06
 
diff --git a/source/module_elecstate/elecstate.h b/source/module_elecstate/elecstate.h
index a90555a249..7640d43d0f 100644
--- a/source/module_elecstate/elecstate.h
+++ b/source/module_elecstate/elecstate.h
@@ -151,9 +151,7 @@ class ElecState
         return 0.0;
     }
 
-#ifdef __LCAO
     double get_dftu_energy();
-#endif
 
 #ifdef __DEEPKS
     double get_deepks_E_delta();
diff --git a/source/module_elecstate/elecstate_energy.cpp b/source/module_elecstate/elecstate_energy.cpp
index 86a02d7364..f016da85a5 100644
--- a/source/module_elecstate/elecstate_energy.cpp
+++ b/source/module_elecstate/elecstate_energy.cpp
@@ -287,7 +287,6 @@ void ElecState::cal_energies(const int type)
     }
 
     //! spin constrained energy
-#ifdef __LCAO
     if (PARAM.inp.sc_mag_switch)
     {
         this->f_en.escon = get_spin_constrain_energy();
@@ -298,7 +297,6 @@ void ElecState::cal_energies(const int type)
     {
         this->f_en.edftu = get_dftu_energy();
     }
-#endif
 
 #ifdef __DEEPKS
     // energy from deepks
diff --git a/source/module_elecstate/elecstate_energy_terms.cpp b/source/module_elecstate/elecstate_energy_terms.cpp
index a4d7d98cf3..d820ba064e 100644
--- a/source/module_elecstate/elecstate_energy_terms.cpp
+++ b/source/module_elecstate/elecstate_energy_terms.cpp
@@ -34,12 +34,10 @@ double ElecState::get_solvent_model_Acav()
     return GlobalC::solvent_model.Acav;
 }
 
-#ifdef __LCAO
 double ElecState::get_dftu_energy()
 {
     return GlobalC::dftu.get_energy();
 }
-#endif
 
 #ifdef __DEEPKS
 double ElecState::get_deepks_E_delta()
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index 3ffeac7712..63a1201bef 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -193,7 +193,8 @@ void ESolver_KS_LCAO<TK, TR>::before_all_runners(UnitCell& ucell, const Input_pa
     // 7) initialize DFT+U
     if (PARAM.inp.dft_plus_u)
     {
-        GlobalC::dftu.init(ucell, &this->pv, this->kv.get_nks(), orb_);
+        auto* dftu = ModuleDFTU::DFTU::get_instance();
+        dftu->init(ucell, &this->pv, this->kv.get_nks(), &orb_);
     }
 
     // 8) initialize ppcell
@@ -1140,7 +1141,7 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(UnitCell& ucell, const int istep)
         //! Perform Mulliken charge analysis
         if (PARAM.inp.out_mul)
         {
-            ModuleIO::cal_mag(&(this->pv), this->p_hamilt, this->kv, this->pelec, ucell, istep, true);
+            ModuleIO::cal_mag(&(this->pv), this->p_hamilt, this->kv, this->pelec, this->two_center_bundle_, this->orb_, ucell, istep, true);
         }
     }
 
diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp
index 33be890791..d57c4f5ffc 100644
--- a/source/module_esolver/esolver_ks_pw.cpp
+++ b/source/module_esolver/esolver_ks_pw.cpp
@@ -54,6 +54,10 @@
 #include "module_base/kernels/dsp/dsp_connector.h"
 #endif
 
+#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h"
+#include "module_hamilt_lcao/module_deltaspin/spin_constrain.h"
+#include "module_hamilt_lcao/module_dftu/dftu.h"
+
 namespace ModuleESolver
 {
 
@@ -359,6 +363,46 @@ void ESolver_KS_PW<T, Device>::before_scf(UnitCell& ucell, const int istep)
 
     this->ppcell.cal_effective_D(veff, this->pw_rhod, ucell);
 
+    if(PARAM.inp.onsite_radius > 0) 
+    {
+        auto* onsite_p = projectors::OnsiteProjector<double, Device>::get_instance();
+        onsite_p->init(PARAM.inp.orbital_dir,
+                       &ucell,
+                       *(this->kspw_psi),
+                       this->kv,
+                       *(this->pw_wfc), 
+                       this->sf,
+                       PARAM.inp.onsite_radius,
+                       PARAM.globalv.nqx,
+                       PARAM.globalv.dq,
+                       this->pelec->wg,
+                       this->pelec->ekb);
+    }
+
+    if (PARAM.inp.sc_mag_switch)
+    {
+        spinconstrain::SpinConstrain<std::complex<double>>& sc = spinconstrain::SpinConstrain<std::complex<double>>::getScInstance();
+        sc.init_sc(PARAM.inp.sc_thr,
+                   PARAM.inp.nsc,
+                   PARAM.inp.nsc_min,
+                   PARAM.inp.alpha_trial,
+                   PARAM.inp.sccut,
+                   PARAM.inp.sc_drop_thr,
+                   ucell,
+                   nullptr,
+                   PARAM.inp.nspin,
+                   this->kv,
+                   this->p_hamilt,
+                   this->kspw_psi,
+                   this->pelec,
+                   this->pw_wfc);
+    }
+
+    if(PARAM.inp.dft_plus_u)
+    {
+        auto* dftu = ModuleDFTU::DFTU::get_instance();
+        dftu->init(ucell, nullptr, this->kv.get_nks());
+    }
     // after init_rho (in pelec->init_scf), we have rho now.
     // before hamilt2density, we update Hk and initialize psi
 
@@ -400,10 +444,55 @@ void ESolver_KS_PW<T, Device>::iter_init(UnitCell& ucell, const int istep, const
     if (iter == this->p_chgmix->mixing_restart_step && PARAM.inp.mixing_restart > 0.0)
     {
         this->p_chgmix->init_mixing();
+        this->p_chgmix->mixing_restart_count++;
+        if (PARAM.inp.dft_plus_u)
+        {
+            auto* dftu = ModuleDFTU::DFTU::get_instance();
+            if (dftu->uramping > 0.01 && !dftu->u_converged())
+            {
+                this->p_chgmix->mixing_restart_step = PARAM.inp.scf_nmax + 1;
+            }
+            if (dftu->uramping > 0.01)
+            {
+                bool do_uramping = true;
+                if (PARAM.inp.sc_mag_switch)
+                {
+                    spinconstrain::SpinConstrain<std::complex<double>>& sc = spinconstrain::SpinConstrain<std::complex<double>>::getScInstance();
+                    if(!sc.mag_converged())// skip uramping if mag not converged
+                    {
+                        do_uramping = false;
+                    }
+                }
+                if(do_uramping)
+                {
+                    dftu->uramping_update(); // update U by uramping if uramping > 0.01
+                    std::cout << " U-Ramping! Current U = ";
+                    for (int i = 0; i < dftu->U0.size(); i++)
+                    {
+                        std::cout << dftu->U[i] * ModuleBase::Ry_to_eV << " ";
+                    }
+                    std::cout << " eV " << std::endl;
+                }
+            }
+        }
     }
     // mohan move harris functional to here, 2012-06-05
     // use 'rho(in)' and 'v_h and v_xc'(in)
     this->pelec->f_en.deband_harris = this->pelec->cal_delta_eband();
+
+    // update local occupations for DFT+U
+    // should before lambda loop in DeltaSpin
+    if (PARAM.inp.dft_plus_u && (iter != 1 || istep != 0))
+    {
+        auto* dftu = ModuleDFTU::DFTU::get_instance();
+        // only old DFT+U method should calculated energy correction in esolver,
+        // new DFT+U method will calculate energy in calculating Hamiltonian
+        if (dftu->omc != 2)
+        {
+            dftu->cal_occ_pw(iter, this->kspw_psi, this->pelec->wg, ucell, PARAM.inp.mixing_beta);
+        }
+        dftu->output(ucell);
+    }
 }
 
 // Temporary, it should be replaced by hsolver later.
@@ -431,27 +520,49 @@ void ESolver_KS_PW<T, Device>::hamilt2density_single(UnitCell& ucell,
     }
     bool skip_charge = PARAM.inp.calculation == "nscf" ? true : false;
 
-    hsolver::HSolverPW<T, Device> hsolver_pw_obj(this->pw_wfc,
-                                                 PARAM.inp.calculation,
-                                                 PARAM.inp.basis_type,
-                                                 PARAM.inp.ks_solver,
-                                                 PARAM.inp.use_paw,
-                                                 PARAM.globalv.use_uspp,
-                                                 PARAM.inp.nspin,
-                                                 hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
-                                                 hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
-                                                 hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
-                                                 hsolver::DiagoIterAssist<T, Device>::need_subspace);
-
-    hsolver_pw_obj.solve(this->p_hamilt,
-                         this->kspw_psi[0],
-                         this->pelec,
-                         this->pelec->ekb.c,
-                         GlobalV::RANK_IN_POOL,
-                         GlobalV::NPROC_IN_POOL,
-                         skip_charge,
-                         ucell.tpiba,
-                         ucell.nat);
+    // run the inner lambda loop to contrain atomic moments with the DeltaSpin method
+    bool skip_solve = false;
+    if (PARAM.inp.sc_mag_switch)
+    {
+        spinconstrain::SpinConstrain<std::complex<double>>& sc = spinconstrain::SpinConstrain<std::complex<double>>::getScInstance();
+        if(!sc.mag_converged() && this->drho>0 && this->drho < PARAM.inp.sc_scf_thr)
+        {
+            // optimize lambda to get target magnetic moments, but the lambda is not near target
+            sc.run_lambda_loop(iter-1);
+            sc.set_mag_converged(true);
+            skip_solve = true;
+        }
+        else if(sc.mag_converged())
+        {
+            // optimize lambda to get target magnetic moments, but the lambda is not near target
+            sc.run_lambda_loop(iter-1);
+            skip_solve = true;
+        }
+    }
+    if(!skip_solve)
+    {
+        hsolver::HSolverPW<T, Device> hsolver_pw_obj(this->pw_wfc,
+                                                    PARAM.inp.calculation,
+                                                    PARAM.inp.basis_type,
+                                                    PARAM.inp.ks_solver,
+                                                    PARAM.inp.use_paw,
+                                                    PARAM.globalv.use_uspp,
+                                                    PARAM.inp.nspin,
+                                                    hsolver::DiagoIterAssist<T, Device>::SCF_ITER,
+                                                    hsolver::DiagoIterAssist<T, Device>::PW_DIAG_NMAX,
+                                                    hsolver::DiagoIterAssist<T, Device>::PW_DIAG_THR,
+                                                    hsolver::DiagoIterAssist<T, Device>::need_subspace);
+
+        hsolver_pw_obj.solve(this->p_hamilt,
+                            this->kspw_psi[0],
+                            this->pelec,
+                            this->pelec->ekb.c,
+                            GlobalV::RANK_IN_POOL,
+                            GlobalV::NPROC_IN_POOL,
+                            skip_charge,
+                            ucell.tpiba,
+                            ucell.nat);
+    }
 
     Symmetry_rho srho;
     for (int is = 0; is < PARAM.inp.nspin; is++)
@@ -517,6 +628,20 @@ void ESolver_KS_PW<T, Device>::iter_finish(UnitCell& ucell, const int istep, int
             // functions into file WAVEFUNC.dat");
         }
     }
+    // 4) check if oscillate for delta_spin method
+    if(PARAM.inp.sc_mag_switch)
+    {
+        spinconstrain::SpinConstrain<std::complex<double>>& sc = spinconstrain::SpinConstrain<std::complex<double>>::getScInstance();
+        if(!sc.higher_mag_prec)
+        {
+            sc.higher_mag_prec = 
+                this->p_chgmix->if_scf_oscillate(iter, this->drho, PARAM.inp.sc_os_ndim, PARAM.inp.scf_os_thr);
+            if(sc.higher_mag_prec)
+            { // if oscillate, increase the precision of magnetization and do mixing_restart in next iteration
+                this->p_chgmix->mixing_restart_step = iter + 1;
+            }
+        }
+    }
 }
 
 template <typename T, typename Device>
@@ -600,6 +725,22 @@ void ESolver_KS_PW<T, Device>::after_scf(UnitCell& ucell, const int istep)
         bp.Macroscopic_polarization(ucell,this->pw_wfc->npwk_max, this->psi, this->pw_rho, this->pw_wfc, this->kv);
         std::cout << FmtCore::format(" >> Finish %s.\n * * * * * *\n", "Berry phase polarization");
     }
+
+    // 8) write spin constrian results
+    // spin constrain calculations, write atomic magnetization and magnetic force.
+    if (PARAM.inp.sc_mag_switch) {
+        spinconstrain::SpinConstrain<std::complex<double>>& sc
+            = spinconstrain::SpinConstrain<std::complex<double>>::getScInstance();
+        sc.cal_mi_pw();
+        sc.print_Mag_Force(GlobalV::ofs_running);
+    }
+
+    // 9) write onsite occupations for charge and magnetizations
+    if(PARAM.inp.onsite_radius > 0)
+    { // float type has not been implemented
+        auto* onsite_p = projectors::OnsiteProjector<double, Device>::get_instance();
+        onsite_p->cal_occupations(reinterpret_cast<psi::Psi<std::complex<double>, Device>*>(this->kspw_psi), this->pelec->wg);
+    }
 }
 
 template <typename T, typename Device>
diff --git a/source/module_esolver/lcao_before_scf.cpp b/source/module_esolver/lcao_before_scf.cpp
index b02c92729d..2066b5069b 100644
--- a/source/module_esolver/lcao_before_scf.cpp
+++ b/source/module_esolver/lcao_before_scf.cpp
@@ -238,7 +238,6 @@ void ESolver_KS_LCAO<TK, TR>::before_scf(UnitCell& ucell, const int istep)
                    &(this->pv),
                    PARAM.inp.nspin,
                    this->kv,
-                   PARAM.inp.ks_solver,
                    this->p_hamilt,
                    this->psi,
                    this->pelec);
diff --git a/source/module_esolver/lcao_others.cpp b/source/module_esolver/lcao_others.cpp
index 50012599c1..fa82e38ff6 100644
--- a/source/module_esolver/lcao_others.cpp
+++ b/source/module_esolver/lcao_others.cpp
@@ -242,7 +242,6 @@ void ESolver_KS_LCAO<TK, TR>::others(UnitCell& ucell, const int istep)
                    &(this->pv),
                    PARAM.inp.nspin,
                    this->kv,
-                   PARAM.inp.ks_solver,
                    this->p_hamilt,
                    this->psi,
                    this->pelec);
diff --git a/source/module_hamilt_general/operator.h b/source/module_hamilt_general/operator.h
index f040efc710..6cf29122fe 100644
--- a/source/module_hamilt_general/operator.h
+++ b/source/module_hamilt_general/operator.h
@@ -17,6 +17,7 @@ enum class calculation_type
     pw_nonlocal,
     pw_veff,
     pw_meta,
+    pw_onsite,
     lcao_overlap,
     lcao_fixed,
     lcao_gint,
diff --git a/source/module_hamilt_lcao/module_deltaspin/cal_mw.cpp b/source/module_hamilt_lcao/module_deltaspin/cal_mw.cpp
index 9da6f58564..94c5c74db7 100644
--- a/source/module_hamilt_lcao/module_deltaspin/cal_mw.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/cal_mw.cpp
@@ -5,7 +5,7 @@
 #include "module_base/scalapack_connector.h"
 #include "module_base/tool_title.h"
 #include "module_base/timer.h"
-//#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h"
+#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h"
 #include "spin_constrain.h"
 #include "module_parameter/parameter.h"
 #ifdef __LCAO
@@ -51,7 +51,7 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mi_lcao(const int&
 
 #endif
 
-/*template <>
+template <>
 void spinconstrain::SpinConstrain<std::complex<double>>::cal_mi_pw()
 {
     ModuleBase::TITLE("module_deltaspin", "cal_mi_pw");
@@ -154,7 +154,7 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mi_pw()
     Parallel_Reduce::reduce_double_allpool(PARAM.inp.kpar, GlobalV::NPROC_IN_POOL, &(this->Mi_[0][0]), 3 * this->Mi_.size());
     
     ModuleBase::timer::tick("spinconstrain::SpinConstrain", "cal_mi_pw");
-}*/
+}
 
 template <>
 void spinconstrain::SpinConstrain<std::complex<double>>::set_operator(
diff --git a/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp b/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp
index 4ce31dfeda..87a2fa41cc 100644
--- a/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp
@@ -3,10 +3,12 @@
 #include "module_hsolver/diago_iter_assist.h"
 #include "module_parameter/parameter.h"
 #include "spin_constrain.h"
-//#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h"
+#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h"
 #include "module_base/parallel_reduce.h"
 #include "module_hsolver/kernels/math_kernel_op.h"
 #include "module_hsolver/hsolver_lcao.h"
+#include "module_hsolver/hsolver_pw.h"
+#include "module_elecstate/elecstate_pw.h"
 
 #ifdef __LCAO
 #include "module_elecstate/elecstate_lcao.h"
@@ -18,7 +20,23 @@ template <>
 void spinconstrain::SpinConstrain<std::complex<double>>::calculate_delta_hcc(std::complex<double>* h_tmp, const std::complex<double>* becp_k, const ModuleBase::Vector3<double>* delta_lambda, const int nbands, const int nkb, const int* nh_iat)
 {
     int sum = 0;
-    std::vector<std::complex<double>> ps(nkb * 2 * nbands, 0.0);
+    int size_ps = nkb * 2 * nbands;
+    std::complex<double>* becp_cpu = nullptr;
+    if(PARAM.inp.device == "gpu")
+    {
+#if ((defined __CUDA) || (defined __ROCM))
+        base_device::DEVICE_GPU* ctx = {};
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_CPU>()(cpu_ctx, becp_cpu, size_ps);
+        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(cpu_ctx, ctx, becp_cpu, becp_k, size_ps);   
+#endif
+    }
+    else if (PARAM.inp.device == "cpu")
+    {
+        becp_cpu = const_cast<std::complex<double>*>(becp_k);
+    }
+
+    std::vector<std::complex<double>> ps(size_ps, 0.0);
     for (int iat = 0; iat < this->Mi_.size(); iat++)
     {
         const int nproj = nh_iat[iat];
@@ -34,8 +52,8 @@ void spinconstrain::SpinConstrain<std::complex<double>>::calculate_delta_hcc(std
             for (int ip = 0; ip < nproj; ip++)
             {
                 const int becpind = ib * nkb + sum + ip;
-                const std::complex<double> becp1 = becp_k[becpind];
-                const std::complex<double> becp2 = becp_k[becpind + nkb];
+                const std::complex<double> becp1 = becp_cpu[becpind];
+                const std::complex<double> becp2 = becp_cpu[becpind + nkb];
                 ps[becpind] += coefficients0 * becp1
                                 + coefficients2 * becp2;
                 ps[becpind + nkb] += coefficients1 * becp1
@@ -44,27 +62,69 @@ void spinconstrain::SpinConstrain<std::complex<double>>::calculate_delta_hcc(std
         } // end ib
         sum += nproj;
     } // end iat
+    std::complex<double>* ps_pointer = nullptr;
+    if(PARAM.inp.device == "gpu")
+    {
+#if ((defined __CUDA) || (defined __ROCM))
+        base_device::DEVICE_GPU* ctx = {};
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, ps_pointer, size_ps);
+        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(ctx, cpu_ctx, ps_pointer, ps.data(), size_ps);   
+#endif
+    }
+    else if (PARAM.inp.device == "cpu")
+    {
+        ps_pointer = ps.data();
+    }
     // update h_tmp by becp_k * ps
     char transa = 'C';
     char transb = 'N';
     const int npm = nkb * 2;
-    base_device::DEVICE_CPU* ctx = {};
-    hsolver::gemm_op<std::complex<double>, base_device::DEVICE_CPU>()(
-        ctx,
-        transa,
-        transb,
-        nbands,
-        nbands,
-        npm,
-        &ModuleBase::ONE,
-        becp_k,
-        npm,
-        ps.data(),
-        npm,
-        &ModuleBase::ONE,
-        h_tmp,
-        nbands
-    );
+    if (PARAM.inp.device == "gpu")
+    {
+#if ((defined __CUDA) || (defined __ROCM))
+        base_device::DEVICE_GPU* ctx = {};
+        hsolver::gemm_op<std::complex<double>, base_device::DEVICE_GPU>()(
+            ctx,
+            transa,
+            transb,
+            nbands,
+            nbands,
+            npm,
+            &ModuleBase::ONE,
+            becp_k,
+            npm,
+            ps_pointer,
+            npm,
+            &ModuleBase::ONE,
+            h_tmp,
+            nbands
+        );
+        base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, ps_pointer);
+        delete[] becp_cpu;
+#endif
+
+    }
+    else if (PARAM.inp.device == "cpu")
+    {
+        base_device::DEVICE_CPU* ctx = {};
+        hsolver::gemm_op<std::complex<double>, base_device::DEVICE_CPU>()(
+            ctx,
+            transa,
+            transb,
+            nbands,
+            nbands,
+            npm,
+            &ModuleBase::ONE,
+            becp_k,
+            npm,
+            ps_pointer,
+            npm,
+            &ModuleBase::ONE,
+            h_tmp,
+            nbands
+        );
+    }
 }
 
 template <>
@@ -103,149 +163,196 @@ void spinconstrain::SpinConstrain<std::complex<double>>::cal_mw_from_lambda(int
     else
 #endif
     {
-        /*this->zero_Mi();
-        int size_becp = 0;
-        std::vector<std::complex<double>> becp_tmp;
-        int nk = 0;
-        int nkb = 0;
-        int nbands = 0;
-        int npol = 0;
-        const int* nh_iat = nullptr;
-        if (PARAM.inp.device == "cpu")
+        /*if (i_step == -1 && this->higher_mag_prec)
         {
-            psi::Psi<std::complex<double>>* psi_t = static_cast<psi::Psi<std::complex<double>>*>(this->psi);
-            hamilt::Hamilt<std::complex<double>, base_device::DEVICE_CPU>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>, base_device::DEVICE_CPU>*>(this->p_hamilt);
-            auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_CPU>::get_instance();
-            nbands = psi_t->get_nbands();
-            npol = psi_t->npol;
-            nkb = onsite_p->get_tot_nproj();
-            nk = psi_t->get_nk();
-            nh_iat = &onsite_p->get_nh(0);
-            size_becp = nbands * nkb * npol;
-            becp_tmp.resize(size_becp * nk);
-            std::vector<std::complex<double>> h_tmp(nbands * nbands), s_tmp(nbands * nbands);
-            int initial_hs = 0;
-            if(this->sub_h_save == nullptr)
+            // std::cout<<__FILE__<<__LINE__<<"istep == 0"<<std::endl;
+            if (PARAM.inp.device == "cpu")
             {
-                initial_hs = 1;
-                this->sub_h_save = new std::complex<double>[nbands * nbands * nk];
-                this->sub_s_save = new std::complex<double>[nbands * nbands * nk];
-                this->becp_save = new std::complex<double>[size_becp * nk];
+                psi::Psi<std::complex<double>>* psi_t = static_cast<psi::Psi<std::complex<double>>*>(this->psi);
+                hamilt::Hamilt<std::complex<double>>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>>*>(this->p_hamilt);
+                hsolver::HSolver<std::complex<double>, base_device::DEVICE_CPU>* hsolver_t = static_cast<hsolver::HSolver<std::complex<double>, base_device::DEVICE_CPU>*>(this->phsol);
+                hsolver_t->solve(hamilt_t, psi_t[0], this->pelec, this->KS_SOLVER, true);
             }
-            for (int ik = 0; ik < nk; ++ik)
+            else
             {
-
-                psi_t->fix_k(ik);
-
-                std::complex<double>* h_k = this->sub_h_save + ik * nbands * nbands;
-                std::complex<double>* s_k = this->sub_s_save + ik * nbands * nbands;
-                std::complex<double>* becp_k = this->becp_save + ik * size_becp;
-                if(initial_hs)
+                psi::Psi<std::complex<double>, base_device::DEVICE_GPU>* psi_t = static_cast<psi::Psi<std::complex<double>, base_device::DEVICE_GPU>*>(this->psi);
+                hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>*>(this->p_hamilt);
+                hsolver::HSolver<std::complex<double>, base_device::DEVICE_GPU>* hsolver_t = static_cast<hsolver::HSolver<std::complex<double>, base_device::DEVICE_GPU>*>(this->phsol);
+                hsolver_t->solve(hamilt_t, psi_t[0], this->pelec, this->KS_SOLVER, true);
+            }
+            this->pelec->calculate_weights();
+            this->cal_Mi_pw();
+        }
+        else*/
+        {
+            this->zero_Mi();
+            int size_becp = 0;
+            std::vector<std::complex<double>> becp_tmp;
+            int nk = 0;
+            int nkb = 0;
+            int nbands = 0;
+            int npol = 0;
+            const int* nh_iat = nullptr;
+            if (PARAM.inp.device == "cpu")
+            {
+                psi::Psi<std::complex<double>>* psi_t = static_cast<psi::Psi<std::complex<double>>*>(this->psi);
+                hamilt::Hamilt<std::complex<double>, base_device::DEVICE_CPU>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>, base_device::DEVICE_CPU>*>(this->p_hamilt);
+                auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_CPU>::get_instance();
+                nbands = psi_t->get_nbands();
+                npol = psi_t->npol;
+                nkb = onsite_p->get_tot_nproj();
+                nk = psi_t->get_nk();
+                nh_iat = &onsite_p->get_nh(0);
+                size_becp = nbands * nkb * npol;
+                becp_tmp.resize(size_becp * nk);
+                std::vector<std::complex<double>> h_tmp(nbands * nbands), s_tmp(nbands * nbands);
+                int initial_hs = 0;
+                if(this->sub_h_save == nullptr)
                 {
-                    /// update H(k) for each k point
-                    hamilt_t->updateHk(ik);
-                    hsolver::DiagoIterAssist<std::complex<double>>::cal_hs_subspace(hamilt_t, psi_t[0], h_k, s_k);
-                    memcpy(becp_k, onsite_p->get_becp(), sizeof(std::complex<double>) * size_becp);
+                    initial_hs = 1;
+                    this->sub_h_save = new std::complex<double>[nbands * nbands * nk];
+                    this->sub_s_save = new std::complex<double>[nbands * nbands * nk];
+                    this->becp_save = new std::complex<double>[size_becp * nk];
                 }
-                memcpy(h_tmp.data(), h_k, sizeof(std::complex<double>) * nbands * nbands);
-                memcpy(s_tmp.data(), s_k, sizeof(std::complex<double>) * nbands * nbands);
-                // update h_tmp by delta_lambda
-                if (i_step != -1) this->calculate_delta_hcc(h_tmp.data(), becp_k, delta_lambda, nbands, nkb, nh_iat);
+                for (int ik = 0; ik < nk; ++ik)
+                {
 
-                hsolver::DiagoIterAssist<std::complex<double>>::diag_responce(h_tmp.data(),
-                                                                                s_tmp.data(),
-                                                                                nbands,
-                                                                                becp_k,
-                                                                                &becp_tmp[ik * size_becp],
-                                                                                nkb * 2,
-                                                                                &this->pelec->ekb(ik, 0));
+                    psi_t->fix_k(ik);
+
+                    std::complex<double>* h_k = this->sub_h_save + ik * nbands * nbands;
+                    std::complex<double>* s_k = this->sub_s_save + ik * nbands * nbands;
+                    std::complex<double>* becp_k = this->becp_save + ik * size_becp;
+                    if(initial_hs)
+                    {
+                        /// update H(k) for each k point
+                        hamilt_t->updateHk(ik);
+                        hsolver::DiagoIterAssist<std::complex<double>>::cal_hs_subspace(hamilt_t, psi_t[0], h_k, s_k);
+                        memcpy(becp_k, onsite_p->get_becp(), sizeof(std::complex<double>) * size_becp);
+                    }
+                    memcpy(h_tmp.data(), h_k, sizeof(std::complex<double>) * nbands * nbands);
+                    memcpy(s_tmp.data(), s_k, sizeof(std::complex<double>) * nbands * nbands);
+                    // update h_tmp by delta_lambda
+                    if (i_step != -1) this->calculate_delta_hcc(h_tmp.data(), becp_k, delta_lambda, nbands, nkb, nh_iat);
+
+                    hsolver::DiagoIterAssist<std::complex<double>>::diag_responce(h_tmp.data(),
+                                                                                  s_tmp.data(),
+                                                                                  nbands,
+                                                                                  becp_k,
+                                                                                  &becp_tmp[ik * size_becp],
+                                                                                  nkb * 2,
+                                                                                  &this->pelec->ekb(ik, 0));
+                }
             }
-        }
 #if ((defined __CUDA) || (defined __ROCM))
-        else
-        {
-            base_device::DEVICE_GPU* ctx = {};
-            base_device::DEVICE_CPU* cpu_ctx = {};
-            psi::Psi<std::complex<double>, base_device::DEVICE_GPU>* psi_t = static_cast<psi::Psi<std::complex<double>, base_device::DEVICE_GPU>*>(this->psi);
-            hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>*>(this->p_hamilt);
-            auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_GPU>::get_instance();
-            nbands = psi_t->get_nbands();
-            npol = psi_t->npol;
-            nkb = onsite_p->get_tot_nproj();
-            nk = psi_t->get_nk();
-            nh_iat = &onsite_p->get_nh(0);
-            size_becp = nbands * nkb * npol;
-            becp_tmp.resize(size_becp * nk);
-            std::complex<double>* becp_pointer = nullptr;
-            // allocate memory for becp_pointer in GPU device
-            base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, becp_pointer, size_becp);
-            for (int ik = 0; ik < nk; ++ik)
+            else
             {
-                /// update H(k) for each k point
-                hamilt_t->updateHk(ik);
+                base_device::DEVICE_GPU* ctx = {};
+                base_device::DEVICE_CPU* cpu_ctx = {};
+                psi::Psi<std::complex<double>, base_device::DEVICE_GPU>* psi_t = static_cast<psi::Psi<std::complex<double>, base_device::DEVICE_GPU>*>(this->psi);
+                hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>* hamilt_t = static_cast<hamilt::Hamilt<std::complex<double>, base_device::DEVICE_GPU>*>(this->p_hamilt);
+                auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_GPU>::get_instance();
+                nbands = psi_t->get_nbands();
+                npol = psi_t->npol;
+                nkb = onsite_p->get_tot_nproj();
+                nk = psi_t->get_nk();
+                nh_iat = &onsite_p->get_nh(0);
+                size_becp = nbands * nkb * npol;
+                becp_tmp.resize(size_becp * nk);
+                std::complex<double>* h_tmp = nullptr;
+                std::complex<double>* s_tmp = nullptr;
+                base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, h_tmp, nbands * nbands);
+                base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, s_tmp, nbands * nbands);
+                int initial_hs = 0;
+                if(this->sub_h_save == nullptr)
+                {
+                    initial_hs = 1;
+                    
+                    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, this->sub_h_save, nbands * nbands * nk);
+                    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, this->sub_s_save, nbands * nbands * nk);
+                    base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, this->becp_save, size_becp * nk);
+                }
+                std::complex<double>* becp_pointer = nullptr;
+                // allocate memory for becp_pointer in GPU device
+                base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, becp_pointer, size_becp);
+                for (int ik = 0; ik < nk; ++ik)
+                {
+                    psi_t->fix_k(ik);
 
-                psi_t->fix_k(ik);
+                    std::complex<double>* h_k = this->sub_h_save + ik * nbands * nbands;
+                    std::complex<double>* s_k = this->sub_s_save + ik * nbands * nbands;
+                    std::complex<double>* becp_k = this->becp_save + ik * size_becp;
+                    if(initial_hs)
+                    {
+                        /// update H(k) for each k point
+                        hamilt_t->updateHk(ik);
+                        hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::cal_hs_subspace(hamilt_t, psi_t[0], h_k, s_k);
+                        base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, becp_k, onsite_p->get_becp(), size_becp);
+                    }
+                    base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, h_tmp, h_k, nbands * nbands);
+                    base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, s_tmp, s_k, nbands * nbands);
+                    // update h_tmp by delta_lambda
+                    if (i_step != -1) this->calculate_delta_hcc(h_tmp, becp_k, delta_lambda, nbands, nkb, nh_iat);
 
-                const std::complex<double>* becp_new = onsite_p->get_becp();
-                hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::diag_responce(hamilt_t,
-                                                                                psi_t[0],
-                                                                                becp_new,
-                                                                                becp_pointer,
-                                                                                nkb * npol,
-                                                                                &this->pelec->ekb(ik, 0));
-                // copy becp_pointer from GPU to CPU
-                base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(cpu_ctx, ctx, &becp_tmp[ik * size_becp], becp_pointer, size_becp);   
-            }
+                    hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::diag_responce(h_tmp,
+                                                                                  s_tmp,
+                                                                                  nbands,
+                                                                                  becp_k,
+                                                                                  becp_pointer,
+                                                                                  nkb * npol,
+                                                                                  &this->pelec->ekb(ik, 0));
+                    // copy becp_pointer from GPU to CPU
+                    base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(cpu_ctx, ctx, &becp_tmp[ik * size_becp], becp_pointer, size_becp);   
+                }
 
-            // free memory for becp_pointer in GPU device
-            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, becp_pointer);
-        }
+                // free memory for becp_pointer in GPU device
+                base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, becp_pointer);
+            }
 #endif
-        // calculate weights from ekb to update wg
-        this->pelec->calculate_weights();
-        // calculate Mi from existed becp
-        for (int ik = 0; ik < nk; ik++)
-        {
-            const std::complex<double>* becp = &becp_tmp[ik * size_becp];
-            // becp(nbands*npol , nkb)
-            // mag = wg * \sum_{nh}becp * becp
-            for (int ib = 0; ib < nbands; ib++)
+            // calculate weights from ekb to update wg
+            this->pelec->calculate_weights();
+            // calculate Mi from existed becp
+            for (int ik = 0; ik < nk; ik++)
             {
-                const double weight = this->pelec->wg(ik, ib);
-                int begin_ih = 0;
-                for (int iat = 0; iat < this->Mi_.size(); iat++)
+                const std::complex<double>* becp = &becp_tmp[ik * size_becp];
+                // becp(nbands*npol , nkb)
+                // mag = wg * \sum_{nh}becp * becp
+                for (int ib = 0; ib < nbands; ib++)
                 {
-                    const int nh = nh_iat[iat];
-                    std::complex<double> occ[4]
-                        = {ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO};
-                    for (int ih = 0; ih < nh; ih++)
+                    const double weight = this->pelec->wg(ik, ib);
+                    int begin_ih = 0;
+                    for (int iat = 0; iat < this->Mi_.size(); iat++)
                     {
-                        const int index = ib * npol * nkb + begin_ih + ih;
-                        occ[0] += conj(becp[index]) * becp[index];
-                        occ[1] += conj(becp[index]) * becp[index + nkb];
-                        occ[2] += conj(becp[index + nkb]) * becp[index];
-                        occ[3] += conj(becp[index + nkb]) * becp[index + nkb];
+                        const int nh = nh_iat[iat];
+                        std::complex<double> occ[4]
+                            = {ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO};
+                        for (int ih = 0; ih < nh; ih++)
+                        {
+                            const int index = ib * npol * nkb + begin_ih + ih;
+                            occ[0] += conj(becp[index]) * becp[index];
+                            occ[1] += conj(becp[index]) * becp[index + nkb];
+                            occ[2] += conj(becp[index + nkb]) * becp[index];
+                            occ[3] += conj(becp[index + nkb]) * becp[index + nkb];
+                        }
+                        // occ has been reduced and calculate mag
+                        this->Mi_[iat].x += weight * (occ[1] + occ[2]).real();
+                        this->Mi_[iat].y += weight * (occ[1] - occ[2]).imag();
+                        this->Mi_[iat].z += weight * (occ[0] - occ[3]).real();
+                        begin_ih += nh;
                     }
-                    // occ has been reduced and calculate mag
-                    this->Mi_[iat].x += weight * (occ[1] + occ[2]).real();
-                    this->Mi_[iat].y += weight * (occ[1] - occ[2]).imag();
-                    this->Mi_[iat].z += weight * (occ[0] - occ[3]).real();
-                    begin_ih += nh;
                 }
             }
+            Parallel_Reduce::reduce_double_allpool(GlobalV::KPAR,
+                                                   GlobalV::NPROC_IN_POOL,
+                                                   &(this->Mi_[0][0]),
+                                                   3 * this->Mi_.size());
+            // for(int i = 0; i < this->Mi_.size(); i++)
+            //{
+            //     std::cout<<"atom"<<i<<": "<<" mag: "<<this->Mi_[i].x<<" "<<this->Mi_[i].y<<" "<<this->Mi_[i].z<<"
+            //     "<<this->lambda_[i].x<<" "<<this->lambda_[i].y<<" "<<this->lambda_[i].z<<std::endl;
+            // }
         }
-        Parallel_Reduce::reduce_double_allpool(PARAM.inp.kpar,
-                                                GlobalV::NPROC_IN_POOL,
-                                                &(this->Mi_[0][0]),
-                                                3 * this->Mi_.size());
-        // for(int i = 0; i < this->Mi_.size(); i++)
-        //{
-        //     std::cout<<"atom"<<i<<": "<<" mag: "<<this->Mi_[i].x<<" "<<this->Mi_[i].y<<" "<<this->Mi_[i].z<<"
-        //     "<<this->lambda_[i].x<<" "<<this->lambda_[i].y<<" "<<this->lambda_[i].z<<std::endl;
-        // }
-        */
     }
-    ModuleBase::timer::tick("spinconstrain::SpinConstrain", "cal_mw_from_lambda");
+    ModuleBase::timer::tick("SpinConstrain", "cal_mw_from_lambda");
 }
 
 template <>
@@ -262,7 +369,7 @@ void spinconstrain::SpinConstrain<std::complex<double>>::update_psi_charge(const
     else
 #endif
     {
-        /*int size_becp = 0;
+        int size_becp = 0;
         std::vector<std::complex<double>> becp_tmp;
         int nk = 0;
         int nkb = 0;
@@ -311,8 +418,27 @@ void spinconstrain::SpinConstrain<std::complex<double>>::update_psi_charge(const
 
             if(pw_solve)
             {
-                hsolver::HSolver<std::complex<double>, base_device::DEVICE_CPU>* hsolver_t = static_cast<hsolver::HSolver<std::complex<double>, base_device::DEVICE_CPU>*>(this->phsol);
-                hsolver_t->solve(hamilt_t, psi_t[0], this->pelec, this->KS_SOLVER, false);
+                hsolver::HSolverPW<std::complex<double>, base_device::DEVICE_CPU> hsolver_pw_obj(this->pw_wfc_,
+                                                 PARAM.inp.calculation,
+                                                 PARAM.inp.basis_type,
+                                                 PARAM.inp.ks_solver,
+                                                 PARAM.inp.use_paw,
+                                                 PARAM.globalv.use_uspp,
+                                                 PARAM.inp.nspin,
+                                                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_CPU>::SCF_ITER,
+                                                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_CPU>::PW_DIAG_NMAX,
+                                                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_CPU>::PW_DIAG_THR,
+                                                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_CPU>::need_subspace);
+
+                hsolver_pw_obj.solve(hamilt_t,
+                         psi_t[0],
+                         this->pelec,
+                         this->pelec->ekb.c,
+                         GlobalV::RANK_IN_POOL,
+                         GlobalV::NPROC_IN_POOL,
+                         false,
+                         this->tpiba,
+                         this->get_nat());
             }
             else
             {// update charge density only
@@ -333,36 +459,69 @@ void spinconstrain::SpinConstrain<std::complex<double>>::update_psi_charge(const
             nk = psi_t->get_nk();
             nh_iat = &onsite_p->get_nh(0);
             size_becp = nbands * nkb * npol;
-            becp_tmp.resize(size_becp * nk);
-            std::complex<double>* becp_pointer = nullptr;
-            // allocate memory for becp_pointer in GPU device
-            base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, becp_pointer, size_becp);
+
+            std::complex<double>* h_tmp = nullptr;
+            std::complex<double>* s_tmp = nullptr;
+            base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, h_tmp, nbands * nbands);
+            base_device::memory::resize_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, s_tmp, nbands * nbands);
+            assert(this->sub_h_save != nullptr);
+            assert(this->sub_s_save != nullptr);
+            assert(this->becp_save != nullptr);
             for (int ik = 0; ik < nk; ++ik)
             {
-                /// update H(k) for each k point
-                hamilt_t->updateHk(ik);
+                std::complex<double>* h_k = this->sub_h_save + ik * nbands * nbands;
+                std::complex<double>* s_k = this->sub_s_save + ik * nbands * nbands;
+                std::complex<double>* becp_k = this->becp_save + ik * size_becp;
 
                 psi_t->fix_k(ik);
-
-                const std::complex<double>* becp_new = onsite_p->get_becp();
-                hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::diag_responce(hamilt_t,
+                base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, h_tmp, h_k, nbands * nbands);
+                base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, s_tmp, s_k, nbands * nbands);
+                this->calculate_delta_hcc(h_tmp, becp_k, delta_lambda, nbands, nkb, nh_iat);
+                hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::diag_subspace_psi(h_tmp,
+                                                                                s_tmp,
+                                                                                nbands,
                                                                                 psi_t[0],
-                                                                                becp_new,
-                                                                                becp_pointer,
-                                                                                nkb * npol,
                                                                                 &this->pelec->ekb(ik, 0));
-                // copy becp_pointer from GPU to CPU
-                base_device::memory::synchronize_memory_op<std::complex<double>, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(cpu_ctx, ctx, &becp_tmp[ik * size_becp], becp_pointer, size_becp);   
             }
 
-            // free memory for becp_pointer in GPU device
-            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, becp_pointer);
+            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, sub_h_save);
+            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, sub_s_save);
+            base_device::memory::delete_memory_op<std::complex<double>, base_device::DEVICE_GPU>()(ctx, becp_save);
+            this->sub_h_save = nullptr;
+            this->sub_s_save = nullptr;
+            this->becp_save = nullptr;
+
+            if(pw_solve)
+            {
+                hsolver::HSolverPW<std::complex<double>, base_device::DEVICE_GPU> hsolver_pw_obj(this->pw_wfc_,
+                                                 PARAM.inp.calculation,
+                                                 PARAM.inp.basis_type,
+                                                 PARAM.inp.ks_solver,
+                                                 PARAM.inp.use_paw,
+                                                 PARAM.globalv.use_uspp,
+                                                 PARAM.inp.nspin,
+                                                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::SCF_ITER,
+                                                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::PW_DIAG_NMAX,
+                                                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::PW_DIAG_THR,
+                                                 hsolver::DiagoIterAssist<std::complex<double>, base_device::DEVICE_GPU>::need_subspace);
 
-            hsolver::HSolver<std::complex<double>, base_device::DEVICE_GPU>* hsolver_t = static_cast<hsolver::HSolver<std::complex<double>, base_device::DEVICE_GPU>*>(this->phsol);
-            hsolver_t->solve(hamilt_t, psi_t[0], this->pelec, this->KS_SOLVER, false);
+                hsolver_pw_obj.solve(hamilt_t,
+                         psi_t[0],
+                         this->pelec,
+                         this->pelec->ekb.c,
+                         GlobalV::RANK_IN_POOL,
+                         GlobalV::NPROC_IN_POOL,
+                         false,
+                         this->tpiba,
+                         this->get_nat());
+            }
+            else
+            {// update charge density only
+                reinterpret_cast<elecstate::ElecStatePW<std::complex<double>, base_device::DEVICE_GPU>*>(this->pelec)->psiToRho(*psi_t);
+            }
+            
         }
-#endif     
-        */  
+#endif       
     }
     ModuleBase::timer::tick("spinconstrain::SpinConstrain", "update_psi_charge");
 }
diff --git a/source/module_hamilt_lcao/module_deltaspin/init_sc.cpp b/source/module_hamilt_lcao/module_deltaspin/init_sc.cpp
index 45cb492780..fbba82a839 100644
--- a/source/module_hamilt_lcao/module_deltaspin/init_sc.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/init_sc.cpp
@@ -12,10 +12,10 @@ void spinconstrain::SpinConstrain<FPTYPE>::init_sc(double sc_thr_in,
                                             Parallel_Orbitals* ParaV_in,
                                             int nspin_in,
                                             K_Vectors& kv_in,
-                                            std::string KS_SOLVER_in,
                                             void* p_hamilt_in,
                                             void* psi_in,
-                                            elecstate::ElecState* pelec_in)
+                                            elecstate::ElecState* pelec_in,
+                                            ModulePW::PW_Basis_K* pw_wfc_in)
 {
     this->set_input_parameters(sc_thr_in, nsc_in, nsc_min_in, alpha_trial_in, sccut_in, sc_drop_thr_in);
     this->set_atomCounts(ucell.get_atom_Counts());
@@ -26,9 +26,11 @@ void spinconstrain::SpinConstrain<FPTYPE>::init_sc(double sc_thr_in,
     this->lambda_ = ucell.get_lambda();
     this->constrain_ = ucell.get_constrain();
     this->atomLabels_ = ucell.get_atomLabels();
+    this->tpiba = ucell.tpiba;
+    this->pw_wfc_ = pw_wfc_in;
     this->set_decay_grad();
     if(ParaV_in != nullptr) this->set_ParaV(ParaV_in);
-    this->set_solver_parameters(kv_in, p_hamilt_in, psi_in, pelec_in, KS_SOLVER_in);
+    this->set_solver_parameters(kv_in, p_hamilt_in, psi_in, pelec_in);
 }
 
 template class spinconstrain::SpinConstrain<std::complex<double>>;
diff --git a/source/module_hamilt_lcao/module_deltaspin/lambda_loop.cpp b/source/module_hamilt_lcao/module_deltaspin/lambda_loop.cpp
index f4eb1f7edc..cad7b64c7c 100644
--- a/source/module_hamilt_lcao/module_deltaspin/lambda_loop.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/lambda_loop.cpp
@@ -202,7 +202,7 @@ void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(int out
         {
             //add_scalar_multiply_2d(initial_lambda, dnu_last_step, 1.0, this->lambda_);
             this->update_psi_charge(dnu_last_step.data(), rerun);
-            /*if(PARAM.inp.basis_type == "pw")
+            if(PARAM.inp.basis_type == "pw")
             {
                 //double check Atomic spin moment
                 this->cal_mi_pw();
@@ -224,7 +224,7 @@ void spinconstrain::SpinConstrain<std::complex<double>>::run_lambda_loop(int out
                     std::cout<<"Error: RMS error is too large, rerun the loop"<<std::endl;
                     this->run_lambda_loop(outer_step, false);
                 }
-            }*/
+            }
             break;
         }
 #ifdef __MPI
diff --git a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp
index 30ac4d7dfd..1339fc4601 100644
--- a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp
@@ -490,14 +490,12 @@ template <typename FPTYPE>
 void SpinConstrain<FPTYPE>::set_solver_parameters(K_Vectors& kv_in,
                                                   void* p_hamilt_in,
                                                   void* psi_in,
-                                                  elecstate::ElecState* pelec_in,
-                                                  std::string KS_SOLVER_in)
+                                                  elecstate::ElecState* pelec_in)
 {
     this->kv_ = kv_in;
     this->p_hamilt = p_hamilt_in;
     this->psi = psi_in;
     this->pelec = pelec_in;
-    this->KS_SOLVER = KS_SOLVER_in;
 }
 
 /// @brief  set ParaV
diff --git a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h
index 866f2373e0..2e7cf6c8db 100644
--- a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h
+++ b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h
@@ -37,17 +37,17 @@ class SpinConstrain
                Parallel_Orbitals* ParaV_in,
                int nspin_in,
                K_Vectors& kv_in,
-               std::string KS_SOLVER_in,
                void* p_hamilt_in,
                void* psi_in,
-               elecstate::ElecState* pelec_in);
+               elecstate::ElecState* pelec_in,
+               ModulePW::PW_Basis_K* pw_wfc_in = nullptr);
 
   /// @brief calculate the magnetization of each atom with real space projection method for LCAO base
   /// @param step : the step number of the SCF calculation
   /// @param print : print the magnetization of each atom if true
   void cal_mi_lcao(const int& step, bool print = false);
 
-  //void cal_mi_pw();
+  void cal_mi_pw();
 
   void cal_mw_from_lambda(int i_step, const ModuleBase::Vector3<double>* delta_lambda = nullptr);
 
@@ -108,7 +108,8 @@ class SpinConstrain
     void* p_hamilt = nullptr;
     void* psi = nullptr;
     elecstate::ElecState* pelec = nullptr;
-    std::string KS_SOLVER;
+    ModulePW::PW_Basis_K* pw_wfc_ = nullptr;
+    double tpiba = 0.0; /// save ucell.tpiba
     const double meV_to_Ry = 7.349864435130999e-05;
     K_Vectors kv_;
     //--------------------------------------------------------------------------------
@@ -203,8 +204,7 @@ class SpinConstrain
     void set_solver_parameters(K_Vectors& kv_in,
                                void* p_hamilt_in,
                                void* psi_in,
-                               elecstate::ElecState* pelec_in,
-                               std::string KS_SOLVER_in);
+                               elecstate::ElecState* pelec_in);
 
   private:
     SpinConstrain(){};                               // Private constructor
diff --git a/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp b/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp
index 72f2941a75..1fd36524e3 100644
--- a/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp
+++ b/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp
@@ -149,12 +149,11 @@ TYPED_TEST(SpinConstrainTest, SetSolverParameters)
 {
     K_Vectors kv;
     this->sc.set_nspin(4);
-    this->sc.set_solver_parameters(kv, nullptr, nullptr, nullptr, "genelpa");
+    this->sc.set_solver_parameters(kv, nullptr, nullptr, nullptr);
     EXPECT_EQ(this->sc.get_nspin(), 4);
     EXPECT_EQ(this->sc.p_hamilt, nullptr);
     EXPECT_EQ(this->sc.psi, nullptr);
     EXPECT_EQ(this->sc.pelec, nullptr);
-    EXPECT_EQ(this->sc.KS_SOLVER, "genelpa");
 }
 
 TYPED_TEST(SpinConstrainTest, SetParaV)
diff --git a/source/module_hamilt_lcao/module_dftu/CMakeLists.txt b/source/module_hamilt_lcao/module_dftu/CMakeLists.txt
index dd7197dbfd..d412154970 100644
--- a/source/module_hamilt_lcao/module_dftu/CMakeLists.txt
+++ b/source/module_hamilt_lcao/module_dftu/CMakeLists.txt
@@ -1,22 +1,21 @@
-if(ENABLE_LCAO)
-  list(APPEND objects
-        dftu.cpp
-        dftu_force.cpp
-        dftu_yukawa.cpp
-        dftu_folding.cpp
-        dftu_io.cpp
-        dftu_tools.cpp
-        dftu_occup.cpp
-        dftu_hamilt.cpp
-  )
+list(APPEND objects
+      dftu.cpp
+      dftu_force.cpp
+      dftu_yukawa.cpp
+      dftu_folding.cpp
+      dftu_io.cpp
+      dftu_tools.cpp
+      dftu_occup.cpp
+      dftu_hamilt.cpp
+      dftu_pw.cpp
+)
 
-  add_library(
-      dftu
-      OBJECT
-      ${objects}
-  )
+add_library(
+    dftu
+    OBJECT
+    ${objects}
+)
 
-  if(ENABLE_COVERAGE)
-    add_coverage(dftu)
-  endif()
+if(ENABLE_COVERAGE)
+  add_coverage(dftu)
 endif()
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_dftu/dftu.cpp b/source/module_hamilt_lcao/module_dftu/dftu.cpp
index 9f5eb09a3a..2dd705a03c 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu.cpp
@@ -38,8 +38,10 @@ DFTU::~DFTU()
 
 void DFTU::init(UnitCell& cell, // unitcell class
                 const Parallel_Orbitals* pv,
-                const int& nks,
-                const LCAO_Orbitals& orb
+                const int nks
+#ifdef __LCAO
+                , const LCAO_Orbitals* orb
+#endif
                 )
 {
     ModuleBase::TITLE("DFTU", "init");
@@ -50,9 +52,14 @@ void DFTU::init(UnitCell& cell, // unitcell class
 #endif
 
     this->paraV = pv;
-    
-    ptr_orb_ = &orb;
-    orb_cutoff_ = orb.cutoffs();
+
+#ifdef __LCAO    
+    ptr_orb_ = orb;
+    if(ptr_orb_ != nullptr)
+    {
+        orb_cutoff_ = orb->cutoffs();
+    }
+#endif
 
     // needs reconstructions in future
     // global parameters, need to be removed in future
@@ -64,6 +71,9 @@ void DFTU::init(UnitCell& cell, // unitcell class
 
     this->locale.resize(cell.nat);
     this->locale_save.resize(cell.nat);
+    // only for PW base
+    this->eff_pot_pw_index.resize(cell.nat);
+    int pot_index = 0;
 
     this->iatlnmipol2iwt.resize(cell.nat);
 
@@ -80,6 +90,10 @@ void DFTU::init(UnitCell& cell, // unitcell class
             locale[iat].resize(cell.atoms[it].nwl + 1);
             locale_save[iat].resize(cell.atoms[it].nwl + 1);
 
+            const int tlp1_npol = (this->orbital_corr[it]*2+1)*npol;
+            this->eff_pot_pw_index[iat] = pot_index;
+            pot_index += tlp1_npol * tlp1_npol;
+
             for (int l = 0; l <= cell.atoms[it].nwl; l++)
             {
                 const int N = cell.atoms[it].l_nchi[l];
@@ -143,6 +157,8 @@ void DFTU::init(UnitCell& cell, // unitcell class
             }
         }
     }
+    // allocate memory for eff_pot_pw
+    this->eff_pot_pw.resize(pot_index, 0.0);
 
     if (Yukawa)
     {
@@ -209,6 +225,8 @@ void DFTU::init(UnitCell& cell, // unitcell class
     return;
 }
 
+#ifdef __LCAO
+
 void DFTU::cal_energy_correction(const UnitCell& ucell,
                                  const int istep)
 {
@@ -360,6 +378,8 @@ void DFTU::cal_energy_correction(const UnitCell& ucell,
     return;
 }
 
+#endif
+
 void DFTU::uramping_update()
 {
     // if uramping < 0.1, use the original U
@@ -392,6 +412,8 @@ bool DFTU::u_converged()
     return true;
 }
 
+#ifdef __LCAO
+
 void DFTU::set_dmr(const elecstate::DensityMatrix<std::complex<double>, double>* dmr)
 {
     this->dm_in_dftu_cd = dmr;
@@ -443,4 +465,7 @@ void dftu_cal_occup_m(const int iter,
 {
     GlobalC::dftu.cal_occup_m_k(iter,ucell, dm, kv, mixing_beta, p_ham);
 }
+
+#endif
+
 } // namespace ModuleDFTU
diff --git a/source/module_hamilt_lcao/module_dftu/dftu.h b/source/module_hamilt_lcao/module_dftu/dftu.h
index 9543ae6e55..68aae44516 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu.h
+++ b/source/module_hamilt_lcao/module_dftu/dftu.h
@@ -8,12 +8,14 @@
 #include "module_cell/klist.h"
 #include "module_cell/unitcell.h"
 #include "module_basis/module_ao/parallel_orbitals.h"
+#ifdef __LCAO
 #include "module_elecstate/module_charge/charge_mixing.h"
 #include "module_hamilt_general/hamilt.h"
 #include "module_elecstate/elecstate.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
 #include "module_elecstate/module_dm/density_matrix.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/force_stress_arrays.h" // mohan add 2024-06-15
+#endif
 
 #include <string>
 #include <vector>
@@ -40,9 +42,13 @@ class DFTU
     // allocate relevant data strcutures
     void init(UnitCell& cell, // unitcell class
               const Parallel_Orbitals* pv,
-              const int& nks,
-              const LCAO_Orbitals& orb
+              const int nks
+#ifdef __LCAO
+              , const LCAO_Orbitals* orb = nullptr
+#endif
               );
+    
+    static DFTU* get_instance();
 
     // calculate the energy correction
     void cal_energy_correction(const UnitCell& ucell, const int istep);
@@ -65,13 +71,16 @@ class DFTU
     // FIXME: the following variable does not have static lifetime;
     // while the present class is used via a global variable. This has
     // potential to cause dangling pointer issues.
+#ifdef __LCAO
     const LCAO_Orbitals* ptr_orb_ = nullptr;
     std::vector<double> orb_cutoff_;
+#endif
     
     // transform between iwt index and it, ia, L, N and m index
     std::vector<std::vector<std::vector<std::vector<std::vector<int>>>>>
         iatlnmipol2iwt; // iatlnm2iwt[iat][l][n][m][ipol]
 
+#ifdef __LCAO
     //=============================================================
     // In dftu_hamilt.cpp
     // For calculating contribution to Hamiltonian matrices
@@ -81,6 +90,7 @@ class DFTU
     void cal_eff_pot_mat_real(const int ik, double* eff_pot, const std::vector<int>& isk, const double* sk);
     void cal_eff_pot_mat_R_double(const int ispin, double* SR, double* HR);
     void cal_eff_pot_mat_R_complex_double(const int ispin, std::complex<double>* SR, std::complex<double>* HR);
+#endif
 
     //=============================================================
     // In dftu_occup.cpp
@@ -88,6 +98,16 @@ class DFTU
     // and other operations of locale: copy,zero out,mix
     //=============================================================
   public:
+    /// interface for PW base
+    /// calculate the local occupation number matrix for PW based wave functions
+    void cal_occ_pw(const int iter, const void* psi_in, const ModuleBase::matrix& wg_in, const UnitCell& cell, const double& mixing_beta);
+    /// calculate the local DFT+U effective potential matrix for PW base.
+    void cal_VU_pot_pw(const int spin);
+    /// get effective potential matrix for PW base
+    const std::complex<double>* get_eff_pot_pw(const int iat) const { return &(eff_pot_pw[this->eff_pot_pw_index[iat]]); }
+    int get_size_eff_pot_pw() const { return eff_pot_pw.size(); }
+
+#ifdef __LCAO
     // calculate the local occupation number matrix
     void cal_occup_m_k(const int iter, 
                        const UnitCell& ucell,
@@ -100,6 +120,7 @@ class DFTU
                            const std::vector<std::vector<double>>& dm_gamma, 
                            const double& mixing_beta, 
                            hamilt::Hamilt<double>* p_ham);
+#endif
 
     // dftu can be calculated only after locale has been initialed
     bool initialed_locale = false;
@@ -109,12 +130,16 @@ class DFTU
     void zero_locale(const UnitCell& ucell);
     void mix_locale(const UnitCell& ucell,const double& mixing_beta);
 
+    std::vector<std::complex<double>> eff_pot_pw;
+    std::vector<int> eff_pot_pw_index;
+
 public:
     // local occupancy matrix of the correlated subspace
     // locale: the out put local occupation number matrix of correlated electrons in the current electronic step
     // locale_save: the input local occupation number matrix of correlated electrons in the current electronic step
     std::vector<std::vector<std::vector<std::vector<ModuleBase::matrix>>>> locale; // locale[iat][l][n][spin](m1,m2)
     std::vector<std::vector<std::vector<std::vector<ModuleBase::matrix>>>> locale_save; // locale_save[iat][l][n][spin](m1,m2)
+#ifdef __LCAO
 private:
     //=============================================================
     // In dftu_tools.cpp
@@ -224,6 +249,7 @@ class DFTU
 			double* dh_r,
 			const double* rho_VU, 
 			ModuleBase::matrix& stress_dftu);
+#endif
 
     //=============================================================
     // In dftu_io.cpp
@@ -261,6 +287,7 @@ class DFTU
     double spherical_Bessel(const int k, const double r, const double lambda);
     double spherical_Hankel(const int k, const double r, const double lambda);
 
+#ifdef __LCAO
   public:
     /**
      * @brief get the density matrix of target spin
@@ -278,8 +305,10 @@ class DFTU
   private:
     const elecstate::DensityMatrix<double, double>* dm_in_dftu_d = nullptr;
     const elecstate::DensityMatrix<std::complex<double>, double>* dm_in_dftu_cd = nullptr;
+#endif
 };
 
+#ifdef __LCAO
 template <typename T>
 void dftu_cal_occup_m(const int iter,
                       const UnitCell& ucell,
@@ -287,6 +316,7 @@ void dftu_cal_occup_m(const int iter,
                       const K_Vectors& kv,
                       const double& mixing_beta,
                       hamilt::Hamilt<T>* p_ham);
+#endif
 
 } // namespace ModuleDFTU
 
diff --git a/source/module_hamilt_lcao/module_dftu/dftu_folding.cpp b/source/module_hamilt_lcao/module_dftu/dftu_folding.cpp
index 84ed194008..aac95478d7 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu_folding.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu_folding.cpp
@@ -1,3 +1,4 @@
+#ifdef __LCAO
 #include "dftu.h"
 #include "module_base/timer.h"
 #include "module_parameter/parameter.h"
@@ -305,3 +306,4 @@ void DFTU::folding_matrix_k_new(const int ik,
     
 
 } // namespace ModuleDFTU
+#endif // __LCAO
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_dftu/dftu_force.cpp b/source/module_hamilt_lcao/module_dftu/dftu_force.cpp
index b24aa09865..3ab4ef2496 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu_force.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu_force.cpp
@@ -3,6 +3,7 @@
 #include "module_parameter/parameter.h"
 // DATE : 2019-12-10
 //==========================================================
+#ifdef __LCAO
 #include "dftu.h"
 #include "module_base/constants.h"
 #include "module_base/global_function.h"
@@ -665,3 +666,4 @@ void DFTU::cal_stress_gamma(const UnitCell& ucell,
     return;
 }
 } // namespace ModuleDFTU
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_dftu/dftu_hamilt.cpp b/source/module_hamilt_lcao/module_dftu/dftu_hamilt.cpp
index c350aef529..90d781ae9c 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu_hamilt.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu_hamilt.cpp
@@ -7,6 +7,7 @@
 namespace ModuleDFTU
 {
 
+#ifdef __LCAO
 void DFTU::cal_eff_pot_mat_complex(const int ik, std::complex<double>* eff_pot, const std::vector<int>& isk, const std::complex<double>* sk)
 {
     ModuleBase::TITLE("DFTU", "cal_eff_pot_mat");
@@ -167,4 +168,5 @@ void DFTU::cal_eff_pot_mat_R_complex_double(const int ispin, std::complex<double
     return;
 }
 
+#endif
 }
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_dftu/dftu_occup.cpp b/source/module_hamilt_lcao/module_dftu/dftu_occup.cpp
index c51fbfbf70..d4efeed426 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu_occup.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu_occup.cpp
@@ -2,7 +2,9 @@
 #include "module_base/timer.h"
 #include "module_parameter/parameter.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
+#ifdef __LCAO
 #include "module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.h"
+#endif
 
 extern "C"
 {
@@ -142,6 +144,8 @@ void DFTU::mix_locale(const UnitCell& ucell,
     ModuleBase::timer::tick("DFTU", "mix_locale");
 }
 
+#ifdef __LCAO
+
 void DFTU::cal_occup_m_k(const int iter, 
                          const UnitCell& ucell,
                          const std::vector<std::vector<std::complex<double>>>& dm_k,
@@ -519,4 +523,5 @@ void DFTU::cal_occup_m_gamma(const int iter,
     ModuleBase::timer::tick("DFTU", "cal_occup_m_gamma");
     return;
 }
+#endif
 } // namespace ModuleDFTU
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_dftu/dftu_pw.cpp b/source/module_hamilt_lcao/module_dftu/dftu_pw.cpp
new file mode 100644
index 0000000000..cc0c3a6c30
--- /dev/null
+++ b/source/module_hamilt_lcao/module_dftu/dftu_pw.cpp
@@ -0,0 +1,212 @@
+#include "dftu.h"
+#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h"
+#include "module_base/parallel_reduce.h"
+#include "module_parameter/parameter.h"
+#include "module_base/timer.h"
+
+
+namespace ModuleDFTU
+{
+DFTU* DFTU::get_instance()
+{
+    return &GlobalC::dftu;
+}
+/// calculate occupation matrix for DFT+U
+void DFTU::cal_occ_pw(const int iter, const void* psi_in, const ModuleBase::matrix& wg_in, const UnitCell& cell, const double& mixing_beta)
+{
+    ModuleBase::timer::tick("DFTU", "cal_occ_pw");
+    this->copy_locale(cell);
+    this->zero_locale(cell);
+
+    if(PARAM.inp.device == "cpu")
+    {
+        auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_CPU>::get_instance();
+        const psi::Psi<std::complex<double>>* psi_p = (const psi::Psi<std::complex<double>>*)psi_in;
+        // loop over k-points to calculate Mi of \sum_{k,i,l,m}<Psi_{k,i}|alpha_{l,m}><alpha_{l,m}|Psi_{k,i}>
+        const int nbands = psi_p->get_nbands();
+        for(int ik = 0; ik < psi_p->get_nk(); ik++)
+        {
+            psi_p->fix_k(ik);
+            onsite_p->tabulate_atomic(ik);
+
+            onsite_p->overlap_proj_psi(nbands*psi_p->npol, psi_p->get_pointer());
+            const std::complex<double>* becp = onsite_p->get_h_becp();
+            // becp(nbands*npol , nkb)
+            // mag = wg * \sum_{nh}becp * becp
+            int nkb = onsite_p->get_size_becp() / nbands / psi_p->npol;
+            int begin_ih = 0;
+            for(int iat = 0; iat < cell.nat; iat++)
+            {
+                const int it = cell.iat2it[iat];
+                const int nh = onsite_p->get_nh(iat);
+                const int target_l = this->orbital_corr[it];
+                if(target_l == -1)
+                {
+                    begin_ih += nh;
+                    continue;
+                }
+                // m = l^2, l^2+1, ..., (l+1)^2-1
+                const int m_begin = target_l * target_l;
+                const int tlp1 = 2 * target_l + 1;
+                const int tlp1_2 = tlp1 * tlp1;
+                for(int ib = 0;ib<nbands;ib++)
+                {
+                    const double weight = wg_in(ik, ib);
+                    int ind_m1m2 = 0;
+                    for(int m1 = 0; m1 < tlp1; m1++)
+                    {
+                        const int index_m1 = ib*2*nkb + begin_ih + m_begin + m1;
+                        for(int m2 = 0; m2 < tlp1; m2++)
+                        {
+                            const int index_m2 = ib*2*nkb + begin_ih + m_begin + m2;
+                            std::complex<double> occ[4];
+                            occ[0] = weight * conj(becp[index_m1]) * becp[index_m2];
+                            occ[1] = weight * conj(becp[index_m1]) * becp[index_m2 + nkb];
+                            occ[2] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2];
+                            occ[3] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2 + nkb];
+                            this->locale[iat][target_l][0][0].c[ind_m1m2] += (occ[0] + occ[3]).real();
+                            this->locale[iat][target_l][0][0].c[ind_m1m2 + tlp1_2] += (occ[1] + occ[2]).real();
+                            this->locale[iat][target_l][0][0].c[ind_m1m2 + 2 * tlp1_2] += (occ[1] - occ[2]).imag();
+                            this->locale[iat][target_l][0][0].c[ind_m1m2 + 3 * tlp1_2] += (occ[0] - occ[3]).real();
+                            ind_m1m2++;
+                        }
+                    }
+                }// ib
+                begin_ih += nh;
+            }// iat
+        }// ik
+    }
+#if defined(__CUDA) || defined(__ROCM)
+    else
+    {
+        auto* onsite_p = projectors::OnsiteProjector<double, base_device::DEVICE_GPU>::get_instance();
+        const psi::Psi<std::complex<double>, base_device::DEVICE_GPU>* psi_p = (const psi::Psi<std::complex<double>, base_device::DEVICE_GPU>*)psi_in;
+        // loop over k-points to calculate Mi of \sum_{k,i,l,m}<Psi_{k,i}|alpha_{l,m}><alpha_{l,m}|Psi_{k,i}>
+        const int nbands = psi_p->get_nbands();
+        for(int ik = 0; ik < psi_p->get_nk(); ik++)
+        {
+            psi_p->fix_k(ik);
+            onsite_p->tabulate_atomic(ik);
+
+            onsite_p->overlap_proj_psi(nbands*psi_p->npol, psi_p->get_pointer());
+            const std::complex<double>* becp = onsite_p->get_h_becp();
+            // becp(nbands*npol , nkb)
+            // mag = wg * \sum_{nh}becp * becp
+            int nkb = onsite_p->get_size_becp() / nbands / psi_p->npol;
+            int begin_ih = 0;
+            for(int iat = 0; iat < cell.nat; iat++)
+            {
+                const int it = cell.iat2it[iat];
+                const int nh = onsite_p->get_nh(iat);
+                const int target_l = this->orbital_corr[it];
+                if(target_l == -1)
+                {
+                    begin_ih += nh;
+                    continue;
+                }
+                // m = l^2, l^2+1, ..., (l+1)^2-1
+                const int m_begin = target_l * target_l;
+                const int tlp1 = 2 * target_l + 1;
+                const int tlp1_2 = tlp1 * tlp1;
+                for(int ib = 0;ib<nbands;ib++)
+                {
+                    const double weight = wg_in(ik, ib);
+                    int ind_m1m2 = 0;
+                    for(int m1 = 0; m1 < tlp1; m1++)
+                    {
+                        const int index_m1 = ib*2*nkb + begin_ih + m_begin + m1;
+                        for(int m2 = 0; m2 < tlp1; m2++)
+                        {
+                            const int index_m2 = ib*2*nkb + begin_ih + m_begin + m2;
+                            std::complex<double> occ[4];
+                            occ[0] = weight * conj(becp[index_m1]) * becp[index_m2];
+                            occ[1] = weight * conj(becp[index_m1]) * becp[index_m2 + nkb];
+                            occ[2] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2];
+                            occ[3] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2 + nkb];
+                            this->locale[iat][target_l][0][0].c[ind_m1m2] += (occ[0] + occ[3]).real();
+                            this->locale[iat][target_l][0][0].c[ind_m1m2 + tlp1_2] += (occ[1] + occ[2]).real();
+                            this->locale[iat][target_l][0][0].c[ind_m1m2 + 2 * tlp1_2] += (occ[1] - occ[2]).imag();
+                            this->locale[iat][target_l][0][0].c[ind_m1m2 + 3 * tlp1_2] += (occ[0] - occ[3]).real();
+                            ind_m1m2++;
+                        }
+                    }
+                }// ib
+                begin_ih += nh;
+            }// iat
+        }// ik
+    }
+#endif
+
+    this->EU = 0.0;
+    // reduce mag from all k-pools
+    for(int iat = 0; iat < cell.nat; iat++)
+    {
+        const int it = cell.iat2it[iat];
+        const int target_l = this->orbital_corr[it];
+        if(target_l == -1)
+        {
+            continue;
+        }
+        const int size = (2 * target_l + 1) * (2 * target_l + 1);
+        Parallel_Reduce::reduce_double_allpool(PARAM.inp.kpar, PARAM.globalv.nproc_in_pool, this->locale[iat][target_l][0][0].c, size * PARAM.inp.nspin);
+        //update effective potential
+        const double u_value = this->U[it];
+        std::complex<double>* vu_iat = &(this->eff_pot_pw[this->eff_pot_pw_index[iat]]);
+        const int m_size = 2 * target_l + 1;
+        for (int m1 = 0; m1 < m_size; m1++)
+        {
+            for (int m2 = 0; m2 < m_size; m2++)
+            {
+                vu_iat[m1 * m_size + m2] = u_value * (1.0 * (m1 == m2) - this->locale[iat][target_l][0][0].c[m2 * m_size + m1]);
+                this->EU += u_value * 0.25 * this->locale[iat][target_l][0][0].c[m2 * m_size + m1] * this->locale[iat][target_l][0][0].c[m1 * m_size + m2];
+            }
+        }
+        for (int is = 1; is < 4; ++is)
+        {
+            int start = is * m_size * m_size;
+            for (int m1 = 0; m1 < m_size; m1++)
+            {
+                for (int m2 = 0; m2 < m_size; m2++)
+                {
+                    vu_iat[start + m1 * m_size + m2] = u_value * (0 - this->locale[iat][target_l][0][0].c[start + m2 * m_size + m1]);
+                    this->EU += u_value * 0.25 * this->locale[iat][target_l][0][0].c[start + m2 * m_size + m1] * this->locale[iat][target_l][0][0].c[start + m1 * m_size + m2];
+                }
+            }
+        }
+        // transfer from Pauli matrix representation to spin representation 
+        for (int m1 = 0; m1 < m_size; m1++)
+        {
+            for (int m2 = 0; m2 < m_size; m2++)
+            {
+                int index[4];
+                index[0] = m1 * m_size + m2;
+                index[1] = m1 * m_size + m2 + size;
+                index[2] = m1 * m_size + m2 + size * 2;
+                index[3] = m1 * m_size + m2 + size * 3;
+                std::complex<double> vu_tmp[4];
+                for (int i = 0; i < 4; i++)
+                {
+                    vu_tmp[i] = vu_iat[index[i]];
+                }
+                vu_iat[index[0]] = 0.5 * (vu_tmp[0] + vu_tmp[3]);
+                vu_iat[index[3]] = 0.5 * (vu_tmp[0] - vu_tmp[3]);
+                vu_iat[index[1]] = 0.5 * (vu_tmp[1] + std::complex<double>(0.0, 1.0) * vu_tmp[2]);
+                vu_iat[index[2]] = 0.5 * (vu_tmp[1] - std::complex<double>(0.0, 1.0) * vu_tmp[2]);
+            }
+        }
+    }
+
+    if(mixing_dftu && initialed_locale)
+    {
+        this->mix_locale(cell, mixing_beta);
+    }
+    // update effective potential
+    ModuleBase::timer::tick("DFTU", "cal_occ_pw");
+}
+/// calculate the local DFT+U effective potential matrix for PW base.
+void DFTU::cal_VU_pot_pw(const int spin)
+{
+
+}
+
+} // namespace ModuleDFTU
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_dftu/dftu_tools.cpp b/source/module_hamilt_lcao/module_dftu/dftu_tools.cpp
index 96582ee6aa..363c84da89 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu_tools.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu_tools.cpp
@@ -6,6 +6,7 @@
 namespace ModuleDFTU
 {
 
+#ifdef __LCAO
 void DFTU::cal_VU_pot_mat_complex(const int spin, const bool newlocale, std::complex<double>* VU)
 {
     ModuleBase::TITLE("DFTU", "cal_VU_pot_mat_complex");
@@ -203,4 +204,5 @@ double DFTU::get_onebody_eff_pot(const int T,
 
     return VU;
 }
+#endif
 } // namespace ModuleDFTU
\ No newline at end of file
diff --git a/source/module_hamilt_lcao/module_dftu/dftu_yukawa.cpp b/source/module_hamilt_lcao/module_dftu/dftu_yukawa.cpp
index cdc83dc867..a2c3dd2973 100644
--- a/source/module_hamilt_lcao/module_dftu/dftu_yukawa.cpp
+++ b/source/module_hamilt_lcao/module_dftu/dftu_yukawa.cpp
@@ -1,5 +1,6 @@
 //==========================================================
 // Author:Xin Qu
+#ifdef __LCAO
 #include "module_parameter/parameter.h"
 // DATE : 2019-12-10
 //==========================================================
@@ -281,3 +282,5 @@ double DFTU::spherical_Hankel(const int k, const double r, const double lambda)
 }
 
 } // namespace ModuleDFTU
+
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt b/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt
index 18e6518a8d..9e797f3744 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt
+++ b/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt
@@ -8,11 +8,13 @@ list(APPEND objects
     operator_pw/meta_pw.cpp
     operator_pw/velocity_pw.cpp
     operator_pw/operator_pw.cpp
+    operator_pw/onsite_proj_pw.cpp
     forces_nl.cpp
     forces_cc.cpp
     forces_scc.cpp
     forces.cpp
     forces_us.cpp
+    forces_onsite.cpp
     stress_func_cc.cpp
     stress_func_ewa.cpp
     stress_func_gga.cpp
@@ -22,6 +24,7 @@ list(APPEND objects
     stress_func_loc.cpp
     stress_func_nl.cpp
     stress_func_us.cpp
+    stress_func_onsite.cpp
     stress_pw.cpp
     VL_in_pw.cpp
     VNL_in_pw.cpp
@@ -35,6 +38,8 @@ list(APPEND objects
     fs_nonlocal_tools.cpp
     fs_kin_tools.cpp
     radial_proj.cpp
+    onsite_projector.cpp 
+    onsite_proj_tools.cpp
 )
 
 add_library(
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
index f9c6a63556..c1fcd2299c 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp
@@ -50,6 +50,7 @@ void Forces<FPTYPE, Device>::cal_force(const UnitCell& ucell,
     ModuleBase::matrix forcenl(nat, 3);
     ModuleBase::matrix forcescc(nat, 3);
     ModuleBase::matrix forcepaw(nat, 3);
+    ModuleBase::matrix forceonsite(nat, 3);
 
     // Force due to local ionic potential
     // For PAW, calculated together in paw_cell.calculate_force
@@ -156,6 +157,11 @@ void Forces<FPTYPE, Device>::cal_force(const UnitCell& ucell,
             }
 #endif
         }
+        // DFT+U and DeltaSpin
+        if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch)
+        {
+            this->cal_force_onsite(forceonsite, wg, wfc_basis, GlobalC::ucell, psi_in);
+        }
     }
 
     // non-linear core correction
@@ -317,6 +323,11 @@ void Forces<FPTYPE, Device>::cal_force(const UnitCell& ucell,
                     force(iat, ipol) = force(iat, ipol) + forcesol(iat, ipol);
                 }
 
+                if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch)
+                {
+                    force(iat, ipol) += forceonsite(iat, ipol);
+                }
+
                 sum += force(iat, ipol);
 
                 iat++;
@@ -457,6 +468,14 @@ void Forces<FPTYPE, Device>::cal_force(const UnitCell& ucell,
                                   forcesol,
                                   false);
         }
+        if (PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch)
+        {
+            ModuleIO::print_force(GlobalV::ofs_running,
+                                  ucell,
+                                  "ONSITE_PROJ    FORCE (eV/Angstrom)",
+                                  forceonsite,
+                                  false);
+        }
     }
     ModuleIO::print_force(GlobalV::ofs_running, ucell, "TOTAL-FORCE (eV/Angstrom)", force, false);
     ModuleBase::timer::tick("Forces", "cal_force");
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.h b/source/module_hamilt_pw/hamilt_pwdft/forces.h
index c23f24f53e..90b419199d 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/forces.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces.h
@@ -82,6 +82,17 @@ class Forces
                       const pseudopot_cell_vnl& nlpp_in,
                       const UnitCell& ucell_in,
                       const psi::Psi<std::complex<FPTYPE>, Device>* psi_in = nullptr);
+    /// @brief atomic force for DFT+U and DeltaSpin
+    /// @param force_onsite , the output atomic force
+    /// @param wg , the weight of k points
+    /// @param wfc_basis , the plane wave basis
+    /// @param ucell_in , the unit cell
+    /// @param psi_in , the wave function
+    void cal_force_onsite(ModuleBase::matrix& force_onsite,
+                      const ModuleBase::matrix& wg,
+                      const ModulePW::PW_Basis_K* wfc_basis,
+                      const UnitCell& ucell_in,
+                      const psi::Psi<complex<FPTYPE>, Device>* psi_in = nullptr);
     void cal_force_scc(ModuleBase::matrix& forcescc,
                        ModulePW::PW_Basis* rho_basis,
                        const ModuleBase::matrix& v_current,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp
new file mode 100644
index 0000000000..240187b3ba
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp
@@ -0,0 +1,79 @@
+#include "forces.h"
+#include "module_base/timer.h"
+#include "module_base/tool_title.h"
+#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h"
+#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
+#include "module_parameter/parameter.h"
+#include "module_hamilt_lcao/module_dftu/dftu.h"
+#include "module_hamilt_lcao/module_deltaspin/spin_constrain.h"
+
+template <typename FPTYPE, typename Device>
+void Forces<FPTYPE, Device>::cal_force_onsite(ModuleBase::matrix& force_onsite,
+                                          const ModuleBase::matrix& wg,
+                                          const ModulePW::PW_Basis_K* wfc_basis,
+                                          const UnitCell& ucell_in,
+                                          const psi::Psi<complex<FPTYPE>, Device>* psi_in)
+{
+    ModuleBase::TITLE("Forces", "cal_force_onsite");
+    if(psi_in == nullptr || wfc_basis == nullptr)
+    {
+        return;
+    }
+    ModuleBase::timer::tick("Forces", "cal_force_onsite");
+
+    // allocate memory for the force
+    FPTYPE* force = nullptr;
+    resmem_var_op()(this->ctx, force, ucell_in.nat * 3);
+    base_device::memory::set_memory_op<FPTYPE, Device>()(this->ctx, force, 0.0, ucell_in.nat * 3);
+
+    auto* onsite_p = projectors::OnsiteProjector<FPTYPE, Device>::get_instance();
+
+    const int nks = wfc_basis->nks;
+    for (int ik = 0; ik < nks; ik++) // loop k points
+    {
+        // skip zero weights to speed up
+        int nbands_occ = wg.nc;
+        while (wg(ik, nbands_occ - 1) == 0.0)
+        {
+            nbands_occ--;
+            if (nbands_occ == 0)
+            {
+                break;
+            }
+        }
+        const int npm = nbands_occ;
+        onsite_p->get_fs_tools()->cal_becp(ik, npm);
+        // calculate becp = <psi|beta> for all beta functions
+        for (int ipol = 0; ipol < 3; ipol++)
+        {
+            // calculate dbecp = <psi|\nabla beta> for all beta functions
+            onsite_p->get_fs_tools()->cal_dbecp_f(ik, npm, ipol);
+        }
+        // calculate the force_i = \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_i
+        // force for DFT+U
+        if(PARAM.inp.dft_plus_u)
+        {
+            auto* dftu = ModuleDFTU::DFTU::get_instance();
+            onsite_p->get_fs_tools()->cal_force_dftu(ik, npm, force, dftu->orbital_corr.data(), dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw(), wg.c);
+        }
+        if(PARAM.inp.sc_mag_switch)
+        {
+            spinconstrain::SpinConstrain<std::complex<double>>& sc = spinconstrain::SpinConstrain<std::complex<double>>::getScInstance();
+            const std::vector<ModuleBase::Vector3<double>>& lambda = sc.get_sc_lambda();
+            onsite_p->get_fs_tools()->cal_force_dspin(ik, npm, force, lambda.data(), wg.c);
+        }
+        
+    } // end ik
+
+    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_onsite.c, force, force_onsite.nr * force_onsite.nc);
+    delmem_var_op()(this->ctx, force);
+    // sum up force_onsite from all processors
+    Parallel_Reduce::reduce_all(force_onsite.c, force_onsite.nr * force_onsite.nc);
+
+    ModuleBase::timer::tick("Forces", "cal_force_onsite");
+}
+
+template class Forces<double, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class Forces<double, base_device::DEVICE_GPU>;
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
index b219678f4c..810b313292 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp
@@ -216,8 +216,8 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb(const int& ik, const int& nbdall
                     hd_vq);
 
         // prepare（-i）^l, size: nh
-        std::vector<std::complex<double>> pref = maths.cal_pref(it);
-        const int nh = pref.size();
+        const int nh = this->ucell_->atoms[it].ncpp.nh;
+        std::vector<std::complex<double>> pref = maths.cal_pref(it, nh);
         this->dvkb_indexes.resize(nh * 4);
         maths.cal_dvkb_index(this->ucell_->atoms[it].ncpp.nbeta,
                              this->nlpp_->nhtol.c,
@@ -369,8 +369,8 @@ void FS_Nonlocal_tools<FPTYPE, Device>::cal_vkb_deri_s(const int& ik,
                          hd_vq_deri);
 
         // prepare（-i）^l, size: nh
-        std::vector<std::complex<double>> pref = maths.cal_pref(it);
-        int nh = pref.size();
+        const int nh = this->ucell_->atoms[it].ncpp.nh;
+        std::vector<std::complex<double>> pref = maths.cal_pref(it, nh);
         // prepare indexes for calculate vkb_deri
         this->dvkb_indexes.resize(nh * 4);
         maths.cal_dvkb_index(this->ucell_->atoms[it].ncpp.nbeta,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp
index 6272675398..7fe256b23d 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp
@@ -10,6 +10,7 @@
 #include "operator_pw/ekinetic_pw.h"
 #include "operator_pw/meta_pw.h"
 #include "operator_pw/nonlocal_pw.h"
+#include "operator_pw/onsite_proj_pw.h"
 
 #ifdef USE_PAW
 #include "module_cell/module_paw/paw_cell.h"
@@ -114,6 +115,12 @@ HamiltPW<T, Device>::HamiltPW(elecstate::Potential* pot_in,
             this->ops->add(nonlocal);
         }
     }
+    if(PARAM.inp.sc_mag_switch || PARAM.inp.dft_plus_u)
+    {
+        Operator<T, Device>* onsite_proj
+            = new OnsiteProj<OperatorPW<T, Device>>(isk, &GlobalC::ucell, PARAM.inp.sc_mag_switch, (PARAM.inp.dft_plus_u>0));
+        this->ops->add(onsite_proj);
+    }
     return;
 }
 
@@ -192,6 +199,17 @@ HamiltPW<T, Device>::HamiltPW(const HamiltPW<T_in, Device_in> *hamilt)
                 this->ops->add(meta);
             }
         }
+        else if (node->classname == "OnsiteProj") {
+            Operator<T, Device>* onsite_proj =
+                    new OnsiteProj<OperatorPW<T, Device>>(
+                            reinterpret_cast<const OnsiteProj<OperatorPW<T_in, Device_in>>*>(node));
+            if(this->ops == nullptr) {
+                this->ops = onsite_proj;
+            }
+            else {
+                this->ops->add(onsite_proj);
+            }
+        }
         else {
             ModuleBase::WARNING_QUIT("HamiltPW", "Unrecognized Operator type!");
         }
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
index 991a81e746..5d0656d105 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu
@@ -306,6 +306,209 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     cudaCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_force_onsite(int wg_nc,
+                                  int ntype,
+                                  int forcenl_nc,
+                                  int nbands,
+                                  int ik,
+                                  int nkb,
+                                  const int* atom_nh,
+                                  const int* atom_na,
+                                  int tpiba,
+                                  const FPTYPE* d_wg,
+                                  const thrust::complex<FPTYPE>* vu,
+                                  const int* orbital_corr,
+                                  const thrust::complex<FPTYPE>* becp,
+                                  const thrust::complex<FPTYPE>* dbecp,
+                                  FPTYPE* force)
+{
+    const int ib = blockIdx.x / ntype; // index of loop-nbands
+    const int ib2 = ib * 2;
+    const int it = blockIdx.x % ntype; // index of loop-ntype
+    if (orbital_corr[it] == -1)
+        return;
+    const int orbital_l = orbital_corr[it];
+    const int ip_begin = orbital_l * orbital_l;
+    const int tlp1 = 2 * orbital_l + 1;
+    const int tlp1_2 = tlp1 * tlp1;
+
+    int iat = 0; // calculate the begin of atomic index
+    int sum = 0; // calculate the begin of atomic-orbital index
+    for (int ii = 0; ii < it; ii++)
+    {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+        vu += 4 * tlp1_2 * atom_na[ii]; // step for vu
+    }
+
+    const FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+    const int nprojs = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        for (int mm = threadIdx.x; mm < tlp1_2; mm += blockDim.x)
+        {
+            const int m1 = mm / tlp1;
+            const int m2 = mm % tlp1;
+            const int ip1 = ip_begin + m1;
+            const int ip2 = ip_begin + m2;
+            const int inkb1 = sum + ip1 + ib2 * nkb;
+            const int inkb2 = sum + ip2 + ib2 * nkb;
+            thrust::complex<FPTYPE> ps[4] = {vu[mm], vu[mm + tlp1_2], vu[mm + 2 * tlp1_2], vu[mm + 3 * tlp1_2]};
+            // out<<"\n ps = "<<ps;
+            for (int ipol = 0; ipol < 3; ipol++)
+            {
+                const int inkb0 = ipol * nbands * 2 * nkb + inkb1;
+                const thrust::complex<FPTYPE> dbb0 = conj(dbecp[inkb0]) * becp[inkb2];
+                const thrust::complex<FPTYPE> dbb1 = conj(dbecp[inkb0]) * becp[inkb2 + nkb];
+                const thrust::complex<FPTYPE> dbb2 = conj(dbecp[inkb0 + nkb]) * becp[inkb2];
+                const thrust::complex<FPTYPE> dbb3 = conj(dbecp[inkb0 + nkb]) * becp[inkb2 + nkb];
+                const FPTYPE tmp = -fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real();
+                atomicAdd(force + iat * forcenl_nc + ipol, tmp);
+            }
+        }
+        ++iat;
+        sum += nprojs;
+        vu += 4 * tlp1_2;
+    } // ia
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_onsite(int wg_nc,
+                                 int ntype,
+                                 int forcenl_nc,
+                                 int nbands,
+                                 int ik,
+                                 int nkb,
+                                 const int* atom_nh,
+                                 const int* atom_na,
+                                 int tpiba,
+                                 const FPTYPE* d_wg,
+                                 const FPTYPE* lambda,
+                                 const thrust::complex<FPTYPE>* becp,
+                                 const thrust::complex<FPTYPE>* dbecp,
+                                 FPTYPE* force)
+{
+    const int ib = blockIdx.x / ntype; // index of loop-nbands
+    const int ib2 = ib * 2;
+    const int it = blockIdx.x % ntype; // index of loop-ntype
+
+    int iat = 0; // calculate the begin of atomic index
+    int sum = 0; // calculate the begin of atomic-orbital index
+    for (int ii = 0; ii < it; ii++)
+    {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    const FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+    const int nprojs = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        const thrust::complex<FPTYPE> coefficients0(lambda[iat * 3 + 2], 0.0);
+        const thrust::complex<FPTYPE> coefficients1(lambda[iat * 3], lambda[iat * 3 + 1]);
+        const thrust::complex<FPTYPE> coefficients2(lambda[iat * 3], -1 * lambda[iat * 3 + 1]);
+        const thrust::complex<FPTYPE> coefficients3(-1 * lambda[iat * 3 + 2], 0.0);
+        for (int ip = threadIdx.x; ip < nprojs; ip += blockDim.x)
+        {
+            const int inkb = sum + ip + ib2 * nkb;
+            // out<<"\n ps = "<<ps;
+            for (int ipol = 0; ipol < 3; ipol++)
+            {
+                const int inkb0 = ipol * nbands * 2 * nkb + inkb;
+                const thrust::complex<FPTYPE> dbb0 = conj(dbecp[inkb0]) * becp[inkb];
+                const thrust::complex<FPTYPE> dbb1 = conj(dbecp[inkb0]) * becp[inkb + nkb];
+                const thrust::complex<FPTYPE> dbb2 = conj(dbecp[inkb0 + nkb]) * becp[inkb];
+                const thrust::complex<FPTYPE> dbb3 = conj(dbecp[inkb0 + nkb]) * becp[inkb + nkb];
+                const FPTYPE tmp
+                    = -fac
+                      * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3)
+                            .real();
+                atomicAdd(force + iat * forcenl_nc + ipol, tmp);
+            }
+        }
+        ++iat;
+        sum += nprojs;
+    } // ia
+}
+
+// kernel for DFTU force
+template <typename FPTYPE>
+void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                                                                  const int& nbands_occ,
+                                                                  const int& wg_nc,
+                                                                  const int& ntype,
+                                                                  const int& forcenl_nc,
+                                                                  const int& nbands,
+                                                                  const int& ik,
+                                                                  const int& nkb,
+                                                                  const int* atom_nh,
+                                                                  const int* atom_na,
+                                                                  const FPTYPE& tpiba,
+                                                                  const FPTYPE* d_wg,
+                                                                  const std::complex<FPTYPE>* vu,
+                                                                  const int* orbital_corr,
+                                                                  const std::complex<FPTYPE>* becp,
+                                                                  const std::complex<FPTYPE>* dbecp,
+                                                                  FPTYPE* force)
+{
+    cal_force_onsite<FPTYPE>
+        <<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(wg_nc,
+                                                    ntype,
+                                                    forcenl_nc,
+                                                    nbands,
+                                                    ik,
+                                                    nkb,
+                                                    atom_nh,
+                                                    atom_na,
+                                                    tpiba,
+                                                    d_wg,
+                                                    reinterpret_cast<const thrust::complex<FPTYPE>*>(vu),
+                                                    orbital_corr,
+                                                    reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+                                                    reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+                                                    force); // array of data
+
+    cudaCheckOnDebug();
+}
+// kernel for DeltaSpin force
+template <typename FPTYPE>
+void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                                                                  const int& nbands_occ,
+                                                                  const int& wg_nc,
+                                                                  const int& ntype,
+                                                                  const int& forcenl_nc,
+                                                                  const int& nbands,
+                                                                  const int& ik,
+                                                                  const int& nkb,
+                                                                  const int* atom_nh,
+                                                                  const int* atom_na,
+                                                                  const FPTYPE& tpiba,
+                                                                  const FPTYPE* d_wg,
+                                                                  const FPTYPE* lambda,
+                                                                  const std::complex<FPTYPE>* becp,
+                                                                  const std::complex<FPTYPE>* dbecp,
+                                                                  FPTYPE* force)
+{
+    cal_force_onsite<FPTYPE>
+        <<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(wg_nc,
+                                                    ntype,
+                                                    forcenl_nc,
+                                                    nbands,
+                                                    ik,
+                                                    nkb,
+                                                    atom_nh,
+                                                    atom_na,
+                                                    tpiba,
+                                                    d_wg,
+                                                    lambda,
+                                                    reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+                                                    reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+                                                    force); // array of data
+
+    cudaCheckOnDebug();
+}
+
 template <typename FPTYPE>
 __global__ void saveVkbValues_(
     const int *gcar_zero_ptrs, 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu
new file mode 100644
index 0000000000..ef54ff0605
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu
@@ -0,0 +1,134 @@
+#include "module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h"
+
+#include <base/macros/macros.h>
+#include <complex>
+#include <cuda_runtime.h>
+#include <thrust/complex.h>
+
+namespace hamilt
+{
+
+#define THREADS_PER_BLOCK 256
+
+template <typename FPTYPE>
+__global__ void onsite_op(const int npm,
+                          const int npol,
+                          const int* ip_iat,
+                          const int tnp,
+                          const thrust::complex<FPTYPE>* lambda_coeff,
+                          thrust::complex<FPTYPE>* ps,
+                          const thrust::complex<FPTYPE>* becp)
+{
+    const int ip = blockIdx.x;
+    const int nbands = npm / npol;
+    for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x)
+    {
+        int ib2 = ib * npol;
+        int iat = ip_iat[ip];
+        const int psind = ip * npm + ib2;
+        const int becpind = ib2 * tnp + ip;
+        ps[psind] += lambda_coeff[iat * 4] * becp[becpind] + lambda_coeff[iat * 4 + 2] * becp[becpind + tnp];
+        ps[psind + 1] += lambda_coeff[iat * 4 + 1] * becp[becpind] + lambda_coeff[iat * 4 + 3] * becp[becpind + tnp];
+    }
+}
+
+template <typename FPTYPE>
+__global__ void onsite_op(const int npm,
+                          const int npol,
+                          const int* orb_l_iat,
+                          const int* ip_iat,
+                          const int* ip_m,
+                          const int* vu_begin_iat,
+                          const int tnp,
+                          const thrust::complex<FPTYPE>* vu,
+                          thrust::complex<FPTYPE>* ps,
+                          const thrust::complex<FPTYPE>* becp)
+{
+    const int ip = blockIdx.x;
+    int m1 = ip_m[ip];
+    if (m1 >= 0)
+    {
+        const int nbands = npm / npol;
+        for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x)
+        {
+            int ib2 = ib * npol;
+            int iat = ip_iat[ip];
+            const thrust::complex<FPTYPE>* vu_iat = vu + vu_begin_iat[iat];
+            int orb_l = orb_l_iat[iat];
+            int tlp1 = 2 * orb_l + 1;
+            int tlp1_2 = tlp1 * tlp1;
+            int ip2_begin = ip - m1;
+            int ip2_end = ip - m1 + tlp1;
+            const int psind = ip * npm + ib2;
+            for (int ip2 = ip2_begin; ip2 < ip2_end; ip2++)
+            {
+                const int becpind = ib2 * tnp + ip2;
+                int m2 = ip_m[ip2];
+                const int index_mm = m1 * tlp1 + m2;
+                ps[psind] += vu_iat[index_mm] * becp[becpind] + vu_iat[index_mm + tlp1_2 * 2] * becp[becpind + tnp];
+                ps[psind + 1] += vu_iat[index_mm + tlp1_2 * 1] * becp[becpind]
+                                 + vu_iat[index_mm + tlp1_2 * 3] * becp[becpind + tnp];
+            }
+        }
+    }
+}
+
+template <typename FPTYPE>
+void hamilt::onsite_ps_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
+                                                                       const int& npm,
+                                                                       const int npol,
+                                                                       const int* ip_iat,
+                                                                       const int& tnp,
+                                                                       const std::complex<FPTYPE>* lambda_coeff,
+                                                                       std::complex<FPTYPE>* ps,
+                                                                       const std::complex<FPTYPE>* becp)
+{
+    // denghui implement 20221019
+    // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+    onsite_op<FPTYPE>
+        <<<tnp, THREADS_PER_BLOCK>>>(npm,
+                                     npol,
+                                     ip_iat,
+                                     tnp,
+                                     reinterpret_cast<const thrust::complex<FPTYPE>*>(lambda_coeff),
+                                     reinterpret_cast<thrust::complex<FPTYPE>*>(ps),          // array of data
+                                     reinterpret_cast<const thrust::complex<FPTYPE>*>(becp)); // array of data
+
+    cudaCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void hamilt::onsite_ps_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
+                                                                       const int& npm,
+                                                                       const int npol,
+                                                                       const int* orb_l_iat,
+                                                                       const int* ip_iat,
+                                                                       const int* ip_m,
+                                                                       const int* vu_begin_iat,
+                                                                       const int& tnp,
+                                                                       const std::complex<FPTYPE>* vu,
+                                                                       std::complex<FPTYPE>* ps,
+                                                                       const std::complex<FPTYPE>* becp)
+{
+    // denghui implement 20221109
+    // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+    onsite_op<FPTYPE>
+        <<<tnp, THREADS_PER_BLOCK>>>(npm,
+                                     npol,
+                                     orb_l_iat,
+                                     ip_iat,
+                                     ip_m,
+                                     vu_begin_iat,
+                                     tnp,
+                                     reinterpret_cast<const thrust::complex<FPTYPE>*>(vu),
+                                     reinterpret_cast<thrust::complex<FPTYPE>*>(ps),          // array of data
+                                     reinterpret_cast<const thrust::complex<FPTYPE>*>(becp)); // array of data
+
+    cudaCheckOnDebug();
+    // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+}
+
+template struct onsite_ps_op<float, base_device::DEVICE_GPU>;
+template struct onsite_ps_op<double, base_device::DEVICE_GPU>;
+
+} // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
index 997827d669..b18e5c5160 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu
@@ -922,6 +922,187 @@ void synchronize_ptrs<base_device::DEVICE_GPU>::operator()(
     cudaMemcpy(ptr_out, ptr_in, sizeof(void*) * size, cudaMemcpyHostToDevice);
 }
 
+template <typename FPTYPE>
+__global__ void cal_stress_onsite(
+        const int nkb,
+        const int ntype,
+        const int wg_nc,
+        const int ik,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE *d_wg,
+        const thrust::complex<FPTYPE> *vu,
+        const int* orbital_corr,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *stress)
+{
+    const int ib = blockIdx.x / ntype; // index of loop-nbands
+    const int ib2  = ib * 2;
+    const int it = blockIdx.x % ntype; // index of loop-ntype
+    if(orbital_corr[it] == -1) return;
+    const int orbital_l = orbital_corr[it];
+    const int ip_begin = orbital_l * orbital_l;
+    const int tlp1 = 2 * orbital_l + 1;
+    const int tlp1_2 = tlp1 * tlp1;
+
+    int iat = 0; // calculate the begin of atomic index
+    int sum = 0; // calculate the begin of atomic-orbital index
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+        vu += 4 * tlp1_2 * atom_na[ii];// step for vu
+    }
+
+    FPTYPE stress_var = 0;
+    const FPTYPE fac = d_wg[ik * wg_nc + ib];
+    const int nprojs = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        for (int mm = threadIdx.x; mm < tlp1_2; mm += blockDim.x) {
+            const int m1 = mm / tlp1;
+            const int m2 = mm % tlp1;
+            const int ip1 = ip_begin + m1;
+            const int ip2 = ip_begin + m2;
+            const int inkb1 = sum + ip1 + ib2 * nkb;
+            const int inkb2 = sum + ip2 + ib2 * nkb;
+            thrust::complex<FPTYPE> ps[4] = {vu[mm], vu[mm + tlp1_2], vu[mm + 2 * tlp1_2], vu[mm + 3 * tlp1_2]};
+            //out<<"\n ps = "<<ps;
+            const thrust::complex<FPTYPE> dbb0 = conj(dbecp[inkb1]) * becp[inkb2];
+            const thrust::complex<FPTYPE> dbb1 = conj(dbecp[inkb1]) * becp[inkb2 + nkb];
+            const thrust::complex<FPTYPE> dbb2 = conj(dbecp[inkb1 + nkb]) * becp[inkb2];
+            const thrust::complex<FPTYPE> dbb3 = conj(dbecp[inkb1 + nkb]) * becp[inkb2 + nkb];
+            stress_var -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real();
+        }
+        ++iat;
+        sum+=nprojs;
+        vu += 4 * tlp1_2;
+    }//ia
+    __syncwarp();
+    warp_reduce(stress_var);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        atomicAdd(stress, stress_var);
+    }
+}
+
+template <typename FPTYPE>
+__global__ void cal_stress_onsite(
+        const int nkb,
+        const int ntype,
+        const int wg_nc,
+        const int ik,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE *d_wg,
+        const double* lambda,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *stress)
+{
+    const int ib = blockIdx.x / ntype; // index of loop-nbands
+    const int ib2  = ib * 2;
+    const int it = blockIdx.x % ntype; // index of loop-ntype
+
+    int iat = 0; // calculate the begin of atomic index
+    int sum = 0; // calculate the begin of atomic-orbital index
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    FPTYPE stress_var = 0;
+    const FPTYPE fac = d_wg[ik * wg_nc + ib];
+    const int nprojs = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        const thrust::complex<FPTYPE> coefficients0(lambda[iat*3+2], 0.0);
+        const thrust::complex<FPTYPE> coefficients1(lambda[iat*3] , lambda[iat*3+1]);
+        const thrust::complex<FPTYPE> coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]);
+        const thrust::complex<FPTYPE> coefficients3(-1 * lambda[iat*3+2], 0.0);
+        for (int ip = threadIdx.x; ip < nprojs; ip += blockDim.x) {
+            const int inkb = sum + ip + ib2 * nkb;
+            //out<<"\n ps = "<<ps;
+            const thrust::complex<FPTYPE> dbb0 = conj(dbecp[inkb]) * becp[inkb];
+            const thrust::complex<FPTYPE> dbb1 = conj(dbecp[inkb]) * becp[inkb + nkb];
+            const thrust::complex<FPTYPE> dbb2 = conj(dbecp[inkb + nkb]) * becp[inkb];
+            const thrust::complex<FPTYPE> dbb3 = conj(dbecp[inkb + nkb]) * becp[inkb + nkb];
+            stress_var -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real();
+        }
+        ++iat;
+        sum+=nprojs;
+    }//ia
+    __syncwarp();
+    warp_reduce(stress_var);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        atomicAdd(stress, stress_var);
+    }
+}
+
+//kernel for DFTU stress
+template <typename FPTYPE>
+void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const std::complex<FPTYPE>* vu,
+                    const int* orbital_corr,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress)
+{
+    cal_stress_onsite<FPTYPE><<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(
+             nkb,
+             ntype,
+             wg_nc,
+             ik,
+             atom_nh,
+             atom_na,
+             d_wg,
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(vu),
+             orbital_corr,
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+             stress);// array of data
+
+    cudaCheckOnDebug();
+}
+// kernel for DeltaSpin stress
+template <typename FPTYPE>
+void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const double* lambda,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress)
+{
+    cal_stress_onsite<FPTYPE><<<nbands_occ * ntype, THREADS_PER_BLOCK>>>(
+             nkb,
+             ntype,
+             wg_nc,
+             ik,
+             atom_nh,
+             atom_na,
+             d_wg,
+             lambda,
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+             stress);// array of data
+
+    cudaCheckOnDebug();
+}
+
 template struct synchronize_ptrs<base_device::DEVICE_GPU>;
 
 template struct cal_stress_mgga_op<std::complex<float>, base_device::DEVICE_GPU>;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
index 261c510efc..6d797e147d 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp
@@ -278,6 +278,149 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_CPU>
 #ifdef _OPENMP
         }
 #endif
+    };
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const std::complex<FPTYPE>* vu,
+                    const int* orbital_corr,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force)
+    {
+        int iat0 = 0;
+        int sum0 = 0;
+        for (int it = 0; it < ntype; it++)
+        {
+            const int orbital_l = orbital_corr[it];
+            const int nproj = atom_nh[it];
+            if(orbital_l == -1)
+            {
+                sum0 += nproj * atom_na[it];
+                continue;
+            }
+            const int ip_begin = orbital_l * orbital_l;
+            const int ip_end = (orbital_l + 1) * (orbital_l + 1);
+            const int tlp1 = 2 * orbital_l + 1;
+            const int tlp1_2 = tlp1 * tlp1;
+            for (int ia = 0; ia < atom_na[it]; ia++)
+            {
+                for (int ib = 0; ib < nbands_occ; ib++)
+                {
+                    const int ib2 = ib*2;
+                    FPTYPE local_force[3] = {0, 0, 0};
+                    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+                    int iat = iat0 + ia;
+                    int sum = sum0 + ia * nproj;
+                    for (int ip = ip_begin; ip < ip_end; ip++)
+                    {
+                        const int inkb = sum + ip;
+                        const int m = ip - ip_begin;
+                        // out<<"\n ps = "<<ps;
+                        for (int ip2 = ip_begin; ip2 < ip_end; ip2++)
+                        {
+                            const int jnkb = sum + ip2;
+                            const int m2 = ip2 - ip_begin;
+                            std::complex<FPTYPE> ps[4];
+                            for(int i = 0; i < 4; i++)
+                            {
+                                ps[i] = vu[(i * tlp1_2 + m * tlp1 + m2)];
+                            }
+
+                            for (int ipol = 0; ipol < 3; ipol++)
+                            {
+                                const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
+                                const int index1 = ib2 * nkb + jnkb;
+                                const std::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                                const std::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                                const std::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                                const std::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+
+                                local_force[ipol] -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real();
+                            }
+                        }
+                    }
+                    for (int ipol = 0; ipol < 3; ++ipol)
+                    {
+                        force[iat * forcenl_nc + ipol] += local_force[ipol];
+                    }
+                }
+                vu += 4 * tlp1_2;// step for vu
+            } // end ia
+            iat0 += atom_na[it];
+            sum0 += atom_na[it] * nproj;
+        } // end it
+    };
+
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* lambda,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force)
+    {
+        int iat0 = 0;
+        int sum0 = 0;
+        for (int it = 0; it < ntype; it++)
+        {
+            const int nproj = atom_nh[it];
+            for (int ia = 0; ia < atom_na[it]; ia++)
+            {
+                int iat = iat0 + ia;
+                int sum = sum0 + ia * nproj;
+                const std::complex<FPTYPE> coefficients0(lambda[iat*3+2], 0.0);
+                const std::complex<FPTYPE> coefficients1(lambda[iat*3] , lambda[iat*3+1]);
+                const std::complex<FPTYPE> coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]);
+                const std::complex<FPTYPE> coefficients3(-1 * lambda[iat*3+2], 0.0);
+                for (int ib = 0; ib < nbands_occ; ib++)
+                {
+                    const int ib2 = ib*2;
+                    FPTYPE local_force[3] = {0, 0, 0};
+                    FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+                    for (int ip = 0; ip < nproj; ip++)
+                    {
+                        const int inkb = sum + ip;
+
+                        for (int ipol = 0; ipol < 3; ipol++)
+                        {
+                            const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb;
+                            const int index1 = ib2 * nkb + inkb;
+                            const std::complex<FPTYPE> dbb0 = conj(dbecp[index0]) * becp[index1];
+                            const std::complex<FPTYPE> dbb1 = conj(dbecp[index0]) * becp[index1 + nkb];
+                            const std::complex<FPTYPE> dbb2 = conj(dbecp[index0 + nkb]) * becp[index1];
+                            const std::complex<FPTYPE> dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb];
+
+                            local_force[ipol] -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real();
+                        }
+                    }//ip
+                    for (int ipol = 0; ipol < 3; ++ipol)
+                    {
+                        force[iat * forcenl_nc + ipol] += local_force[ipol];
+                    }
+                } // end ib
+            } // ia
+            iat0 += atom_na[it];
+            sum0 += atom_na[it] * nproj;
+        }//it
     }
 };
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
index b9aaa6d468..3aa5d4f87e 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h
@@ -112,6 +112,41 @@ struct cal_force_nl_op
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* force);
+    /// kernel for DFT+U
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const std::complex<FPTYPE>* vu,
+                    const int* orbital_corr,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force);
+    /// kernel for DeltaSpin
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* lambda,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force);
 };
 
 #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
@@ -176,6 +211,41 @@ struct cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* force);
+    /// kernel for DFT+U
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const std::complex<FPTYPE>* vu,
+                    const int* orbital_corr,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force);
+    /// kernel for DeltaSpin
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nbands_occ,
+                    const int& wg_nc,
+                    const int& ntype,
+                    const int& forcenl_nc,
+                    const int& nbands,
+                    const int& ik,
+                    const int& nkb,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE& tpiba,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* lambda,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* force);
 };
 
 /**
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.cpp
new file mode 100644
index 0000000000..1528af190c
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.cpp
@@ -0,0 +1,87 @@
+#include "module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h"
+
+namespace hamilt
+{
+
+template <typename FPTYPE>
+struct onsite_ps_op<FPTYPE, base_device::DEVICE_CPU>
+{
+    // kernel for DeltaSpin calculation
+    void operator()(const base_device::DEVICE_CPU* /*dev*/,
+                    const int& npm,
+                    const int npol,
+                    const int* ip_iat,
+                    const int& tnp,
+                    const std::complex<FPTYPE>* lambda_array,
+                    std::complex<FPTYPE>* ps,
+                    const std::complex<FPTYPE>* becp)
+    {
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2)
+#endif
+        for (int ib = 0; ib < npm / npol; ib++)
+        {
+            for (int ip = 0; ip < tnp; ip++)
+            {
+                int ib2 = ib * npol;
+                int iat = ip_iat[ip];
+                const int psind = ip * npm + ib2;
+                const int becpind = ib2 * tnp + ip;
+                ps[psind] += lambda_array[iat * 4] * becp[becpind] 
+                            + lambda_array[iat * 4 + 2] * becp[becpind + tnp];
+                ps[psind + 1] += lambda_array[iat * 4 + 1] * becp[becpind] 
+                            + lambda_array[iat * 4 + 3] * becp[becpind + tnp];
+            } // end ip
+        } // end ib
+    };
+
+    // kernel for DFT+U calculation
+    void operator()(const base_device::DEVICE_CPU* dev,
+      const int& npm,
+      const int npol,
+      const int* orb_l_iat,
+      const int* ip_iat,
+      const int* ip_m,
+      const int* vu_begin_iat,
+      const int& tnp,
+      const std::complex<FPTYPE>* vu,
+      std::complex<FPTYPE>* ps,
+      const std::complex<FPTYPE>* becp)
+  {
+#ifdef _OPENMP
+#pragma omp parallel for collapse(2)
+#endif
+        for (int ib = 0; ib < npm / npol; ib++)
+        {
+            for (int ip = 0; ip < tnp; ip++)
+            {
+                int m1 = ip_m[ip];
+                if(m1 < 0) continue;
+                int ib2 = ib * npol;
+                int iat = ip_iat[ip];
+                const std::complex<FPTYPE>* vu_iat = vu + vu_begin_iat[iat];
+                int orb_l = orb_l_iat[iat];
+                int tlp1 = 2 * orb_l + 1;
+                int tlp1_2 = tlp1 * tlp1;
+                int ip2_begin = ip - m1;
+                int ip2_end = ip - m1 + tlp1;
+                const int psind = ip * npm + ib2;
+                for(int ip2 = ip2_begin;ip2<ip2_end;ip2++)
+                {
+                    const int becpind = ib2 * tnp + ip2;
+                    int m2 = ip_m[ip2];
+                    const int index_mm = m1 * tlp1 + m2;
+                    ps[psind] += vu_iat[index_mm] * becp[becpind]
+                                + vu_iat[index_mm + tlp1_2 * 2] * becp[becpind + tnp];
+                    ps[psind + 1] += vu_iat[index_mm + tlp1_2 * 1] * becp[becpind]
+                                + vu_iat[index_mm + tlp1_2 * 3] * becp[becpind + tnp];
+                }
+            } // end ip
+        } // end ib
+  }
+};
+
+template struct onsite_ps_op<float, base_device::DEVICE_CPU>;
+template struct onsite_ps_op<double, base_device::DEVICE_CPU>;
+
+} // namespace hamilt
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h
new file mode 100644
index 0000000000..fee57fbbd3
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h
@@ -0,0 +1,63 @@
+#ifndef MODULE_HAMILT_OPERATOR_KERNELS_ONSITE_H
+#define MODULE_HAMILT_OPERATOR_KERNELS_ONSITE_H
+
+#include "module_psi/psi.h"
+#include <complex>
+
+namespace hamilt {
+template <typename FPTYPE, typename Device> 
+struct onsite_ps_op {
+  void operator() (
+      const Device* dev,
+      const int& npm,
+      const int npol,
+      const int* ip_iat,
+      const int& tnp,
+      const std::complex<FPTYPE>* lambda_coeff,
+      std::complex<FPTYPE>* ps,
+      const std::complex<FPTYPE>* becp);
+
+  void operator() (
+      const Device* dev,
+      const int& npm,
+      const int npol,
+      const int* orb_l_iat,
+      const int* ip_iat,
+      const int* ip_m,
+      const int* vu_begin_iat,
+      const int& tnp,
+      const std::complex<FPTYPE>* vu,
+      std::complex<FPTYPE>* ps,
+      const std::complex<FPTYPE>* becp);
+};
+                      
+#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+// Partially specialize functor for base_device::GpuDevice.
+template <typename FPTYPE>
+struct onsite_ps_op<FPTYPE, base_device::DEVICE_GPU> {
+  void operator() (
+      const base_device::DEVICE_GPU* dev,
+      const int& npm,
+      const int npol,
+      const int* ip_iat,
+      const int& tnp,
+      const std::complex<FPTYPE>* lambda_coeff,
+      std::complex<FPTYPE>* ps,
+      const std::complex<FPTYPE>* becp);
+
+  void operator() (
+      const base_device::DEVICE_GPU* dev,
+      const int& npm,
+      const int npol,
+      const int* orb_l_iat,
+      const int* ip_iat,
+      const int* ip_m,
+      const int* vu_begin_iat,
+      const int& tnp,
+      const std::complex<FPTYPE>* vu,
+      std::complex<FPTYPE>* ps,
+      const std::complex<FPTYPE>* becp);
+};
+#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+} // namespace hamilt
+#endif //MODULE_HAMILT_OPERATOR_KERNELS_ONSITE_H
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
index b89a380133..c78b333b86 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu
@@ -304,6 +304,217 @@ void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_dev
     hipCheckOnDebug();
 }
 
+template <typename FPTYPE>
+__global__ void cal_force_onsite(int wg_nc,
+                                  int ntype,
+                                  int forcenl_nc,
+                                  int nbands,
+                                  int ik,
+                                  int nkb,
+                                  const int* atom_nh,
+                                  const int* atom_na,
+                                  int tpiba,
+                                  const FPTYPE* d_wg,
+                                  const thrust::complex<FPTYPE>* vu,
+                                  const int* orbital_corr,
+                                  const thrust::complex<FPTYPE>* becp,
+                                  const thrust::complex<FPTYPE>* dbecp,
+                                  FPTYPE* force)
+{
+    const int ib = blockIdx.x / ntype; // index of loop-nbands
+    const int ib2 = ib * 2;
+    const int it = blockIdx.x % ntype; // index of loop-ntype
+    if (orbital_corr[it] == -1)
+        return;
+    const int orbital_l = orbital_corr[it];
+    const int ip_begin = orbital_l * orbital_l;
+    const int tlp1 = 2 * orbital_l + 1;
+    const int tlp1_2 = tlp1 * tlp1;
+
+    int iat = 0; // calculate the begin of atomic index
+    int sum = 0; // calculate the begin of atomic-orbital index
+    for (int ii = 0; ii < it; ii++)
+    {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+        vu += 4 * tlp1_2 * atom_na[ii]; // step for vu
+    }
+
+    const FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+    const int nprojs = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        for (int mm = threadIdx.x; mm < tlp1_2; mm += blockDim.x)
+        {
+            const int m1 = mm / tlp1;
+            const int m2 = mm % tlp1;
+            const int ip1 = ip_begin + m1;
+            const int ip2 = ip_begin + m2;
+            const int inkb1 = sum + ip1 + ib2 * nkb;
+            const int inkb2 = sum + ip2 + ib2 * nkb;
+            thrust::complex<FPTYPE> ps[4] = {vu[mm], vu[mm + tlp1_2], vu[mm + 2 * tlp1_2], vu[mm + 3 * tlp1_2]};
+            // out<<"\n ps = "<<ps;
+            for (int ipol = 0; ipol < 3; ipol++)
+            {
+                const int inkb0 = ipol * nbands * 2 * nkb + inkb1;
+                const thrust::complex<FPTYPE> dbb0 = conj(dbecp[inkb0]) * becp[inkb2];
+                const thrust::complex<FPTYPE> dbb1 = conj(dbecp[inkb0]) * becp[inkb2 + nkb];
+                const thrust::complex<FPTYPE> dbb2 = conj(dbecp[inkb0 + nkb]) * becp[inkb2];
+                const thrust::complex<FPTYPE> dbb3 = conj(dbecp[inkb0 + nkb]) * becp[inkb2 + nkb];
+                const FPTYPE tmp = -fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real();
+                atomicAdd(force + iat * forcenl_nc + ipol, tmp);
+            }
+        }
+        ++iat;
+        sum += nprojs;
+        vu += 4 * tlp1_2;
+    } // ia
+}
+
+template <typename FPTYPE>
+__global__ void cal_force_onsite(int wg_nc,
+                                 int ntype,
+                                 int forcenl_nc,
+                                 int nbands,
+                                 int ik,
+                                 int nkb,
+                                 const int* atom_nh,
+                                 const int* atom_na,
+                                 int tpiba,
+                                 const FPTYPE* d_wg,
+                                 const FPTYPE* lambda,
+                                 const thrust::complex<FPTYPE>* becp,
+                                 const thrust::complex<FPTYPE>* dbecp,
+                                 FPTYPE* force)
+{
+    const int ib = blockIdx.x / ntype; // index of loop-nbands
+    const int ib2 = ib * 2;
+    const int it = blockIdx.x % ntype; // index of loop-ntype
+
+    int iat = 0; // calculate the begin of atomic index
+    int sum = 0; // calculate the begin of atomic-orbital index
+    for (int ii = 0; ii < it; ii++)
+    {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    const FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba;
+    const int nprojs = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        const thrust::complex<FPTYPE> coefficients0(lambda[iat * 3 + 2], 0.0);
+        const thrust::complex<FPTYPE> coefficients1(lambda[iat * 3], lambda[iat * 3 + 1]);
+        const thrust::complex<FPTYPE> coefficients2(lambda[iat * 3], -1 * lambda[iat * 3 + 1]);
+        const thrust::complex<FPTYPE> coefficients3(-1 * lambda[iat * 3 + 2], 0.0);
+        for (int ip = threadIdx.x; ip < nprojs; ip += blockDim.x)
+        {
+            const int inkb = sum + ip + ib2 * nkb;
+            // out<<"\n ps = "<<ps;
+            for (int ipol = 0; ipol < 3; ipol++)
+            {
+                const int inkb0 = ipol * nbands * 2 * nkb + inkb;
+                const thrust::complex<FPTYPE> dbb0 = conj(dbecp[inkb0]) * becp[inkb];
+                const thrust::complex<FPTYPE> dbb1 = conj(dbecp[inkb0]) * becp[inkb + nkb];
+                const thrust::complex<FPTYPE> dbb2 = conj(dbecp[inkb0 + nkb]) * becp[inkb];
+                const thrust::complex<FPTYPE> dbb3 = conj(dbecp[inkb0 + nkb]) * becp[inkb + nkb];
+                const FPTYPE tmp
+                    = -fac
+                      * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3)
+                            .real();
+                atomicAdd(force + iat * forcenl_nc + ipol, tmp);
+            }
+        }
+        ++iat;
+        sum += nprojs;
+    } // ia
+}
+
+// kernel for DFTU force
+template <typename FPTYPE>
+void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                                                                  const int& nbands_occ,
+                                                                  const int& wg_nc,
+                                                                  const int& ntype,
+                                                                  const int& forcenl_nc,
+                                                                  const int& nbands,
+                                                                  const int& ik,
+                                                                  const int& nkb,
+                                                                  const int* atom_nh,
+                                                                  const int* atom_na,
+                                                                  const FPTYPE& tpiba,
+                                                                  const FPTYPE* d_wg,
+                                                                  const std::complex<FPTYPE>* vu,
+                                                                  const int* orbital_corr,
+                                                                  const std::complex<FPTYPE>* becp,
+                                                                  const std::complex<FPTYPE>* dbecp,
+                                                                  FPTYPE* force)
+{
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_onsite<FPTYPE>),
+                       dim3(nbands_occ * ntype),
+                       dim3(THREADS_PER_BLOCK),
+                       0,
+                       0,
+                       wg_nc,
+                       ntype,
+                       forcenl_nc,
+                       nbands,
+                       ik,
+                       nkb,
+                       atom_nh,
+                       atom_na,
+                       tpiba,
+                       d_wg,
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(vu),
+                       orbital_corr,
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+                       force); // array of data
+
+    hipCheckOnDebug();
+}
+// kernel for DeltaSpin force
+template <typename FPTYPE>
+void cal_force_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                                                                  const int& nbands_occ,
+                                                                  const int& wg_nc,
+                                                                  const int& ntype,
+                                                                  const int& forcenl_nc,
+                                                                  const int& nbands,
+                                                                  const int& ik,
+                                                                  const int& nkb,
+                                                                  const int* atom_nh,
+                                                                  const int* atom_na,
+                                                                  const FPTYPE& tpiba,
+                                                                  const FPTYPE* d_wg,
+                                                                  const FPTYPE* lambda,
+                                                                  const std::complex<FPTYPE>* becp,
+                                                                  const std::complex<FPTYPE>* dbecp,
+                                                                  FPTYPE* force)
+{
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_onsite<FPTYPE>),
+                       dim3(nbands_occ * ntype),
+                       dim3(THREADS_PER_BLOCK),
+                       0,
+                       0,
+                       wg_nc,
+                       ntype,
+                       forcenl_nc,
+                       nbands,
+                       ik,
+                       nkb,
+                       atom_nh,
+                       atom_na,
+                       tpiba,
+                       d_wg,
+                       lambda,
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+                       reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+                       force); // array of data
+
+    hipCheckOnDebug();
+}
+
 template <typename FPTYPE>
 __global__ void saveVkbValues_(
     const int *gcar_zero_ptrs, 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/onsite_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/onsite_op.hip.cu
new file mode 100644
index 0000000000..31ec309a28
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/onsite_op.hip.cu
@@ -0,0 +1,134 @@
+#include "module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h"
+
+#include <base/macros/macros.h>
+#include <complex>
+#include <hip/hip_runtime.h>
+#include <thrust/complex.h>
+
+namespace hamilt
+{
+
+#define THREADS_PER_BLOCK 256
+
+template <typename FPTYPE>
+__global__ void onsite_op(const int npm,
+                          const int npol,
+                          const int* ip_iat,
+                          const int tnp,
+                          const thrust::complex<FPTYPE>* lambda_coeff,
+                          thrust::complex<FPTYPE>* ps,
+                          const thrust::complex<FPTYPE>* becp)
+{
+    const int ip = blockIdx.x;
+    const int nbands = npm / npol;
+    for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x)
+    {
+        int ib2 = ib * npol;
+        int iat = ip_iat[ip];
+        const int psind = ip * npm + ib2;
+        const int becpind = ib2 * tnp + ip;
+        ps[psind] += lambda_coeff[iat * 4] * becp[becpind] + lambda_coeff[iat * 4 + 2] * becp[becpind + tnp];
+        ps[psind + 1] += lambda_coeff[iat * 4 + 1] * becp[becpind] + lambda_coeff[iat * 4 + 3] * becp[becpind + tnp];
+    }
+}
+
+template <typename FPTYPE>
+__global__ void onsite_op(const int npm,
+                          const int npol,
+                          const int* orb_l_iat,
+                          const int* ip_iat,
+                          const int* ip_m,
+                          const int* vu_begin_iat,
+                          const int tnp,
+                          const thrust::complex<FPTYPE>* vu,
+                          thrust::complex<FPTYPE>* ps,
+                          const thrust::complex<FPTYPE>* becp)
+{
+    const int ip = blockIdx.x;
+    int m1 = ip_m[ip];
+    if (m1 >= 0)
+    {
+        const int nbands = npm / npol;
+        for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x)
+        {
+            int ib2 = ib * npol;
+            int iat = ip_iat[ip];
+            const thrust::complex<FPTYPE>* vu_iat = vu + vu_begin_iat[iat];
+            int orb_l = orb_l_iat[iat];
+            int tlp1 = 2 * orb_l + 1;
+            int tlp1_2 = tlp1 * tlp1;
+            int ip2_begin = ip - m1;
+            int ip2_end = ip - m1 + tlp1;
+            const int psind = ip * npm + ib2;
+            for (int ip2 = ip2_begin; ip2 < ip2_end; ip2++)
+            {
+                const int becpind = ib2 * tnp + ip2;
+                int m2 = ip_m[ip2];
+                const int index_mm = m1 * tlp1 + m2;
+                ps[psind] += vu_iat[index_mm] * becp[becpind] + vu_iat[index_mm + tlp1_2 * 2] * becp[becpind + tnp];
+                ps[psind + 1] += vu_iat[index_mm + tlp1_2 * 1] * becp[becpind]
+                                 + vu_iat[index_mm + tlp1_2 * 3] * becp[becpind + tnp];
+            }
+        }
+    }
+}
+
+template <typename FPTYPE>
+void hamilt::onsite_ps_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
+                                                                       const int& npm,
+                                                                       const int npol,
+                                                                       const int* ip_iat,
+                                                                       const int& tnp,
+                                                                       const std::complex<FPTYPE>* lambda_coeff,
+                                                                       std::complex<FPTYPE>* ps,
+                                                                       const std::complex<FPTYPE>* becp)
+{
+    // denghui implement 20221019
+    // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(onsite_op<FPTYPE>), dim3(tnp), dim3(THREADS_PER_BLOCK), 0, 0,
+        npm,
+        npol,
+        ip_iat,
+        tnp,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(lambda_coeff),
+        reinterpret_cast<thrust::complex<FPTYPE>*>(ps),          // array of data
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(becp)); // array of data
+
+    hipCheckOnDebug();
+}
+
+template <typename FPTYPE>
+void hamilt::onsite_ps_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* dev,
+                                                                       const int& npm,
+                                                                       const int npol,
+                                                                       const int* orb_l_iat,
+                                                                       const int* ip_iat,
+                                                                       const int* ip_m,
+                                                                       const int* vu_begin_iat,
+                                                                       const int& tnp,
+                                                                       const std::complex<FPTYPE>* vu,
+                                                                       std::complex<FPTYPE>* ps,
+                                                                       const std::complex<FPTYPE>* becp)
+{
+    // denghui implement 20221109
+    // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(onsite_op<FPTYPE>), dim3(tnp), dim3(THREADS_PER_BLOCK), 0, 0,
+        npm,
+        npol,
+        orb_l_iat,
+        ip_iat,
+        ip_m,
+        vu_begin_iat,
+        tnp,
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(vu),
+        reinterpret_cast<thrust::complex<FPTYPE>*>(ps),          // array of data
+        reinterpret_cast<const thrust::complex<FPTYPE>*>(becp)); // array of data
+
+    hipCheckOnDebug();
+    // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+}
+
+template struct onsite_ps_op<float, base_device::DEVICE_GPU>;
+template struct onsite_ps_op<double, base_device::DEVICE_GPU>;
+
+} // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
index a5f8e553af..ef138c04cc 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu
@@ -922,6 +922,185 @@ void synchronize_ptrs<base_device::DEVICE_GPU>::operator()(
     hipErrcheck(hipMemcpy(ptr_out, ptr_in, sizeof(void*) * size, hipMemcpyHostToDevice));
 }
 
+template <typename FPTYPE>
+__global__ void cal_stress_onsite(
+        const int nkb,
+        const int ntype,
+        const int wg_nc,
+        const int ik,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE *d_wg,
+        const thrust::complex<FPTYPE> *vu,
+        const int* orbital_corr,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *stress)
+{
+    const int ib = blockIdx.x / ntype; // index of loop-nbands
+    const int ib2  = ib * 2;
+    const int it = blockIdx.x % ntype; // index of loop-ntype
+    if(orbital_corr[it] == -1) return;
+    const int orbital_l = orbital_corr[it];
+    const int ip_begin = orbital_l * orbital_l;
+    const int tlp1 = 2 * orbital_l + 1;
+    const int tlp1_2 = tlp1 * tlp1;
+
+    int iat = 0; // calculate the begin of atomic index
+    int sum = 0; // calculate the begin of atomic-orbital index
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+        vu += 4 * tlp1_2 * atom_na[ii];// step for vu
+    }
+
+    FPTYPE stress_var = 0;
+    const FPTYPE fac = d_wg[ik * wg_nc + ib];
+    const int nprojs = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        for (int mm = threadIdx.x; mm < tlp1_2; mm += blockDim.x) {
+            const int m1 = mm / tlp1;
+            const int m2 = mm % tlp1;
+            const int ip1 = ip_begin + m1;
+            const int ip2 = ip_begin + m2;
+            const int inkb1 = sum + ip1 + ib2 * nkb;
+            const int inkb2 = sum + ip2 + ib2 * nkb;
+            thrust::complex<FPTYPE> ps[4] = {vu[mm], vu[mm + tlp1_2], vu[mm + 2 * tlp1_2], vu[mm + 3 * tlp1_2]};
+            //out<<"\n ps = "<<ps;
+            const thrust::complex<FPTYPE> dbb0 = conj(dbecp[inkb1]) * becp[inkb2];
+            const thrust::complex<FPTYPE> dbb1 = conj(dbecp[inkb1]) * becp[inkb2 + nkb];
+            const thrust::complex<FPTYPE> dbb2 = conj(dbecp[inkb1 + nkb]) * becp[inkb2];
+            const thrust::complex<FPTYPE> dbb3 = conj(dbecp[inkb1 + nkb]) * becp[inkb2 + nkb];
+            stress_var -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real();
+        }
+        ++iat;
+        sum+=nprojs;
+        vu += 4 * tlp1_2;
+    }//ia
+    warp_reduce(stress_var);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        atomicAdd(stress, stress_var);
+    }
+}
+
+template <typename FPTYPE>
+__global__ void cal_stress_onsite(
+        const int nkb,
+        const int ntype,
+        const int wg_nc,
+        const int ik,
+        const int *atom_nh,
+        const int *atom_na,
+        const FPTYPE *d_wg,
+        const double* lambda,
+        const thrust::complex<FPTYPE> *becp,
+        const thrust::complex<FPTYPE> *dbecp,
+        FPTYPE *stress)
+{
+    const int ib = blockIdx.x / ntype; // index of loop-nbands
+    const int ib2  = ib * 2;
+    const int it = blockIdx.x % ntype; // index of loop-ntype
+
+    int iat = 0; // calculate the begin of atomic index
+    int sum = 0; // calculate the begin of atomic-orbital index
+    for (int ii = 0; ii < it; ii++) {
+        iat += atom_na[ii];
+        sum += atom_na[ii] * atom_nh[ii];
+    }
+
+    FPTYPE stress_var = 0;
+    const FPTYPE fac = d_wg[ik * wg_nc + ib];
+    const int nprojs = atom_nh[it];
+    for (int ia = 0; ia < atom_na[it]; ia++)
+    {
+        const thrust::complex<FPTYPE> coefficients0(lambda[iat*3+2], 0.0);
+        const thrust::complex<FPTYPE> coefficients1(lambda[iat*3] , lambda[iat*3+1]);
+        const thrust::complex<FPTYPE> coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]);
+        const thrust::complex<FPTYPE> coefficients3(-1 * lambda[iat*3+2], 0.0);
+        for (int ip = threadIdx.x; ip < nprojs; ip += blockDim.x) {
+            const int inkb = sum + ip + ib2 * nkb;
+            //out<<"\n ps = "<<ps;
+            const thrust::complex<FPTYPE> dbb0 = conj(dbecp[inkb]) * becp[inkb];
+            const thrust::complex<FPTYPE> dbb1 = conj(dbecp[inkb]) * becp[inkb + nkb];
+            const thrust::complex<FPTYPE> dbb2 = conj(dbecp[inkb + nkb]) * becp[inkb];
+            const thrust::complex<FPTYPE> dbb3 = conj(dbecp[inkb + nkb]) * becp[inkb + nkb];
+            stress_var -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real();
+        }
+        ++iat;
+        sum+=nprojs;
+    }//ia
+    warp_reduce(stress_var);
+    if (threadIdx.x % WARP_SIZE == 0) {
+        atomicAdd(stress, stress_var);
+    }
+}
+
+//kernel for DFTU stress
+template <typename FPTYPE>
+void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const std::complex<FPTYPE>* vu,
+                    const int* orbital_corr,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress)
+{
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_onsite<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0,
+             nkb,
+             ntype,
+             wg_nc,
+             ik,
+             atom_nh,
+             atom_na,
+             d_wg,
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(vu),
+             orbital_corr,
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+             stress);// array of data
+
+    hipCheckOnDebug();
+}
+// kernel for DeltaSpin stress
+template <typename FPTYPE>
+void cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>::operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const double* lambda,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress)
+{
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_onsite<FPTYPE>), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0,
+             nkb,
+             ntype,
+             wg_nc,
+             ik,
+             atom_nh,
+             atom_na,
+             d_wg,
+             lambda,
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(becp),
+             reinterpret_cast<const thrust::complex<FPTYPE>*>(dbecp),
+             stress);// array of data
+
+    hipCheckOnDebug();
+}
+
 template struct synchronize_ptrs<base_device::DEVICE_GPU>;
 
 template struct cal_stress_mgga_op<std::complex<float>,  base_device::DEVICE_GPU>;
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
index 979955d3e8..0cd0e1ab96 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp
@@ -239,6 +239,121 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_CPU>
         }
 #endif
         stress[ipol * 3 + jpol] += local_stress;
+    };
+    // kernel for DFT+U
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const std::complex<FPTYPE>* vu,
+                    const int* orbital_corr,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress)
+    {
+        FPTYPE local_stress = 0;
+        int iat = 0, sum = 0;
+        for (int it = 0; it < ntype; it++)
+        {
+            const int orbital_l = orbital_corr[it];
+            const int nproj = atom_nh[it];
+            if(orbital_l == -1)
+            {
+                sum += nproj * atom_na[it];
+                continue;
+            }
+            const int ip_begin = orbital_l * orbital_l;
+            const int ip_end = (orbital_l + 1) * (orbital_l + 1);
+            const int tlp1 = 2 * orbital_l + 1;
+            const int tlp1_2 = tlp1 * tlp1;
+            for (int ia = 0; ia < atom_na[it]; ia++)
+            {
+                for (int ib = 0; ib < nbands_occ; ib++)
+                {
+                    const int ib2 = ib*2;
+                    FPTYPE fac = d_wg[ik * wg_nc + ib];
+                    for (int ip1 = ip_begin; ip1 < ip_end; ip1++)
+                    {
+                        const int m1 = ip1 - ip_begin;
+                        const int inkb1 = ib2 * nkb + sum + ia * nproj + ip1;
+                        // out<<"\n ps = "<<ps;
+                        for (int ip2 = ip_begin; ip2 < ip_end; ip2++)
+                        {
+                            const int m2 = ip2 - ip_begin;
+                            std::complex<FPTYPE> ps[4];
+                            for(int i = 0; i < 4; i++)
+                            {
+                                ps[i] = vu[(i * tlp1_2 + m1 * tlp1 + m2)];
+                            }
+                            const int inkb2 = ib2 * nkb + sum + ia * nproj + ip2;
+
+                            const std::complex<FPTYPE> dbb0 = conj(dbecp[inkb1]) * becp[inkb2];
+                            const std::complex<FPTYPE> dbb1 = conj(dbecp[inkb1]) * becp[nkb + inkb2];
+                            const std::complex<FPTYPE> dbb2 = conj(dbecp[nkb + inkb1]) * becp[inkb2];
+                            const std::complex<FPTYPE> dbb3 = conj(dbecp[nkb + inkb1]) * becp[nkb + inkb2];
+                            local_stress -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real();
+                        }
+                    } // end ip
+                }// ib
+                vu += 4 * tlp1_2;// step for vu
+            }// ia
+            sum += atom_na[it] * nproj;
+            iat += atom_na[it];
+        } // end it
+        *stress += local_stress;
+    };
+    // kernel for DeltaSpin 
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const FPTYPE* lambda,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress)
+    {
+        FPTYPE local_stress = 0;
+        int iat0 = 0, sum = 0;
+        for (int it = 0; it < ntype; it++)
+        {
+            const int nproj = atom_nh[it];
+            for (int ia = 0; ia < atom_na[it]; ia++)
+            {
+                int iat = iat0 + ia;
+                const std::complex<FPTYPE> coefficients0(lambda[iat*3+2], 0.0);
+                const std::complex<FPTYPE> coefficients1(lambda[iat*3] , lambda[iat*3+1]);
+                const std::complex<FPTYPE> coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]);
+                const std::complex<FPTYPE> coefficients3(-1 * lambda[iat*3+2], 0.0);
+                for (int ib = 0; ib < nbands_occ; ib++)
+                {
+                    const int ib2 = ib*2;
+                    FPTYPE fac = d_wg[ik * wg_nc + ib];
+                    for (int ip = 0; ip < nproj; ip++)
+                    {
+                        const int inkb1 = ib2 * nkb + sum + ia * nproj + ip;
+
+                        const std::complex<FPTYPE> dbb0 = conj(dbecp[inkb1]) * becp[inkb1];
+                        const std::complex<FPTYPE> dbb1 = conj(dbecp[inkb1]) * becp[nkb + inkb1];
+                        const std::complex<FPTYPE> dbb2 = conj(dbecp[nkb + inkb1]) * becp[inkb1];
+                        const std::complex<FPTYPE> dbb3 = conj(dbecp[nkb + inkb1]) * becp[nkb + inkb1];
+                        local_stress -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real();
+                    } // end ip
+                }// ib
+            }// ia
+            sum += atom_na[it] * nproj;
+            iat0 += atom_na[it];
+        } // end it
+        *stress += local_stress;
     }
 };
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
index af7d51523d..7fecd96d75 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h
@@ -122,6 +122,35 @@ struct cal_stress_nl_op
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* stress);
+    // kernel for DFT+U
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const std::complex<FPTYPE>* vu,
+                    const int* orbital_corr,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress);
+    // kernel for DeltaSpin
+    void operator()(const base_device::DEVICE_CPU* ctx,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const double* lambda,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress);
 };
 
 template <typename T, typename Device>
@@ -298,6 +327,35 @@ struct cal_stress_nl_op<FPTYPE, base_device::DEVICE_GPU>
                     const std::complex<FPTYPE>* becp,
                     const std::complex<FPTYPE>* dbecp,
                     FPTYPE* stress);
+    // kernel for DFT+U
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const std::complex<FPTYPE>* vu,
+                    const int* orbital_corr,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress);
+    // kernel for DeltaSpin
+    void operator()(const base_device::DEVICE_GPU* ctx,
+                    const int& nkb,
+                    const int& nbands_occ,
+                    const int& ntype,
+                    const int& wg_nc,
+                    const int& ik,
+                    const int* atom_nh,
+                    const int* atom_na,
+                    const FPTYPE* d_wg,
+                    const double* lambda,
+                    const std::complex<FPTYPE>* becp,
+                    const std::complex<FPTYPE>* dbecp,
+                    FPTYPE* stress);
 };
 
 // cpu version first, gpu version later
diff --git a/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp b/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp
index 5fddaa0e84..aa28b5abe2 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp
@@ -19,12 +19,21 @@ class Nonlocal_maths
     Nonlocal_maths(const pseudopot_cell_vnl* nlpp_in, const UnitCell* ucell_in)
     {
         this->device = base_device::get_device_type<Device>(this->ctx);
-        this->nlpp_ = nlpp_in;
+        this->nhtol_ = nlpp_in->nhtol;
+        this->lmax_ = nlpp_in->lmaxkb;
+        this->ucell_ = ucell_in;
+    }
+    Nonlocal_maths(const ModuleBase::matrix& nhtol, const int lmax, const UnitCell* ucell_in)
+    {
+        this->device = base_device::get_device_type<Device>(this->ctx);
+        this->nhtol_ = nhtol;
+        this->lmax_ = lmax;
         this->ucell_ = ucell_in;
     }
 
   private:
-    const pseudopot_cell_vnl* nlpp_;
+    ModuleBase::matrix nhtol_;
+    int lmax_;
     const UnitCell* ucell_;
 
     Device* ctx = {};
@@ -33,14 +42,31 @@ class Nonlocal_maths
 
   public:
     // functions
-    /// calculate the G+K vectors
-    std::vector<FPTYPE> cal_gk(int ik, const ModulePW::PW_Basis_K* wfc_basis);
-    /// calculate the sperical bessel function for projections
+    /**
+     * @brief this function prepares all the q (G+k) information in one contiguous memory block
+     * including the x, y and z components, its norm and the reciprocal of its norm
+     * 
+     * @param ik index of k point
+     * @param pw_basis the plane wave basis
+     * @return std::vector<FPTYPE> 1d contiguous memory block containing all the q information. The
+     * first 3*npw are data of x, y and z components, the next 2*npw are data of norm and 1/norm. 
+     * This is beneficial for GPU memory access.
+     */
+    std::vector<FPTYPE> cal_gk(int ik, const ModulePW::PW_Basis_K* pw_basis);
+    /**
+     * @brief calculate the real spherical harmonic functions on cpu (and optionally send to gpu,
+     * if gpu is available)
+     * 
+     * @param lmax [in] maximum angular momentum to calculate
+     * @param npw [in] number of G+k vectors
+     * @param gk_in [in] the G+k vectors
+     * @param ylm [out] the spherical harmonic functions
+     */
     void cal_ylm(int lmax, int npw, const FPTYPE* gk_in, FPTYPE* ylm);
     /// calculate the derivate of the sperical bessel function for projections
     void cal_ylm_deri(int lmax, int npw, const FPTYPE* gk_in, FPTYPE* ylm_deri);
     /// calculate the (-i)^l factors
-    std::vector<complex<FPTYPE>> cal_pref(int it);
+    std::vector<complex<FPTYPE>> cal_pref(int it, const int nh);
     /// calculate the vkb matrix for this atom
     /// vkb = sum_lm (-i)^l * ylm(g^) * vq(g^) * sk(g^)
     void cal_vkb(int it,
@@ -99,68 +125,81 @@ class Nonlocal_maths
                                               const FPTYPE& x);
 };
 
-// cal_gk
+// prepare a memory block containing information of vector G+k, this function can be named as eval_q or eval_gk
+// seems this operation is not on gpu
 template <typename FPTYPE, typename Device>
-std::vector<FPTYPE> Nonlocal_maths<FPTYPE, Device>::cal_gk(int ik, const ModulePW::PW_Basis_K* wfc_basis)
+std::vector<FPTYPE> Nonlocal_maths<FPTYPE, Device>::cal_gk(int ik, const ModulePW::PW_Basis_K* pw_basis)
 {
-    int npw = wfc_basis->npwk[ik];
+    int npw = pw_basis->npwk[ik];
     std::vector<FPTYPE> gk(npw * 5);
-    ModuleBase::Vector3<FPTYPE> tmp;
+    ModuleBase::Vector3<FPTYPE> q;
     for (int ig = 0; ig < npw; ++ig)
     {
-        tmp = wfc_basis->getgpluskcar(ik, ig);
-        gk[ig * 3] = tmp.x;
-        gk[ig * 3 + 1] = tmp.y;
-        gk[ig * 3 + 2] = tmp.z;
-        FPTYPE norm = sqrt(tmp.norm2());
-        gk[3 * npw + ig] = norm * this->ucell_->tpiba;
-        gk[4 * npw + ig] = norm < 1e-8 ? 0.0 : 1.0 / norm * this->ucell_->tpiba;
+        // written in memory block from 0 to 3*npw. This is like a matrix with npw rows and 3 columns
+        q = pw_basis->getgpluskcar(ik, ig);
+        gk[ig * 3]     = q.x;
+        gk[ig * 3 + 1] = q.y;
+        gk[ig * 3 + 2] = q.z;
+        // the following written in memory block from 3*npw to 5*npw, the excess 2*npw is for norm and 1/norm
+        // for memory consecutive consideration, there are blocks storing the norm and 1/norm.
+        FPTYPE norm = sqrt(q.norm2());
+        gk[3 * npw + ig] = norm * this->ucell_->tpiba; // one line with length npw, storing the norm
+        gk[4 * npw + ig] = norm < 1e-8 ? 0.0 : 1.0 / norm * this->ucell_->tpiba; // one line with length npw, storing 1/norm
     }
     return gk;
 }
 
-// cal_ylm
+// tabulate the spherical haromonic functions up to lmax. The q vector is given as input.
+// I would rather call this function as cal_ylm_cpu2gpu, distincting from the pure cpu implementation
 template <typename FPTYPE, typename Device>
-void Nonlocal_maths<FPTYPE, Device>::cal_ylm(int lmax, int npw, const FPTYPE* gk_in, FPTYPE* ylm)
+void Nonlocal_maths<FPTYPE, Device>::cal_ylm(int lmax, int npw, const FPTYPE* q, FPTYPE* ylm)
 {
-
-    const int x1 = (lmax + 1) * (lmax + 1);
-
+    const int ntot_ylm = (lmax + 1) * (lmax + 1);
     if (this->device == base_device::GpuDevice)
     {
+        // alias
         using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op<FPTYPE, Device, base_device::DEVICE_CPU>;
-        std::vector<FPTYPE> ylm_cpu(x1 * npw);
-        ModuleBase::YlmReal::Ylm_Real(cpu_ctx, x1, npw, gk_in, ylm_cpu.data());
+        // allocate
+        std::vector<FPTYPE> ylm_cpu(ntot_ylm * npw);
+        // calculate
+        ModuleBase::YlmReal::Ylm_Real(cpu_ctx, ntot_ylm, npw, q, ylm_cpu.data());
+        // send from cpu to gpu
         syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, ylm, ylm_cpu.data(), ylm_cpu.size());
     }
     else
     {
-        ModuleBase::YlmReal::Ylm_Real(cpu_ctx, x1, npw, gk_in, ylm);
+        // calculate. Why not implement this logic branch inside some function???
+        ModuleBase::YlmReal::Ylm_Real(cpu_ctx, ntot_ylm, npw, q, ylm);
     }
-
     return;
 }
-// cal_ylm_deri
+
+// this function calculate the numerical derivate of the spherical harmonic functions respect to the G vector...
+// maybe called eval_dylmdq_cpu2gpu?
 template <typename FPTYPE, typename Device>
-void Nonlocal_maths<FPTYPE, Device>::cal_ylm_deri(int lmax, int npw, const FPTYPE* gk_in, FPTYPE* ylm_deri)
+void Nonlocal_maths<FPTYPE, Device>::cal_ylm_deri(int lmax, int npw, const FPTYPE* q, FPTYPE* out)
 {
-    const int x1 = (lmax + 1) * (lmax + 1);
+    const int ntot_ylm = (lmax + 1) * (lmax + 1);
 
     if (this->device == base_device::GpuDevice)
     {
-        std::vector<FPTYPE> dylm(3 * x1 * npw);
+        // alias
+        using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op<FPTYPE, Device, base_device::DEVICE_CPU>;
+        // allocate
+        std::vector<FPTYPE> dylmdq_cpu(3 * ntot_ylm * npw);
+        // calculate
         for (int ipol = 0; ipol < 3; ipol++)
         {
-            Nonlocal_maths<FPTYPE, Device>::dylmr2(x1, npw, gk_in, &dylm[ipol * x1 * npw], ipol);
+            Nonlocal_maths<FPTYPE, Device>::dylmr2(ntot_ylm, npw, q, &dylmdq_cpu[ipol * ntot_ylm * npw], ipol);
         }
-        using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op<FPTYPE, Device, base_device::DEVICE_CPU>;
-        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, ylm_deri, dylm.data(), dylm.size());
+        // send from cpu to gpu
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, out, dylmdq_cpu.data(), dylmdq_cpu.size());
     }
     else
     {
         for (int ipol = 0; ipol < 3; ipol++)
         {
-            Nonlocal_maths<FPTYPE, Device>::dylmr2(x1, npw, gk_in, &ylm_deri[ipol * x1 * npw], ipol);
+            Nonlocal_maths<FPTYPE, Device>::dylmr2(ntot_ylm, npw, q, &out[ipol * ntot_ylm * npw], ipol);
         }
     }
 
@@ -168,13 +207,16 @@ void Nonlocal_maths<FPTYPE, Device>::cal_ylm_deri(int lmax, int npw, const FPTYP
 }
 // cal_pref
 template <typename FPTYPE, typename Device>
-std::vector<std::complex<FPTYPE>> Nonlocal_maths<FPTYPE, Device>::cal_pref(int it)
+std::vector<std::complex<FPTYPE>> Nonlocal_maths<FPTYPE, Device>::cal_pref(int it, const int nh)
 {
-    const int nh = this->ucell_->atoms[it].ncpp.nh;
+    // nh is the total number of m-channels of the beta functions
+    // for example, if angular momentum of beta functions are 0, 0, 1, 1, 1, 1, the nh will be 
+    // 1 + 1 + 3 + 3 + 3 + 3 = 14
     std::vector<std::complex<FPTYPE>> pref(nh);
     for (int ih = 0; ih < nh; ih++)
     {
-        pref[ih] = std::pow(std::complex<FPTYPE>(0.0, -1.0), this->nlpp_->nhtol(it, ih));
+        pref[ih] = std::pow(std::complex<FPTYPE>(0.0, -1.0), this->nhtol_(it, ih));
+        // it is actually nh2l, which means to get the angular momentum...
     }
     return pref;
 }
@@ -193,16 +235,16 @@ void Nonlocal_maths<FPTYPE, Device>::cal_vkb(int it,
 {
     int ih = 0;
     // loop over all beta functions
-    for (int nb = 0; nb < this->ucell_->atoms[it].ncpp.nbeta; nb++)
+    for (int ib = 0; ib < this->ucell_->atoms[it].ncpp.nbeta; ib++)
     {
-        int l = this->nlpp_->nhtol(it, ih);
+        int l = this->nhtol_(it, ih);
         // loop over all m angular momentum
         for (int m = 0; m < 2 * l + 1; m++)
         {
             int lm = l * l + m;
             std::complex<FPTYPE>* vkb_ptr = &vkb_out[ih * npw];
             const FPTYPE* ylm_ptr = &ylm_in[lm * npw];
-            const FPTYPE* vq_ptr = &vq_in[nb * npw];
+            const FPTYPE* vq_ptr = &vq_in[ib * npw];
             // loop over all G-vectors
             for (int ig = 0; ig < npw; ig++)
             {
@@ -230,12 +272,12 @@ void Nonlocal_maths<FPTYPE, Device>::cal_vkb_deri(int it,
                                                   const FPTYPE* gk_in,
                                                   std::complex<FPTYPE>* vkb_out)
 {
-    const int x1 = (this->nlpp_->lmaxkb + 1) * (this->nlpp_->lmaxkb + 1);
+    const int x1 = (this->lmax_ + 1) * (this->lmax_ + 1);
     int ih = 0;
     // loop over all beta functions
     for (int nb = 0; nb < this->ucell_->atoms[it].ncpp.nbeta; nb++)
     {
-        const int l = this->nlpp_->nhtol(it, ih);
+        const int l = this->nhtol_(it, ih);
         // loop over all m angular momentum
         for (int m = 0; m < 2 * l + 1; m++)
         {
@@ -262,7 +304,7 @@ void Nonlocal_maths<FPTYPE, Device>::cal_vkb_deri(int it,
             const FPTYPE* ylm_deri_ptr1 = &ylm_deri_in[(ipol * x1 + lm) * npw];
             const FPTYPE* ylm_deri_ptr2 = &ylm_deri_in[(jpol * x1 + lm) * npw];
             const FPTYPE* vq_deri_ptr = &vq_deri_in[nb * npw];
-            const FPTYPE* gkn = &gk_in[4 * npw];
+            const FPTYPE* qnorm = &gk_in[4 * npw];
             for (int ig = 0; ig < npw; ig++)
             {
                 vkb_ptr[ig] -= (gk_in[ig * 3 + ipol] * ylm_deri_ptr2[ig] + gk_in[ig * 3 + jpol] * ylm_deri_ptr1[ig])
@@ -273,7 +315,7 @@ void Nonlocal_maths<FPTYPE, Device>::cal_vkb_deri(int it,
             for (int ig = 0; ig < npw; ig++)
             {
                 vkb_ptr[ig] -= 2.0 * ylm_ptr[ig] * vq_deri_ptr[ig] * sk_in[ig] * pref_in[ih] * gk_in[ig * 3 + ipol]
-                               * gk_in[ig * 3 + jpol] * gkn[ig];
+                               * gk_in[ig * 3 + jpol] * qnorm[ig];
             }
             ih++;
         }
@@ -322,15 +364,16 @@ void Nonlocal_maths<FPTYPE, Device>::cal_dvkb_index(const int nbeta,
                                                     int* indexes)
 {
     int ih = 0;
-    const int x1 = (this->nlpp_->lmaxkb + 1) * (this->nlpp_->lmaxkb + 1);
+    const int x1 = (this->lmax_ + 1) * (this->lmax_ + 1);
     for (int nb = 0; nb < nbeta; nb++)
     {
         int l = nhtol[it * nhtol_nc + ih];
         for (int m = 0; m < 2 * l + 1; m++)
         {
+            //std::cout << "in function cal_dvkb_index, nhtol(" << it << ", " << ih << ") = " << l << std::endl;
             int lm = l * l + m;
-            indexes[ih * 4] = lm;
-            indexes[ih * 4 + 1] = nb;
+            indexes[ih * 4] = lm; // the index of ylm matrix, for given l and m, together with ig to get value
+            indexes[ih * 4 + 1] = nb; // the iproj of present atom type
             indexes[ih * 4 + 2] = (ipol * x1 + lm);
             indexes[ih * 4 + 3] = (jpol * x1 + lm);
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp
new file mode 100644
index 0000000000..d4b7e51b65
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp
@@ -0,0 +1,1023 @@
+#include "onsite_proj_tools.h"
+
+#include "module_base/math_polyint.h"
+#include "module_base/math_ylmreal.h"
+#include "module_base/memory.h"
+#include "module_base/timer.h"
+#include "module_base/tool_title.h"
+#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h"
+#include "nonlocal_maths.hpp"
+
+#include <numeric>
+
+namespace hamilt
+{
+template <typename FPTYPE, typename Device>
+Onsite_Proj_tools<FPTYPE, Device>::Onsite_Proj_tools(const pseudopot_cell_vnl* nlpp_in,
+                                                     const UnitCell* ucell_in,
+                                                     const psi::Psi<std::complex<FPTYPE>, Device>* psi_in,
+                                                     const K_Vectors* kv_in,
+                                                     const ModulePW::PW_Basis_K* wfc_basis_in,
+                                                     const Structure_Factor* sf_in,
+                                                     const ModuleBase::matrix& wg,
+                                                     const ModuleBase::matrix& ekb)
+    : nlpp_(nlpp_in), ucell_(ucell_in), psi_(psi_in), kv_(kv_in), wfc_basis_(wfc_basis_in), sf_(sf_in)
+{
+    // get the device context
+    this->device = base_device::get_device_type<Device>(this->ctx);
+
+    // seems kvec_c never used...
+    this->kvec_c = this->wfc_basis_->template get_kvec_c_data<FPTYPE>();
+    // the following is important for calculating the whole contribution to
+    // Hamiltonian or force, stress: sum{nk} fnk*sum_{ij}<psi_nk|ai>Dij<aj|psi_nk>
+    // among, Dij is deeq.
+    // For DFT+U and other projection involved operators, deeq also plays.
+    this->deeq = this->nlpp_->template get_deeq_data<FPTYPE>();
+    this->deeq_dims[0] = this->nlpp_->deeq.getBound1();
+    this->deeq_dims[1] = this->nlpp_->deeq.getBound2();
+    this->deeq_dims[2] = this->nlpp_->deeq.getBound3();
+    this->deeq_dims[3] = this->nlpp_->deeq.getBound4();
+    this->deeq_nc = this->nlpp_->template get_deeq_nc_data<FPTYPE>();
+    this->deeq_nc_dims[0] = this->nlpp_->deeq_nc.getBound1();
+    this->deeq_nc_dims[1] = this->nlpp_->deeq_nc.getBound2();
+    this->deeq_nc_dims[2] = this->nlpp_->deeq_nc.getBound3();
+    this->deeq_nc_dims[3] = this->nlpp_->deeq_nc.getBound4();
+    // ultrasoft pseudopotential
+    this->qq_nt = this->nlpp_->template get_qq_nt_data<FPTYPE>();
+    // total number of projectors (all types, all atoms, not m-distinguishive)
+    this->nkb = nlpp_->nkb;
+    // not clear why do these following...
+    this->nbands = psi_->get_nbands();
+    this->max_npw = wfc_basis_->npwk_max;
+    this->ntype = ucell_->ntype;
+    // because the code is needed to reuse, therefore all other parts should be general
+    // and not strongly depend on any structure of class pseudopot_cell_vnl, therefore
+    // here unpack all needed information.
+    this->tabtpr = &(nlpp_->tab);
+    this->nhtol = &(nlpp_->nhtol);
+    this->lprojmax = nlpp_->lmaxkb;
+    // There is a contribution for jh<>ih in US case or multi projectors case
+    // Actually, the judge of nondiagonal should be done on every atom type
+    this->nondiagonal = (PARAM.globalv.use_uspp || this->nlpp_->multi_proj) ? true : false;
+
+    this->nproj.resize(this->ntype);
+    std::vector<int> nch(this->ntype);
+    for (int it = 0; it < this->ntype; it++)
+    {
+        this->nproj[it] = this->ucell_->atoms[it].ncpp.nbeta;
+        nch[it] = this->ucell_->atoms[it].ncpp.nh;
+    }
+    // allocate memory
+    this->allocate_memory(wg, ekb, this->nproj, nch);
+    this->ppcell_vkb
+        = (this->device == base_device::GpuDevice) ? this->nlpp_->template get_vkb_data<FPTYPE>() : this->nlpp_->vkb.c;
+}
+
+template <typename FPTYPE, typename Device>
+Onsite_Proj_tools<FPTYPE, Device>::Onsite_Proj_tools(
+    const std::vector<int>& nproj, // number of projectors for each atom type
+    const std::vector<int>& lproj,
+    const ModuleBase::realArray& tab, // radials' spherical bessel transform
+    const ModuleBase::matrix& nhtol,  // (it, ich) -> l, the ich is (l, m)-distinctive index
+    std::complex<FPTYPE>* vkb_buf,    // the vkb buffer
+    const UnitCell* ucell_in,
+    const psi::Psi<std::complex<FPTYPE>, Device>* psi_in,
+    const K_Vectors* kv_in,
+    const ModulePW::PW_Basis_K* wfc_basis_in,
+    const Structure_Factor* sf_in,
+    const ModuleBase::matrix& wg,
+    const ModuleBase::matrix& ekb)
+{
+    // this is a constructor for general case, including vnl, dftu, deltaspin, deepks, etc.
+    // what is needed for this kind of constructor?
+
+    // ntype: from unitcell
+    // nproj: number of projectors own by each atom type
+    // projs: beta function or radial function
+    // lproj: angular momentum of projectors
+    // rgrid: radial grid
+    // deeq: the Dij matrix, Hubbard parameters or other quantities...
+
+    // what are already programmed to be needed?
+
+    // tab: the spherical transform of radial functions, with q = linspace(0, GlobalV::NQX, GlobalV::DQ)
+    // nhtol: the (it, ich) -> l, the ich is (l, m)-distinctive index
+    // nkb: total # of projectors <- std::accumulate(nproj.begin(), nproj.end(), 0)
+    // atom_nh: # of (l, m)-distinctive projectors for each atom type
+    // h_atom_nh: counterpart of atom_nh on host
+    // max_nh: std::max_element(atom_nh.begin(), atom_nh.end())
+
+    // in conclusion, this constructor needs the following individual information:
+
+    // nproj
+    // tab (projs is not needed, should be calculated elsewhere)
+    // lproj
+    // deeq, with its dims. it will be good to pass the whole realarray
+
+    // what can be built here
+    // nhtol
+    // nkb
+    // atom_nh, h_atom_nh, max_nh
+    // deeq_dims
+
+    ucell_ = ucell_in;
+    psi_ = psi_in;
+    kv_ = kv_in;
+    wfc_basis_ = wfc_basis_in;
+    sf_ = sf_in;
+
+    this->device = base_device::get_device_type<Device>(this->ctx);
+
+    this->kvec_c = this->wfc_basis_->template get_kvec_c_data<FPTYPE>();
+    // skip deeq, qq_nt
+    this->nbands = psi_->get_nbands();
+    this->max_npw = wfc_basis_->npwk_max;
+    this->ntype = nproj.size();
+    this->tabtpr = &tab;
+
+    this->nhtol = &nhtol;
+    this->lprojmax = *std::max_element(lproj.begin(), lproj.end());
+    this->nondiagonal = false;
+
+    this->nkb = 0;
+    this->h_atom_nh.resize(this->ntype, 0);
+    int iproj = 0;
+    for (int it = 0; it < this->ntype; it++)
+    {
+        int nproj_it = nproj[it];
+        for (int ip = 0; ip < nproj_it; ip++)
+        {
+            this->h_atom_nh[it] += 2 * lproj[iproj] + 1;
+            this->nkb += (2 * lproj[iproj] + 1) * this->ucell_->atoms[it].na;
+            iproj++;
+        }
+    }
+    this->nproj = nproj;
+    this->allocate_memory(wg, ekb, nproj, this->h_atom_nh);
+    // what is this??? seems it is not on gpu
+    this->ppcell_vkb = vkb_buf;
+}
+
+template <typename FPTYPE, typename Device>
+Onsite_Proj_tools<FPTYPE, Device>::~Onsite_Proj_tools()
+{
+    // delete memory
+    delete_memory();
+}
+
+template <typename FPTYPE, typename Device>
+void Onsite_Proj_tools<FPTYPE, Device>::allocate_memory(const ModuleBase::matrix& wg,
+                                                        const ModuleBase::matrix& ekb,
+                                                        const std::vector<int>& nproj,
+                                                        const std::vector<int>& nch)
+{
+    // allocate memory
+
+    // prepare the memory of stress and init some variables:
+    this->h_atom_nh.resize(this->ntype);
+    this->h_atom_na.resize(this->ntype);
+    for (int it = 0; it < this->ntype; it++)
+    {
+        h_atom_nh[it] = nch[it];
+        h_atom_na[it] = this->ucell_->atoms[it].na;
+    }
+
+    int nprojmax = 0;
+    for (int it = 0; it < this->ntype; it++) // loop all elements
+    {
+        nprojmax = std::max(nproj[it], nprojmax); // 0000000000000000000000000
+        this->max_nh = std::max(h_atom_nh[it], max_nh);
+    }
+
+    // allocate the memory for vkb and vkb_deri.
+    if (this->device == base_device::GpuDevice)
+    {
+        resmem_int_op()(this->ctx, this->d_dvkb_indexes, max_nh * 4);
+    }
+
+    resmem_var_op()(this->ctx, this->hd_vq, nprojmax * max_npw);
+    resmem_var_op()(this->ctx, this->hd_vq_deri, nprojmax * max_npw);
+    resmem_var_op()(this->ctx, this->hd_ylm, (lprojmax + 1) * (lprojmax + 1) * max_npw);
+    resmem_var_op()(this->ctx, this->hd_ylm_deri, 3 * (lprojmax + 1) * (lprojmax + 1) * max_npw);
+
+    if (this->device == base_device::GpuDevice)
+    {
+        resmem_var_op()(this->ctx, d_wg, wg.nr * wg.nc);
+        resmem_var_op()(this->ctx, d_ekb, ekb.nr * ekb.nc);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, wg.c, wg.nr * wg.nc);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_ekb, ekb.c, ekb.nr * ekb.nc);
+        resmem_int_op()(this->ctx, atom_nh, this->ntype);
+        resmem_int_op()(this->ctx, atom_na, this->ntype);
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_nh, h_atom_nh.data(), this->ntype);
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_na, h_atom_na.data(), this->ntype);
+
+        resmem_var_op()(this->ctx, d_g_plus_k, max_npw * 5);
+        resmem_var_op()(this->ctx, d_pref, max_nh);
+        resmem_var_op()(this->ctx, d_vq_tab, this->tabtpr->getSize());
+        resmem_complex_op()(this->ctx, d_pref_in, max_nh);
+    }
+    else
+    {
+        this->d_wg = wg.c;
+        this->d_ekb = ekb.c;
+        this->atom_nh = h_atom_nh.data();
+        this->atom_na = h_atom_na.data();
+    }
+}
+
+template <typename FPTYPE, typename Device>
+void Onsite_Proj_tools<FPTYPE, Device>::delete_memory()
+{
+    // delete memory
+
+    delmem_var_op()(this->ctx, hd_vq);
+    delmem_var_op()(this->ctx, hd_vq_deri);
+    delmem_var_op()(this->ctx, hd_ylm);
+    delmem_var_op()(this->ctx, hd_ylm_deri);
+
+    // delete memory on GPU
+    if (this->device == base_device::GpuDevice)
+    {
+        delmem_var_op()(this->ctx, d_wg);
+        delmem_var_op()(this->ctx, d_ekb);
+        delmem_int_op()(this->ctx, atom_nh);
+        delmem_int_op()(this->ctx, atom_na);
+        delmem_var_op()(this->ctx, d_g_plus_k);
+        delmem_var_op()(this->ctx, d_pref);
+        delmem_var_op()(this->ctx, d_vq_tab);
+        delmem_complex_op()(this->ctx, this->d_pref_in);
+        delmem_int_op()(this->ctx, d_dvkb_indexes);
+    }
+
+    if (becp != nullptr)
+    {
+        delmem_complex_op()(this->ctx, becp);
+        delmem_complex_op()(this->ctx, hd_sk);
+    }
+    if (dbecp != nullptr)
+    {
+        delmem_complex_op()(this->ctx, dbecp);
+    }
+    if (this->pre_ik_f != -1)
+    {
+        delmem_int_op()(this->ctx, gcar_zero_indexes);
+        delmem_complex_op()(this->ctx, vkb_save);
+        delmem_var_op()(this->ctx, gcar);
+    }
+}
+
+// cal_becp
+// starts from vkb (nkb, ng) table
+// it should be merely the multiplication of matrix (vkb, ng) * (ng, nbands) -> (vkb, nbands)
+// should be irrelevant with what the matrix is.
+// the vkb index generation should be maintained elsewhere.
+// vkb already has atomic position information, calculated from the vq and sk
+// . the multiplication with sk should be within specific operator
+// because the atom selection task is operator-specific.
+template <typename FPTYPE, typename Device>
+void Onsite_Proj_tools<FPTYPE, Device>::cal_becp(int ik,
+                                                 int npm,
+                                                 std::complex<FPTYPE>* becp_in,
+                                                 const std::complex<FPTYPE>* ppsi_in)
+{
+    ModuleBase::TITLE("Onsite_Proj_tools", "cal_becp");
+    ModuleBase::timer::tick("Onsite_Proj_tools", "cal_becp");
+
+    const int npol = this->ucell_->get_npol();
+    const std::complex<FPTYPE>* ppsi = ppsi_in == nullptr ? &(this->psi_[0](ik, 0, 0)) : ppsi_in;
+    const int npw = this->wfc_basis_->npwk[ik];
+    if (becp_in == nullptr && this->becp == nullptr)
+    {
+        resmem_complex_op()(this->ctx, becp, this->nbands * npol * this->nkb);
+    }
+    std::complex<FPTYPE>* becp_tmp = becp_in == nullptr ? this->becp : becp_in;
+    const int size_becp_act = npm * npol * this->nkb;
+    if (ik != this->current_ik) // different ik, need to recalculate vkb
+    {
+        const int size_becp = this->nbands * npol * this->nkb;
+        if (this->becp == nullptr)
+        {
+            resmem_complex_op()(this->ctx, becp, size_becp);
+        }
+
+        // prepare math tools
+        Nonlocal_maths<FPTYPE, Device> maths(*(this->nhtol), this->lprojmax, this->ucell_);
+
+        std::complex<FPTYPE>* vkb_ptr = this->ppcell_vkb;
+
+        // calculate G+K
+        this->g_plus_k = maths.cal_gk(ik, this->wfc_basis_);
+        FPTYPE *gk = g_plus_k.data(), *vq_tb = this->tabtpr->ptr;
+        // vq_tb has dimension (ntype, nproj, GlobalV::NQX)
+
+        // calculate sk
+        resmem_complex_op()(ctx, hd_sk, this->ucell_->nat * npw);
+        this->sf_->get_sk(ctx, ik, this->wfc_basis_, hd_sk);
+        std::complex<FPTYPE>* d_sk = this->hd_sk;
+        // prepare ylm，size: (lmax+1)^2 * this->max_npw
+        const int lmax_ = this->lprojmax;
+        maths.cal_ylm(lmax_, npw, g_plus_k.data(), hd_ylm);
+
+        // DEBUG: ONCE YOU CHECK ylm VALUES, YOU UNCOMMENT THE FOLLOW
+        // std::vector<ModuleBase::Vector3<double>> qs(npw);
+        // for (int ig = 0; ig < npw; ig++)
+        // {
+        //     qs[ig] = this->wfc_basis_->getgpluskcar(ik, ig);
+        // }
+        // const int total_lm = (lmax_ + 1) * (lmax_ + 1);
+        // ModuleBase::matrix ylmref(total_lm, npw);
+        // ModuleBase::YlmReal::Ylm_Real(total_lm, npw, qs.data(), ylmref);
+        // std::cout << "Compare the Ylm values of two methods:" << std::endl;
+        // int lm = 0;
+        // for(int l_ = 0; l_ < lmax_ + 1; l_++)
+        // {
+        //     for(int m_ = -l_; m_ <= l_; m_++)
+        //     {
+        //         std::cout << "l = " << l_ << " m = " << m_ << std::endl;
+        //         lm = l_ * l_ + l_ + m_;
+        //         for(int ig = 0; ig < npw; ig++)
+        //         {
+        //             std::cout << "[" << ylmref(lm, ig) << " " << hd_ylm[lm * npw + ig] << "]" << " ";
+        //         }
+        //         std::cout << std::endl;
+        //     }
+        //     std::cout << std::endl;
+        // }
+        // ModuleBase::WARNING_QUIT("Onsite_Proj_tools", "cal_becp");
+
+        if (this->device == base_device::GpuDevice)
+        {
+            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_g_plus_k, g_plus_k.data(), g_plus_k.size());
+            syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vq_tab, this->tabtpr->ptr, this->tabtpr->getSize());
+            gk = d_g_plus_k;
+            vq_tb = d_vq_tab;
+        }
+
+        // int vkb_size = 0;
+        for (int it = 0; it < this->ucell_->ntype; it++) // loop all elements
+        {
+            // interpolate (it, 0..nproj[it], 0..npw) to get hd_vq
+            cal_vq_op()(this->ctx,
+                        vq_tb, // its data is correct, dimension (ntype, nprojmax, GlobalV::NQX)
+                        it,    // but maybe it is (ntype, nprojmax*npol, GlobalV::NQX)
+                        gk,
+                        npw,
+                        this->tabtpr->getBound2(),
+                        this->tabtpr->getBound3(),
+                        PARAM.globalv.dq,
+                        nproj[it],
+                        hd_vq); // hd_vq has dimension (nprojmax, npwx), this size will be the largest needed.
+
+            // DEBUG: ONCE YOU CHECK vq VALUES, YOU UNCOMMENT THE FOLLOWING LINE
+            // for(int ip = 0; ip < nproj[it]; ip++)
+            // {
+            //     std::cout << "projector #" << ip << " of atom type " << it << std::endl;
+            //     for(int iq = 0; iq < npw; iq++)
+            //     {
+            //         std::cout << hd_vq[ip * npw + iq] << " ";
+            //     }
+            //     std::cout << std::endl;
+            // }
+            // std::cout << std::endl;
+
+            // prepare（-i）^l, size: nh
+            std::vector<std::complex<double>> pref = maths.cal_pref(it, h_atom_nh[it]);
+            const int nh = pref.size();
+            this->dvkb_indexes.resize(nh * 4);
+            // print the value of nhtol
+            // nhtol->print(std::cout); // as checked, nhtol works as expected
+            maths.cal_dvkb_index(nproj[it], this->nhtol->c, this->nhtol->nc, npw, it, 0, 0, this->dvkb_indexes.data());
+
+            if (this->device == base_device::GpuDevice)
+            {
+                syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4);
+                syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_pref_in, pref.data(), nh);
+            }
+
+            for (int ia = 0; ia < h_atom_na[it]; ia++)
+            {
+                if (this->device == base_device::CpuDevice)
+                {
+                    d_pref_in = pref.data();
+                    d_dvkb_indexes = dvkb_indexes.data();
+                }
+                cal_vkb_op()(this->ctx, nh, npw, d_dvkb_indexes, hd_vq, hd_ylm, d_sk, d_pref_in, vkb_ptr);
+                vkb_ptr += nh * npw; // vkb_ptr has dimension (nhtot, npwx), this size will be the largest needed.
+                d_sk += npw;
+                // vkb_size += nh * npw;
+            }
+        }
+        this->current_ik = ik;
+    }
+    // DEBUG: ONCE YOU CHECK vkb VALUES, YOU UNCOMMENT THE FOLLOWING LINE
+    // for(int i = 0; i < vkb_size; i++)
+    // {
+    //     if (i % npw == 0)
+    //     {
+    //         std::cout << "The #" << i / npw << " projector" << std::endl;
+    //     }
+    //     std::cout << this->ppcell_vkb[i] << " ";
+    // }
+    // std::cout << std::endl;
+    // ModuleBase::WARNING_QUIT("Onsite_Proj_tools", "cal_becp");
+
+    // PLAN: seperate the lower and upper into two parts, individually called.
+    const char transa = 'C';
+    const char transb = 'N';
+    int npm_npol = npm * npol;
+    gemm_op()(this->ctx,
+              transa,
+              transb,
+              this->nkb,
+              npm_npol, // nbands(occ)*npol
+              npw,
+              &ModuleBase::ONE,
+              this->ppcell_vkb,
+              npw,
+              ppsi,
+              this->max_npw,
+              &ModuleBase::ZERO,
+              becp_tmp,
+              this->nkb);
+
+    if (this->device == base_device::GpuDevice)
+    {
+        std::complex<FPTYPE>* h_becp = nullptr;
+        resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act);
+        syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp_tmp, size_becp_act);
+        Parallel_Reduce::reduce_pool(h_becp, size_becp_act);
+        syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp_tmp, h_becp, size_becp_act);
+        delmem_complex_h_op()(this->cpu_ctx, h_becp);
+    }
+    else
+    {
+        Parallel_Reduce::reduce_pool(becp_tmp, size_becp_act);
+    }
+    // DEBUG: ONCE YOU CHECK becp VALUES, YOU UNCOMMENT THE FOLLOWING LINE
+    // std::cout << "ik: " << ik << std::endl;
+    // for (int i = 0; i < npm_npol*this->nkb; i++)
+    // {
+    //     std::cout << "becp[" << i << "]: " << becp[i] << std::endl;
+    // }
+    ModuleBase::timer::tick("Onsite_Proj_tools", "cal_becp");
+}
+
+// cal_dbecp
+template <typename FPTYPE, typename Device>
+void Onsite_Proj_tools<FPTYPE, Device>::cal_dbecp_s(int ik, int npm, int ipol, int jpol)
+{
+    ModuleBase::TITLE("Onsite_Proj_tools", "cal_dbecp_s");
+    ModuleBase::timer::tick("Onsite_Proj_tools", "cal_dbecp_s");
+    this->current_ik = -1; // reset the current ik, vkb has been reused to save dvkb
+    const int npol = this->ucell_->get_npol();
+    const int size_becp = this->nbands * npol * this->nkb;
+    const int npm_npol = npm * npol;
+    if (this->dbecp == nullptr)
+    {
+        resmem_complex_op()(this->ctx, dbecp, size_becp);
+    }
+
+    // prepare math tools
+    Nonlocal_maths<FPTYPE, Device> maths(*(this->nhtol), this->lprojmax, this->ucell_);
+
+    const std::complex<FPTYPE>* ppsi = &(this->psi_[0](ik, 0, 0));
+    const int npw = this->wfc_basis_->npwk[ik];
+    std::complex<FPTYPE>* vkb_deri_ptr = this->ppcell_vkb;
+
+    if (this->pre_ik_s != ik)
+    { // k point has changed, we need to recalculate the g_plus_k
+        // this->g_plus_k = maths.cal_gk(ik, this->wfc_basis_); //has been calculated by cal_becp
+
+        const int lmax_ = this->lprojmax;
+        // prepare ylm，size: (lmax+1)^2 * this->max_npw
+        // maths.cal_ylm(lmax_, npw, g_plus_k.data(), hd_ylm); //has been calculated by cal_becp
+        maths.cal_ylm_deri(lmax_, npw, g_plus_k.data(), hd_ylm_deri);
+        this->pre_ik_s = ik;
+    }
+    FPTYPE *gk = g_plus_k.data(), *vq_tb = this->tabtpr->ptr;
+    std::complex<FPTYPE>* d_sk = this->hd_sk;
+    if (this->device == base_device::GpuDevice)
+    {
+        gk = d_g_plus_k;
+        vq_tb = d_vq_tab;
+    }
+
+    for (int it = 0; it < this->ucell_->ntype; it++) // loop all elements
+    {
+        cal_vq_op()(this->ctx,
+                    vq_tb,
+                    it,
+                    gk,
+                    npw,
+                    this->tabtpr->getBound2(),
+                    this->tabtpr->getBound3(),
+                    PARAM.globalv.dq,
+                    this->nproj[it],
+                    hd_vq);
+        cal_vq_deri_op()(this->ctx,
+                         vq_tb,
+                         it,
+                         gk,
+                         npw,
+                         this->tabtpr->getBound2(),
+                         this->tabtpr->getBound3(),
+                         PARAM.globalv.dq,
+                         this->nproj[it],
+                         hd_vq_deri);
+
+        // prepare（-i）^l, size: nh
+        std::vector<std::complex<double>> pref = maths.cal_pref(it, h_atom_nh[it]);
+        int nh = pref.size();
+        // prepare indexes for calculate vkb_deri
+        this->dvkb_indexes.resize(nh * 4);
+        maths.cal_dvkb_index(this->nproj[it],
+                             this->nhtol->c,
+                             this->nhtol->nc,
+                             npw,
+                             it,
+                             ipol,
+                             jpol,
+                             this->dvkb_indexes.data());
+        if (this->device == base_device::GpuDevice)
+        {
+            syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4);
+            syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_pref_in, pref.data(), nh);
+        }
+        for (int ia = 0; ia < h_atom_na[it]; ia++)
+        {
+            // 2. calculate dbecp：
+            // 2.a. calculate dbecp_noevc, repeat use the memory of ppcell.vkb
+
+            if (this->device == base_device::CpuDevice)
+            {
+                d_dvkb_indexes = dvkb_indexes.data();
+                d_pref_in = pref.data();
+                d_g_plus_k = g_plus_k.data();
+            }
+            cal_vkb_deri_op()(this->ctx,
+                              nh,
+                              npw,
+                              ipol,
+                              jpol,
+                              d_dvkb_indexes,
+                              hd_vq,
+                              hd_vq_deri,
+                              hd_ylm,
+                              hd_ylm_deri,
+                              d_sk,
+                              d_pref_in,
+                              d_g_plus_k,
+                              vkb_deri_ptr);
+            d_sk += npw;
+            vkb_deri_ptr += nh * npw;
+        }
+    }
+    // 2.b calculate dbecp = dbecp_noevc * psi
+    const char transa = 'C';
+    const char transb = 'N';
+
+    gemm_op()(this->ctx,
+              transa,
+              transb,
+              nkb,
+              npm_npol,
+              npw,
+              &ModuleBase::ONE,
+              ppcell_vkb,
+              npw,
+              ppsi,
+              this->max_npw,
+              &ModuleBase::ZERO,
+              dbecp,
+              nkb);
+    ModuleBase::timer::tick("Onsite_Proj_tools", "cal_dbecp_s");
+}
+
+// cal_dbecp_f
+// starts from vkb (nkb, ng) table
+// it should be again merely the multiplication of matrix (vkb, ng) * (ng, nbands) -> (vkb, nbands)
+// the vkb is backed-up, and the memory space is reused for calculate ONE COMPONENT of dbecp
+// . the direction of force is indexed by ipol (for stress, there are two, ipol and jpol).
+// the dbecp_f is simply the becp multiplied with -i(G+k)_i
+template <typename FPTYPE, typename Device>
+void Onsite_Proj_tools<FPTYPE, Device>::cal_dbecp_f(int ik, int npm, int ipol)
+{
+    ModuleBase::TITLE("Onsite_Proj_tools", "cal_dbecp_f");
+    ModuleBase::timer::tick("Onsite_Proj_tools", "cal_dbecp_f");
+
+    this->current_ik = -1; // reset the current ik, vkb has been reused to save dvkb
+
+    const int npw = this->wfc_basis_->npwk[ik];
+
+    // STAGE1: calculate dvkb_f
+    // calculate gcarx, gcary/gcarx and gcarz/gcary, overwrite gcar
+    if (this->pre_ik_f == -1) // if it is the very first run, we allocate
+    {
+        resmem_var_op()(this->ctx, gcar, 3 * this->wfc_basis_->npwk_max);
+        resmem_int_op()(this->ctx, gcar_zero_indexes, 3 * this->wfc_basis_->npwk_max);
+    }
+    // first refresh the value of gcar_zero_indexes, gcar_zero_counts
+    if (this->pre_ik_f != ik)
+    { // the following lines will cause UNDEFINED BEHAVIOR because memory layout of vector3 instance
+      // is assumed to be always contiguous but it is not guaranteed.
+        this->transfer_gcar(npw,
+                            this->wfc_basis_->npwk_max,
+                            &(this->wfc_basis_->gcar[ik * this->wfc_basis_->npwk_max].x));
+    }
+
+    // backup vkb values to vkb_save
+    this->save_vkb(npw, ipol);
+    // for x, the coef is -i, for y and z it is 1
+    const std::complex<double> coeff = ipol == 0 ? ModuleBase::NEG_IMAG_UNIT : ModuleBase::ONE;
+
+    const std::complex<FPTYPE>* vkb_ptr = this->ppcell_vkb;
+    std::complex<FPTYPE>* vkb_deri_ptr = this->ppcell_vkb;
+    // calculate the vkb_deri for ipol with the memory of ppcell_vkb
+    cal_vkb1_nl_op<FPTYPE, Device>()(this->ctx, nkb, npw, npw, npw, ipol, coeff, vkb_ptr, gcar, vkb_deri_ptr);
+
+    // ------------------------------------------------------------------------------->8
+
+    // STAGE2: calculate dbecp_f
+    // NPOL
+    // either 1 or 2, for NSPIN 1, 2 or 4 calculation
+    // once NSPIN 4, there are doubled number of pw in each "row" of psi
+    // on the other hand, for NSPIN 4 calculation, the number of bands is also doubled
+    const int npol = this->ucell_->get_npol();
+    const int npm_npol = npm * npol;
+    const int size_becp = this->nbands * npol * this->nkb;
+    if (this->dbecp == nullptr) // if it is the very first run, we allocate
+    {                           // why not judging whether dbecp == nullptr inside resmem_complex_op?
+        resmem_complex_op()(this->ctx, dbecp, 3 * size_becp);
+    }
+    // do gemm to get dbecp and revert the ppcell_vkb for next ipol
+    const std::complex<FPTYPE>* ppsi = &(this->psi_[0](ik, 0, 0));
+    // move the pointer to corresponding read&write position, according to ipol
+    std::complex<FPTYPE>* dbecp_ptr = this->dbecp + ipol * size_becp; // [out]
+    const char transa = 'C';
+    const char transb = 'N';
+    gemm_op()(this->ctx,
+              transa,
+              transb,
+              this->nkb,
+              npm_npol,
+              npw,
+              &ModuleBase::ONE,
+              vkb_deri_ptr,
+              npw,
+              ppsi,
+              this->max_npw,
+              &ModuleBase::ZERO,
+              dbecp_ptr,
+              nkb);
+    this->revert_vkb(npw, ipol);
+    this->pre_ik_f = ik;
+    ModuleBase::timer::tick("Onsite_Proj_tools", "cal_dbecp_f");
+}
+
+// save_vkb
+template <typename FPTYPE, typename Device>
+void Onsite_Proj_tools<FPTYPE, Device>::save_vkb(int npw, int ipol)
+{
+    if (this->device == base_device::CpuDevice)
+    {
+        const int gcar_zero_count = this->gcar_zero_indexes[ipol * this->wfc_basis_->npwk_max];
+        const int* gcar_zero_ptrs = &this->gcar_zero_indexes[ipol * this->wfc_basis_->npwk_max + 1];
+        const std::complex<FPTYPE>* vkb_ptr = this->ppcell_vkb;
+        std::complex<FPTYPE>* vkb_save_ptr = this->vkb_save;
+        // find the zero indexes to save the vkb values to vkb_save
+        for (int ikb = 0; ikb < this->nkb; ++ikb)
+        {
+            for (int icount = 0; icount < gcar_zero_count; ++icount)
+            {
+                *vkb_save_ptr = vkb_ptr[gcar_zero_ptrs[icount]];
+                ++vkb_save_ptr;
+            }
+            vkb_ptr += npw;
+        }
+    }
+    else
+    {
+#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+        saveVkbValues<FPTYPE>(this->gcar_zero_indexes,
+                              this->ppcell_vkb,
+                              this->vkb_save,
+                              nkb,
+                              this->gcar_zero_counts[ipol],
+                              npw,
+                              ipol,
+                              this->wfc_basis_->npwk_max);
+#endif
+    }
+}
+
+// revert_vkb
+template <typename FPTYPE, typename Device>
+void Onsite_Proj_tools<FPTYPE, Device>::revert_vkb(int npw, int ipol)
+{
+    const std::complex<FPTYPE> coeff = ipol == 0 ? ModuleBase::NEG_IMAG_UNIT : ModuleBase::ONE;
+    if (this->device == base_device::CpuDevice)
+    {
+        const int gcar_zero_count = this->gcar_zero_indexes[ipol * this->wfc_basis_->npwk_max];
+        const int* gcar_zero_ptrs = &this->gcar_zero_indexes[ipol * this->wfc_basis_->npwk_max + 1];
+        std::complex<FPTYPE>* vkb_ptr = this->ppcell_vkb;
+        const std::complex<FPTYPE>* vkb_save_ptr = this->vkb_save;
+        // find the zero indexes to save the vkb values to vkb_save
+        for (int ikb = 0; ikb < this->nkb; ++ikb)
+        {
+            for (int icount = 0; icount < gcar_zero_count; ++icount)
+            {
+                vkb_ptr[gcar_zero_ptrs[icount]] = *vkb_save_ptr * coeff;
+                ++vkb_save_ptr;
+            }
+            vkb_ptr += npw;
+        }
+    }
+    else
+    {
+#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM
+        revertVkbValues<FPTYPE>(this->gcar_zero_indexes,
+                                this->ppcell_vkb,
+                                this->vkb_save,
+                                nkb,
+                                this->gcar_zero_counts[ipol],
+                                npw,
+                                ipol,
+                                this->wfc_basis_->npwk_max,
+                                coeff);
+#endif
+    }
+}
+
+template <typename FPTYPE, typename Device>
+void Onsite_Proj_tools<FPTYPE, Device>::transfer_gcar(int npw, int npw_max, const FPTYPE* gcar_in)
+{
+    std::vector<FPTYPE> gcar_tmp(3 * npw_max); // [out], will overwritten this->gcar
+    gcar_tmp.assign(gcar_in,
+                    gcar_in + 3 * npw_max); // UNDEFINED BEHAVIOR!!! nobody always knows the memory layout of vector3
+    std::vector<int> gcar_zero_indexes_tmp(3 * npw_max); // a "checklist"
+
+    int* gcar_zero_ptrs[3];
+    for (int i = 0; i < 3; i++)
+    {
+        gcar_zero_ptrs[i] = &gcar_zero_indexes_tmp[i * npw_max];
+        gcar_zero_ptrs[i][0] = -1;
+        this->gcar_zero_counts[i] = 0;
+    }
+    for (int ig = 0; ig < npw; ig++)
+    {
+        // calculate gcar.x , gcar.y/gcar.x, gcar.z/gcar.y
+        // if individual gcar is less than 1e-15, we will record the index
+        for (int i = 0; i < 3; ++i)
+        {
+            if (std::abs(gcar_tmp[ig * 3 + i]) < 1e-15)
+            {
+                ++gcar_zero_counts[i]; // num of zeros on each direction
+                gcar_zero_ptrs[i][gcar_zero_counts[i]] = ig;
+            }
+        }
+        // four cases for the gcar of y and z
+        if (gcar_zero_ptrs[0][gcar_zero_counts[0]] == ig && gcar_zero_ptrs[1][gcar_zero_counts[1]] == ig)
+        { // x == y == 0, z = z
+        }
+        else if (gcar_zero_ptrs[0][gcar_zero_counts[0]] != ig && gcar_zero_ptrs[1][gcar_zero_counts[1]] == ig)
+        { // x != 0, y == 0, z = z/x
+            gcar_tmp[ig * 3 + 2] /= gcar_tmp[ig * 3];
+        }
+        else if (gcar_zero_ptrs[0][gcar_zero_counts[0]] == ig && gcar_zero_ptrs[1][gcar_zero_counts[1]] != ig)
+        { // x == 0, y != 0, y = y, z = z/y
+            gcar_tmp[ig * 3 + 2] /= gcar_tmp[ig * 3 + 1];
+        }
+        else
+        { // x != 0, y != 0, y = y/x, z = z/y
+            gcar_tmp[ig * 3 + 2] /= gcar_tmp[ig * 3 + 1];
+            gcar_tmp[ig * 3 + 1] /= gcar_tmp[ig * 3];
+        }
+    }
+    for (int i = 0; i < 3; ++i)
+    { // record the counts to the first element
+        gcar_zero_ptrs[i][0] = gcar_zero_counts[i];
+    }
+    // prepare the memory for vkb_save
+    const int max_count = std::max(gcar_zero_counts[0], std::max(gcar_zero_counts[1], gcar_zero_counts[2]));
+    resmem_complex_op()(this->ctx, this->vkb_save, this->nkb * max_count);
+    // transfer the gcar and gcar_zero_indexes to the device
+    syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar, gcar_tmp.data(), 3 * npw_max);
+    syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, gcar_zero_indexes, gcar_zero_indexes_tmp.data(), 3 * npw_max);
+}
+
+template <typename FPTYPE, typename Device>
+void Onsite_Proj_tools<FPTYPE, Device>::cal_force_dftu(int ik,
+                                                       int npm,
+                                                       FPTYPE* force,
+                                                       const int* orbital_corr,
+                                                       const std::complex<FPTYPE>* vu,
+                                                       const int size_vu,
+                                                       const FPTYPE* h_wg)
+{
+    int* orbital_corr_tmp = nullptr;
+    std::complex<FPTYPE>* vu_tmp = nullptr;
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        resmem_int_op()(this->ctx, orbital_corr_tmp, this->ucell_->ntype);
+        syncmem_int_h2d_op()(this->ctx, cpu_ctx, orbital_corr_tmp, orbital_corr, this->ucell_->ntype);
+        resmem_complex_op()(this->ctx, vu_tmp, size_vu);
+        syncmem_complex_h2d_op()(this->ctx, cpu_ctx, vu_tmp, vu, size_vu);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1));
+    }
+    else
+#endif
+    {
+        orbital_corr_tmp = const_cast<int*>(orbital_corr);
+        vu_tmp = const_cast<std::complex<FPTYPE>*>(vu);
+        d_wg = const_cast<FPTYPE*>(h_wg);
+    }
+    const int force_nc = 3;
+    cal_force_nl_op<FPTYPE, Device>()(this->ctx,
+                                      npm,
+                                      this->nbands,
+                                      this->ntype,
+                                      force_nc,
+                                      this->nbands,
+                                      ik,
+                                      nkb,
+                                      atom_nh,
+                                      atom_na,
+                                      this->ucell_->tpiba,
+                                      d_wg,
+                                      vu_tmp,
+                                      orbital_corr_tmp,
+                                      becp,
+                                      dbecp,
+                                      force);
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        delmem_complex_op()(this->ctx, vu_tmp);
+        delmem_int_op()(this->ctx, orbital_corr_tmp);
+    }
+#endif
+}
+
+template <typename FPTYPE, typename Device>
+void Onsite_Proj_tools<FPTYPE, Device>::cal_force_dspin(int ik,
+                                                        int npm,
+                                                        FPTYPE* force,
+                                                        const ModuleBase::Vector3<double>* lambda,
+                                                        const FPTYPE* h_wg)
+{
+    std::vector<FPTYPE> lambda_array(this->ucell_->nat * 3);
+    for (int iat = 0; iat < this->ucell_->nat; iat++)
+    {
+        lambda_array[iat * 3] = lambda[iat].x;
+        lambda_array[iat * 3 + 1] = lambda[iat].y;
+        lambda_array[iat * 3 + 2] = lambda[iat].z;
+    }
+    FPTYPE* lambda_tmp = nullptr;
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array.data(), this->ucell_->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1));
+    }
+    else
+#endif
+    {
+        lambda_tmp = lambda_array.data();
+        d_wg = const_cast<FPTYPE*>(h_wg);
+    }
+    const int force_nc = 3;
+    cal_force_nl_op<FPTYPE, Device>()(this->ctx,
+                                      npm,
+                                      this->nbands,
+                                      this->ntype,
+                                      force_nc,
+                                      this->nbands,
+                                      ik,
+                                      nkb,
+                                      atom_nh,
+                                      atom_na,
+                                      this->ucell_->tpiba,
+                                      d_wg,
+                                      lambda_tmp,
+                                      becp,
+                                      dbecp,
+                                      force);
+
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        delmem_var_op()(this->ctx, lambda_tmp);
+    }
+#endif
+}
+
+template <typename FPTYPE, typename Device>
+void Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dftu(int ik,
+                                                        int npm,
+                                                        FPTYPE* stress,
+                                                        const int* orbital_corr,
+                                                        const std::complex<FPTYPE>* vu,
+                                                        const int size_vu,
+                                                        const FPTYPE* h_wg)
+{
+    int* orbital_corr_tmp = nullptr;
+    std::complex<FPTYPE>* vu_tmp = nullptr;
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        resmem_int_op()(this->ctx, orbital_corr_tmp, this->ucell_->ntype);
+        syncmem_int_h2d_op()(this->ctx, cpu_ctx, orbital_corr_tmp, orbital_corr, this->ucell_->ntype);
+        resmem_complex_op()(this->ctx, vu_tmp, size_vu);
+        syncmem_complex_h2d_op()(this->ctx, cpu_ctx, vu_tmp, vu, size_vu);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1));
+    }
+    else
+#endif
+    {
+        orbital_corr_tmp = const_cast<int*>(orbital_corr);
+        vu_tmp = const_cast<std::complex<FPTYPE>*>(vu);
+        d_wg = const_cast<FPTYPE*>(h_wg);
+    }
+    cal_stress_nl_op()(this->ctx,
+                       nkb,
+                       npm,
+                       this->ntype,
+                       this->nbands,
+                       ik,
+                       atom_nh,
+                       atom_na,
+                       d_wg,
+                       vu_tmp,
+                       orbital_corr_tmp,
+                       becp,
+                       dbecp,
+                       stress);
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        delmem_complex_op()(this->ctx, vu_tmp);
+        delmem_int_op()(this->ctx, orbital_corr_tmp);
+    }
+#endif
+}
+
+template <typename FPTYPE, typename Device>
+void Onsite_Proj_tools<FPTYPE, Device>::cal_stress_dspin(int ik,
+                                                         int npm,
+                                                         FPTYPE* stress,
+                                                         const ModuleBase::Vector3<double>* lambda,
+                                                         const FPTYPE* h_wg)
+{
+    std::vector<FPTYPE> lambda_array(this->ucell_->nat * 3);
+    for (int iat = 0; iat < this->ucell_->nat; iat++)
+    {
+        lambda_array[iat * 3] = lambda[iat].x;
+        lambda_array[iat * 3 + 1] = lambda[iat].y;
+        lambda_array[iat * 3 + 2] = lambda[iat].z;
+    }
+    FPTYPE* lambda_tmp = nullptr;
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array.data(), this->ucell_->nat * 3);
+        syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1));
+    }
+    else
+#endif
+    {
+        lambda_tmp = lambda_array.data();
+        d_wg = const_cast<FPTYPE*>(h_wg);
+    }
+    const int force_nc = 3;
+    cal_stress_nl_op()(this->ctx,
+                       nkb,
+                       npm,
+                       this->ntype,
+                       this->nbands,
+                       ik,
+                       atom_nh,
+                       atom_na,
+                       d_wg,
+                       lambda_tmp,
+                       becp,
+                       dbecp,
+                       stress);
+
+#if defined(__CUDA) || defined(__ROCM)
+    if (this->device == base_device::GpuDevice)
+    {
+        delmem_var_op()(this->ctx, lambda_tmp);
+    }
+#endif
+}
+
+// template instantiation
+template class Onsite_Proj_tools<double, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class Onsite_Proj_tools<double, base_device::DEVICE_GPU>;
+#endif
+
+} // namespace hamilt
diff --git a/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.h b/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.h
new file mode 100644
index 0000000000..17c7e06491
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.h
@@ -0,0 +1,216 @@
+#ifndef MODULEHAMILTPW_ONSITEPROJTOOLS_H
+#define MODULEHAMILTPW_ONSITEPROJTOOLS_H
+
+#include "module_base/module_device/device.h"
+#include "module_basis/module_pw/pw_basis_k.h"
+#include "module_cell/klist.h"
+#include "module_cell/unitcell.h"
+#include "module_hamilt_pw/hamilt_pwdft/VNL_in_pw.h"
+#include "module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h"
+#include "module_hsolver/kernels/math_kernel_op.h"
+#include "module_psi/psi.h"
+
+#include <complex>
+
+namespace hamilt
+{
+
+/**
+ * @brief Nonlocal pseudopotential tools in plane wave basis set.
+ * used for calculating force and stress for different algorithm
+ * the main functions are:
+ * 1. cal_becp: calculate the becp = <psi|beta> for all beta functions
+ * 2. cal_dbecp_s: calculate the dbecp_{ij} = <psi|\partial beta/\partial varepsilon_{ij}> for all beta functions
+ *                 stress_{ij} = -1/omega \sum_{n,k}f_{nk} \sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_{ij} also
+ * calculated
+ * 3. cal_dbecp_f: calculate the dbecp_i = <psi|\partial beta/\partial \tau^I_i> for all beta functions
+ * 4. cal_force: calculate the force^I_i = - \sum_{n,k}f_{nk} \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_i
+ */
+template <typename FPTYPE, typename Device>
+class Onsite_Proj_tools
+{
+  public:
+    Onsite_Proj_tools(const pseudopot_cell_vnl* nlpp_in,
+                      const UnitCell* ucell_in,
+                      const psi::Psi<std::complex<FPTYPE>, Device>* psi_in,
+                      const K_Vectors* kv_in,
+                      const ModulePW::PW_Basis_K* wfc_basis_in,
+                      const Structure_Factor* sf_in,
+                      const ModuleBase::matrix& wg,
+                      const ModuleBase::matrix& ekb);
+
+    // a more general constructor is in the following
+    Onsite_Proj_tools(const std::vector<int>& nproj,     // number of projectors for each atom type
+                      const std::vector<int>& lproj,
+                      const ModuleBase::realArray& tab,  // radials' spherical bessel transform
+                      const ModuleBase::matrix& nhtol,
+                      std::complex<FPTYPE>* vkb_buf,
+                      const UnitCell* ucell_in,
+                      const psi::Psi<std::complex<FPTYPE>, Device>* psi_in,
+                      const K_Vectors* kv_in,
+                      const ModulePW::PW_Basis_K* wfc_basis_in,
+                      const Structure_Factor* sf_in,
+                      const ModuleBase::matrix& wg,
+                      const ModuleBase::matrix& ekb);
+
+    ~Onsite_Proj_tools();
+
+    /**
+     * @brief calculate the becp = <psi|beta> for all beta functions
+     */
+    void cal_becp(int ik, int npm, std::complex<FPTYPE>* becp_in = nullptr, const std::complex<FPTYPE>* ppsi_in = nullptr);
+    /**
+     * @brief calculate the dbecp_{ij} = <psi|\partial beta/\partial varepsilon_{ij}> for all beta functions
+     *       stress_{ij} = -1/omega \sum_{n,k}f_{nk} \sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_{ij} also calculated
+     */
+    void cal_dbecp_s(int ik, int npm, int ipol, int jpol);
+    /**
+     * @brief calculate the dbecp_i = <psi|\partial beta/\partial \tau^I_i> for all beta functions
+     */
+    void cal_dbecp_f(int ik, int npm, int ipol);
+
+    void cal_force_dftu(int ik, int npm, FPTYPE* force, const int* orbital_corr, const std::complex<FPTYPE>* vu, const int size_vu, const FPTYPE* h_wg);
+    void cal_force_dspin(int ik, int npm, FPTYPE* force, const ModuleBase::Vector3<double>* lambda, const FPTYPE* h_wg);
+    void cal_stress_dftu(int ik, int npm, FPTYPE* stress, const int* orbital_corr, const std::complex<FPTYPE>* vu, const int size_vu, const FPTYPE* h_wg);
+    void cal_stress_dspin(int ik, int npm, FPTYPE* stress, const ModuleBase::Vector3<double>* lambda, const FPTYPE* h_wg);
+
+
+    std::complex<FPTYPE>* get_becp() { return becp; }
+    std::complex<FPTYPE>* get_dbecp() { return dbecp; }
+
+  private:
+    /**
+     * @brief allocate the memory for the variables
+     */
+    void allocate_memory(const ModuleBase::matrix& wg, 
+                         const ModuleBase::matrix& ekb,
+                         const std::vector<int>& nproj,
+                         const std::vector<int>& nch);
+    /**
+     * @brief delete the memory for the variables
+     */
+    void delete_memory();
+
+  private:
+    /// pointers to access the data without memory arrangement
+    const Structure_Factor* sf_;
+    const pseudopot_cell_vnl* nlpp_;
+    const UnitCell* ucell_;
+    const psi::Psi<std::complex<FPTYPE>, Device>* psi_;
+    const K_Vectors* kv_;
+    const ModulePW::PW_Basis_K* wfc_basis_;
+
+    /// the following variables are used for the calculation
+    Device* ctx = {};
+    base_device::DEVICE_CPU* cpu_ctx = {};
+    base_device::AbacusDevice_t device = {};
+    int nkb;
+    int nbands;
+    int deeq_dims[4] = {0, 0, 0, 0};    // deeq can be something other than that in pseudopotentials
+    int deeq_nc_dims[4] = {0, 0, 0, 0};
+
+    int current_ik = -1;
+
+    int max_nh = 0;
+    int max_npw = 0;
+    int ntype;
+    bool nondiagonal;
+    int pre_ik_s = -1;
+    int pre_ik_f = -1;
+
+    int* atom_nh = nullptr;
+    int* atom_na = nullptr;
+    std::vector<int> h_atom_nh;
+    std::vector<int> h_atom_na;
+    std::vector<int> nproj;
+
+    /// ------------------------- Key optimization -------------------------
+    /// @brief the following variables are used for transfer gcar and reuse the values of vkb for force calculation
+    int* gcar_zero_indexes = nullptr;
+    int gcar_zero_counts[3] = {0, 0, 0};
+    std::complex<FPTYPE>* vkb_save = nullptr;
+    /// @brief count zero gcar indexes and prepare zero_indexes, do gcar_y /= gcar_x, gcar_z /= gcar_y
+    void transfer_gcar(int npw, int npw_max, const FPTYPE* gcar_in);
+    /// @brief save the 0-value dvkbs for calculating the dbecp_i in the force calculation
+    void save_vkb(int npw, int ipol);
+    /// @brief revert the 0-value dvkbs for calculating the dbecp_i in the force calculation
+    void revert_vkb(int npw, int ipol);
+    /// ---------------------------------------------------------------------
+
+    /// pointers to access the data without memory arrangement
+    const ModuleBase::realArray* tabtpr = nullptr;
+    const ModuleBase::matrix* nhtol = nullptr;
+    int lprojmax = -1;
+
+    FPTYPE* d_wg = nullptr;
+    FPTYPE* d_ekb = nullptr;
+    FPTYPE* gcar = nullptr;
+
+    FPTYPE* deeq = nullptr;
+    std::complex<FPTYPE>* deeq_nc = nullptr;
+
+    FPTYPE* kvec_c = nullptr;
+    FPTYPE* qq_nt = nullptr;
+    /// --------------------- Key variable ---------------------
+    /// borrow the memory from the vkb in VNL_in_pw to calculate vkb and dvkb
+    std::complex<FPTYPE>* ppcell_vkb = nullptr;
+    /// ---------------------------------------------------------
+    /// the following variables are used for the calculation
+    /// allocate memory on CPU device only
+    std::vector<FPTYPE> g_plus_k;
+    /// allocate memory on CPU/GPU device
+    FPTYPE* hd_ylm = nullptr;              // (lmax + 1) * (lmax + 1) * npw
+    FPTYPE* hd_ylm_deri = nullptr;         // 3 * (lmax + 1) * (lmax + 1) * npw
+    FPTYPE* hd_vq = nullptr;               // this->ucell->atoms[it].ncpp.nbeta * npw
+    FPTYPE* hd_vq_deri = nullptr;          // this->ucell->atoms[it].ncpp.nbeta * npw
+    std::complex<FPTYPE>* hd_sk = nullptr; // this->ucell->nat * npw
+    /// allocate global memory on GPU device only
+    FPTYPE* d_g_plus_k = nullptr;              // npw * 5
+    FPTYPE* d_pref = nullptr;                  // this->ucell->atoms[it].ncpp.nh
+    FPTYPE* d_gk = nullptr;                    // this->ucell->atoms[it].ncpp.nh * npw
+    FPTYPE* d_vq_tab = nullptr;                // this->ucell->atoms[it].ncpp.nbeta * npw
+    std::vector<int> dvkb_indexes;             // this->ucell->atoms[it].ncpp.nh * 4
+    int* d_dvkb_indexes = nullptr;             // this->ucell->atoms[it].ncpp.nh * 4
+    std::complex<FPTYPE>* d_pref_in = nullptr; // this->ucell->atoms[it].ncpp.nh
+
+    /// becp and dbecp:
+    std::complex<FPTYPE>* dbecp = nullptr; // nbands * nkb (for stress) or nbands * nkb * 3 (for force)
+    std::complex<FPTYPE>* becp = nullptr;  // nbands * nkb
+
+    /// @brief rename the operators for CPU/GPU device
+    using gemm_op = hsolver::gemm_op<std::complex<FPTYPE>, Device>;
+    using cal_stress_nl_op = hamilt::cal_stress_nl_op<FPTYPE, Device>;
+    using cal_dbecp_noevc_nl_op = hamilt::cal_dbecp_noevc_nl_op<FPTYPE, Device>;
+
+    using resmem_complex_op = base_device::memory::resize_memory_op<std::complex<FPTYPE>, Device>;
+    using resmem_complex_h_op = base_device::memory::resize_memory_op<std::complex<FPTYPE>, base_device::DEVICE_CPU>;
+    using setmem_complex_op = base_device::memory::set_memory_op<std::complex<FPTYPE>, Device>;
+    using delmem_complex_op = base_device::memory::delete_memory_op<std::complex<FPTYPE>, Device>;
+    using delmem_complex_h_op = base_device::memory::delete_memory_op<std::complex<FPTYPE>, base_device::DEVICE_CPU>;
+    using syncmem_complex_h2d_op
+        = base_device::memory::synchronize_memory_op<std::complex<FPTYPE>, Device, base_device::DEVICE_CPU>;
+    using syncmem_complex_d2h_op
+        = base_device::memory::synchronize_memory_op<std::complex<FPTYPE>, base_device::DEVICE_CPU, Device>;
+
+    using resmem_var_op = base_device::memory::resize_memory_op<FPTYPE, Device>;
+    using resmem_var_h_op = base_device::memory::resize_memory_op<FPTYPE, base_device::DEVICE_CPU>;
+    using setmem_var_op = base_device::memory::set_memory_op<FPTYPE, Device>;
+    using delmem_var_op = base_device::memory::delete_memory_op<FPTYPE, Device>;
+    using delmem_var_h_op = base_device::memory::delete_memory_op<FPTYPE, base_device::DEVICE_CPU>;
+    using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op<FPTYPE, Device, base_device::DEVICE_CPU>;
+    using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op<FPTYPE, base_device::DEVICE_CPU, Device>;
+
+    using resmem_int_op = base_device::memory::resize_memory_op<int, Device>;
+    using delmem_int_op = base_device::memory::delete_memory_op<int, Device>;
+    using syncmem_int_h2d_op = base_device::memory::synchronize_memory_op<int, Device, base_device::DEVICE_CPU>;
+
+    using cal_vq_op = hamilt::cal_vq_op<FPTYPE, Device>;
+    using cal_vq_deri_op = hamilt::cal_vq_deri_op<FPTYPE, Device>;
+
+    using cal_vkb_op = hamilt::cal_vkb_op<FPTYPE, Device>;
+    using cal_vkb_deri_op = hamilt::cal_vkb_deri_op<FPTYPE, Device>;
+};
+
+} // namespace hamilt
+
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp b/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp
new file mode 100644
index 0000000000..2bb69dc131
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp
@@ -0,0 +1,643 @@
+#include <cassert>
+#include <numeric>
+#include <fstream>
+#include <algorithm>
+#include <map>
+#include <tuple>
+#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h"
+
+#include "module_base/projgen.h"
+#include "module_base/blas_connector.h"
+#include "module_hsolver/kernels/math_kernel_op.h"
+#ifdef __MPI
+#include "module_base/parallel_reduce.h"
+#include "module_base/parallel_common.h"
+#endif
+#include "module_parameter/parameter.h"
+#include "module_base/timer.h"
+#include "module_base/formatter.h"
+
+
+
+/**
+ * ===============================================================================================
+ * 
+ *                                          README
+ * 
+ * ===============================================================================================
+ * 
+ * This is a code demo for illustrating how to use unified radial projection in implementation of
+ * Operators involving local radial projectors on PW-expanded wavefunctions.
+ * 
+ * Example usage:
+ * ```c++
+ * // select the range of atoms that impose the operator in std::vector<std::vector<int>> it2ia like
+ * // it2ia[it] = {ia1, ia2, ...} for each type
+ * // if all atoms in present kind is "selected", just set it2ia[it].resize(na) and call 
+ * // std::iota(it2ia[it].begin(), it2ia[it].end(), 0)
+ * 
+ * std::vector<std::vector<int>> it2ia; // as if we have given its value...
+ * 
+ * // you should have the `orbital_dir` as the directory containing the orbital files, then those
+ * // will be read by a static function `AtomicRadials::read_abacus_orb` to get the radial orbitals
+ * 
+ * // call `init_proj` to initialize the radial projector, this function only needs to be called
+ * // once during the runtime.
+ * // its input... 
+ * // the `nproj`, is for specifying number of projectors of each atom type, can be zero,
+ * // but cannot be the value larger than the number of zeta functions for the given angular momentum.
+ * // the `lproj` is the angular momentum of the projectors, and `iproj` is the index of zeta function
+ * // that each projector generated from.
+ * // the `lproj` along with `iproj` can enable radial projectors in any number developer wants.
+ * 
+ * // the `onsite_r` is the onsite-radius for all valid projectors, it is used to generate the new
+ * // radial function that more localized than the original one, which is expected to have enhanced
+ * // projection efficiency.
+ * 
+ * std::vector<double> rgrid;
+ * std::vector<std::vector<double>> projs;
+ * std::vector<std::vector<int>> it2iproj;
+ * init_proj(orbital_dir, ucell, nproj, lproj, iproj, onsite_r, rgrid, projs, it2iproj);
+ * 
+ * // then call the function `cal_becp` to calculate the becp. HOWEVER, there are quantities that
+ * // can be calculated in advance and reused in the following calculations. Please see the function
+ * // implementation, especially the comments about CACHE 0, CACHE 1, CACHE 2..., etc.
+ * 
+ * // the input param of `cal_becp`...
+ * // the `it2ia` has been explained above
+ * // the `it2iproj` is the output of function `init_proj`, so you do not need to worry about it
+ * // the `rgrid` and `projs` are also the output of function `init_proj`
+ * // the `lproj` is the angular momentum for each projector, actually you have used it in `init_proj`, it
+ * // is the same as `lproj`
+ * // the `nq` is the number of G+k vectors, typically it is always GlobalV::NQX
+ * // the `dq` is the step size of G+k vectors, typically it is always GlobalV::DQ
+ * // the `ik` is the k-point index
+ * // the `pw_basis` is the plane wave basis, need ik
+ * // the `omega` is the cell volume
+ * // the `tpiba` is 2*pi/lat0
+ * // the `sf` is the structure factor calculator
+ * // the `psi` is the wavefunction
+ * // the `becp` is the output of the function, it is the becp
+ * cal_becp(it2ia, it2iproj, rgrid, projs, lproj, nq, dq, ik, pw_basis, omega, tpiba, sf, psi, becp);
+ * 
+ * // About parallelization, presently, the function `AtomicRadials::read_abacus_orb` is actually parallelized
+ * // by MPI, so after the reading of orbital, actually all processors have the same data. Therefore it is not
+ * // needed to call functions like `Parallel_Reduce` or `Parallel_Bcast` to synchronize the data.
+ * // However, what is strikingly memory-consuming is the table `tab_atomic_`. Performance optimization will
+ * // be needed if the memory is not enough.
+ */
+
+template<typename T, typename Device>
+projectors::OnsiteProjector<T, Device>* projectors::OnsiteProjector<T, Device>::get_instance()
+{
+    static projectors::OnsiteProjector<T, Device> instance;
+    return &instance;
+}
+
+template<typename T, typename Device>
+void projectors::OnsiteProjector<T, Device>::init(const std::string& orbital_dir,
+                                                  const UnitCell* ucell_in,
+                                                  const psi::Psi<std::complex<T>, Device>& psi,
+                                                  const K_Vectors& kv,
+                                                  const ModulePW::PW_Basis_K& pw_basis,             // level1: the plane wave basis, need ik
+                                                  Structure_Factor& sf,                              // level2: the structure factor calculator
+                                                  const double onsite_radius,
+                                                  const int nq,
+                                                  const double dq,
+                                                  const ModuleBase::matrix& wg,
+                                                  const ModuleBase::matrix& ekb)
+{
+    this->device = base_device::get_device_type<Device>(this->ctx);
+    if(!this->initialed)
+    {
+        this->ucell = ucell_in;
+        this->ntype = ucell_in->ntype;
+
+        this->pw_basis_ = &pw_basis;
+        this->sf_ = &sf;
+
+        std::vector<std::string> orb_files(ntype);
+        std::vector<int> nproj(ntype);
+        int sum_nproj = 0;
+        for(int it=0;it<ntype;++it)
+        {
+            orb_files[it] = ucell->orbital_fn[it];
+            nproj[it] = ucell->atoms[it].nwl;
+            sum_nproj += nproj[it];
+        }
+        this->lproj.resize(sum_nproj);
+        int index = 0;
+        for(int it=0;it<ntype;++it)
+        {
+            for(int il=0;il<nproj[it];++il)
+            {
+                this->lproj[index++] = il;
+            }
+        }
+        std::vector<int> iproj(sum_nproj, 0);
+        std::vector<double> onsite_r(sum_nproj, onsite_radius);
+
+        this->it2ia.resize(this->ntype);
+        this->iat_nh.resize(this->ucell->nat);
+        int iat = 0;
+        for(int it = 0; it < it2ia.size(); it++)
+        {
+            it2ia[it].resize(this->ucell->atoms[it].na);
+            std::iota(it2ia[it].begin(), it2ia[it].end(), 0);
+            for(int ia = 0; ia < it2ia[it].size(); ia++)
+            {
+                iat_nh[iat++] = nproj[it] * nproj[it];
+            }
+        }
+
+        this->init_proj(PARAM.inp.orbital_dir, 
+                        orb_files, 
+                        nproj, 
+                        lproj, 
+                        iproj, 
+                        onsite_r);
+
+        ModuleBase::timer::tick("OnsiteProj", "cubspl_tabulate");
+        // STAGE 0 - making the interpolation table
+        // CACHE 0 - if cache the irow2it, irow2iproj, irow2m, itiaiprojm2irow, <G+k|p> can be reused for 
+        //           SCF, RELAX and CELL-RELAX calculation
+        // [in] rgrid, projs, lproj, it2ia, it2iproj, nq, dq
+        RadialProjection::RadialProjector::_build_backward_map(it2iproj, lproj, irow2it_, irow2iproj_, irow2m_);
+        RadialProjection::RadialProjector::_build_forward_map(it2ia, it2iproj, lproj, itiaiprojm2irow_);
+        //rp_._build_sbt_tab(rgrid, projs, lproj, nq, dq);
+        rp_._build_sbt_tab(nproj, rgrid, projs, lproj, nq, dq, ucell_in->omega, psi.npol, tab, nhtol);
+        // For being compatible with present cal_force and cal_stress framework  
+        // uncomment the following code block if you want to use the Onsite_Proj_tools
+        if(this->tab_atomic_ == nullptr)
+        {
+            this->tot_nproj = itiaiprojm2irow_.size();
+            this->npwx_ = this->pw_basis_->npwk_max;
+            this->size_vproj = this->tot_nproj * this->npwx_;
+            resmem_complex_op()(this->ctx, this->tab_atomic_, this->size_vproj, "OnsiteP::tab_atomic_");
+        }
+
+        delete this->fs_tools; // it is okay to delete nullptr
+        this->fs_tools = new hamilt::Onsite_Proj_tools<T, Device>(
+            nproj, lproj, tab, nhtol, this->tab_atomic_, ucell_in, &psi, &kv, &pw_basis, &sf, wg, ekb);      
+        
+        ModuleBase::timer::tick("OnsiteProj", "cubspl_tabulate");
+
+        this->initialed = true;
+    }
+}
+
+template<typename T, typename Device>
+projectors::OnsiteProjector<T, Device>::~OnsiteProjector()
+{
+    //delete[] becp;
+    delete fs_tools;
+    delmem_complex_op()(this->ctx, this->tab_atomic_);
+    if(this->device == base_device::GpuDevice)
+    {
+        delmem_complex_h_op()(this->cpu_ctx, this->h_becp);
+    }
+    delmem_complex_op()(this->ctx, this->becp);
+
+}
+
+
+template<typename T, typename Device>
+void projectors::OnsiteProjector<T, Device>::init_proj(const std::string& orbital_dir,
+                     const std::vector<std::string>& orb_files,
+                     const std::vector<int>& nproj,           // for each type, the number of projectors
+                     const std::vector<int>& lproj,           // angular momentum of projectors within the type (l of zeta function)
+                     const std::vector<int>& iproj,           // index of projectors within the type (izeta)
+                     const std::vector<double>& onsite_r) 
+{
+    // extract the information from ucell
+    const int ntype = nproj.size();
+    assert(ntype == orb_files.size());
+    this->it2iproj.resize(ntype);
+
+    int nproj_tot = 0;
+    nproj_tot = std::accumulate(nproj.begin(), nproj.end(), nproj_tot, std::plus<int>());
+    assert(nproj_tot == lproj.size());
+    assert(nproj_tot == iproj.size());
+    assert(nproj_tot == onsite_r.size());
+    this->projs.resize(nproj_tot);
+
+    int idx = 0;
+    int nr = -1;
+    double dr = -1.0;
+    for(int it = 0; it < ntype; ++it)
+    {
+        const int nproj_it = nproj[it];
+        this->it2iproj[it].resize(nproj_it);
+        if(nproj_it == 0)
+        {
+            std::cout << "BECP_PW >> No projectors defined for type " << it << std::endl;
+            continue;
+        }
+        std::ifstream ifs(orbital_dir + orb_files[it]);
+        std::string elem = "";
+        double ecut = -1.0;
+        int nr_ = -1;
+        double dr_ = -1.0;
+        std::vector<int> nzeta; // number of radials for each l
+        std::vector<std::vector<double>> radials; // radials arranged in serial
+        this->read_abacus_orb(ifs, elem, ecut, nr_, dr_, nzeta, radials);
+#ifdef __DEBUG
+        assert(elem != "");
+        assert(ecut != -1.0);
+        assert(nr_ != -1);
+        assert(dr_ != -1.0);
+#endif
+        nr = std::max(nr, nr_); // the maximal nr
+        assert(dr == -1.0 || dr == dr_); // the dr should be the same for all types
+        dr = (dr == -1.0) ? dr_ : dr;
+        for(int ip = 0; ip < nproj_it; ++ip)
+        {
+            int l = lproj[idx];
+            int izeta = iproj[idx];
+            int irad = 0;
+            irad = std::accumulate(nzeta.begin(), nzeta.begin() + l, irad);
+            irad += izeta;
+            std::vector<double> temp = radials[irad];
+            rgrid.resize(nr);
+            std::iota(rgrid.begin(), rgrid.end(), 0);
+            std::for_each(rgrid.begin(), rgrid.end(), [dr](double& r_i) { r_i *= dr; });
+            smoothgen(nr, rgrid.data(), temp.data(), onsite_r[idx], projs[idx]);
+            it2iproj[it][ip] = idx;
+            ++idx;
+        }
+    }
+    // do zero padding
+    if(nr != -1)
+    {
+        std::for_each(projs.begin(), projs.end(), [nr](std::vector<double>& proj) { proj.resize(nr, 0.0); });
+    }
+    // generate the rgrid
+    this->rgrid.resize(nr);
+    std::iota(rgrid.begin(), rgrid.end(), 0);
+    std::for_each(rgrid.begin(), rgrid.end(), [dr](double& r_i) { r_i *= dr; });
+}
+
+template<typename T, typename Device>
+void projectors::OnsiteProjector<T, Device>::tabulate_atomic(const int ik, const char grad)
+{
+    ModuleBase::timer::tick("OnsiteProj", "tabulate_atomic");
+    // assert(grad == 'n' || grad == 'x' || grad == 'y' || grad == 'z');
+    // grad = 'n' means no gradient, grad = 'x' means gradient along x, etc.
+
+    // STAGE 1 - calculate the <G+k|p> for the given G+k vector
+    // CACHE 1 - if cache the tab_, <G+k|p> can be reused for SCF and RELAX calculation
+    // [in] pw_basis, ik, omega, tpiba, irow2it
+    this->ik_ = ik;
+    this->npw_ = pw_basis_->npwk[ik];
+    this->npwx_ = pw_basis_->npwk_max;
+    // std::vector<ModuleBase::Vector3<double>> q(this->npw_);
+    // for(int ig = 0; ig < this->npw_; ++ig)
+    // {
+    //     q[ig] = pw_basis_->getgpluskcar(ik, ig); // get the G+k vector, G+k will change during CELL-RELAX
+    // }
+    // const int nrow = irow2it_.size();
+    // std::vector<std::complex<double>> tab_(nrow*this->npw_);
+    // // convention used here: 'l': <p|G+k>, 'r': <G+k|p>
+    // // denote q=G+k, <r|q> = exp(iqr), the routine Fourier Transform written as F(q) = <q|f>
+    // rp_.sbtft(q, tab_, 'l', this->ucell->omega, this->ucell->tpiba);
+    // // what is calculated is <p|q> here
+
+    // STAGE 2 - make_atomic: multiply e^iqtau and extend the <G+k|p> to <G+k|pi> for each atom
+    // CACHE 2 - if cache the tab_atomic_, <G+k|p> can be reused for SCF calculation
+    // [in] it2ia, itiaiprojm2irow, tab_, npw, sf
+    // for(int irow = 0; irow < nrow; ++irow)
+    // {
+    //     const int it = irow2it_[irow];
+    //     const int iproj = irow2iproj_[irow];
+    //     const int m = irow2m_[irow];
+    //     for(int ia = 0; ia < na[it]; ++ia)
+    //     {
+    //         // why Structure_Factor needs the FULL pw_basis???
+    //         std::complex<double>* sk = this->sf_->get_sk(ik, it, ia, pw_basis_); // exp(-iqtau)
+    //         // Note: idea on extending the param list of get_sk
+    //         // the get_sk should have an extra param 'grad' to calculate the gradient of S(q), which
+    //         // is actually very simple to be
+    //         // d(S(q))/dq = -i S(q) * tau, for one direction it is just -i S(q) * tau_x (if x is the direction)
+    //         const int irow_out = itiaiprojm2irow_.at(std::make_tuple(it, ia, iproj, m));
+    //         for(int ig = 0; ig < this->npw_; ++ig)
+    //         {
+    //             std::complex<double> deriv = (grad == 'n')? 1.0: ModuleBase::NEG_IMAG_UNIT; // because sk is exp(-iqtau)
+    //             deriv = (grad == 'n')? 1.0: (grad == 'x')? deriv * q[ig].x: (grad == 'y')? deriv * q[ig].y: deriv * q[ig].z;
+    //             // there must be something twisted in ABACUS
+    //             // because the tab_ is <p|G+k>, but the sk is exp(-iqtau). How can it get the 
+    //             // correct result?
+    //             this->tab_atomic_[irow_out*this->npw_ + ig] = sk[ig] * tab_[irow*this->npw_ + ig] * deriv;
+    //         }
+    //         delete[] sk;
+    //     }
+    // }
+    // q.clear();
+    // q.shrink_to_fit();    // release memory
+    // tab_.clear();
+    // tab_.shrink_to_fit(); // release memory
+    ModuleBase::timer::tick("OnsiteProj", "tabulate_atomic");
+}
+
+template<typename T, typename Device>
+void projectors::OnsiteProjector<T, Device>::overlap_proj_psi( 
+                    const int npm,
+                    const std::complex<double>* ppsi
+                    )
+{
+    ModuleBase::timer::tick("OnsiteProj", "overlap");
+    // STAGE 3 - cal_becp
+    // CACHE 3 - it is no use to cache becp, it will change in each SCF iteration
+    // [in] psi, tab_atomic_, npw, becp, ik
+//     const char transa = 'C';
+//     const char transb = 'N';
+//     const int ldb = this->npwx_;
+//     const int ldc = this->tot_nproj;
+//     const std::complex<double> alpha = 1.0;
+//     const std::complex<double> beta = 0.0;
+//     if(this->becp == nullptr || this->size_becp < npm*ldc)
+//     {
+//         delete[] this->becp;
+//         this->becp = new std::complex<double>[npm*ldc];
+//         this->size_becp = npm*ldc;
+//     }
+//     setmem_complex_op()(ctx, this->becp, 0.0, this->size_becp);
+//     gemm_op()(
+//         this->ctx,
+//         transa,                 // const char transa
+//         transb,                 // const char transb
+//         ldc,                    // const int m
+//         npm,                    // const int n
+//         this->npw_,             // const int k
+//         &alpha,                 // const std::complex<double> alpha
+//         this->tab_atomic_,      // const std::complex<double>* a
+//         this->npw_,             // const int lda
+//         ppsi,                   // const std::complex<double>* b
+//         ldb,                    // const int ldb
+//         &beta,                  // const std::complex<double> beta
+//         becp,                   // std::complex<double>* c
+//         ldc);                   // const int ldc
+// #ifdef __MPI
+//     Parallel_Reduce::reduce_pool(becp, size_becp);
+// #endif
+
+    // notes on refactor for DCU calculation
+    // the npm here is nbands(occ) * npol, for calling cal_becp, the npol should be divided.
+    // std::cout << "npm: " << npm << std::endl;
+    // std::cout << "at " << __FILE__ << ": " << __LINE__ << " output tot_nproj: " << this->tot_nproj << std::endl;
+    // std::cout << "at " << __FILE__ << ": " << __LINE__ << " output npm: " << npm << std::endl;
+    // std::cout << "at " << __FILE__ << ": " << __LINE__ << " ik_: " << ik_ << std::endl;
+    int npol = this->ucell->get_npol();
+    if(this->becp == nullptr || this->size_becp < npm*this->tot_nproj)
+    {
+        this->size_becp = npm*this->tot_nproj;
+        resmem_complex_op()(this->ctx, this->becp, this->size_becp);
+        if(this->device == base_device::GpuDevice )
+        {
+            resmem_complex_h_op()(this->cpu_ctx, this->h_becp, this->size_becp);
+        }
+        else
+        {
+            this->h_becp = this->becp;
+        }
+    }
+    this->fs_tools->cal_becp(ik_, npm/npol, this->becp, ppsi); // in cal_becp, npm should be the one not multiplied by npol
+    if(this->device == base_device::GpuDevice)
+    {
+        syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, this->becp, this->size_becp);
+    }
+    ModuleBase::timer::tick("OnsiteProj", "overlap");
+}
+
+template<typename T, typename Device>
+void projectors::OnsiteProjector<T, Device>::read_abacus_orb(std::ifstream& ifs,
+                           std::string& elem,
+                           double& ecut,
+                           int& nr,
+                           double& dr,
+                           std::vector<int>& nzeta,
+                           std::vector<std::vector<double>>& radials,
+                           const int rank)
+{
+    nr = 0; // number of grid points
+    dr = 0; // grid spacing
+    int lmax = 0, nchi = 0; // number of radial functions
+    std::vector<std::vector<int>> radial_map_; // build a map from [l][izeta] to 1-d array index
+    std::string tmp;
+    // first read the header
+    if (rank == 0)
+    {
+        if (!ifs.is_open())
+        {
+            ModuleBase::WARNING_QUIT("AtomicRadials::read_abacus_orb", "Couldn't open orbital file.");
+        }
+        while (ifs >> tmp)
+        {
+            if (tmp == "Element")
+            {
+                ifs >> elem;
+            }
+            else if (tmp == "Cutoff(Ry)")
+            {
+                ifs >> ecut;
+            }
+            else if (tmp == "Lmax")
+            {
+                ifs >> lmax;
+                nzeta.resize(lmax + 1);
+                for (int l = 0; l <= lmax; ++l)
+                {
+                    ifs >> tmp >> tmp >> tmp >> nzeta[l];
+                }
+            }
+            else if (tmp == "Mesh")
+            {
+                ifs >> nr;
+                continue;
+            }
+            else if (tmp == "dr")
+            {
+                ifs >> dr;
+                break;
+            }
+        }
+        radial_map_.resize(lmax + 1);
+        for (int l = 0; l <= lmax; ++l)
+        {
+            radial_map_[l].resize(nzeta[l]);
+        }
+        int ichi = 0;
+        for (int l = 0; l <= lmax; ++l)
+        {
+            for (int iz = 0; iz < nzeta[l]; ++iz)
+            {
+                radial_map_[l][iz] = ichi++; // return the value of ichi, then increment
+            }
+        }
+        nchi = ichi; // total number of radial functions
+        radials.resize(nchi);
+        std::for_each(radials.begin(), radials.end(), [nr](std::vector<double>& v) { v.resize(nr); });
+    }
+
+    // broadcast the header information
+#ifdef __MPI
+    Parallel_Common::bcast_string(elem);
+    Parallel_Common::bcast_double(ecut);
+    Parallel_Common::bcast_int(lmax);
+    Parallel_Common::bcast_int(nchi);
+    Parallel_Common::bcast_int(nr);
+    Parallel_Common::bcast_double(dr);
+#endif
+
+    // then adjust the size of the vectors
+    if (rank != 0)
+    {
+        nzeta.resize(lmax + 1);
+        radials.resize(nchi);
+        std::for_each(radials.begin(), radials.end(), [nr](std::vector<double>& v) { v.resize(nr); });
+    }
+    // broadcast the number of zeta functions for each angular momentum
+#ifdef __MPI
+    Parallel_Common::bcast_int(nzeta.data(), lmax + 1);
+#endif
+
+    // read the radial functions by rank0
+    int ichi = 0;
+    for (int i = 0; i != nchi; ++i)
+    {
+        if (rank == 0)
+        {
+            int l, izeta;
+            ifs >> tmp >> tmp >> tmp;
+            ifs >> tmp >> l >> izeta;
+            ichi = radial_map_[l][izeta];
+            for (int ir = 0; ir != nr; ++ir)
+            {
+                ifs >> radials[ichi][ir];
+            }
+        }
+    // broadcast the radial functions
+#ifdef __MPI
+        Parallel_Common::bcast_int(ichi); // let other ranks know where to store the radial function
+        Parallel_Common::bcast_double(radials[ichi].data(), nr);
+#endif
+    }
+} // end of read_abacus_orb
+
+template<typename T, typename Device>
+void projectors::OnsiteProjector<T, Device>::cal_occupations(const psi::Psi<std::complex<T>, Device>* psi_in, const ModuleBase::matrix& wg_in)
+{
+    ModuleBase::timer::tick("OnsiteProj", "cal_occupation");
+    this->tabulate_atomic(0);
+    std::vector<std::complex<double>> occs(this->tot_nproj * 4, 0.0);
+
+    // loop over k-points to calculate Mi of \sum_{k,i,l,m}<Psi_{k,i}|alpha_{l,m}><alpha_{l,m}|Psi_{k,i}>
+    const int nbands = psi_in->get_nbands();
+    for(int ik = 0; ik < psi_in->get_nk(); ik++)
+    {
+        psi_in->fix_k(ik);
+        if(ik != 0)
+        {
+            this->tabulate_atomic(ik);
+        }
+        // std::cout << __FILE__ << ":" << __LINE__ << " nbands = " << nbands << std::endl;
+        this->overlap_proj_psi(
+                        nbands * psi_in->npol,
+                        psi_in->get_pointer());
+        const std::complex<double>* becp_p = this->get_h_becp();
+        // becp(nbands*npol , nkb)
+        // mag = wg * \sum_{nh}becp * becp
+        int nkb = this->tot_nproj;
+        //nkb = 18;
+        //std::cout << "at " << __FILE__ << ": " << __LINE__ << " output nbands: " << nbands << std::endl;
+        //std::cout << "at " << __FILE__ << ": " << __LINE__ << " output nkb: " << nkb << std::endl;
+        for(int ib = 0;ib<nbands;ib++)
+        {
+            const double weight = wg_in(ik, ib);
+            int begin_ih = 0;
+            for(int iat = 0; iat < this->iat_nh.size(); iat++)
+            {
+                const int nh = this->get_nh(iat);
+                for(int ih = 0; ih < nh; ih++)
+                {
+                    const int occ_index = (begin_ih + ih) * 4;
+                    const int index = ib*2*nkb + begin_ih + ih;
+                    occs[occ_index] += weight * conj(becp_p[index]) * becp_p[index];
+                    occs[occ_index + 1] += weight * conj(becp_p[index]) * becp_p[index + nkb];
+                    occs[occ_index + 2] += weight * conj(becp_p[index + nkb]) * becp_p[index];
+                    occs[occ_index + 3] += weight * conj(becp_p[index + nkb]) * becp_p[index + nkb];
+                }
+                begin_ih += nh;
+            }
+        }
+    }
+    // reduce mag from all k-pools
+    Parallel_Reduce::reduce_double_allpool(GlobalV::KPAR, GlobalV::NPROC_IN_POOL, (double*)(&(occs[0])), occs.size()*2);
+    // occ has been reduced and calculate mag
+    // parameters for orbital charge output
+    FmtCore fmt_of_chg("%15.4f");
+    FmtCore fmt_of_label("%-15s");
+    GlobalV::ofs_running << std::endl;
+    GlobalV::ofs_running << "-------------------------------------------------------------------------------------------" << std::endl;
+    GlobalV::ofs_running << "Orbital Charge Analysis      Charge         Mag(x)         Mag(y)         Mag(z)" << std::endl;
+    GlobalV::ofs_running << "-------------------------------------------------------------------------------------------" << std::endl;
+    // parameters for orbital charge output
+    // parameters for mag output
+    std::vector<double> mag_x(this->ucell->nat, 0.0);
+    std::vector<double> mag_y(this->ucell->nat, 0.0);
+    std::vector<double> mag_z(this->ucell->nat,0.0);
+    auto atomLabels = this->ucell->get_atomLabels();
+    const std::vector<std::string> title = {"Total Magnetism (uB)", "", "", ""};
+    const std::vector<std::string> fmts = {"%-26s", "%20.10f", "%20.10f", "%20.10f"};
+    const std::vector<std::string> orb_names = {"s", "p", "d", "f", "g"};
+    FmtTable table(title, this->ucell->nat, fmts, {FmtTable::Align::RIGHT, FmtTable::Align::LEFT});
+    // parameters for mag output
+    int occ_index = 0;
+    for(int iat=0;iat<this->ucell->nat;iat++)
+    {
+        const int it = this->ucell->iat2it[iat];
+        std::string atom_label = atomLabels[it];
+        int ia = this->ucell->iat2ia[iat];
+        GlobalV::ofs_running << FmtCore::format("%-20s", atom_label+std::to_string(ia+1)) << std::endl;
+        std::vector<double> sum(4, 0.0);
+        int current_l = 1;
+        std::vector<double> charge_mag(4, 0.0);
+        for(int ih=0;ih<this->iat_nh[iat];ih++)
+        {
+            charge_mag[3] += (occs[occ_index] - occs[occ_index + 3]).real();
+            charge_mag[1] += (occs[occ_index + 1] + occs[occ_index + 2]).real();
+            charge_mag[2] += (occs[occ_index + 1] - occs[occ_index + 2]).imag();
+            charge_mag[0] += (occs[occ_index] + occs[occ_index + 3]).real();
+            if(ih == current_l * current_l - 1)
+            {
+                sum[0] += charge_mag[0];
+                sum[1] += charge_mag[1];
+                sum[2] += charge_mag[2];
+                sum[3] += charge_mag[3];
+                GlobalV::ofs_running << FmtCore::format("%20s", orb_names[current_l-1])
+                    << fmt_of_chg.format(charge_mag[0]) << fmt_of_chg.format(charge_mag[1])
+                    << fmt_of_chg.format(charge_mag[2]) << fmt_of_chg.format(charge_mag[3]) << std::endl;
+                current_l++;
+                charge_mag.assign(4, 0.0);
+            }
+            occ_index += 4;
+        }
+        mag_x[iat] = sum[1];
+        mag_y[iat] = sum[2];
+        mag_z[iat] = sum[3];
+        GlobalV::ofs_running << FmtCore::format("%20s", std::string("Sum")) << ""
+                    << fmt_of_chg.format(sum[0]) << fmt_of_chg.format(sum[1])
+                    << fmt_of_chg.format(sum[2]) << fmt_of_chg.format(sum[3]) << std::endl;
+    }
+    GlobalV::ofs_running << "-------------------------------------------------------------------------------------------" << std::endl;
+    GlobalV::ofs_running << std::endl;
+    table << atomLabels << mag_x << mag_y << mag_z;
+    GlobalV::ofs_running << table.str() << std::endl;
+    
+    // print charge
+    ModuleBase::timer::tick("OnsiteProj", "cal_occupation");
+}
+
+template class projectors::OnsiteProjector<double, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class projectors::OnsiteProjector<double, base_device::DEVICE_GPU>;
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.h b/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.h
new file mode 100644
index 0000000000..a2bb99354b
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.h
@@ -0,0 +1,159 @@
+#ifndef MODULEHAMILTPW_ONSITEPROJECTOR_H
+#define MODULEHAMILTPW_ONSITEPROJECTOR_H
+#include "module_base/module_device/device.h"
+#include "module_hsolver/kernels/math_kernel_op.h"
+#include "module_hamilt_pw/hamilt_pwdft/structure_factor.h"
+#include "module_basis/module_pw/pw_basis_k.h"
+#include "module_hamilt_pw/hamilt_pwdft/radial_proj.h"
+#include "module_psi/psi.h"
+#include "module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.h"
+
+#include <string>
+#include <vector>
+#include <complex>
+namespace projectors
+{
+    template <typename T, typename Device>
+    class OnsiteProjector
+    {
+        public:
+
+        /**
+         * @brief initialize the radial projector for real-space projection involving operators
+         * 
+         * @param orbital_dir You know what it is
+         * @param orb_files You know what it is
+         * @param nproj # of projectors for each type defined in UnitCell, can be zero
+         * @param lproj angular momentum for each projector
+         * @param iproj index of zeta function that each projector generated from
+         * @param onsite_r onsite-radius for all valid projectors
+         * @param rgrid [out] the radial grid shared by all projectors
+         * @param projs [out] projectors indexed by `iproj`
+         * @param it2iproj [out] for each type, the projector index (across all types)
+         */
+        void init_proj(const std::string& orbital_dir,
+                       const std::vector<std::string>& orb_files,
+                       const std::vector<int>& nproj,           // for each type, the number of projectors
+                       const std::vector<int>& lproj,           // angular momentum of projectors within the type (l of zeta function)
+                       const std::vector<int>& iproj,           // index of projectors within the type (izeta)
+                       const std::vector<double>& onsite_r); // for each type, the projector index (across all types)
+
+        /**
+         * @brief calculate the onsite projectors in reciprocal space(|G+K>) for all atoms
+         */
+        void tabulate_atomic(const int ik, const char grad = 'n');
+        
+        void overlap_proj_psi(
+                    const int npm,
+                    const std::complex<double>* ppsi
+                    );
+        void read_abacus_orb(std::ifstream& ifs,
+                            std::string& elem,
+                            double& ecut,
+                            int& nr,
+                            double& dr,
+                            std::vector<int>& nzeta,
+                            std::vector<std::vector<double>>& radials,
+                            const int rank = 0);
+        /// @brief static access to this class instance
+        static OnsiteProjector<T, Device>* get_instance();
+        void init(const std::string& orbital_dir,
+                    const UnitCell* ucell_in,
+                    const psi::Psi<std::complex<T>, Device>& psi,
+                    const K_Vectors& kv,
+                    const ModulePW::PW_Basis_K& pw_basis,             // level1: the plane wave basis, need ik
+                    Structure_Factor& sf,                              // level2: the structure factor calculator
+                    const double onsite_radius,
+                    const int nq,
+                    const double dq,
+                    const ModuleBase::matrix& wg,
+                    const ModuleBase::matrix& ekb);
+        
+        /// @brief calculate and print the occupations of all lm orbitals
+        void cal_occupations(const psi::Psi<std::complex<T>, Device>* psi, const ModuleBase::matrix& wg_in);
+
+        int get_size_becp() const { return size_becp; }
+        std::complex<double>* get_becp() const { return becp; }
+        std::complex<double>* get_h_becp() const { return h_becp; }
+        std::complex<double>* get_tab_atomic() const { return tab_atomic_; }
+        int get_tot_nproj() const { return tot_nproj; }
+        int get_npw() const { return npw_; }
+        int get_npwx() const { return npwx_; }
+        const int& get_nh(int iat) const { return iat_nh[iat]; }
+
+        hamilt::Onsite_Proj_tools<T, Device>* get_fs_tools() const { return fs_tools; }
+
+        private:
+        OnsiteProjector(){};
+        ~OnsiteProjector();
+
+        Device* ctx = {};
+        base_device::DEVICE_CPU* cpu_ctx = {};
+        base_device::AbacusDevice_t device = {};
+        static OnsiteProjector<T, Device> *instance;
+
+        hamilt::Onsite_Proj_tools<T, Device>* fs_tools = nullptr;
+
+        std::complex<double>* tab_atomic_ = nullptr;
+        std::complex<double>* becp = nullptr;  // nbands * nkb
+        // save becp in CPU memory, only used when Device is GPU
+        std::complex<double>* h_becp;
+
+        int size_becp = 0;
+        int size_vproj = 0;
+        int tot_nproj = 0;
+        int npw_ = 0;
+        int npwx_ = 0;
+        int ik_ = 0;
+        std::vector<std::vector<int>> it2ia;
+        std::vector<double> rgrid;
+        std::vector<std::vector<double>> projs;
+        std::vector<std::vector<int>> it2iproj;
+        std::vector<int> lproj;
+        std::vector<int> iat_nh;
+
+        const UnitCell* ucell = nullptr;
+
+        const ModulePW::PW_Basis_K* pw_basis_ = nullptr;             // level1: the plane wave basis, need ik
+        Structure_Factor* sf_ = nullptr;                             // level2: the structure factor calculator
+        int ntype = 0;
+
+        RadialProjection::RadialProjector rp_;
+        std::vector<int> irow2it_;
+        std::vector<int> irow2iproj_;
+        std::vector<int> irow2m_;
+        std::map<std::tuple<int, int, int, int>, int> itiaiprojm2irow_;
+
+        ModuleBase::realArray tab;
+        ModuleBase::matrix nhtol;
+
+        bool initialed = false;
+
+        /// @brief rename the operators for CPU/GPU device
+        using gemm_op = hsolver::gemm_op<std::complex<T>, Device>;
+
+        using resmem_complex_op = base_device::memory::resize_memory_op<std::complex<T>, Device>;
+        using resmem_complex_h_op = base_device::memory::resize_memory_op<std::complex<T>, base_device::DEVICE_CPU>;
+        using setmem_complex_op = base_device::memory::set_memory_op<std::complex<T>, Device>;
+        using delmem_complex_op = base_device::memory::delete_memory_op<std::complex<T>, Device>;
+        using delmem_complex_h_op = base_device::memory::delete_memory_op<std::complex<T>, base_device::DEVICE_CPU>;
+        using syncmem_complex_h2d_op
+            = base_device::memory::synchronize_memory_op<std::complex<T>, Device, base_device::DEVICE_CPU>;
+        using syncmem_complex_d2h_op
+            = base_device::memory::synchronize_memory_op<std::complex<T>, base_device::DEVICE_CPU, Device>;
+
+        using resmem_var_op = base_device::memory::resize_memory_op<T, Device>;
+        using resmem_var_h_op = base_device::memory::resize_memory_op<T, base_device::DEVICE_CPU>;
+        using setmem_var_op = base_device::memory::set_memory_op<T, Device>;
+        using delmem_var_op = base_device::memory::delete_memory_op<T, Device>;
+        using delmem_var_h_op = base_device::memory::delete_memory_op<T, base_device::DEVICE_CPU>;
+        using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>;
+        using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op<T, base_device::DEVICE_CPU, Device>;
+
+        using resmem_int_op = base_device::memory::resize_memory_op<int, Device>;
+        using delmem_int_op = base_device::memory::delete_memory_op<int, Device>;
+        using syncmem_int_h2d_op = base_device::memory::synchronize_memory_op<int, Device, base_device::DEVICE_CPU>;
+    };
+}// namespace projectors
+
+#endif // MODULEHAMILTPW_ONSITEPROJECTOR_H
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/CMakeLists.txt b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/CMakeLists.txt
index 83f7955dbb..57f45558fb 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/CMakeLists.txt
+++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/CMakeLists.txt
@@ -5,6 +5,7 @@ list(APPEND operator_ks_pw_srcs
     nonlocal_pw.cpp
     meta_pw.cpp
     velocity_pw.cpp
+    onsite_proj_pw.cpp
 )
 
 # this library is included in hamilt_pwdft now
diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp
new file mode 100644
index 0000000000..39f0c1458a
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp
@@ -0,0 +1,422 @@
+#include "onsite_proj_pw.h"
+
+#include "module_base/blas_connector.h"
+#include "module_base/timer.h"
+#include "module_base/parallel_reduce.h"
+#include "module_base/tool_quit.h"
+#include "module_hamilt_lcao/module_deltaspin/spin_constrain.h"
+#include "module_hamilt_lcao/module_dftu/dftu.h"
+#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h"
+#include "module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h"
+#ifdef USE_PAW
+#include "module_cell/module_paw/paw_cell.h"
+#endif
+
+namespace hamilt {
+
+template<typename T, typename Device>
+OnsiteProj<OperatorPW<T, Device>>::OnsiteProj(const int* isk_in,
+                                               const UnitCell* ucell_in,
+                                               const bool cal_delta_spin,
+                                               const bool cal_dftu)
+{
+    this->classname = "OnsiteProj";
+    this->cal_type = calculation_type::pw_onsite;
+    this->isk = isk_in;
+    this->ucell = ucell_in;
+    this->has_delta_spin = cal_delta_spin;
+    this->has_dftu = cal_dftu;
+}
+
+template<typename T, typename Device>
+OnsiteProj<OperatorPW<T, Device>>::~OnsiteProj() {
+    delmem_complex_op()(this->ctx, this->ps);
+    if(this->init_delta_spin)
+    {
+        delmem_int_op()(this->ctx, this->ip_iat);
+        delmem_complex_op()(this->ctx, this->lambda_coeff);
+    }
+    if(this->has_dftu)
+    {
+        if(!init_delta_spin)
+        {
+            delmem_int_op()(this->ctx, this->ip_iat);
+        }
+        delmem_int_op()(this->ctx, this->orb_l_iat);
+        delmem_int_op()(this->ctx, this->ip_m);
+        delmem_int_op()(this->ctx, this->vu_begin_iat);
+        delmem_complex_op()(this->ctx, this->vu_device);
+    }
+}
+
+template<typename T, typename Device>
+void OnsiteProj<OperatorPW<T, Device>>::init(const int ik_in)
+{
+    ModuleBase::timer::tick("OnsiteProj", "getvnl");
+    this->ik = ik_in;
+
+    auto* onsite_p = projectors::OnsiteProjector<double, Device>::get_instance();
+    onsite_p->tabulate_atomic(ik_in);
+    this->tnp = onsite_p->get_tot_nproj();
+
+    if(this->next_op != nullptr)
+    {
+        this->next_op->init(ik_in);
+    }
+
+    ModuleBase::timer::tick("OnsiteProj", "getvnl");
+}
+
+//--------------------------------------------------------------------------
+// this function sum up each non-local pseudopotential located on each atom,
+//--------------------------------------------------------------------------
+template<typename T, typename Device>
+void OnsiteProj<OperatorPW<T, Device>>::add_onsite_proj(T *hpsi_in, const int npol, const int m) const
+{
+    ModuleBase::timer::tick("OnsiteProj", "add_onsite_proj");
+
+    auto* onsite_p = projectors::OnsiteProjector<double, Device>::get_instance();
+    // apply the operator to the wavefunction
+    //std::cout << "use of tab_atomic at " << __FILE__ << ": " << __LINE__ << std::endl;
+    const std::complex<double>* tab_atomic = onsite_p->get_tab_atomic();
+    const int npw = onsite_p->get_npw();
+    const int npwx = onsite_p->get_npwx();
+    char transa = 'N';
+    char transb = 'T';
+    int npm = m;
+    gemm_op()(
+        this->ctx,
+        transa,
+        transb,
+        npw,
+        npm,
+        this->tnp,
+        &this->one,
+        tab_atomic,
+        npw,
+        this->ps,
+        npm,
+        &this->one,
+        hpsi_in,
+        npwx
+    );
+    ModuleBase::timer::tick("OnsiteProj", "add_onsite_proj");
+}
+
+template<typename T, typename Device>
+void OnsiteProj<OperatorPW<T, Device>>::update_becp(const T *psi_in, const int npol, const int m) const
+{
+    auto* onsite_p = projectors::OnsiteProjector<double, Device>::get_instance();
+    // calculate <alpha|psi> 
+    // std::cout << __FILE__ << ":" << __LINE__ << " nbands = " << m << std::endl;
+    onsite_p->overlap_proj_psi(m, psi_in);
+}
+
+template<typename T, typename Device>
+void OnsiteProj<OperatorPW<T, Device>>::cal_ps_delta_spin(const int npol, const int m) const
+{
+    if(!this->has_delta_spin) return;
+
+    auto* onsite_p = projectors::OnsiteProjector<double, Device>::get_instance();
+    const std::complex<double>* becp = onsite_p->get_becp();
+
+    spinconstrain::SpinConstrain<std::complex<double>>& sc = spinconstrain::SpinConstrain<std::complex<double>>::getScInstance();
+    auto& constrain = sc.get_constrain();
+    auto& lambda = sc.get_sc_lambda();
+
+    // T *ps = new T[tnp * m];
+    // ModuleBase::GlobalFunc::ZEROS(ps, m * tnp);
+    if (this->nkb_m < m * tnp) {
+        resmem_complex_op()(this->ctx, this->ps, tnp * m, "OnsiteProj<PW>::ps");
+        this->nkb_m = m * tnp;
+    }
+    setmem_complex_op()(this->ctx, this->ps, 0, tnp * m);
+
+    if(!this->init_delta_spin)
+    {
+        this->init_delta_spin = true;
+        //prepare ip_iat and lambda_coeff
+        resmem_int_op()(this->ctx, this->ip_iat, onsite_p->get_tot_nproj());
+        resmem_complex_op()(this->ctx, this->lambda_coeff, this->ucell->nat * 4);
+        std::vector<int> ip_iat0(onsite_p->get_tot_nproj());
+        int ip0 = 0;
+        for(int iat=0;iat<this->ucell->nat;iat++)
+        {
+            for(int ip=0;ip<onsite_p->get_nh(iat);ip++)
+            {
+                ip_iat0[ip0++] = iat;
+            }
+        }
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj());
+    }
+
+    // prepare array of nh_iat and lambda_array to pass to the onsite_ps_op operator
+    std::vector<std::complex<double>> tmp_lambda_coeff(this->ucell->nat * 4);
+    for(int iat=0;iat<this->ucell->nat;iat++)
+    {
+        tmp_lambda_coeff[iat * 4] = std::complex<double>(lambda[iat][2], 0.0);
+        tmp_lambda_coeff[iat * 4 + 1] = std::complex<double>(lambda[iat][0], lambda[iat][1]);
+        tmp_lambda_coeff[iat * 4 + 2] = std::complex<double>(lambda[iat][0], -1 * lambda[iat][1]);
+        tmp_lambda_coeff[iat * 4 + 3] = std::complex<double>(-1 * lambda[iat][2], 0.0);
+    }
+    syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, this->lambda_coeff, tmp_lambda_coeff.data(), this->ucell->nat * 4);
+    // TODO: code block above should be moved to the init function
+
+    hamilt::onsite_ps_op<Real, Device>()(
+        this->ctx,   // device context
+        m, 
+        npol,
+        this->ip_iat, 
+        tnp,  
+        this->lambda_coeff,
+        this->ps, becp);
+
+    /*int sum = 0;
+    if (npol == 1)
+    {
+        const int current_spin = this->isk[this->ik];
+    }
+    else
+    {
+        for (int iat = 0; iat < this->ucell->nat; iat++)
+        {
+            const int nproj = onsite_p->get_nh(iat);
+            if(constrain[iat].x == 0 && constrain[iat].y == 0 && constrain[iat].z == 0)
+            {
+                sum += nproj;
+                continue;
+            }
+            const std::complex<double> coefficients0(lambda[iat][2], 0.0);
+            const std::complex<double> coefficients1(lambda[iat][0] , lambda[iat][1]);
+            const std::complex<double> coefficients2(lambda[iat][0] , -1 * lambda[iat][1]);
+            const std::complex<double> coefficients3(-1 * lambda[iat][2], 0.0);
+            // each atom has nproj, means this is with structure factor;
+            // each projector (each atom) must multiply coefficient
+            // with all the other projectors.
+            for (int ib = 0; ib < m; ib+=2)
+            {
+                for (int ip = 0; ip < nproj; ip++)
+                {
+                    const int psind = (sum + ip) * m + ib;
+                    const int becpind = ib * tnp + sum + ip;
+                    const std::complex<double> becp1 = becp[becpind];
+                    const std::complex<double> becp2 = becp[becpind + tnp];
+                    ps[psind] += coefficients0 * becp1
+                                    + coefficients2 * becp2;
+                    ps[psind + 1] += coefficients1 * becp1
+                                        + coefficients3 * becp2;
+                } // end ip
+            } // end ib
+            sum += nproj;
+        } // end iat
+    }*/
+}
+
+template<typename T, typename Device>
+void OnsiteProj<OperatorPW<T, Device>>::cal_ps_dftu(const int npol, const int m) const
+{
+    if(!this->has_dftu) return;
+
+    auto* onsite_p = projectors::OnsiteProjector<double, Device>::get_instance();
+    const std::complex<double>* becp = onsite_p->get_becp();
+
+    auto* dftu = ModuleDFTU::DFTU::get_instance();
+
+    // T *ps = new T[tnp * m];
+    // ModuleBase::GlobalFunc::ZEROS(ps, m * tnp);
+    if (this->nkb_m < m * tnp) {
+        resmem_complex_op()(this->ctx, this->ps, tnp * m, "OnsiteProj<PW>::ps");
+        this->nkb_m = m * tnp;
+    }
+    if(!this->has_delta_spin) 
+    {
+        setmem_complex_op()(this->ctx, this->ps, 0, tnp * m);
+    }
+
+    if(!this->init_dftu)
+    {
+        this->init_dftu = true;
+        //prepare orb_l_iat, ip_m, vu_begin_iat and vu_device
+        resmem_int_op()(this->ctx, this->orb_l_iat, this->ucell->nat);
+        resmem_int_op()(this->ctx, this->ip_m, onsite_p->get_tot_nproj());
+        resmem_int_op()(this->ctx, this->vu_begin_iat, this->ucell->nat);
+        // recal the ip_iat
+        resmem_int_op()(this->ctx, this->ip_iat, onsite_p->get_tot_nproj());
+        std::vector<int> ip_iat0(onsite_p->get_tot_nproj());
+        std::vector<int> ip_m0(onsite_p->get_tot_nproj());
+        std::vector<int> vu_begin_iat0(this->ucell->nat);
+        std::vector<int> orb_l_iat0(this->ucell->nat);
+        int ip0 = 0;
+        int vu_begin = 0;
+        for(int iat=0;iat<this->ucell->nat;iat++)
+        {
+            const int it = this->ucell->iat2it[iat];
+            const int target_l = dftu->orbital_corr[it];
+            orb_l_iat0[iat] = target_l;
+            const int nproj = onsite_p->get_nh(iat);
+            if(target_l == -1)
+            {
+                for(int ip=0;ip<nproj;ip++)
+                {
+                    ip_iat0[ip0] = iat;
+                    ip_m0[ip0++] = -1;
+                }
+                vu_begin_iat0[iat] = 0;
+                continue;
+            }
+            else
+            {
+                const int tlp1 = 2 * target_l + 1;
+                vu_begin_iat0[iat] = vu_begin;
+                vu_begin += tlp1 * tlp1 * 4;
+                const int m_begin = target_l * target_l;
+                const int m_end  = (target_l + 1) * (target_l + 1);
+                for(int ip=0;ip<nproj;ip++)
+                {
+                    ip_iat0[ip0] = iat;
+                    if(ip >= m_begin && ip < m_end)
+                    {
+                        ip_m0[ip0++] = ip - m_begin;
+                    }
+                    else
+                    {
+                        ip_m0[ip0++] = -1;
+                    }
+                }
+            }
+        }
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->orb_l_iat, orb_l_iat0.data(), this->ucell->nat);
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj());
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->ip_m, ip_m0.data(), onsite_p->get_tot_nproj());
+        syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->vu_begin_iat, vu_begin_iat0.data(), this->ucell->nat);
+
+        resmem_complex_op()(this->ctx, this->vu_device, dftu->get_size_eff_pot_pw());
+    }
+
+    syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, this->vu_device, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw());
+
+    hamilt::onsite_ps_op<Real, Device>()(
+        this->ctx,   // device context
+        m, 
+        npol,
+        this->orb_l_iat,
+        this->ip_iat,
+        this->ip_m,
+        this->vu_begin_iat, 
+        tnp,  
+        this->vu_device,
+        this->ps, becp);
+
+    /*
+    int sum = 0;
+    if (npol == 1)
+    {
+        const int current_spin = this->isk[this->ik];
+    }
+    else
+    {
+        for (int iat = 0; iat < this->ucell->nat; iat++)
+        {
+            const int it = this->ucell->iat2it[iat];
+            const int target_l = dftu->orbital_corr[it];
+            const int nproj = onsite_p->get_nh(iat);
+            if(target_l == -1)
+            {
+                sum += nproj;
+                continue;
+            }
+            const int ip_begin = target_l * target_l;
+            const int ip_end = (target_l + 1) * (target_l + 1);
+            const int tlp1 = 2 * target_l + 1;
+            const int tlp1_2 = tlp1 * tlp1;
+            const std::complex<double>* vu = dftu->get_eff_pot_pw(iat);
+            // each projector (each atom) must multiply coefficient
+            // with all the other projectors.
+            for (int ib = 0; ib < m; ib+=2)
+            {
+                for (int ip2 = ip_begin; ip2 < ip_end; ip2++)
+                {
+                    const int psind = (sum + ip2) * m + ib;
+                    const int m2 = ip2 - ip_begin;
+                    for (int ip1 = ip_begin; ip1 < ip_end; ip1++)
+                    {
+                        const int becpind1 = ib * tnp + sum + ip1;
+                        const int m1 = ip1 - ip_begin;
+                        const int index_mm = m1 * tlp1 + m2;
+                        const std::complex<double> becp1 = becp[becpind1];
+                        const std::complex<double> becp2 = becp[becpind1 + tnp];
+                        ps[psind] += vu[index_mm] * becp1
+                                    + vu[index_mm + tlp1_2 * 2] * becp2;
+                        ps[psind + 1] += vu[index_mm + tlp1_2 * 1] * becp1
+                                    + vu[index_mm + tlp1_2 * 3] * becp2;
+                    } // end ip1
+                } // end ip2
+            } // end ib
+            sum += nproj;
+        } // end iat
+    }*/
+}
+
+template<>
+void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_CPU>>::add_onsite_proj(std::complex<float> *hpsi_in, const int npol, const int m) const
+{}
+template<>
+void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_CPU>>::update_becp(const std::complex<float> *psi_in, const int npol, const int m) const
+{}
+template<>
+void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_CPU>>::cal_ps_delta_spin(const int npol, const int m) const
+{}
+template<>
+void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_CPU>>::cal_ps_dftu(const int npol, const int m) const
+{}
+
+#if ((defined __CUDA) || (defined __ROCM))
+template<>
+void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_GPU>>::add_onsite_proj(std::complex<float> *hpsi_in, const int npol, const int m) const
+{}
+template<>
+void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_GPU>>::update_becp(const std::complex<float> *psi_in, const int npol, const int m) const
+{}
+template<>
+void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_GPU>>::cal_ps_delta_spin(const int npol, const int m) const
+{}
+template<>
+void OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_GPU>>::cal_ps_dftu(const int npol, const int m) const
+{}
+#endif
+
+template<typename T, typename Device>
+void OnsiteProj<OperatorPW<T, Device>>::act(
+    const int nbands,
+    const int nbasis,
+    const int npol,
+    const T* tmpsi_in,
+    T* tmhpsi,
+    const int ngk_ik,
+    const bool is_first_node)const
+{
+    ModuleBase::timer::tick("Operator", "OnsiteProjPW");
+    this->update_becp(tmpsi_in, npol, nbands);
+    this->cal_ps_delta_spin(npol, nbands);
+    this->cal_ps_dftu(npol, nbands);
+    this->add_onsite_proj(tmhpsi, npol, nbands);
+    ModuleBase::timer::tick("Operator", "OnsiteProjPW");
+}
+
+template<typename T, typename Device>
+template<typename T_in, typename Device_in>
+hamilt::OnsiteProj<OperatorPW<T, Device>>::OnsiteProj(const OnsiteProj<OperatorPW<T_in, Device_in>> *nonlocal)
+{
+    this->classname = "OnsiteProj";
+    this->cal_type = calculation_type::pw_nonlocal;
+    // FIXME: 
+}
+
+template class OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_CPU>>;
+template class OnsiteProj<OperatorPW<std::complex<double>, base_device::DEVICE_CPU>>;
+
+#if ((defined __CUDA) || (defined __ROCM))
+template class OnsiteProj<OperatorPW<std::complex<float>, base_device::DEVICE_GPU>>;
+template class OnsiteProj<OperatorPW<std::complex<double>, base_device::DEVICE_GPU>>;
+#endif
+} // namespace hamilt
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.h b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.h
new file mode 100644
index 0000000000..975967d5c8
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.h
@@ -0,0 +1,98 @@
+#ifndef MODULEHAMILTPW_ONSITE_PROJ_PW_H
+#define MODULEHAMILTPW_ONSITE_PROJ_PW_H
+
+#include "operator_pw.h"
+
+#include "module_cell/unitcell.h"
+#include "module_hsolver/kernels/math_kernel_op.h"
+
+namespace hamilt {
+
+#ifndef ONSITETEMPLATE_H
+#define ONSITETEMPLATE_H
+
+template<class T> class OnsiteProj : public T {};
+// template<typename Real, typename Device = base_device::DEVICE_CPU>
+// class OnsiteProj : public OperatorPW<T, Device> {};
+
+#endif
+
+template<typename T, typename Device>
+class OnsiteProj<OperatorPW<T, Device>> : public OperatorPW<T, Device>
+{
+  private:
+    using Real = typename GetTypeReal<T>::type;
+  public:
+    OnsiteProj(const int* isk_in,
+             const UnitCell* ucell_in,
+             const bool cal_delta_spin,
+             const bool cal_dftu);
+
+    template<typename T_in, typename Device_in = Device>
+    explicit OnsiteProj(const OnsiteProj<OperatorPW<T_in, Device_in>>* onsite_proj);
+
+    virtual ~OnsiteProj();
+
+    virtual void init(const int ik_in)override;
+
+    virtual void act(const int nbands,
+        const int nbasis,
+        const int npol,
+        const T* tmpsi_in,
+        T* tmhpsi,
+        const int ngk = 0,
+        const bool is_first_node = false)const override;
+
+    const int *get_isk() const {return this->isk;}
+    const UnitCell *get_ucell() const {return this->ucell;}
+
+  private:
+    void cal_ps_delta_spin(const int npol, const int m) const;
+    void cal_ps_dftu(const int npol, const int m) const;
+    void update_becp(const T* psi_in, const int npol, const int m) const;
+    void add_onsite_proj(T *hpsi_in, const int npol, const int m) const;
+
+    const int* isk = nullptr;
+
+    const UnitCell* ucell = nullptr;
+
+    mutable int* ip_iat = nullptr;
+    mutable T* lambda_coeff = nullptr;
+    mutable int* orb_l_iat = nullptr;
+    mutable int* ip_m = nullptr;
+    mutable int* vu_begin_iat = nullptr;
+    mutable T* vu_device = nullptr;
+
+    mutable int nkb_m = 0;
+
+    bool has_delta_spin = false;
+    bool has_dftu = false;
+
+    mutable bool init_dftu = false;
+    mutable bool init_delta_spin = false;
+
+    mutable T *ps = nullptr;
+    int tnp = 0;
+    Device* ctx = {};
+    base_device::DEVICE_CPU* cpu_ctx = {};
+
+    using gemv_op = hsolver::gemv_op<T, Device>;
+    using gemm_op = hsolver::gemm_op<T, Device>;
+    using setmem_complex_op = base_device::memory::set_memory_op<T, Device>;
+    using resmem_complex_op = base_device::memory::resize_memory_op<T, Device>;
+    using delmem_complex_op = base_device::memory::delete_memory_op<T, Device>;
+    using syncmem_complex_h2d_op = base_device::memory::synchronize_memory_op<T, Device, base_device::DEVICE_CPU>;
+    using resmem_int_op = base_device::memory::resize_memory_op<int, Device>;
+    using resmem_real_op = base_device::memory::resize_memory_op<Real, Device>;
+    using delmem_int_op = base_device::memory::delete_memory_op<int, Device>;
+    using delmem_real_op = base_device::memory::delete_memory_op<Real, Device>;
+    using syncmem_int_h2d_op = base_device::memory::synchronize_memory_op<int, Device, base_device::DEVICE_CPU>;
+    using syncmem_real_h2d_op = base_device::memory::synchronize_memory_op<Real, Device, base_device::DEVICE_CPU>;
+
+    T one{1, 0};
+    T zero{0, 0};
+};
+
+} // namespace hamilt
+
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/projop_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/projop_pw.cpp
deleted file mode 100644
index b419d9ce88..0000000000
--- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/projop_pw.cpp
+++ /dev/null
@@ -1,285 +0,0 @@
-#include <cassert>
-#include <numeric>
-#include "module_parameter/parameter.h"
-#include <fstream>
-#include <string>
-#include <vector>
-#include <algorithm>
-#include <map>
-#include <tuple>
-#include <complex>
-
-#include "module_hamilt_pw/hamilt_pwdft/radial_proj.h"
-#include "module_basis/module_nao/projgen.h"
-#include "module_basis/module_nao/atomic_radials.h"
-#include "module_hamilt_pw/hamilt_pwdft/structure_factor.h"
-#include "module_basis/module_pw/pw_basis_k.h"
-#include "module_cell/unitcell.h"
-#include "module_base/blas_connector.h"
-#ifdef __MPI
-#include "module_base/parallel_reduce.h"
-#endif
-#include "module_io/orb_io.h"
-/**
- * ===============================================================================================
- * 
- *                                          README
- * 
- * ===============================================================================================
- * 
- * This is a code demo for illustrating how to use unified radial projection in implementation of
- * Operators involving local radial projectors on PW-expanded wavefunctions.
- * 
- * Example usage:
- * ```c++
- * // select the range of atoms that impose the operator in std::vector<std::vector<int>> it2ia like
- * // it2ia[it] = {ia1, ia2, ...} for each type
- * // if all atoms in present kind is "selected", just set it2ia[it].resize(na) and call 
- * // std::iota(it2ia[it].begin(), it2ia[it].end(), 0)
- * 
- * std::vector<std::vector<int>> it2ia; // as if we have given its value...
- * 
- * // you should have the `orbital_dir` as the directory containing the orbital files, then those
- * // will be read by a static function `AtomicRadials::read_abacus_orb` to get the radial orbitals
- * 
- * // call `init_proj` to initialize the radial projector, this function only needs to be called
- * // once during the runtime.
- * // its input... 
- * // the `nproj`, is for specifying number of projectors of each atom type, can be zero,
- * // but cannot be the value larger than the number of zeta functions for the given angular momentum.
- * // the `lproj` is the angular momentum of the projectors, and `iproj` is the index of zeta function
- * // that each projector generated from.
- * // the `lproj` along with `iproj` can enable radial projectors in any number developer wants.
- * 
- * // the `onsite_r` is the onsite-radius for all valid projectors, it is used to generate the new
- * // radial function that more localized than the original one, which is expected to have enhanced
- * // projection efficiency.
- * 
- * std::vector<double> rgrid;
- * std::vector<std::vector<double>> projs;
- * std::vector<std::vector<int>> it2iproj;
- * init_proj(orbital_dir, ucell, nproj, lproj, iproj, onsite_r, rgrid, projs, it2iproj);
- * 
- * // then call the function `cal_becp` to calculate the becp. HOWEVER, there are quantities that
- * // can be calculated in advance and reused in the following calculations. Please see the function
- * // implementation, especially the comments about CACHE 0, CACHE 1, CACHE 2..., etc.
- * 
- * // the input param of `cal_becp`...
- * // the `it2ia` has been explained above
- * // the `it2iproj` is the output of function `init_proj`, so you do not need to worry about it
- * // the `rgrid` and `projs` are also the output of function `init_proj`
- * // the `iproj2l` is the angular momentum for each projector, actually you have used it in `init_proj`, it
- * // is the same as `lproj`
- * // the `nq` is the number of G+k vectors, typically it is always PARAM.globalv.nqx
- * // the `dq` is the step size of G+k vectors, typically it is always PARAM.globalv.dq
- * // the `ik` is the k-point index
- * // the `pw_basis` is the plane wave basis, need ik
- * // the `omega` is the cell volume
- * // the `tpiba` is 2*pi/lat0
- * // the `sf` is the structure factor calculator
- * // the `psi` is the wavefunction
- * // the `becp` is the output of the function, it is the becp
- * cal_becp(it2ia, it2iproj, rgrid, projs, iproj2l, nq, dq, ik, pw_basis, omega, tpiba, sf, psi, becp);
- * 
- * // About parallelization, presently, the function `AtomicRadials::read_abacus_orb` is actually parallelized
- * // by MPI, so after the reading of orbital, actually all processors have the same data. Therefore it is not
- * // needed to call functions like `Parallel_Reduce` or `Parallel_Bcast` to synchronize the data.
- * // However, what is strikingly memory-consuming is the table `tab_atomic_`. Performance optimization will
- * // be needed if the memory is not enough.
- */
-
-
-/**
- * @brief initialize the radial projector for real-space projection involving operators
- * 
- * @param orbital_dir You know what it is
- * @param orb_files You know what it is
- * @param nproj # of projectors for each type defined in UnitCell, can be zero
- * @param lproj angular momentum for each projector
- * @param iproj index of zeta function that each projector generated from
- * @param onsite_r onsite-radius for all valid projectors
- * @param rgrid [out] the radial grid shared by all projectors
- * @param projs [out] projectors indexed by `iproj`
- * @param it2iproj [out] for each type, the projector index (across all types)
- */
-void init_proj(const std::string& orbital_dir,
-               const std::vector<std::string>& orb_files,
-               const std::vector<int>& nproj,           // for each type, the number of projectors
-               const std::vector<int>& lproj,           // angular momentum of projectors within the type (l of zeta function)
-               const std::vector<int>& iproj,           // index of projectors within the type (izeta)
-               const std::vector<double>& onsite_r,     // for each projector, the "onsite_radius"
-               std::vector<double>& rgrid,              // the radial grid shared by all projectors
-               std::vector<std::vector<double>>& projs, // projectors indexed by `iproj`
-               std::vector<std::vector<int>>& it2iproj) // for each type, the projector index (across all types)
-{
-    // extract the information from ucell
-    const int ntype = nproj.size();
-    assert(ntype == orb_files.size());
-    int nproj_tot = 0;
-    std::accumulate(nproj.begin(), nproj.end(), nproj_tot);
-    assert(nproj_tot == lproj.size());
-    assert(nproj_tot == iproj.size());
-    assert(nproj_tot == onsite_r.size());
-    projs.resize(nproj_tot);
-
-    int idx = 0;
-    int nr = -1;
-    double dr = -1.0;
-    for(int it = 0; it < ntype; ++it)
-    {
-        const int nproj_it = nproj[it];
-        it2iproj[it].resize(nproj_it);
-        if(nproj_it == 0) { continue; }
-        std::ifstream ifs(orbital_dir + "/" + orb_files[it]);
-        std::string elem = "";
-        double ecut = -1.0;
-        int nr_ = -1;
-        double dr_ = -1.0;
-        std::vector<int> nzeta; // number of radials for each l
-        std::vector<std::vector<double>> radials; // radials arranged in serial
-        ModuleIO::read_abacus_orb(ifs, elem, ecut, nr_, dr_, nzeta, radials);
-#ifdef __DEBUG
-        assert(elem != "");
-        assert(ecut != -1.0);
-        assert(nr_ != -1);
-        assert(dr_ != -1.0);
-#endif
-        nr = std::max(nr, nr_); // the maximal nr
-        assert(dr == -1.0 || dr == dr_); // the dr should be the same for all types
-        dr = (dr == -1.0) ? dr_ : dr;
-        for(int ip = 0; ip < nproj_it; ++ip)
-        {
-            int l = lproj[idx];
-            int izeta = iproj[idx];
-            int irad = 0;
-            std::accumulate(nzeta.begin(), nzeta.begin() + l, irad);
-            irad += izeta;
-            std::vector<double> temp = radials[irad];
-            smoothgen(nr, rgrid.data(), temp.data(), onsite_r[idx], projs[idx]);
-            it2iproj[it][ip] = idx;
-            ++idx;
-        }
-    }
-    // do zero padding
-    if(nr != -1)
-    {
-        std::for_each(projs.begin(), projs.end(), [nr](std::vector<double>& proj) { proj.resize(nr, 0.0); });
-    }
-    // generate the rgrid
-    rgrid.resize(nr);
-    std::iota(rgrid.begin(), rgrid.end(), 0);
-    std::for_each(rgrid.begin(), rgrid.end(), [dr](double& r_i) { r_i *= dr; });
-}
-
-// I am sorry but what does becp mean?...
-void cal_becp(const std::vector<std::vector<int>>& it2ia,       // level0: for given type `it`, the atom indices `ia`
-              const std::vector<std::vector<int>>& it2iproj,    // level0: for given type `it`, the proj indices `iproj`
-              const std::vector<double>& rgrid,                 // level0: the radial grid shared by all projectors
-              const std::vector<std::vector<double>>& projs,    // level0: projectors indexed by `iproj`
-              const std::vector<int>& iproj2l,                  // level0: for given proj index `iproj`, the angular momentum `l`
-              const int nq,                                     // level0: PARAM.globalv.nqx
-              const double& dq,                                 // level0: PARAM.globalv.dq
-              const int ik,                                     // level1: the k-point index
-              const ModulePW::PW_Basis_K& pw_basis,             // level1: the plane wave basis, need ik
-              const double& omega,                              // level1: the cell volume
-              const double& tpiba,                              // level1: 2*pi/lat0
-              Structure_Factor& sf,                             // level2: the structure factor calculator
-              const psi::Psi<std::complex<double>, base_device::DEVICE_CPU>& psi,
-              std::vector<std::complex<double>>& becp
-              )
-{
-    // STAGE 0 - making the interpolation table
-    // CACHE 0 - if cache the irow2it, irow2iproj, irow2m, itiaiprojm2irow, <G+k|p> can be reused for 
-    //           SCF, RELAX and CELL-RELAX calculation
-    // [in] rgrid, projs, iproj2l, it2ia, it2iproj, nq, dq
-    RadialProjection::RadialProjector rp;
-    std::vector<int> irow2it;
-    std::vector<int> irow2iproj;
-    std::vector<int> irow2m;
-    std::map<std::tuple<int, int, int, int>, int> itiaiprojm2irow;
-    RadialProjection::RadialProjector::_build_backward_map(it2iproj, iproj2l, irow2it, irow2iproj, irow2m);
-    RadialProjection::RadialProjector::_build_forward_map(it2ia, it2iproj, iproj2l, itiaiprojm2irow);
-    rp._build_sbt_tab(rgrid, projs, iproj2l, nq, dq);
-
-
-    // STAGE 1 - calculate the <G+k|p> for the given G+k vector
-    // CACHE 1 - if cache the tab_, <G+k|p> can be reused for SCF and RELAX calculation
-    // [in] pw_basis, ik, omega, tpiba, irow2it
-    const int npw = pw_basis.npwk[ik];
-    std::vector<ModuleBase::Vector3<double>> q(npw);
-    for(int ig = 0; ig < npw; ++ig)
-    {
-        q[ig] = pw_basis.getgpluskcar(ik, ig); // get the G+k vector, G+k will change during CELL-RELAX
-    }
-    const int nrow = irow2it.size();
-    std::vector<std::complex<double>> tab_(nrow*npw);
-    rp.sbtft(q, tab_, 'l', omega, tpiba); // l: <p|G+k>, r: <G+k|p>
-    q.clear();
-    q.shrink_to_fit(); // release memory
-
-
-    // STAGE 2 - make_atomic: multiply e^iqtau and extend the <G+k|p> to <G+k|pi> for each atom
-    // CACHE 2 - if cache the tab_atomic_, <G+k|p> can be reused for SCF calculation
-    // [in] it2ia, itiaiprojm2irow, tab_, npw, sf
-    std::vector<int> na(it2ia.size());
-    for(int it = 0; it < it2ia.size(); ++it)
-    {
-        na[it] = it2ia[it].size();
-    }
-    const int nrow_out = itiaiprojm2irow.size();
-    std::vector<std::complex<double>> tab_atomic_(nrow_out*npw); // memory usage peak HERE
-    for(int irow = 0; irow < nrow; ++irow)
-    {
-        const int it = irow2it[irow];
-        const int iproj = irow2iproj[irow];
-        const int m = irow2m[irow];
-        for(int ia = 0; ia < na[it]; ++ia)
-        {
-            // why Structure_Factor needs the FULL pw_basis???
-            std::complex<double>* sk = sf.get_sk(ik, it, ia, &pw_basis);
-            const int irow_out = itiaiprojm2irow.at(std::make_tuple(it, ia, iproj, m));
-            for(int ig = 0; ig < npw; ++ig)
-            {
-                tab_atomic_[irow_out*npw + ig] = sk[ig]*tab_[irow*npw + ig];
-            }
-            delete[] sk;
-        }
-    }
-    tab_.clear();
-    tab_.shrink_to_fit(); // release memory
-
-
-    // STAGE 3 - cal_becp
-    // CACHE 3 - it is no use to cache becp, it will change in each SCF iteration
-    // [in] psi, tab_atomic_, npw, becp, ik
-    const int nbands = psi.get_nbands();
-    const char transa = 'N';
-    const char transb = 'N';
-    const int one = 1;
-    const int lda = nrow_out;
-    const int ldb = npw;
-    const int ldc = nrow_out;
-    const std::complex<double> alpha = 1.0;
-    const std::complex<double> beta = 0.0;
-
-    becp.resize(nbands*nrow_out);
-    psi.fix_k(ik);
-    BlasConnector::gemm(transa,                 // const char transa
-                        transb,                 // const char transb
-                        nrow_out,               // const int m
-                        nbands,                 // const int n
-                        npw,                    // const int k
-                        alpha,                  // const std::complex<double> alpha
-                        tab_atomic_.data(),     // const std::complex<double>* a
-                        lda,                    // const int lda
-                        psi.get_pointer(),      // const std::complex<double>* b
-                        ldb,                    // const int ldb
-                        beta,                   // const std::complex<double> beta
-                        becp.data(),            // std::complex<double>* c
-                        ldc);                   // const int ldc
-#ifdef __MPI
-    Parallel_Reduce::reduce_pool(becp.data(), becp.size());
-#endif
-    tab_atomic_.clear();
-    tab_atomic_.shrink_to_fit(); // release memory
-}
diff --git a/source/module_hamilt_pw/hamilt_pwdft/radial_proj.cpp b/source/module_hamilt_pw/hamilt_pwdft/radial_proj.cpp
index 8ac987df45..88cfeeadee 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/radial_proj.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/radial_proj.cpp
@@ -8,6 +8,7 @@
 #include "module_base/matrix.h"
 #include "module_base/math_ylmreal.h"
 #include "module_base/spherical_bessel_transformer.h"
+#include "module_base/timer.h"
 
 void RadialProjection::RadialProjector::_build_backward_map(const std::vector<std::vector<int>>& it2iproj,
                                                             const std::vector<int>& iproj2l,
@@ -81,6 +82,7 @@ void RadialProjection::RadialProjector::_build_sbt_tab(const int nr,
                                                        const int nq,
                                                        const double& dq)
 {
+    ModuleBase::timer::tick("RadialProjection", "cubspl_tabulate_vq_each_radial");
     l_ = l;
     const int nrad = radials.size();
     assert(nrad == l.size());
@@ -104,6 +106,7 @@ void RadialProjection::RadialProjector::_build_sbt_tab(const int nr,
         std::for_each(_temp.begin(), _temp.end(), [pref](double& x){x = x/pref;});
         cubspl_->add(_temp.data());
     }
+    ModuleBase::timer::tick("RadialProjection", "cubspl_tabulate_vq_each_radial");
 }
 
 void RadialProjection::RadialProjector::_build_sbt_tab(const std::vector<double>& r,
@@ -112,20 +115,91 @@ void RadialProjection::RadialProjector::_build_sbt_tab(const std::vector<double>
                                                        const int nq,
                                                        const double& dq)
 {
+    ModuleBase::timer::tick("RadialProjection", "cubspl_tabulate_vq_each_radial");
     const int nr = r.size();
     const int nrad = radials.size();
     for(int i = 0; i < nrad; i++) { assert(radials[i].size() == nr); }
     std::vector<double*> radptrs(radials.size());
     for(int i = 0; i < radials.size(); i++) { radptrs[i] = const_cast<double*>(radials[i].data()); }
+    ModuleBase::timer::tick("RadialProjection", "cubspl_tabulate_vq_each_radial");
     _build_sbt_tab(nr, r.data(), radptrs, l, nq, dq);
 }
 
+void RadialProjection::RadialProjector::_build_sbt_tab(const std::vector<int>& nproj,
+                                                       const std::vector<double>& r,
+                                                       const std::vector<std::vector<double>>& radials,
+                                                       const std::vector<int>& l,
+                                                       const int nq,                             //< GlobalV::DQ
+                                                       const double& dq,                         //< GlobalV::NQX
+                                                       const double& omega,
+                                                       const int npol,                           // for nspin 4
+                                                       ModuleBase::realArray& tab,
+                                                       ModuleBase::matrix& nhtol)                // output table
+{
+    int nprojmax = *std::max_element(nproj.begin(), nproj.end());
+    const int ntype = nproj.size();
+
+    tab.create(ntype, nprojmax*npol, nq);
+    tab.zero_out();
+
+    std::vector<double> qgrid(nq);
+    std::iota(qgrid.begin(), qgrid.end(), 0);
+    std::transform(qgrid.begin(), qgrid.end(), qgrid.begin(), [dq](const double& q){return q*dq;});
+
+    ModuleBase::SphericalBesselTransformer sbt_(true); // bool: enable cache
+    int iproj = 0;
+    int nchmax = 0;
+    const double pref = 4*M_PI/std::sqrt(omega) / std::sqrt(2.0/std::acos(-1.0));
+    for (int it = 0; it < ntype; it++)
+    {
+        int nch = 0;
+        const int nproj_it = nproj[it];
+        for (int ip = 0; ip < nproj_it; ip++)
+        {
+            const int l_ = l[iproj];
+            nch += 2*l_ + 1;
+            std::vector<double> _temp(nq);
+            sbt_.direct(l_, r.size(), r.data(), radials[iproj].data(), nq, qgrid.data(), _temp.data());
+            std::for_each(_temp.begin(), _temp.end(), [pref](double& x){x = x*pref;});
+            for (int iq = 0; iq < nq; iq++)
+            {
+                tab(it, ip, iq) = _temp[iq];
+                //std::cout << tab(it, ip, iq) << " ";
+            }
+            iproj++;
+        }
+        nchmax = std::max(nchmax, nch);
+    }
+    //std::cout << std::endl;
+    //ModuleBase::WARNING_QUIT("RadialProjection", "The following code is not implemented yet.");
+    
+    nhtol.create(ntype, nchmax);
+    nhtol.zero_out();
+    iproj = 0;
+    for (int it = 0; it < ntype; it++)
+    {
+        int ih = 0; // channel index, across all projectors of present type
+        for (int ip = 0; ip < nproj[it]; ip++)
+        {
+            const int l_ = l[iproj];
+            for (int m = -l_; m <= l_; m++)
+            {
+                nhtol(it, ih) = l_;
+                ih++;
+            }
+            iproj++;
+        }
+    }
+}
+
 void RadialProjection::RadialProjector::sbtft(const std::vector<ModuleBase::Vector3<double>>& qs,
                                               std::vector<std::complex<double>>& out,
                                               const char type,
                                               const double& omega,
                                               const double& tpiba)
 {
+    ModuleBase::timer::tick("RadialProjection", "interp_sphbes_ft_flzYlm");
+    assert(type == 'r' || type == 'l'); // type must be one of 'r' or 'l'
     // first cache the Ylm values
     const int lmax_ = *std::max_element(l_.begin(), l_.end());
     const int total_lm = std::pow(lmax_+1, 2);
@@ -146,7 +220,11 @@ void RadialProjection::RadialProjector::sbtft(const std::vector<ModuleBase::Vect
     for(int i = 0; i < nrad; i++)
     {
         const int l = l_[i];
-        std::complex<double> pref = (type == 'r')? std::pow(ModuleBase::IMAG_UNIT, l) : std::pow(ModuleBase::NEG_IMAG_UNIT, l);
+        // here is bug-prone
+        // we define l as <p|G+k> and r as <G+k|p>. The former is int{p(r)exp(iqr)} and the latter is int{p(r)exp(-iqr)}
+        // , in which we have use G+k=q notation. So once do Ylm expansion on exp(iqr), will get a pure imaginary
+        // prefactor i^l.
+        std::complex<double> pref = (type == 'l')? std::pow(ModuleBase::IMAG_UNIT, l) : std::pow(ModuleBase::NEG_IMAG_UNIT, l);
         pref = pref * ModuleBase::FOUR_PI/std::sqrt(omega);
         cubspl_->eval(npw, qnorm.data(), Jlfq.data(), nullptr, nullptr, i);
         for(int m = -l; m <= l; m++)
@@ -159,6 +237,7 @@ void RadialProjection::RadialProjector::sbtft(const std::vector<ModuleBase::Vect
         }
     }
     assert(iproj == nchannel); // should write to inflate each radial to 2l+1 channels
+    ModuleBase::timer::tick("RadialProjection", "interp_sphbes_ft_flzYlm");
 }
 
 void RadialProjection::_mask_func(std::vector<double>& mask)
diff --git a/source/module_hamilt_pw/hamilt_pwdft/radial_proj.h b/source/module_hamilt_pw/hamilt_pwdft/radial_proj.h
index 2867403b4b..d4a5511bac 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/radial_proj.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/radial_proj.h
@@ -155,30 +155,41 @@ namespace RadialProjection
                                 const double* r,
                                 const std::vector<double*>& radials,
                                 const std::vector<int>& l,
-                                const int nq,                             //< PARAM.globalv.dq
-                                const double& dq);                        //< PARAM.globalv.nqx
+                                const int nq,                             //< GlobalV::DQ
+                                const double& dq);                        //< GlobalV::NQX
             void _build_sbt_tab(const std::vector<double>& r,
                                 const std::vector<std::vector<double>>& radials,
                                 const std::vector<int>& l,
-                                const int nq,                             //< PARAM.globalv.dq
-                                const double& dq);                        //< PARAM.globalv.nqx
-
+                                const int nq,                             //< GlobalV::DQ
+                                const double& dq);                        //< GlobalV::NQX
+            // compatibility concern: for FS_Nonlocal_tools. Will not call sbtft so need omega
+            void _build_sbt_tab(const std::vector<int>& nproj,
+                                const std::vector<double>& r,
+                                const std::vector<std::vector<double>>& radials,
+                                const std::vector<int>& l,
+                                const int nq,                             //< GlobalV::DQ
+                                const double& dq,                         //< GlobalV::NQX
+                                const double& omega,
+                                const int npol,
+                                ModuleBase::realArray& tab,
+                                ModuleBase::matrix& nhtol);
             /**
              * @brief perform analytical version of the Fourier transform:
              * F(q) = int(f(r)*exp(-iq.r) d^3r)
-             *      = 4*pi/sqrt(omega) * i^l * Jl[f](q) * Ylm(q)
+             *      = 4*pi/sqrt(omega) * (-i)^l * Jl[f](q) * Ylm(q)
              * , where Ylm(q) is real spherical harmonic function, and Jl[f](q) is 
              * the Spherial Bessel Transform of f(r):
              * Jl[f](q) = int(f(r)*j_l(q*r)*r^2 dr)
              * , where j_l(q*r) is the spherical Bessel function of the first kind.
-             * 
+             * . If use another notation, F(q) = <q|f>, this is denoted as type
+             * "r" for ket |>, and "l" for bra <|.
              */
             
             void sbtft(const std::vector<ModuleBase::Vector3<double>>& qs,
                        std::vector<std::complex<double>>& out,
-                       const char type = 'r',
+                       const char type = 'r',                                   // 'r' for ket |>, 'l' for bra <|
                        const double& omega = 1.0,
-                       const double& tpiba = 1.0); // 'r' for ket |>, 'l' for bra <|
+                       const double& tpiba = 1.0);                                  // 'n' for no gradient, 'x', 'y', 'z' for gradient in x, y, z direction
             
             void sbfft(); // interface for SBFFT
 
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
index 6d5ee5581e..a81dbc9d93 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h
@@ -162,6 +162,23 @@ class Stress_Func
                    const psi::Psi<complex<FPTYPE>, Device>* psi_in,
                    const pseudopot_cell_vnl& nlpp_in,
                    const UnitCell& ucell_in); // nonlocal part in PW basis
+    // 8) the stress from the DFT+U and DeltaSpin calculations
+    /**
+     * @brief This routine computes the stress contribution from the DFT+U and DeltaSpin calculations
+     *    Stress^{NL}_{ij} = -1/\Omega \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}(V^U_{lmm'\sigma\sigma'} +
+     * f(\lambda,\sigma\sigma')) [ \sum_G \langle c_{nk}(\mathbf{G+K})|\alpha_{lm}^I(\mathbf{G+K})\rangle *
+     *               \sum_{G'}\langle \partial \alpha_{lm}^I(\mathbf{G+K})/\partial \varepsilon_{ij}
+     * |c_{nk}(\mathbf{G+K})\rangle ] there would be three parts in the above equation: (1) sum over becp and dbecp with
+     * f(U+\lambda, \sigma\sigma', lmm')^{I} ----- first line in the above equation (2) calculate becp = <psi | alpha>
+     * ----- second line in the above equation (3) calculate dbecp = <psi | dalpha> ----- third line in the above
+     * equation
+     */
+    void stress_onsite(ModuleBase::matrix& sigma,
+                       const ModuleBase::matrix& wg,
+                       const ModulePW::PW_Basis_K* wfc_basis,
+                       const UnitCell& ucell_in,
+                       const psi::Psi<complex<FPTYPE>, Device>* psi_in,
+                       ModuleSymmetry::Symmetry* p_symm); // nonlocal part in PW basis
 
     void get_dvnl1(ModuleBase::ComplexMatrix& vkb,
                    const int ik,
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp
new file mode 100644
index 0000000000..8568821a10
--- /dev/null
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp
@@ -0,0 +1,113 @@
+#include "module_base/module_device/device.h"
+#include "module_base/timer.h"
+#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h"
+#include "module_parameter/parameter.h"
+#include "module_hamilt_lcao/module_dftu/dftu.h"
+#include "module_hamilt_lcao/module_deltaspin/spin_constrain.h"
+#include "stress_func.h"
+// calculate the nonlocal pseudopotential stress in PW
+template <typename FPTYPE, typename Device>
+void Stress_Func<FPTYPE, Device>::stress_onsite(ModuleBase::matrix& sigma,
+                                            const ModuleBase::matrix& wg,
+                                            const ModulePW::PW_Basis_K* wfc_basis,
+                                            const UnitCell& ucell_in,
+                                            const psi::Psi<complex<FPTYPE>, Device>* psi_in,
+                                            ModuleSymmetry::Symmetry* p_symm)
+{
+    ModuleBase::TITLE("Stress_Func", "stress_onsite");
+    if(psi_in == nullptr || wfc_basis == nullptr)
+    {
+        return;
+    }
+    ModuleBase::timer::tick("Stress_Func", "stress_onsite");
+
+    FPTYPE* stress_device = nullptr;
+    resmem_var_op()(this->ctx, stress_device, 9);
+    setmem_var_op()(this->ctx, stress_device, 0, 9);
+    std::vector<FPTYPE> sigma_onsite(9, 0.0);
+
+    auto* onsite_p = projectors::OnsiteProjector<FPTYPE, Device>::get_instance();
+
+    const int nks = wfc_basis->nks;
+    for (int ik = 0; ik < nks; ik++) // loop k points
+    {
+        // skip zero weights to speed up
+        int nbands_occ = wg.nc;
+        while (wg(ik, nbands_occ - 1) == 0.0)
+        {
+            nbands_occ--;
+            if (nbands_occ == 0)
+            {
+                break;
+            }
+        }
+        const int npm = nbands_occ;
+
+        // calculate becp = <psi|beta> for all beta functions
+        onsite_p->get_fs_tools()->cal_becp(ik, npm);
+        // calculate dbecp = <psi|d(beta)/dR> for all beta functions
+        // calculate stress = \sum <psi|d(beta_j)/dR> * <psi|beta_i> * D_{ij}
+        for (int ipol = 0; ipol < 3; ipol++)
+        {
+            for (int jpol = 0; jpol <= ipol; jpol++)
+            {
+                FPTYPE* stress_device_tmp = stress_device + (ipol * 3 + jpol);
+                onsite_p->get_fs_tools()->cal_dbecp_s(ik, npm, ipol, jpol);
+                if(PARAM.inp.dft_plus_u)
+                {
+                    auto* dftu = ModuleDFTU::DFTU::get_instance();
+                    onsite_p->get_fs_tools()->cal_stress_dftu(ik, npm, stress_device_tmp, dftu->orbital_corr.data(), dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw(), wg.c);
+                }
+                if(PARAM.inp.sc_mag_switch)
+                {
+                    spinconstrain::SpinConstrain<std::complex<double>>& sc = spinconstrain::SpinConstrain<std::complex<double>>::getScInstance();
+                    const std::vector<ModuleBase::Vector3<double>>& lambda = sc.get_sc_lambda();
+                    onsite_p->get_fs_tools()->cal_stress_dspin(ik, npm, stress_device_tmp, lambda.data(), wg.c);
+                }
+            }
+        }
+    }
+    // transfer stress from device to host
+    syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, sigma_onsite.data(), stress_device, 9);
+    delmem_var_op()(this->ctx, stress_device);
+    // sum up forcenl from all processors
+    for (int l = 0; l < 3; l++)
+    {
+        for (int m = 0; m < 3; m++)
+        {
+            if (m > l)
+            {
+                sigma_onsite[l * 3 + m] = sigma_onsite[m * 3 + l];
+            }
+            Parallel_Reduce::reduce_all(sigma_onsite[l * 3 + m]); // qianrui fix a bug for kpar > 1
+        }
+    }
+    // rescale the stress with 1/omega
+    for (int ipol = 0; ipol < 3; ipol++)
+    {
+        for (int jpol = 0; jpol < 3; jpol++)
+        {
+            sigma_onsite[ipol * 3 + jpol] *= 1.0 / ucell_in.omega;
+        }
+    }
+
+    for (int ipol = 0; ipol < 3; ipol++)
+    {
+        for (int jpol = 0; jpol < 3; jpol++)
+        {
+            sigma(ipol, jpol) = sigma_onsite[ipol * 3 + jpol];
+        }
+    }
+    // do symmetry
+    if (ModuleSymmetry::Symmetry::symm_flag == 1)
+    {
+        p_symm->symmetrize_mat3(sigma, ucell_in.lat);
+    } // end symmetry
+
+    ModuleBase::timer::tick("Stress_Func", "stress_onsite");
+}
+
+template class Stress_Func<double, base_device::DEVICE_CPU>;
+#if ((defined __CUDA) || (defined __ROCM))
+template class Stress_Func<double, base_device::DEVICE_GPU>;
+#endif
\ No newline at end of file
diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp
index 2a6925c912..e9cc5ded2b 100644
--- a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp
+++ b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp
@@ -45,6 +45,9 @@ void Stress_PW<FPTYPE, Device>::cal_stress(ModuleBase::matrix& sigmatot,
     // vdw stress
     ModuleBase::matrix sigmavdw;
     sigmavdw.create(3, 3);
+    // DFT+U and DeltaSpin stress
+    ModuleBase::matrix sigmaonsite;
+    sigmaonsite.create(3, 3);
 
     for (int i = 0; i < 3; i++)
     {
@@ -59,6 +62,7 @@ void Stress_PW<FPTYPE, Device>::cal_stress(ModuleBase::matrix& sigmatot,
             sigmaewa(i, j) = 0.0;
             sigmaxcc(i, j) = 0.0;
             sigmavdw(i, j) = 0.0;
+            sigmaonsite(i, j) = 0.0;
         }
     }
 
@@ -107,13 +111,19 @@ void Stress_PW<FPTYPE, Device>::cal_stress(ModuleBase::matrix& sigmatot,
     // vdw term
     stress_vdw(sigmavdw, ucell);
 
+    // DFT+U and DeltaSpin stress
+    if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch)
+    {
+        this->stress_onsite(sigmaonsite, this->pelec->wg, wfc_basis, ucell, d_psi_in, p_symm);
+    }
+
     for (int ipol = 0; ipol < 3; ipol++)
     {
         for (int jpol = 0; jpol < 3; jpol++)
         {
             sigmatot(ipol, jpol) = sigmakin(ipol, jpol) + sigmahar(ipol, jpol) + sigmanl(ipol, jpol)
                                    + sigmaxc(ipol, jpol) + sigmaxcc(ipol, jpol) + sigmaewa(ipol, jpol)
-                                   + sigmaloc(ipol, jpol) + sigmavdw(ipol, jpol);
+                                   + sigmaloc(ipol, jpol) + sigmavdw(ipol, jpol) + sigmaonsite(ipol, jpol);
         }
     }
 
@@ -138,6 +148,10 @@ void Stress_PW<FPTYPE, Device>::cal_stress(ModuleBase::matrix& sigmatot,
         ModuleIO::print_stress("XC    STRESS", sigmaxc, PARAM.inp.test_stress, ry);
         ModuleIO::print_stress("EWALD    STRESS", sigmaewa, PARAM.inp.test_stress, ry);
         ModuleIO::print_stress("NLCC    STRESS", sigmaxcc, PARAM.inp.test_stress, ry);
+        if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch)
+        {
+            ModuleIO::print_stress("ONSITE    STRESS", sigmaonsite, PARAM.inp.test_stress, ry);
+        }
         ModuleIO::print_stress("TOTAL    STRESS", sigmatot, PARAM.inp.test_stress, ry);
     }
     ModuleBase::timer::tick("Stress_PW", "cal_stress");
diff --git a/source/module_hsolver/diago_iter_assist.cpp b/source/module_hsolver/diago_iter_assist.cpp
index a092a8260c..5ec443ab4e 100644
--- a/source/module_hsolver/diago_iter_assist.cpp
+++ b/source/module_hsolver/diago_iter_assist.cpp
@@ -412,6 +412,171 @@ void DiagoIterAssist<T, Device>::diagH_LAPACK(const int nstart,
     ModuleBase::timer::tick("DiagoIterAssist", "diagH_LAPACK");
 }
 
+template <typename T, typename Device>
+void DiagoIterAssist<T, Device>::cal_hs_subspace(const hamilt::Hamilt<T, Device>* pHamilt, // hamiltonian operator carrier
+                                                const psi::Psi<T, Device>& psi,     // [in] wavefunction
+                                                T *hcc, 
+                                                T *scc)
+{
+    const int nstart = psi.get_nbands();
+    
+    setmem_complex_op()(ctx, hcc, 0, nstart * nstart);
+    setmem_complex_op()(ctx, scc, 0, nstart * nstart);
+
+    const int dmin = psi.get_current_nbas();
+    const int dmax = psi.get_nbasis();
+
+    T* temp = nullptr;
+    resmem_complex_op()(ctx, temp, nstart * dmax, "DiagSub::temp");
+    setmem_complex_op()(ctx, temp, 0, nstart * dmax);
+
+    { // code block to calculate hcc and scc
+        setmem_complex_op()(ctx, temp, 0, nstart * dmax);
+
+        T* hphi = temp;
+        // do hPsi for all bands
+        psi::Range all_bands_range(1, psi.get_current_k(), 0, nstart - 1);
+        hpsi_info hpsi_in(&psi, all_bands_range, hphi);
+        pHamilt->ops->hPsi(hpsi_in);
+
+        gemm_op<T, Device>()(ctx,
+                             'C',
+                             'N',
+                             nstart,
+                             nstart,
+                             dmin,
+                             &one,
+                             psi.get_pointer(),
+                             dmax,
+                             hphi,
+                             dmax,
+                             &zero,
+                             hcc,
+                             nstart);
+
+        T* sphi = temp;
+        // do sPsi for all bands
+        pHamilt->sPsi(psi.get_pointer(), sphi, dmax, dmin, nstart);
+
+        gemm_op<T, Device>()(ctx,
+                             'C',
+                             'N',
+                             nstart,
+                             nstart,
+                             dmin,
+                             &one,
+                             psi.get_pointer(),
+                             dmax,
+                             sphi,
+                             dmax,
+                             &zero,
+                             scc,
+                             nstart);
+    }
+
+    if (GlobalV::NPROC_IN_POOL > 1)
+    {
+        Parallel_Reduce::reduce_pool(hcc, nstart * nstart);
+        Parallel_Reduce::reduce_pool(scc, nstart * nstart);
+    }
+
+    delmem_complex_op()(ctx, temp);
+}
+
+template <typename T, typename Device>
+void DiagoIterAssist<T, Device>::diag_responce( const T* hcc,
+                                                const T* scc,
+                                                const int nbands,
+                                                const T* mat_in,           // [out] target matrix to be multiplied
+                                                T* mat_out,
+                                                int mat_col,          // [in] number of columns of target matrix
+                                                Real* en                           // [out] eigenvalues
+)
+{
+    ModuleBase::TITLE("DiagoIterAssist", "diag_responce");
+    ModuleBase::timer::tick("DiagoIterAssist", "diag_responce");
+
+    const int nstart = nbands;
+
+    T *vcc = nullptr;
+    resmem_complex_op()(ctx, vcc, nstart * nstart, "DiagSub::vcc");
+    setmem_complex_op()(ctx, vcc, 0, nstart * nstart);
+
+    // after generation of H and S matrix, diag them
+    DiagoIterAssist::diagH_LAPACK(nstart, nstart, hcc, scc, nstart, en, vcc);
+
+    { // code block to calculate tar_mat
+        gemm_op<T, Device>()(ctx,
+                             'N',
+                             'N',
+                             mat_col,
+                             nstart,
+                             nstart,
+                             &one,
+                             mat_in, // mat_col * nstart
+                             mat_col,
+                             vcc, // nstart * nstart
+                             nstart,
+                             &zero,
+                             mat_out,
+                             mat_col);
+    }
+
+    delmem_complex_op()(ctx, vcc);
+
+    ModuleBase::timer::tick("DiagoIterAssist", "diag_responce");
+}
+
+template <typename T, typename Device>
+void DiagoIterAssist<T, Device>::diag_subspace_psi(const T* hcc,
+                              const T* scc,
+                              const int dim_subspace,
+                              psi::Psi<T, Device>& evc,
+                              Real* en
+)
+{
+    ModuleBase::TITLE("DiagoIterAssist", "diag_subspace_psi");
+    ModuleBase::timer::tick("DiagoIterAssist", "diag_subspace_psi");
+
+    const int nstart = dim_subspace;
+    const int n_band = evc.get_nbands();
+
+    T *vcc = nullptr;
+    resmem_complex_op()(ctx, vcc, nstart * nstart, "DiagSub::vcc");
+    setmem_complex_op()(ctx, vcc, 0, nstart * nstart);
+
+    // after generation of H and S matrix, diag them
+    DiagoIterAssist::diagH_LAPACK(nstart, nstart, hcc, scc, nstart, en, vcc);
+
+    { // code block to calculate tar_mat
+        const int dmin = evc.get_current_nbas();
+        const int dmax = evc.get_nbasis();
+        T* temp = nullptr;
+        resmem_complex_op()(ctx, temp, nstart * dmax, "DiagSub::temp");
+        setmem_complex_op()(ctx, temp, 0, nstart * dmax);
+        gemm_op<T, Device>()(ctx,
+                             'N',
+                             'N',
+                             dmin,
+                             n_band,
+                             nstart,
+                             &one,
+                             evc.get_pointer(), // dmin * nstart
+                             dmax,
+                             vcc, // nstart * n_band
+                             nstart,
+                             &zero,
+                             temp,
+                             dmin);
+        matrixSetToAnother<T, Device>()(ctx, n_band, temp, dmin, evc.get_pointer(), dmax);
+        delmem_complex_op()(ctx, temp);
+    }
+
+    delmem_complex_op()(ctx, vcc);
+
+    ModuleBase::timer::tick("DiagoIterAssist", "diag_subspace_psi");
+}
+
 template <typename T, typename Device>
 bool DiagoIterAssist<T, Device>::test_exit_cond(const int& ntry, const int& notconv)
 {
diff --git a/source/module_hsolver/diago_iter_assist.h b/source/module_hsolver/diago_iter_assist.h
index 560b37d682..c5c4e9cfaa 100644
--- a/source/module_hsolver/diago_iter_assist.h
+++ b/source/module_hsolver/diago_iter_assist.h
@@ -62,6 +62,39 @@ class DiagoIterAssist
                              Real* e,
                              T* vcc);
 
+    /// @brief calculate Hamiltonian and overlap matrix in subspace spanned by nstart states psi
+    /// @param pHamilt : hamiltonian operator carrier
+    /// @param psi : wavefunction
+    /// @param hcc : Hamiltonian matrix
+    /// @param scc : overlap matrix
+    static void cal_hs_subspace(const hamilt::Hamilt<T, Device>* pHamilt, // hamiltonian operator carrier
+                                                const psi::Psi<T, Device>& psi,     // [in] wavefunction
+                                                T *hcc, 
+                                                T *scc);
+
+    /// @brief calculate the response matrix from rotation matrix solved by diagonalization of H and S matrix
+    /// @param hcc : Hamiltonian matrix
+    /// @param scc : overlap matrix
+    /// @param nbands : number of bands
+    /// @param mat_in : input matrix to be rotated
+    /// @param mat_out : output matrix to be rotated
+    /// @param mat_col : number of columns of target matrix
+    /// @param en : eigenvalues
+    static void diag_responce(const T* hcc,
+                              const T* scc,
+                              const int nbands,
+                              const T* mat_in, 
+                              T* mat_out, 
+                              int mat_col, 
+                              Real* en);
+    
+    /// @brief calculate the response wavefunction psi from rotation matrix solved by diagonalization of H and S matrix
+    static void diag_subspace_psi(const T* hcc,
+                              const T* scc,
+                              const int dim_subspace,
+                              psi::Psi<T, Device>& evc,
+                              Real* en);
+
     static bool test_exit_cond(const int& ntry, const int& notconv);
 
   private:
diff --git a/source/module_io/input_conv.cpp b/source/module_io/input_conv.cpp
index ebcf40a6c6..7ce7f0d764 100644
--- a/source/module_io/input_conv.cpp
+++ b/source/module_io/input_conv.cpp
@@ -18,11 +18,11 @@
 #include "module_ri/exx_abfs-jle.h"
 #endif
 
+#include "module_hamilt_lcao/module_dftu/dftu.h"
 #ifdef __LCAO
 #include "module_basis/module_ao/ORB_read.h"
 #include "module_elecstate/potentials/H_TDDFT_pw.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h"
-#include "module_hamilt_lcao/module_dftu/dftu.h"
 #include "module_hamilt_lcao/module_tddft/evolve_elec.h"
 #include "module_hamilt_lcao/module_tddft/td_velocity.h"
 #endif
@@ -243,7 +243,6 @@ void Input_Conv::Convert()
     // iteration (1/3)
     //----------------------------------------------------------
 
-#ifdef __LCAO
     if (PARAM.inp.dft_plus_u)
     {
         GlobalC::dftu.Yukawa = PARAM.inp.yukawa_potential;
@@ -258,7 +257,6 @@ void Input_Conv::Convert()
             ModuleBase::GlobalFunc::ZEROS(GlobalC::dftu.U.data(), PARAM.inp.ntype);
         }
     }
-#endif
 
     //----------------------------------------------------------
     // Yu Liu add 2022-05-18
diff --git a/source/module_io/output_mulliken.h b/source/module_io/output_mulliken.h
index 560dedfeaa..2d78d2fa52 100644
--- a/source/module_io/output_mulliken.h
+++ b/source/module_io/output_mulliken.h
@@ -7,6 +7,8 @@
 #include "module_elecstate/elecstate_lcao.h"
 #include "module_io/output_dmk.h"
 #include "module_io/output_sk.h"
+#include "module_base/formatter.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dspin_lcao.h"
 
 #include <map>
 #include <vector>
@@ -88,29 +90,105 @@ void cal_mag(Parallel_Orbitals* pv,
              hamilt::Hamilt<TK>* p_ham,
              K_Vectors& kv,
              elecstate::ElecState* pelec,
+             const TwoCenterBundle& two_center_bundle,
+             const LCAO_Orbitals& orb,
              UnitCell& ucell,
              const int istep,
              const bool print)
 {
-    auto cell_index
-        = CellIndex(ucell.get_atomLabels(), ucell.get_atomCounts(), ucell.get_lnchiCounts(), PARAM.inp.nspin);
-    auto out_sk = ModuleIO::Output_Sk<TK>(p_ham, pv, PARAM.inp.nspin, kv.get_nks());
-    auto out_dmk = ModuleIO::Output_DMK<TK>(dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(pelec)->get_DM(),
-                                            pv,
-                                            PARAM.inp.nspin,
-                                            kv.get_nks());
-    auto mulp = ModuleIO::Output_Mulliken<TK>(&(out_sk), &(out_dmk), pv, &cell_index, kv.isk, PARAM.inp.nspin);
-    auto atom_chg = mulp.get_atom_chg();
-    /// used in updating mag info in STRU file
-    ucell.atom_mulliken = mulp.get_atom_mulliken(atom_chg);
-    if (print && GlobalV::MY_RANK == 0)
+    // 1) calculate and output Mulliken population charges and magnetic moments
+    if (PARAM.inp.out_mul)
     {
-        /// write the Orbital file
-        cell_index.write_orb_info(PARAM.globalv.global_out_dir);
-        /// write mulliken.txt
-        mulp.write(istep, PARAM.globalv.global_out_dir);
-        /// write atomic mag info in running log file
-        mulp.print_atom_mag(atom_chg, GlobalV::ofs_running);
+        auto cell_index
+            = CellIndex(ucell.get_atomLabels(), ucell.get_atomCounts(), ucell.get_lnchiCounts(), PARAM.inp.nspin);
+        auto out_sk = ModuleIO::Output_Sk<TK>(p_ham, pv, PARAM.inp.nspin, kv.get_nks());
+        auto out_dmk = ModuleIO::Output_DMK<TK>(dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(pelec)->get_DM(),
+                                                pv,
+                                                PARAM.inp.nspin,
+                                                kv.get_nks());
+        auto mulp = ModuleIO::Output_Mulliken<TK>(&(out_sk), &(out_dmk), pv, &cell_index, kv.isk, PARAM.inp.nspin);
+        auto atom_chg = mulp.get_atom_chg();
+        /// used in updating mag info in STRU file
+        ucell.atom_mulliken = mulp.get_atom_mulliken(atom_chg);
+        if (print && GlobalV::MY_RANK == 0)
+        {
+            /// write the Orbital file
+            cell_index.write_orb_info(PARAM.globalv.global_out_dir);
+            /// write mulliken.txt
+            mulp.write(istep, PARAM.globalv.global_out_dir);
+            /// write atomic mag info in running log file
+            mulp.print_atom_mag(atom_chg, GlobalV::ofs_running);
+        }
+    }
+    // 2) calculate and output the magnetizations of each atom with projection method
+    if (PARAM.inp.onsite_radius > 0)
+    {
+        std::vector<std::vector<double>> atom_mag(ucell.nat, std::vector<double>(PARAM.inp.nspin, 0.0));
+        std::vector<ModuleBase::Vector3<int>> constrain(ucell.nat, ModuleBase::Vector3<int>(1, 1, 1));
+        const hamilt::HContainer<double>* dmr
+            = dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(pelec)->get_DM()->get_DMR_pointer(1);
+        std::vector<double> moments;
+        std::vector<double> mag_x(ucell.nat, 0.0);
+        std::vector<double> mag_y(ucell.nat, 0.0);
+        std::vector<double> mag_z(ucell.nat, 0.0);
+        auto atomLabels = ucell.get_atomLabels();
+        if(PARAM.inp.nspin == 2)
+        {
+            auto sc_lambda = new hamilt::DeltaSpin<hamilt::OperatorLCAO<TK, double>>(
+                    nullptr,
+                    kv.kvec_d,
+                    nullptr,
+                    ucell,
+                    &GlobalC::GridD,
+                    two_center_bundle.overlap_orb_onsite.get(),
+                    orb.cutoffs()
+            );
+            dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(pelec)->get_DM()->switch_dmr(2);
+            moments = sc_lambda->cal_moment(dmr, constrain);
+            dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(pelec)->get_DM()->switch_dmr(0);
+            delete sc_lambda;
+            //const std::vector<std::string> title = {"Total Magnetism (uB)", ""};
+            //const std::vector<std::string> fmts = {"%-26s", "%20.10f"};
+            //FmtTable table(title, ucell.nat, fmts, {FmtTable::Align::RIGHT, FmtTable::Align::LEFT});
+            for(int iat=0;iat<ucell.nat;iat++)
+            {
+                atom_mag[iat][0] = 0.0;
+                atom_mag[iat][1] = moments[iat];
+            //    mag_z[iat] = moments[iat];
+            }
+            //table << atomLabels << mag_z;
+            //GlobalV::ofs_running << table.str() << std::endl;
+        }
+        else if(PARAM.inp.nspin == 4)
+        {
+            auto sc_lambda = new hamilt::DeltaSpin<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(
+                    nullptr,
+                    kv.kvec_d,
+                    nullptr,
+                    ucell,
+                    &GlobalC::GridD,
+                    two_center_bundle.overlap_orb_onsite.get(),
+                    orb.cutoffs()
+            );
+            moments = sc_lambda->cal_moment(dmr, constrain);
+            delete sc_lambda;
+            //const std::vector<std::string> title = {"Total Magnetism (uB)", "", "", ""};
+            //const std::vector<std::string> fmts = {"%-26s", "%20.10f", "%20.10f", "%20.10f"};
+            //FmtTable table(title, ucell.nat, fmts, {FmtTable::Align::RIGHT, FmtTable::Align::LEFT});
+            for(int iat=0;iat<ucell.nat;iat++)
+            {
+                atom_mag[iat][0] = 0.0;
+                atom_mag[iat][1] = moments[iat*3];
+                atom_mag[iat][2] = moments[iat*3+1];
+                atom_mag[iat][3] = moments[iat*3+2];
+                //mag_x[iat] = moments[iat*3];
+                //mag_y[iat] = moments[iat*3+1];
+                //mag_z[iat] = moments[iat*3+2];
+            }
+            //table << atomLabels << mag_x << mag_y << mag_z;
+            //GlobalV::ofs_running << table.str() << std::endl;
+        }
+        ucell.atom_mulliken = atom_mag;
     }
 }
 
diff --git a/source/module_io/read_input_item_elec_stru.cpp b/source/module_io/read_input_item_elec_stru.cpp
index 910b59ac18..44bc76d691 100644
--- a/source/module_io/read_input_item_elec_stru.cpp
+++ b/source/module_io/read_input_item_elec_stru.cpp
@@ -577,6 +577,12 @@ void ReadInput::item_elec_stru()
         };
         this->add_item(item);
     }
+    {
+        Input_Item item("sc_os_ndim");
+        item.annotation = "number of old iterations used for oscillation detection, for Spin-Constrained DFT";
+        read_sync_int(input.sc_os_ndim);
+        this->add_item(item);
+    }
     {
         Input_Item item("scf_thr_type");
         item.annotation = "type of the criterion of scf_thr, 1: reci drho for "
diff --git a/source/module_io/read_input_item_exx_dftu.cpp b/source/module_io/read_input_item_exx_dftu.cpp
index 3cfbae13e3..dc7c6a6025 100644
--- a/source/module_io/read_input_item_exx_dftu.cpp
+++ b/source/module_io/read_input_item_exx_dftu.cpp
@@ -339,16 +339,9 @@ void ReadInput::item_dftu()
             const Input_para& input = para.input;
             if (input.dft_plus_u != 0)
             {
-                if (input.basis_type != "lcao")
+                if (input.basis_type == "pw" && input.nspin != 4)
                 {
-                    ModuleBase::WARNING_QUIT("ReadInput", "WRONG ARGUMENTS OF basis_type, only lcao is support");
-                }
-                if (input.ks_solver != "genelpa" && input.ks_solver != "scalapack_gvx" && input.ks_solver != "default")
-                {
-                    std::cout << " You'are using " << input.ks_solver << std::endl;
-                    ModuleBase::WARNING_QUIT("ReadInput",
-                                             "WRONG ARGUMENTS OF ks_solver in DFT+U routine, only "
-                                             "genelpa and scalapack_gvx are supported ");
+                    ModuleBase::WARNING_QUIT("ReadInput", "WRONG ARGUMENTS, only nspin2 with PW base is not supported now");
                 }
             }
         };
diff --git a/source/module_io/test/read_input_ptest.cpp b/source/module_io/test/read_input_ptest.cpp
index 1fc3f0568d..33608f6569 100644
--- a/source/module_io/test/read_input_ptest.cpp
+++ b/source/module_io/test/read_input_ptest.cpp
@@ -167,6 +167,7 @@ TEST_F(InputParaTest, ParaRead)
     EXPECT_EQ(param.inp.scf_os_stop, 1);
     EXPECT_NEAR(param.inp.scf_os_thr, -0.02, 1.0e-15);
     EXPECT_EQ(param.inp.scf_os_ndim, 10);
+    EXPECT_EQ(param.inp.sc_os_ndim, 5);
     EXPECT_NEAR(param.inp.scf_ene_thr, 1.0e-6, 1.0e-15);
     EXPECT_EQ(param.inp.scf_nmax, 50);
     EXPECT_EQ(param.inp.relax_nmax, 1);
diff --git a/source/module_io/test_serial/read_input_item_test.cpp b/source/module_io/test_serial/read_input_item_test.cpp
index 91325b9f00..b83e2df05a 100644
--- a/source/module_io/test_serial/read_input_item_test.cpp
+++ b/source/module_io/test_serial/read_input_item_test.cpp
@@ -1428,21 +1428,6 @@ TEST_F(InputTest, Item_test2)
         param.input.orbital_corr = {-1, -1};
         it->second.reset_value(it->second, param);
         EXPECT_EQ(param.input.dft_plus_u, 0);
-
-        param.input.dft_plus_u = 1;
-        param.input.basis_type = "pw";
-        param.input.ks_solver = "genelpa";
-        testing::internal::CaptureStdout();
-        EXPECT_EXIT(it->second.check_value(it->second, param), ::testing::ExitedWithCode(1), "");
-        output = testing::internal::GetCapturedStdout();
-        EXPECT_THAT(output, testing::HasSubstr("NOTICE"));
-
-        param.input.basis_type = "lcao";
-        param.input.ks_solver = "test";
-        testing::internal::CaptureStdout();
-        EXPECT_EXIT(it->second.check_value(it->second, param), ::testing::ExitedWithCode(1), "");
-        output = testing::internal::GetCapturedStdout();
-        EXPECT_THAT(output, testing::HasSubstr("NOTICE"));
     }
     { // uramping
         auto it = find_label("uramping", readinput.input_lists);
diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h
index d421e86f2b..fe86fbfefc 100644
--- a/source/module_parameter/input_parameter.h
+++ b/source/module_parameter/input_parameter.h
@@ -118,6 +118,7 @@ struct Input_para
     bool scf_os_stop = false;  ///< whether to stop scf when oscillation is detected
     double scf_os_thr = -0.01;  ///< drho threshold for oscillation
     int scf_os_ndim = 0;       ///< number of old iterations used for oscillation detection
+    int sc_os_ndim = 5;       ///< number of old iterations used for oscillation detection in Spin-Constrained DFT
 
     bool lspinorb = false;   ///< consider the spin-orbit interaction
     bool noncolin = false;   ///< using non-collinear-spin
diff --git a/tests/integrate/160_PW_DJ_PK_PU_SO/INPUT b/tests/integrate/160_PW_DJ_PK_PU_SO/INPUT
new file mode 100644
index 0000000000..8e4f45ab0e
--- /dev/null
+++ b/tests/integrate/160_PW_DJ_PK_PU_SO/INPUT
@@ -0,0 +1,46 @@
+INPUT_PARAMETERS
+suffix    autotest
+nbands    40
+
+calculation    scf
+ecutwfc    10
+scf_thr    1.0e-4
+scf_nmax    50
+out_chg    0
+
+#init_chg    file
+#out_dos    1
+#dos_sigma    0.05
+#out_band    1
+
+smearing_method    gaussian
+smearing_sigma    0.01
+
+#force_thr_ev    0.01
+#relax_method    cg
+#relax_bfgs_init    0.5
+
+mixing_type    pulay
+mixing_beta    0.3
+mixing_restart 1e-3
+mixing_dmr     1
+mixing_gg0    1.1
+
+ks_solver    dav_subspace
+pw_diag_ndim  2
+basis_type    pw
+gamma_only    0
+noncolin    1
+lspinorb    1
+cal_force   1
+cal_stress  1
+
+#Parameter DFT+U
+dft_plus_u    1
+orbital_corr    2 
+hubbard_u    5.0 
+onsite_radius   3.0
+pseudo_dir	../../PP_ORB
+orbital_dir	../../PP_ORB
+
+pw_seed 1
diff --git a/tests/integrate/160_PW_DJ_PK_PU_SO/KPT b/tests/integrate/160_PW_DJ_PK_PU_SO/KPT
new file mode 100644
index 0000000000..e769af7638
--- /dev/null
+++ b/tests/integrate/160_PW_DJ_PK_PU_SO/KPT
@@ -0,0 +1,4 @@
+K_POINTS
+0
+Gamma
+2 1 1 0 0 0
diff --git a/tests/integrate/160_PW_DJ_PK_PU_SO/STRU b/tests/integrate/160_PW_DJ_PK_PU_SO/STRU
new file mode 100644
index 0000000000..91021e0a69
--- /dev/null
+++ b/tests/integrate/160_PW_DJ_PK_PU_SO/STRU
@@ -0,0 +1,22 @@
+ATOMIC_SPECIES
+Fe 1.000 Fe.upf
+
+NUMERICAL_ORBITAL
+Fe_gga_6au_100Ry_4s2p2d1f.orb
+
+LATTICE_CONSTANT
+8.190
+
+LATTICE_VECTORS
+ 1.00    0.50     0.50
+ 0.50    1.00     0.50
+ 0.50    0.50     1.00
+ATOMIC_POSITIONS
+Direct
+
+Fe
+0.0
+2
+0.00            0.00            0.00         mag  1.0 1.0 1.0
+0.51            0.51            0.51         mag  1.0 1.0 1.0
+
diff --git a/tests/integrate/160_PW_DJ_PK_PU_SO/jd b/tests/integrate/160_PW_DJ_PK_PU_SO/jd
new file mode 100644
index 0000000000..a93b3b217e
--- /dev/null
+++ b/tests/integrate/160_PW_DJ_PK_PU_SO/jd
@@ -0,0 +1 @@
+DFTU + NSPIN4, Fe2, multi-k case
diff --git a/tests/integrate/160_PW_DJ_PK_PU_SO/result.ref b/tests/integrate/160_PW_DJ_PK_PU_SO/result.ref
new file mode 100644
index 0000000000..e6b1657fb7
--- /dev/null
+++ b/tests/integrate/160_PW_DJ_PK_PU_SO/result.ref
@@ -0,0 +1,5 @@
+etotref -5662.3908859903258417
+etotperatomref -2831.1954429952
+totalforceref 17.965510
+totalstressref 100582.607209
+totaltimeref 1.26
diff --git a/tests/integrate/CASES_CPU.txt b/tests/integrate/CASES_CPU.txt
index 05d4ae689a..8277d51eab 100644
--- a/tests/integrate/CASES_CPU.txt
+++ b/tests/integrate/CASES_CPU.txt
@@ -105,6 +105,7 @@
 140_PW_15_SO_average
 140_PW_15_SO_wfcinit
 150_PW_15_CR_VDW3
+160_PW_DJ_PK_PU_SO
 170_PW_MD_1O
 170_PW_MD_2O
 180_PW_SDFT_10S_M
diff --git a/tests/integrate/CASES_GPU.txt b/tests/integrate/CASES_GPU.txt
index 00de10ccb6..3490ccc9a0 100644
--- a/tests/integrate/CASES_GPU.txt
+++ b/tests/integrate/CASES_GPU.txt
@@ -22,4 +22,4 @@
 934_NO_Si2_tzdp_neq_GPU
 934_NO_Si2_tzdp_neq_ns2_GPU
 934_NO_Si2_tzdp_ns2_GPU
-935_NO_Si2_tzdp_ns2_k_GPU
\ No newline at end of file
+935_NO_Si2_tzdp_ns2_k_GPU