From b42932bc86f201bcfa93662fdef38a426ae8fc4e Mon Sep 17 00:00:00 2001
From: jinzx10 <jzx016@hotmail.com>
Date: Wed, 19 Jun 2024 17:40:14 +0800
Subject: [PATCH 1/8] unify two-center integration interface

---
 .../module_nao/two_center_bundle.cpp          | 73 ++++++++++++++++++-
 .../module_nao/two_center_bundle.h            | 10 +++
 .../module_nao/two_center_table.cpp           |  2 +
 source/module_esolver/esolver_ks_lcao.cpp     | 10 ++-
 .../hamilt_lcaodft/LCAO_nl_beta.cpp           | 18 -----
 .../hamilt_lcaodft/LCAO_nl_mu.cpp             | 17 -----
 .../hamilt_lcaodft/LCAO_set_st.cpp            | 29 --------
 .../hamilt_lcaodft/fvnl_dbeta_gamma.cpp       | 14 ----
 .../hamilt_lcaodft/fvnl_dbeta_k.cpp           | 18 +----
 .../operator_lcao/deepks_lcao.cpp             | 14 +---
 .../operator_lcao/dftu_lcao.cpp               | 35 ---------
 .../operator_lcao/ekinetic_new.cpp            | 39 ++--------
 .../operator_lcao/nonlocal_new.cpp            | 21 +-----
 .../operator_lcao/overlap_new.cpp             | 38 ++--------
 .../operator_lcao/td_ekinetic_lcao.cpp        |  8 +-
 .../module_deepks/LCAO_deepks_psialpha.cpp    | 17 -----
 16 files changed, 105 insertions(+), 258 deletions(-)
diff --git a/source/module_basis/module_nao/two_center_bundle.cpp b/source/module_basis/module_nao/two_center_bundle.cpp
index 5ffc9d6d17..e163a34f5b 100644
--- a/source/module_basis/module_nao/two_center_bundle.cpp
+++ b/source/module_basis/module_nao/two_center_bundle.cpp
@@ -106,8 +106,77 @@ void TwoCenterBundle::tabulate()
 
     ModuleBase::Memory::record("RealGauntTable", RealGauntTable::instance().memory());
 
-    // init Ylm (this shall be done by Ylm automatically! to be done later...)
-    ModuleBase::Ylm::set_coefficients();
+    sbt.clear();
+}
+
+void TwoCenterBundle::tabulate(
+    const double lcao_ecut,
+    const double lcao_dk,
+    const double lcao_dr,
+    const double lcao_rmax
+)
+{
+    ModuleBase::SphericalBesselTransformer sbt(true);
+    orb_->set_transformer(sbt);
+    beta_->set_transformer(sbt);
+    if (alpha_) alpha_->set_transformer(sbt);
+    if (orb_onsite_) orb_onsite_->set_transformer(sbt);
+
+    //================================================================
+    //              build two-center integration tables
+    //================================================================
+
+    // old formula for the number of k-space grid points
+    int nk =  static_cast<int>( sqrt(lcao_ecut) / lcao_dk )  + 4;
+    nk += 1 - nk % 2; // make nk odd
+
+    std::vector<double> kgrid(nk);
+    for (int ik = 0; ik < nk; ++ik) { kgrid[ik] = ik * lcao_dk; }
+
+    orb_->set_grid(false, nk, kgrid.data(), 't');
+    beta_->set_grid(false, nk, kgrid.data(), 't');
+    if (alpha_) { alpha_->set_grid(false, nk, kgrid.data(), 't'); }
+    if (orb_onsite_) { orb_onsite_->set_grid(false, nk, kgrid.data(), 't'); }
+
+    // "st" stands for overlap (s) and kinetic (t)
+    const double cutoff_st = std::min(lcao_rmax, 2.0 * orb_->rcut_max());
+    const int nr_st = static_cast<int>(cutoff_st/ lcao_dr) + 5;
+
+    kinetic_orb = std::unique_ptr<TwoCenterIntegrator>(new TwoCenterIntegrator);
+    kinetic_orb->tabulate(*orb_, *orb_, 'T', nr_st, cutoff_st);
+    ModuleBase::Memory::record("TwoCenterTable: Kinetic", kinetic_orb->table_memory());
+
+    overlap_orb = std::unique_ptr<TwoCenterIntegrator>(new TwoCenterIntegrator);
+    overlap_orb->tabulate(*orb_, *orb_, 'S', nr_st, cutoff_st);
+    ModuleBase::Memory::record("TwoCenterTable: Overlap", overlap_orb->table_memory());
+
+    // overlap between orbital and beta (for nonlocal potential)
+    const double cutoff_nl = std::min(lcao_rmax, orb_->rcut_max() + beta_->rcut_max());
+    const int nr_nl = static_cast<int>(cutoff_nl / lcao_dr) + 5;
+    overlap_orb_beta = std::unique_ptr<TwoCenterIntegrator>(new TwoCenterIntegrator);
+    overlap_orb_beta->tabulate(*orb_, *beta_, 'S', nr_nl, cutoff_nl);
+    ModuleBase::Memory::record("TwoCenterTable: Nonlocal", overlap_orb_beta->table_memory());
+
+    // overlap between orbital and deepks projector
+    if (alpha_)
+    {
+        const double cutoff_alpha = std::min(lcao_rmax, orb_->rcut_max() + alpha_->rcut_max());
+        const int nr_alpha = static_cast<int>(cutoff_alpha / lcao_dr) + 5;
+        overlap_orb_alpha = std::unique_ptr<TwoCenterIntegrator>(new TwoCenterIntegrator);
+        overlap_orb_alpha->tabulate(*orb_, *alpha_, 'S', nr_alpha, cutoff_alpha);
+        ModuleBase::Memory::record("TwoCenterTable: Descriptor", overlap_orb_beta->table_memory());
+    }
+
+    // overlap between orbital and "onsite orbital" (for DFT+U)
+    if (orb_onsite_)
+    {
+        const double cutoff_onsite = std::min(lcao_rmax, orb_->rcut_max() + orb_onsite_->rcut_max());
+        const int nr_onsite = static_cast<int>(cutoff_onsite / lcao_dr) + 5;
+        overlap_orb_onsite = std::unique_ptr<TwoCenterIntegrator>(new TwoCenterIntegrator);
+        overlap_orb_onsite->tabulate(*orb_, *orb_onsite_, 'S', nr_onsite, cutoff_onsite);
+    }
+
+    ModuleBase::Memory::record("RealGauntTable", RealGauntTable::instance().memory());
 
     sbt.clear();
 }
diff --git a/source/module_basis/module_nao/two_center_bundle.h b/source/module_basis/module_nao/two_center_bundle.h
index 5c6c7c43a7..f4a74a43be 100644
--- a/source/module_basis/module_nao/two_center_bundle.h
+++ b/source/module_basis/module_nao/two_center_bundle.h
@@ -21,6 +21,16 @@ class TwoCenterBundle
 
     void tabulate();
 
+    // Unlike the tabulate() above, this overload function computes
+    // two-center integration table by direct integration with Simpson's
+    // rule, which was the algorithm used prior to v3.3.4.
+    void tabulate(
+        const double lcao_ecut,
+        const double lcao_dk,
+        const double lcao_dr,
+        const double lcao_rmax
+    );
+
     /**
      * @brief Overwrites the content of a LCAO_Orbitals object (e.g. GlobalC::ORB)
      * with the current object.
diff --git a/source/module_basis/module_nao/two_center_table.cpp b/source/module_basis/module_nao/two_center_table.cpp
index 969e118699..0c9b5bf5f9 100644
--- a/source/module_basis/module_nao/two_center_table.cpp
+++ b/source/module_basis/module_nao/two_center_table.cpp
@@ -100,7 +100,9 @@ int& TwoCenterTable::table_index(const NumericalRadial* it1, const NumericalRadi
 void TwoCenterTable::cleanup()
 {
     op_ = '\0';
+    ntab_ = 0;
     nr_ = 0;
+    rmax_ = 0.0;
     delete[] rgrid_;
     rgrid_ = nullptr;
 
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index 914781f2ce..0f57a10b2e 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -84,9 +84,12 @@ ESolver_KS_LCAO<TK, TR>::ESolver_KS_LCAO()
 template <typename TK, typename TR>
 ESolver_KS_LCAO<TK, TR>::~ESolver_KS_LCAO()
 {
+<<<<<<< Updated upstream
 #ifndef USE_NEW_TWO_CENTER
     this->orb_con.clear_after_ions(*uot_, GlobalC::ORB, GlobalV::deepks_setorb, GlobalC::ucell.infoNL.nproj);
 #endif
+=======
+>>>>>>> Stashed changes
     delete uot_;
 }
 
@@ -556,11 +559,10 @@ void ESolver_KS_LCAO<TK, TR>::init_basis_lcao(ORB_control& orb_con, Input& inp,
     Lmax = GlobalC::exx_info.info_ri.abfs_Lmax;
 #endif
 
-#ifndef USE_NEW_TWO_CENTER
-    this->orb_con.set_orb_tables(GlobalV::ofs_running, *uot_, GlobalC::ORB, ucell.lat0, GlobalV::deepks_setorb, Lmax,
-                                 ucell.infoNL.nprojmax, ucell.infoNL.nproj, ucell.infoNL.Beta);
-#else
+#ifdef USE_NEW_TWO_CENTER
     two_center_bundle->tabulate();
+#else
+    two_center_bundle->tabulate(inp.lcao_ecut, inp.lcao_dk, inp.lcao_dr, inp.lcao_rmax);
 #endif
 
     if (this->orb_con.setup_2d)
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_beta.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_beta.cpp
index 6f94f92572..a93f1c95e3 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_beta.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_beta.cpp
@@ -99,11 +99,6 @@ void build_Nonlocal_beta_new(
                     //and the first dimension is then 3
                     //inner loop : all projectors (L0,M0)
 
-
-#ifdef USE_NEW_TWO_CENTER
-                    //=================================================================
-                    //          new two-center integral (temporary)
-                    //=================================================================
                     int L1 = atom1->iw2l[ iw1_0 ];
                     int N1 = atom1->iw2n[ iw1_0 ];
                     int m1 = atom1->iw2m[ iw1_0 ];
@@ -114,19 +109,6 @@ void build_Nonlocal_beta_new(
                     ModuleBase::Vector3<double> dtau = ucell.atoms[T0].tau[I0] - tau1;
                     uot.two_center_bundle->overlap_orb_beta->snap(
                             T1, L1, N1, M1, T0, dtau * ucell.lat0, false, nlm);
-#else
-                    uot.snap_psibeta_half(
-                        orb,
-                        ucell.infoNL,
-                        nlm, tau1, T1,
-                        atom1->iw2l[ iw1_0 ], // L1
-                        atom1->iw2m[ iw1_0 ], // m1
-                        atom1->iw2n[ iw1_0 ], // N1
-                        ucell.atoms[T0].tau[I0], T0, 0); //R0,T0
-#endif
-                    //=================================================================
-                    //          end of new two-center integral (temporary)
-                    //=================================================================
 
                     #ifdef _OPENMP
                         nlm_tot_thread[ad_count].insert({iw1_all,nlm[0]});
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_mu.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_mu.cpp
index 8b6f103eb8..7d44122bfd 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_mu.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_mu.cpp
@@ -119,10 +119,6 @@ void build_Nonlocal_mu_new(
 				//and size of outer vector is then 4
 				//inner loop : all projectors (L0,M0)
 
-#ifdef USE_NEW_TWO_CENTER
-                //=================================================================
-                //          new two-center integral (temporary)
-                //=================================================================
                 int L1 = atom1->iw2l[ iw1_0 ];
                 int N1 = atom1->iw2n[ iw1_0 ];
                 int m1 = atom1->iw2m[ iw1_0 ];
@@ -133,19 +129,6 @@ void build_Nonlocal_mu_new(
                 ModuleBase::Vector3<double> dtau = tau - tau1;
                 uot.two_center_bundle->overlap_orb_beta->snap(
                         T1, L1, N1, M1, it, dtau * ucell.lat0, calc_deri, nlm);
-#else
-				uot.snap_psibeta_half(
-					orb,
-					ucell.infoNL,
-					nlm, tau1, T1,
-					atom1->iw2l[ iw1_0 ], // L1
-					atom1->iw2m[ iw1_0 ], // m1
-					atom1->iw2n[ iw1_0 ], // N1
-					tau, it, calc_deri); //R0,T0
-#endif
-                //=================================================================
-                //          end of new two-center integral (temporary)
-                //=================================================================
 
 				if(!calc_deri)
 				{
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_set_st.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_set_st.cpp
index 9cc944125e..60897d844d 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_set_st.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_set_st.cpp
@@ -40,10 +40,6 @@ void single_derivative(
 
     const bool gamma_only_local = GlobalV::GAMMA_ONLY_LOCAL;
 
-#ifdef USE_NEW_TWO_CENTER
-	//=================================================================
-	//          new two-center integral (temporary)
-	//=================================================================
 	// convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
 	const int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
 	const int M2 = (m2 % 2 == 0) ? -m2/2 : (m2+1)/2;
@@ -80,16 +76,6 @@ void single_derivative(
 		default:  // not supposed to happen
 			ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new","dtype must be S or T");
 	}
-#else
-	uot.snap_psipsi( orb, olm, 1, dtype,
-			tau1, T1, L1, m1, N1,
-			tau2, T2, L2, m2, N2
-			);
-#endif
-
-	//=================================================================
-	//          end of new two-center integral (temporary)
-	//=================================================================
 
 	// condition 7: gamma only or multiple k
 	if(gamma_only_local)
@@ -277,10 +263,6 @@ void single_overlap(
 {
     const bool gamma_only_local = GlobalV::GAMMA_ONLY_LOCAL;
 
-#ifdef USE_NEW_TWO_CENTER
-	//=================================================================
-	//          new two-center integral (temporary)
-	//=================================================================
 	// convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
 	const int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
 	const int M2 = (m2 % 2 == 0) ? -m2/2 : (m2+1)/2;
@@ -298,17 +280,6 @@ void single_overlap(
 		default:  // not supposed to happen
 			ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new","dtype must be S or T");
 	}
-#else
-	uot.snap_psipsi( orb, olm, 0, dtype,
-			tau1, T1, L1, m1, N1,                  // info of atom1
-			adjs.adjacent_tau[ad], T2, L2, m2, N2, // info of atom2
-			cal_syns,
-			dmax);
-#endif
-
-	//=================================================================
-	//          end of new two-center integral (temporary)
-	//=================================================================
 
 	// When NSPIN == 4 , only diagonal term is calculated for T or S Operators
 	// use olm1 to store the diagonal term with complex data type.
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_gamma.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_gamma.cpp
index 646074d2c2..ea68de9b72 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_gamma.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_gamma.cpp
@@ -70,10 +70,6 @@ void Force_LCAO<double>::cal_fvnl_dbeta(
                 
                 std::vector<std::vector<double>> nlm;
 
-#ifdef USE_NEW_TWO_CENTER
-                //=================================================================
-                //          new two-center integral (temporary)
-                //=================================================================
                 int L1 = atom1->iw2l[ iw1 ];
                 int N1 = atom1->iw2n[ iw1 ];
                 int m1 = atom1->iw2m[ iw1 ];
@@ -85,16 +81,6 @@ void Force_LCAO<double>::cal_fvnl_dbeta(
 
                 uot.two_center_bundle->overlap_orb_beta->snap(
                     T1, L1, N1, M1, T0, dtau * ucell.lat0, true, nlm);
-#else
-                uot.snap_psibeta_half(
-                    orb,
-                    ucell.infoNL,
-                    nlm, tau1, T1,
-                    atom1->iw2l[ iw1 ], // L1
-                    atom1->iw2m[ iw1 ], // m1
-                    atom1->iw2n[ iw1 ], // N1
-                    ucell.atoms[T0].tau[I0], T0, 1); //R0,T0
-#endif
 
                 assert(nlm.size()==4);
                 nlm_tot[ad1].insert({iw1,nlm});
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_k.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_k.cpp
index a7f5f1e37c..8a9501959e 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_k.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_k.cpp
@@ -104,10 +104,6 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
 				}
                 const int iw1_0 = iw1 / npol;
                 std::vector<std::vector<double>> nlm;
-#ifdef USE_NEW_TWO_CENTER
-                //=================================================================
-                //          new two-center integral (temporary)
-                //=================================================================
                 int L1 = atom1->iw2l[ iw1_0 ];
                 int N1 = atom1->iw2n[ iw1_0 ];
                 int m1 = atom1->iw2m[ iw1_0 ];
@@ -118,19 +114,7 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                 ModuleBase::Vector3<double> dtau = tau - tau1;
                 uot.two_center_bundle->overlap_orb_beta->snap(
                         T1, L1, N1, M1, it, dtau * ucell.lat0, true, nlm);
-#else
-                uot.snap_psibeta_half(orb,
-                                    ucell.infoNL,
-                                    nlm,
-                                    tau1,
-                                    T1,
-                                    atom1->iw2l[iw1_0], // L1
-                                    atom1->iw2m[iw1_0], // m1
-                                    atom1->iw2n[iw1_0], // N1
-                                    tau,
-                                    it,
-                                    1); // R0,T0
-#endif
+
                 nlm_cur.insert({iw1_all, nlm});
             } // end iw
             const int iat1 = ucell.itia2iat(T1, I1);
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
index 77ccb0e9f7..d5c0702516 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
@@ -281,10 +281,7 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::pre_calculate_nlm(const int i
             // If we are calculating force, we need also to store the gradient
             // and size of outer vector is then 4
             // inner loop : all projectors (L0,M0)
-#ifdef USE_NEW_TWO_CENTER
-            //=================================================================
-            //          new two-center integral (temporary)
-            //=================================================================
+
             int L1 = atom1->iw2l[ iw1 ];
             int N1 = atom1->iw2n[ iw1 ];
             int m1 = atom1->iw2m[ iw1 ];
@@ -295,15 +292,6 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::pre_calculate_nlm(const int i
             ModuleBase::Vector3<double> dtau = tau0 - tau1;
             uot_->two_center_bundle->overlap_orb_alpha->snap(
                     T1, L1, N1, M1, 0, dtau * ucell->lat0, 0 /*calc_deri*/, nlm);
-#else
-            uot_->snap_psialpha_half(
-                    orb,
-                    nlm, 0, tau1, T1,
-                    atom1->iw2l[ iw1 ], // L1
-                    atom1->iw2m[ iw1 ], // m1
-                    atom1->iw2n[ iw1 ], // N1
-                    tau0, T0, I0);
-#endif
             nlm_in[ad].insert({all_indexes[iw1l], nlm[0]});
             if(npol == 2) nlm_in[ad].insert({all_indexes[iw1l+1], nlm[0]});
         }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp
index 3a95ccd516..da64876d07 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp
@@ -133,15 +133,11 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_nlm_all(const Parallel_Orbi
                 const int L1 = atom1->iw2l[ iw1 ];
                 const int N1 = atom1->iw2n[ iw1 ];
                 const int m1 = atom1->iw2m[ iw1 ];
-#ifdef USE_NEW_TWO_CENTER
                 std::vector<std::vector<double>> nlm;
                 // nlm is a vector of vectors, but size of outer vector is only 1 here
                 // If we are calculating force, we need also to store the gradient
                 // and size of outer vector is then 4
                 // inner loop : all projectors (L0,M0)
-                //=================================================================
-                //          new two-center integral (temporary)
-                //=================================================================
 
                 // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
                 const int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
@@ -162,37 +158,6 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_nlm_all(const Parallel_Orbi
                         break;
                     }
                 }
-#else
-                ModuleBase::WARNING("DFTU", "autoset onsite_radius to rcut for old two-center integral");
-                double olm[3] = {0, 0, 0};
-                for(int iw =0;iw < this->ucell->atoms[T0].nw; iw++)
-                {
-                    const int L0 = this->ucell->atoms[T0].iw2l[iw];
-                    if(L0 == target_L)
-                    {
-                        for(int m = 0; m < tlp1; m++)
-                        {
-                            uot_->snap_psipsi(orb, // orbitals
-                                olm,
-                                0,
-                                'S', // olm, job of derivation, dtype of Operator
-                                tau1,
-                                T1,
-                                L1,
-                                m1,
-                                N1, // all zeta of atom1
-                                tau0,
-                                T0,
-                                L0,
-                                m,
-                                0 // choose only the first zeta
-                            );
-                            nlm_target[m] = olm[0];
-                        }
-                        break;
-                    }
-                }
-#endif
                 nlm_tot[iat0][ad].insert({all_indexes[iw1l], nlm_target});
             }
         }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp
index 111a3206df..456f81c8ca 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp
@@ -145,7 +145,6 @@ void hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(const int& ia
                                                                    const ModuleBase::Vector3<double>& dtau,
                                                                    TR* data_pointer)
 {
-    const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
     // ---------------------------------------------
     // get info of orbitals of atom1 and atom2 from ucell
     // ---------------------------------------------
@@ -167,15 +166,7 @@ void hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(const int& ia
     const int* iw2l2 = atom2.iw2l;
     const int* iw2n2 = atom2.iw2n;
     const int* iw2m2 = atom2.iw2m;
-#ifndef USE_NEW_TWO_CENTER
-    // ---------------------------------------------
-    // get tau1 (in cell <0,0,0>) and tau2 (in cell R)
-    // in principle, only dtau is needed in this function
-    // snap_psipsi should be refactored to use dtau directly
-    // ---------------------------------------------
-    const ModuleBase::Vector3<double>& tau1 = this->ucell->get_tau(iat1);
-    const ModuleBase::Vector3<double> tau2 = tau1 + dtau;
-#endif
+
     // ---------------------------------------------
     // calculate the Ekinetic matrix for each pair of orbitals
     // ---------------------------------------------
@@ -189,40 +180,22 @@ void hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(const int& ia
         const int L1 = iw2l1[iw1];
         const int N1 = iw2n1[iw1];
         const int m1 = iw2m1[iw1];
-#ifdef USE_NEW_TWO_CENTER
+
+        // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
         int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
-#endif
+
         for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
         {
             const int iw2 = col_indexes[iw2l] / npol;
             const int L2 = iw2l2[iw2];
             const int N2 = iw2n2[iw2];
             const int m2 = iw2m2[iw2];
-#ifdef USE_NEW_TWO_CENTER
-            //=================================================================
-            //          new two-center integral (temporary)
-            //=================================================================
+
             // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
             int M2 = (m2 % 2 == 0) ? -m2/2 : (m2+1)/2;
+
             uot_->two_center_bundle->kinetic_orb->calculate(T1, L1, N1, M1,
                     T2, L2, N2, M2, dtau * this->ucell->lat0, olm);
-#else
-            uot_->snap_psipsi(orb, // orbitals
-                            olm,
-                            0,
-                            'T', // olm, job of derivation, dtype of Operator
-                            tau1,
-                            T1,
-                            L1,
-                            m1,
-                            N1, // info of atom1
-                            tau2,
-                            T2,
-                            L2,
-                            m2,
-                            N2 // info of atom2
-            );
-#endif
             for (int ipol = 0; ipol < npol; ipol++)
             {
                 data_pointer[ipol * step_trace] += olm[0];
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
index 90ebe4d215..9a567bf6df 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
@@ -148,7 +148,6 @@ void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
             const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
             const Atom* atom1 = &ucell->atoms[T1];
 
-            const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
             auto all_indexes = paraV->get_indexes_row(iat1);
 #ifdef _OPENMP
             if(atom_row_list.find(iat1) == atom_row_list.end())
@@ -169,10 +168,7 @@ void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
                 // If we are calculating force, we need also to store the gradient
                 // and size of outer vector is then 4
                 // inner loop : all projectors (L0,M0)
-#ifdef USE_NEW_TWO_CENTER
-                //=================================================================
-                //          new two-center integral (temporary)
-                //=================================================================
+
                 int L1 = atom1->iw2l[ iw1 ];
                 int N1 = atom1->iw2n[ iw1 ];
                 int m1 = atom1->iw2m[ iw1 ];
@@ -182,20 +178,7 @@ void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
 
                 ModuleBase::Vector3<double> dtau = tau0 - tau1;
                 uot_->two_center_bundle->overlap_orb_beta->snap(
-                        T1, L1, N1, M1, T0, dtau * this->ucell->lat0, 0 /*cal_deri*/, nlm);
-#else
-                uot_->snap_psibeta_half(orb,
-                                      this->ucell->infoNL,
-                                      nlm,
-                                      tau1,
-                                      T1,
-                                      atom1->iw2l[iw1], // L1
-                                      atom1->iw2m[iw1], // m1
-                                      atom1->iw2n[iw1], // N1
-                                      tau0,
-                                      T0,
-                                      0 /*cal_deri*/); // R0,T0
-#endif
+                        T1, L1, N1, M1, T0, dtau * ucell->lat0, 0 /*cal_deri*/, nlm);
                 nlm_tot[ad].insert({all_indexes[iw1l], nlm[0]});
             }
         }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
index 255d809278..dcdfc2de2a 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
@@ -115,7 +115,6 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::cal_SR_IJR(const int& iat
                                                                   const ModuleBase::Vector3<double>& dtau,
                                                                   TR* data_pointer)
 {
-    const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
     // ---------------------------------------------
     // get info of orbitals of atom1 and atom2 from ucell
     // ---------------------------------------------
@@ -137,15 +136,7 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::cal_SR_IJR(const int& iat
     const int* iw2l2 = atom2.iw2l;
     const int* iw2n2 = atom2.iw2n;
     const int* iw2m2 = atom2.iw2m;
-#ifndef USE_NEW_TWO_CENTER
-    // ---------------------------------------------
-    // get tau1 (in cell <0,0,0>) and tau2 (in cell R)
-    // in principle, only dtau is needed in this function
-    // snap_psipsi should be refactored to use dtau directly
-    // ---------------------------------------------
-    const ModuleBase::Vector3<double>& tau1 = this->ucell->get_tau(iat1);
-    const ModuleBase::Vector3<double> tau2 = tau1 + dtau;
-#endif
+
     // ---------------------------------------------
     // calculate the overlap matrix for each pair of orbitals
     // ---------------------------------------------
@@ -159,40 +150,21 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::cal_SR_IJR(const int& iat
         const int L1 = iw2l1[iw1];
         const int N1 = iw2n1[iw1];
         const int m1 = iw2m1[iw1];
-#ifdef USE_NEW_TWO_CENTER
+
+        // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
         int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
-#endif
+
         for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
         {
             const int iw2 = col_indexes[iw2l] / npol;
             const int L2 = iw2l2[iw2];
             const int N2 = iw2n2[iw2];
             const int m2 = iw2m2[iw2];
-#ifdef USE_NEW_TWO_CENTER
-            //=================================================================
-            //          new two-center integral (temporary)
-            //=================================================================
+
             // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
             int M2 = (m2 % 2 == 0) ? -m2/2 : (m2+1)/2;
             uot_->two_center_bundle->overlap_orb->calculate(T1, L1, N1, M1,
                     T2, L2, N2, M2, dtau * this->ucell->lat0, olm);
-#else
-            uot_->snap_psipsi(orb, // orbitals
-                            olm,
-                            0,
-                            'S', // olm, job of derivation, dtype of Operator
-                            tau1,
-                            T1,
-                            L1,
-                            m1,
-                            N1, // info of atom1
-                            tau2,
-                            T2,
-                            L2,
-                            m2,
-                            N2 // info of atom2
-            );
-#endif
             for (int ipol = 0; ipol < npol; ipol++)
             {
                 data_pointer[ipol * step_trace] += olm[0];
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
index 97c06578cf..681af45ecf 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
@@ -158,21 +158,15 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::cal_HR_IJR(const int& iat1,
         const int L1 = iw2l1[iw1];
         const int N1 = iw2n1[iw1];
         const int m1 = iw2m1[iw1];
-#ifdef USE_NEW_TWO_CENTER
-        int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
-#endif
+
         for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
         {
             const int iw2 = col_indexes[iw2l] / npol;
             const int L2 = iw2l2[iw2];
             const int N2 = iw2n2[iw2];
             const int m2 = iw2m2[iw2];
-#ifdef USE_NEW_TWO_CENTER
             // center2_orb11_s are used to calculate <psi|∇|psi> no matter whether to use new two-center or not for now.
             ModuleBase::Vector3<double> grad_overlap = center2_orb11_s.at(T1).at(T2).at(L1).at(N1).at(L2).at(N2).cal_grad_overlap(tau1 * ucell->lat0, tau2 * ucell->lat0, m1, m2);
-#else
-            ModuleBase::Vector3<double> grad_overlap = center2_orb11_s.at(T1).at(T2).at(L1).at(N1).at(L2).at(N2).cal_grad_overlap(tau1 * ucell->lat0, tau2 * ucell->lat0, m1, m2);
-#endif
             for (int ipol = 0; ipol < npol; ipol++)
             {
                 //key change
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_psialpha.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_psialpha.cpp
index bb058dc79d..fd8d4ce07e 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_psialpha.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_psialpha.cpp
@@ -91,10 +91,6 @@ void LCAO_Deepks::build_psialpha(const bool& calc_deri,
 					//2D, dim 0 contains the overlap <psi|alpha>
                     //dim 1-3 contains the gradient of overlap
 
-#ifdef USE_NEW_TWO_CENTER
-                    //=================================================================
-                    //          new two-center integral (temporary)
-                    //=================================================================
                     int L1 = atom1->iw2l[ iw1 ];
                     int N1 = atom1->iw2n[ iw1 ];
                     int m1 = atom1->iw2m[ iw1 ];
@@ -105,19 +101,6 @@ void LCAO_Deepks::build_psialpha(const bool& calc_deri,
                     ModuleBase::Vector3<double> dtau = ucell.atoms[T0].tau[I0] - tau1;
                     UOT.two_center_bundle->overlap_orb_alpha->snap(
                             T1, L1, N1, M1, 0, dtau * ucell.lat0, calc_deri, nlm);
-#else
-					//inner loop : all projectors (N,L,M)
-					UOT.snap_psialpha_half(
-                        orb,
-						nlm, job, tau1, T1,
-						atom1->iw2l[ iw1 ], // L1
-						atom1->iw2m[ iw1 ], // m1
-						atom1->iw2n[ iw1 ], // N1
-						ucell.atoms[T0].tau[I0], T0, I0); //R0,T0
-#endif
-                    //=================================================================
-                    //          end of new two-center integral (temporary)
-                    //=================================================================
 
                     if(GlobalV::GAMMA_ONLY_LOCAL)
                     {

From 370870ca5f7c03e555e276967b3736146c180b88 Mon Sep 17 00:00:00 2001
From: jinzx10 <jzx016@hotmail.com>
Date: Wed, 19 Jun 2024 17:49:46 +0800
Subject: [PATCH 2/8] fix version control conflict marker

---
 source/module_esolver/esolver_ks_lcao.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index 0f57a10b2e..ce958c250b 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -84,12 +84,6 @@ ESolver_KS_LCAO<TK, TR>::ESolver_KS_LCAO()
 template <typename TK, typename TR>
 ESolver_KS_LCAO<TK, TR>::~ESolver_KS_LCAO()
 {
-<<<<<<< Updated upstream
-#ifndef USE_NEW_TWO_CENTER
-    this->orb_con.clear_after_ions(*uot_, GlobalC::ORB, GlobalV::deepks_setorb, GlobalC::ucell.infoNL.nproj);
-#endif
-=======
->>>>>>> Stashed changes
     delete uot_;
 }
 

From 1445f2509b1a495aca4115448cadd9bdbfe51917 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci-lite[bot]"
 <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
Date: Wed, 19 Jun 2024 11:59:06 +0000
Subject: [PATCH 3/8] [pre-commit.ci lite] apply automatic fixes

---
 .../module_nao/two_center_bundle.cpp          |   88 +-
 .../module_nao/two_center_bundle.h            |   12 +-
 .../module_nao/two_center_table.cpp           |   67 +-
 source/module_esolver/esolver_ks_lcao.cpp     |  102 +-
 .../hamilt_lcaodft/LCAO_nl_beta.cpp           |  241 ++--
 .../hamilt_lcaodft/LCAO_nl_mu.cpp             | 1103 +++++++++--------
 .../hamilt_lcaodft/LCAO_set_st.cpp            | 1070 ++++++++--------
 .../hamilt_lcaodft/fvnl_dbeta_gamma.cpp       |  220 ++--
 .../hamilt_lcaodft/fvnl_dbeta_k.cpp           |  223 ++--
 .../operator_lcao/deepks_lcao.cpp             |  214 ++--
 .../operator_lcao/dftu_lcao.cpp               |  220 ++--
 .../operator_lcao/ekinetic_new.cpp            |   22 +-
 .../operator_lcao/nonlocal_new.cpp            |  163 +--
 .../operator_lcao/overlap_new.cpp             |   22 +-
 .../operator_lcao/td_ekinetic_lcao.cpp        |  253 ++--
 .../module_deepks/LCAO_deepks_psialpha.cpp    |  262 ++--
 16 files changed, 2166 insertions(+), 2116 deletions(-)

diff --git a/source/module_basis/module_nao/two_center_bundle.cpp b/source/module_basis/module_nao/two_center_bundle.cpp
index e163a34f5b..b6eadc930c 100644
--- a/source/module_basis/module_nao/two_center_bundle.cpp
+++ b/source/module_basis/module_nao/two_center_bundle.cpp
@@ -1,18 +1,21 @@
 #include "module_basis/module_nao/two_center_bundle.h"
+
+#include "module_base/global_variable.h"
 #include "module_base/memory.h"
+#include "module_base/parallel_common.h"
 #include "module_base/ylm.h"
 #include "module_basis/module_nao/real_gaunt_table.h"
+
 #include <memory>
-#include "module_base/parallel_common.h"
-#include "module_base/global_variable.h"
 
 void TwoCenterBundle::build_orb(int ntype, const std::string* file_orb0)
 {
     std::vector<std::string> file_orb(ntype);
     if (GlobalV::MY_RANK == 0)
     {
-        std::transform(file_orb0, file_orb0 + ntype, file_orb.begin(),
-                [](const std::string& file) { return GlobalV::global_orbital_dir + file; });
+        std::transform(file_orb0, file_orb0 + ntype, file_orb.begin(), [](const std::string& file) {
+            return GlobalV::global_orbital_dir + file;
+        });
     }
 #ifdef __MPI
     Parallel_Common::bcast_string(file_orb.data(), ntype);
@@ -48,7 +51,7 @@ void TwoCenterBundle::build_alpha(int ndesc, std::string* file_desc0)
 
 void TwoCenterBundle::build_orb_onsite(int ntype, double radius)
 {
-    if(GlobalV::onsite_radius > 0)
+    if (GlobalV::onsite_radius > 0)
     {
         orb_onsite_ = std::unique_ptr<RadialCollection>(new RadialCollection);
         orb_onsite_->build(orb_.get(), GlobalV::onsite_radius);
@@ -60,23 +63,28 @@ void TwoCenterBundle::tabulate()
     ModuleBase::SphericalBesselTransformer sbt(true);
     orb_->set_transformer(sbt);
     beta_->set_transformer(sbt);
-    if (alpha_) alpha_->set_transformer(sbt);
-    if (orb_onsite_) orb_onsite_->set_transformer(sbt);
+    if (alpha_)
+        alpha_->set_transformer(sbt);
+    if (orb_onsite_)
+        orb_onsite_->set_transformer(sbt);
 
     //================================================================
     //              build two-center integration tables
     //================================================================
     // set up a universal radial grid
     double rmax = std::max(orb_->rcut_max(), beta_->rcut_max());
-    if (alpha_) rmax = std::max(rmax, alpha_->rcut_max());
+    if (alpha_)
+        rmax = std::max(rmax, alpha_->rcut_max());
     double dr = 0.01;
     double cutoff = 2.0 * rmax;
     int nr = static_cast<int>(rmax / dr) + 1;
 
     orb_->set_uniform_grid(true, nr, cutoff, 'i', true);
     beta_->set_uniform_grid(true, nr, cutoff, 'i', true);
-    if (alpha_) alpha_->set_uniform_grid(true, nr, cutoff, 'i', true);
-    if (orb_onsite_) orb_onsite_->set_uniform_grid(true, nr, cutoff, 'i', true);
+    if (alpha_)
+        alpha_->set_uniform_grid(true, nr, cutoff, 'i', true);
+    if (orb_onsite_)
+        orb_onsite_->set_uniform_grid(true, nr, cutoff, 'i', true);
 
     // build TwoCenterIntegrator objects
     kinetic_orb = std::unique_ptr<TwoCenterIntegrator>(new TwoCenterIntegrator);
@@ -109,38 +117,47 @@ void TwoCenterBundle::tabulate()
     sbt.clear();
 }
 
-void TwoCenterBundle::tabulate(
-    const double lcao_ecut,
-    const double lcao_dk,
-    const double lcao_dr,
-    const double lcao_rmax
-)
+void TwoCenterBundle::tabulate(const double lcao_ecut,
+                               const double lcao_dk,
+                               const double lcao_dr,
+                               const double lcao_rmax)
 {
     ModuleBase::SphericalBesselTransformer sbt(true);
     orb_->set_transformer(sbt);
     beta_->set_transformer(sbt);
-    if (alpha_) alpha_->set_transformer(sbt);
-    if (orb_onsite_) orb_onsite_->set_transformer(sbt);
+    if (alpha_)
+        alpha_->set_transformer(sbt);
+    if (orb_onsite_)
+        orb_onsite_->set_transformer(sbt);
 
     //================================================================
     //              build two-center integration tables
     //================================================================
 
     // old formula for the number of k-space grid points
-    int nk =  static_cast<int>( sqrt(lcao_ecut) / lcao_dk )  + 4;
+    int nk = static_cast<int>(sqrt(lcao_ecut) / lcao_dk) + 4;
     nk += 1 - nk % 2; // make nk odd
 
     std::vector<double> kgrid(nk);
-    for (int ik = 0; ik < nk; ++ik) { kgrid[ik] = ik * lcao_dk; }
+    for (int ik = 0; ik < nk; ++ik)
+    {
+        kgrid[ik] = ik * lcao_dk;
+    }
 
     orb_->set_grid(false, nk, kgrid.data(), 't');
     beta_->set_grid(false, nk, kgrid.data(), 't');
-    if (alpha_) { alpha_->set_grid(false, nk, kgrid.data(), 't'); }
-    if (orb_onsite_) { orb_onsite_->set_grid(false, nk, kgrid.data(), 't'); }
+    if (alpha_)
+    {
+        alpha_->set_grid(false, nk, kgrid.data(), 't');
+    }
+    if (orb_onsite_)
+    {
+        orb_onsite_->set_grid(false, nk, kgrid.data(), 't');
+    }
 
     // "st" stands for overlap (s) and kinetic (t)
     const double cutoff_st = std::min(lcao_rmax, 2.0 * orb_->rcut_max());
-    const int nr_st = static_cast<int>(cutoff_st/ lcao_dr) + 5;
+    const int nr_st = static_cast<int>(cutoff_st / lcao_dr) + 5;
 
     kinetic_orb = std::unique_ptr<TwoCenterIntegrator>(new TwoCenterIntegrator);
     kinetic_orb->tabulate(*orb_, *orb_, 'T', nr_st, cutoff_st);
@@ -181,7 +198,11 @@ void TwoCenterBundle::tabulate(
     sbt.clear();
 }
 
-void TwoCenterBundle::to_LCAO_Orbitals(LCAO_Orbitals& ORB, const double lcao_ecut, const double lcao_dk, const double lcao_dr, const double lcao_rmax) const
+void TwoCenterBundle::to_LCAO_Orbitals(LCAO_Orbitals& ORB,
+                                       const double lcao_ecut,
+                                       const double lcao_dk,
+                                       const double lcao_dr,
+                                       const double lcao_rmax) const
 {
     ORB.ntype = orb_->ntype();
     ORB.lmax = orb_->lmax();
@@ -190,7 +211,7 @@ void TwoCenterBundle::to_LCAO_Orbitals(LCAO_Orbitals& ORB, const double lcao_ecu
     ORB.dR = lcao_dr;
     ORB.Rmax = lcao_rmax; // lcao_rmax, see ORB_control.cpp
     ORB.dr_uniform = 0.001;
-    
+
     // Due to algorithmic difference in the spherical Bessel transform
     // (FFT vs. Simpson's integration), k grid of FFT is not appropriate
     // for Simpson's integration. The k grid for Simpson's integration is
@@ -199,17 +220,16 @@ void TwoCenterBundle::to_LCAO_Orbitals(LCAO_Orbitals& ORB, const double lcao_ecu
     ORB.ecutwfc = lcao_ecut;
     ORB.dk = lcao_dk;
 
-	if(ORB.ecutwfc < 20)
-	{
-		ORB.kmesh = static_cast<int>( 2 * sqrt(ORB.ecutwfc) / ORB.dk )  + 4;
-	}
-	else
-	{
-		ORB.kmesh = static_cast<int>( sqrt(ORB.ecutwfc) / ORB.dk )  + 4;
-	}
+    if (ORB.ecutwfc < 20)
+    {
+        ORB.kmesh = static_cast<int>(2 * sqrt(ORB.ecutwfc) / ORB.dk) + 4;
+    }
+    else
+    {
+        ORB.kmesh = static_cast<int>(sqrt(ORB.ecutwfc) / ORB.dk) + 4;
+    }
     ORB.kmesh += 1 - ORB.kmesh % 2;
 
-
     delete[] ORB.Phi;
     ORB.Phi = new Numerical_Orbital[orb_->ntype()];
     for (int itype = 0; itype < orb_->ntype(); ++itype)
diff --git a/source/module_basis/module_nao/two_center_bundle.h b/source/module_basis/module_nao/two_center_bundle.h
index f4a74a43be..9f58014a0b 100644
--- a/source/module_basis/module_nao/two_center_bundle.h
+++ b/source/module_basis/module_nao/two_center_bundle.h
@@ -1,8 +1,8 @@
 #ifndef TWO_CENTER_BUNDLE_H
 #define TWO_CENTER_BUNDLE_H
 
-#include "module_basis/module_nao/two_center_integrator.h"
 #include "module_basis/module_ao/ORB_read.h"
+#include "module_basis/module_nao/two_center_integrator.h"
 
 #include <memory>
 #include <string>
@@ -24,12 +24,7 @@ class TwoCenterBundle
     // Unlike the tabulate() above, this overload function computes
     // two-center integration table by direct integration with Simpson's
     // rule, which was the algorithm used prior to v3.3.4.
-    void tabulate(
-        const double lcao_ecut,
-        const double lcao_dk,
-        const double lcao_dr,
-        const double lcao_rmax
-    );
+    void tabulate(const double lcao_ecut, const double lcao_dk, const double lcao_dr, const double lcao_rmax);
 
     /**
      * @brief Overwrites the content of a LCAO_Orbitals object (e.g. GlobalC::ORB)
@@ -41,8 +36,7 @@ class TwoCenterBundle
                           const double lcao_ecut,
                           const double lcao_dk,
                           const double lcao_dr,
-                          const double lcao_rmax
-                          ) const;
+                          const double lcao_rmax) const;
 
     std::unique_ptr<TwoCenterIntegrator> kinetic_orb;
     std::unique_ptr<TwoCenterIntegrator> overlap_orb;
diff --git a/source/module_basis/module_nao/two_center_table.cpp b/source/module_basis/module_nao/two_center_table.cpp
index 0c9b5bf5f9..f5f6e07193 100644
--- a/source/module_basis/module_nao/two_center_table.cpp
+++ b/source/module_basis/module_nao/two_center_table.cpp
@@ -1,15 +1,15 @@
 #include "module_basis/module_nao/two_center_table.h"
 
+#include "module_base/constants.h"
+#include "module_base/cubic_spline.h"
+#include "module_base/math_integral.h"
+
 #include <algorithm>
 #include <cstring>
 #include <iostream>
 #include <limits>
 #include <numeric>
 
-#include "module_base/constants.h"
-#include "module_base/math_integral.h"
-#include "module_base/cubic_spline.h"
-
 void TwoCenterTable::build(const RadialCollection& bra,
                            const RadialCollection& ket,
                            const char op,
@@ -37,8 +37,13 @@ void TwoCenterTable::build(const RadialCollection& bra,
     std::for_each(rgrid_, rgrid_ + nr_, [this, dr](double& r) { r = (&r - rgrid_) * dr; });
 
     // index the table by generating a map from (itype1, l1, izeta1, itype2, l2, izeta2, l) to a row index
-    index_map_.resize({bra.ntype(), bra.lmax() + 1, bra.nzeta_max(),
-                       ket.ntype(), ket.lmax() + 1, ket.nzeta_max(), bra.lmax() + ket.lmax() + 1});
+    index_map_.resize({bra.ntype(),
+                       bra.lmax() + 1,
+                       bra.nzeta_max(),
+                       ket.ntype(),
+                       ket.lmax() + 1,
+                       ket.nzeta_max(),
+                       bra.lmax() + ket.lmax() + 1});
     std::fill(index_map_.data<int>(), index_map_.data<int>() + index_map_.NumElements(), -1);
 
     ntab_ = 0;
@@ -61,8 +66,8 @@ const double* TwoCenterTable::table(const int itype1,
 #ifdef __DEBUG
     assert(is_present(itype1, l1, izeta1, itype2, l2, izeta2, l));
 #endif
-    return deriv ? dtable_.inner_most_ptr<double>(index_map_.get_value<int>(itype1, l1, izeta1, itype2, l2, izeta2, l)):
-                    table_.inner_most_ptr<double>(index_map_.get_value<int>(itype1, l1, izeta1, itype2, l2, izeta2, l));
+    return deriv ? dtable_.inner_most_ptr<double>(index_map_.get_value<int>(itype1, l1, izeta1, itype2, l2, izeta2, l))
+                 : table_.inner_most_ptr<double>(index_map_.get_value<int>(itype1, l1, izeta1, itype2, l2, izeta2, l));
 }
 
 void TwoCenterTable::lookup(const int itype1,
@@ -82,12 +87,14 @@ void TwoCenterTable::lookup(const int itype1,
 
     if (R > rmax())
     {
-        if (val) *val = 0.0;
-        if (dval) *dval = 0.0;
+        if (val)
+            *val = 0.0;
+        if (dval)
+            *dval = 0.0;
         return;
     }
 
-    const double*  tab = table(itype1, l1, izeta1, itype2, l2, izeta2, l, false);
+    const double* tab = table(itype1, l1, izeta1, itype2, l2, izeta2, l, false);
     const double* dtab = table(itype1, l1, izeta1, itype2, l2, izeta2, l, true);
     ModuleBase::CubicSpline::eval(nr_, rgrid_, tab, dtab, 1, &R, val, dval);
 }
@@ -139,9 +146,7 @@ double TwoCenterTable::dfact(int l) const
     return result;
 }
 
-void TwoCenterTable::two_center_loop(const RadialCollection& bra, 
-                                     const RadialCollection& ket, 
-                                     looped_func f)
+void TwoCenterTable::two_center_loop(const RadialCollection& bra, const RadialCollection& ket, looped_func f)
 {
     for (int l = 0; l <= bra.lmax() + ket.lmax(); ++l)
         for (int l1 = 0; l1 <= bra.lmax(); ++l1)
@@ -181,7 +186,7 @@ void TwoCenterTable::_tabulate(const NumericalRadial* it1, const NumericalRadial
     //
     // See the developer's document for more details.
     double dr = rmax_ / (nr_ - 1);
-    if ( l > 0 )
+    if (l > 0)
     {
         // divide S(R) by R^l (except the R=0 point)
         std::for_each(&tab[1], tab + nr_, [&](double& val) { val /= std::pow(dr * (&val - tab), l); });
@@ -197,37 +202,37 @@ void TwoCenterTable::_tabulate(const NumericalRadial* it1, const NumericalRadial
         int op_exp = l;
         switch (op_)
         {
-        case 'S': op_exp += 2;
-                  break;
-        case 'T': op_exp += 4;
-                  break;
-        default: ; // currently not supposed to happen
+        case 'S':
+            op_exp += 2;
+            break;
+        case 'T':
+            op_exp += 4;
+            break;
+        default:; // currently not supposed to happen
         }
 
         for (int ik = 0; ik != nk; ++ik)
         {
-            fk[ik] = it1->kvalue(ik) * it2->kvalue(ik) 
-                    * std::pow(kgrid[ik], op_exp);
+            fk[ik] = it1->kvalue(ik) * it2->kvalue(ik) * std::pow(kgrid[ik], op_exp);
         }
 
-        tab[0] = ModuleBase::Integral::simpson(nk, fk, &h[1]) 
-                * ModuleBase::FOUR_PI / dfact(2 * l + 1);
+        tab[0] = ModuleBase::Integral::simpson(nk, fk, &h[1]) * ModuleBase::FOUR_PI / dfact(2 * l + 1);
 
         delete[] fk;
         delete[] h;
     }
 
-    // The derivative table stores the derivative of S(R)/R^l or T(R)/R^l 
+    // The derivative table stores the derivative of S(R)/R^l or T(R)/R^l
     // instead of bare dS(R)/dR or dT(R)/dR, which simplifies further calculation.
     //
     // The derivatives are computed from a cubic spline interpolation rather
     // than two spherical Bessel transforms. By doing so, we achieve a good
     // consistency between the table and its derivative during interpolation.
     using ModuleBase::CubicSpline;
-    CubicSpline::build(
-            nr_, rgrid_, table_.inner_most_ptr<double>(itab),
-            {CubicSpline::BoundaryType::first_deriv, 0.0},
-            {CubicSpline::BoundaryType::first_deriv, 0.0},
-            dtable_.inner_most_ptr<double>(itab));
+    CubicSpline::build(nr_,
+                       rgrid_,
+                       table_.inner_most_ptr<double>(itab),
+                       {CubicSpline::BoundaryType::first_deriv, 0.0},
+                       {CubicSpline::BoundaryType::first_deriv, 0.0},
+                       dtable_.inner_most_ptr<double>(itab));
 }
-
diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp
index ce958c250b..2355bd306a 100644
--- a/source/module_esolver/esolver_ks_lcao.cpp
+++ b/source/module_esolver/esolver_ks_lcao.cpp
@@ -251,9 +251,13 @@ void ESolver_KS_LCAO<TK, TR>::before_all_runners(Input& inp, UnitCell& ucell)
     // 12) initialize the potential
     if (this->pelec->pot == nullptr)
     {
-        this->pelec->pot
-            = new elecstate::Potential(this->pw_rhod, this->pw_rho, &GlobalC::ucell, &(GlobalC::ppcell.vloc),
-                                       &(this->sf), &(this->pelec->f_en.etxc), &(this->pelec->f_en.vtxc));
+        this->pelec->pot = new elecstate::Potential(this->pw_rhod,
+                                                    this->pw_rho,
+                                                    &GlobalC::ucell,
+                                                    &(GlobalC::ppcell.vloc),
+                                                    &(this->sf),
+                                                    &(this->pelec->f_en.etxc),
+                                                    &(this->pelec->f_en.vtxc));
     }
 
 #ifdef __DEEPKS
@@ -318,9 +322,13 @@ void ESolver_KS_LCAO<TK, TR>::init_after_vc(Input& inp, UnitCell& ucell)
         // Initialize the potential.
         if (this->pelec->pot == nullptr)
         {
-            this->pelec->pot
-                = new elecstate::Potential(this->pw_rhod, this->pw_rho, &GlobalC::ucell, &(GlobalC::ppcell.vloc),
-                                           &(this->sf), &(this->pelec->f_en.etxc), &(this->pelec->f_en.vtxc));
+            this->pelec->pot = new elecstate::Potential(this->pw_rhod,
+                                                        this->pw_rho,
+                                                        &GlobalC::ucell,
+                                                        &(GlobalC::ppcell.vloc),
+                                                        &(this->sf),
+                                                        &(this->pelec->f_en.etxc),
+                                                        &(this->pelec->f_en.vtxc));
         }
     }
 
@@ -350,25 +358,24 @@ void ESolver_KS_LCAO<TK, TR>::cal_force(ModuleBase::matrix& force)
     ModuleBase::TITLE("ESolver_KS_LCAO", "cal_force");
     ModuleBase::timer::tick("ESolver_KS_LCAO", "cal_force");
 
-	Force_Stress_LCAO<TK> fsl(this->RA, GlobalC::ucell.nat);
-
-	fsl.getForceStress(
-            GlobalV::CAL_FORCE,
-			GlobalV::CAL_STRESS,
-			GlobalV::TEST_FORCE,
-			GlobalV::TEST_STRESS,
-            this->orb_con.ParaV, 
-			this->pelec,
-			this->psi,
-            this->LM,
-            this->GG, // mohan add 2024-04-01
-            this->GK, // mohan add 2024-04-01
-            uot_,
-			force,
-			this->scs,
-			this->sf,
-			this->kv,
-			this->pw_rho,
+    Force_Stress_LCAO<TK> fsl(this->RA, GlobalC::ucell.nat);
+
+    fsl.getForceStress(GlobalV::CAL_FORCE,
+                       GlobalV::CAL_STRESS,
+                       GlobalV::TEST_FORCE,
+                       GlobalV::TEST_STRESS,
+                       this->orb_con.ParaV,
+                       this->pelec,
+                       this->psi,
+                       this->LM,
+                       this->GG, // mohan add 2024-04-01
+                       this->GK, // mohan add 2024-04-01
+                       uot_,
+                       force,
+                       this->scs,
+                       this->sf,
+                       this->kv,
+                       this->pw_rho,
 #ifdef __EXX
                        *this->exx_lri_double,
                        *this->exx_lri_complex,
@@ -854,8 +861,10 @@ void ESolver_KS_LCAO<TK, TR>::hamilt2density(int istep, int iter, double ethr)
     }
 
     // 11) compute magnetization, only for spin==2
-    GlobalC::ucell.magnet.compute_magnetization(this->pelec->charge->nrxx, this->pelec->charge->nxyz,
-                                                this->pelec->charge->rho, this->pelec->nelec_spin.data());
+    GlobalC::ucell.magnet.compute_magnetization(this->pelec->charge->nrxx,
+                                                this->pelec->charge->nxyz,
+                                                this->pelec->charge->rho,
+                                                this->pelec->nelec_spin.data());
 
     // 12) calculate delta energy
     this->pelec->f_en.deband = this->pelec->cal_delta_eband();
@@ -1077,16 +1086,22 @@ void ESolver_KS_LCAO<TK, TR>::after_scf(const int istep)
     // 1) write charge difference into files for charge extrapolation
     if (GlobalV::CALCULATION != "scf")
     {
-        this->CE.save_files(istep, GlobalC::ucell,
+        this->CE.save_files(istep,
+                            GlobalC::ucell,
 #ifdef __MPI
                             this->pw_big,
 #endif
-                            this->pelec->charge, &this->sf);
+                            this->pelec->charge,
+                            &this->sf);
     }
 
     // 2) write density matrix for sparse matrix
     ModuleIO::write_dmr(dynamic_cast<const elecstate::ElecStateLCAO<TK>*>(this->pelec)->get_DM()->get_DMR_vector(),
-                        GlobalV::NLOCAL, INPUT.out_dm1, false, GlobalV::out_app_flag, istep);
+                        GlobalV::NLOCAL,
+                        INPUT.out_dm1,
+                        false,
+                        GlobalV::out_app_flag,
+                        istep);
 
     // 3) write charge density
     if (GlobalV::out_chg)
@@ -1346,20 +1361,19 @@ ModuleIO::Output_DM ESolver_KS_LCAO<TK, TR>::create_Output_DM(int is, int iter)
 template <typename TK, typename TR>
 ModuleIO::Output_Mat_Sparse<TK> ESolver_KS_LCAO<TK, TR>::create_Output_Mat_Sparse(int istep)
 {
-	return ModuleIO::Output_Mat_Sparse<TK>(
-            hsolver::HSolverLCAO<TK>::out_mat_hsR,
-			hsolver::HSolverLCAO<TK>::out_mat_dh,
-			hsolver::HSolverLCAO<TK>::out_mat_t,
-			INPUT.out_mat_r,
-			istep,
-			this->pelec->pot->get_effective_v(),
-			this->orb_con.ParaV,
-            this->GK, // mohan add 2024-04-01
-            uot_,
-			this->LM,
-            GlobalC::GridD, // mohan add 2024-04-06
-			this->kv,
-			this->p_hamilt);
+    return ModuleIO::Output_Mat_Sparse<TK>(hsolver::HSolverLCAO<TK>::out_mat_hsR,
+                                           hsolver::HSolverLCAO<TK>::out_mat_dh,
+                                           hsolver::HSolverLCAO<TK>::out_mat_t,
+                                           INPUT.out_mat_r,
+                                           istep,
+                                           this->pelec->pot->get_effective_v(),
+                                           this->orb_con.ParaV,
+                                           this->GK, // mohan add 2024-04-01
+                                           uot_,
+                                           this->LM,
+                                           GlobalC::GridD, // mohan add 2024-04-06
+                                           this->kv,
+                                           this->p_hamilt);
 }
 
 //------------------------------------------------------------------------------
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_beta.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_beta.cpp
index a93f1c95e3..a1cbfa1367 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_beta.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_beta.cpp
@@ -1,27 +1,25 @@
-#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
 #include "module_base/timer.h"
-
+#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
 
 #ifdef __MKL
-#include <mkl_service.h>  // mkl_get_max_threads
+#include <mkl_service.h> // mkl_get_max_threads
 #endif
 
 namespace LCAO_domain
 {
 
-void build_Nonlocal_beta_new(
-    LCAO_Matrix &lm,
-    double* HSloc,
-	const UnitCell &ucell,
-	const LCAO_Orbitals& orb,
-	const ORB_gen_tables& uot,
-	Grid_Driver* GridD) //update by liuyu 2021-04-07
+void build_Nonlocal_beta_new(LCAO_Matrix& lm,
+                             double* HSloc,
+                             const UnitCell& ucell,
+                             const LCAO_Orbitals& orb,
+                             const ORB_gen_tables& uot,
+                             Grid_Driver* GridD) // update by liuyu 2021-04-07
 {
-    ModuleBase::TITLE("LCAO_domain","b_NL_beta_new");
-    ModuleBase::timer::tick ("LCAO_domain","b_NL_beta_new");
+    ModuleBase::TITLE("LCAO_domain", "b_NL_beta_new");
+    ModuleBase::timer::tick("LCAO_domain", "b_NL_beta_new");
 
-	const Parallel_Orbitals* pv = lm.ParaV;
-	const int npol = GlobalV::NPOL;
+    const Parallel_Orbitals* pv = lm.ParaV;
+    const int npol = GlobalV::NPOL;
 
 #ifdef __MKL
     const int mkl_threads = mkl_get_max_threads();
@@ -31,37 +29,37 @@ void build_Nonlocal_beta_new(
     const std::vector<AdjacentAtomInfo> adjs_all = GridD->get_adjs(ucell);
 
 #ifdef _OPENMP
-    #pragma omp parallel
+#pragma omp parallel
     {
         double* Nonlocal_thread;
         Nonlocal_thread = new double[pv->nloc];
         ModuleBase::GlobalFunc::ZEROS(Nonlocal_thread, pv->nloc);
-        #pragma omp for schedule(dynamic)
+#pragma omp for schedule(dynamic)
 #endif
-        for(int iat=0; iat<ucell.nat; iat++)
+        for (int iat = 0; iat < ucell.nat; iat++)
         {
             const int T0 = ucell.iat2it[iat];
             const int I0 = ucell.iat2ia[iat];
             Atom* atom0 = &ucell.atoms[T0];
 
-            //=======================================================
-            //Step1:
-            //saves <beta|psi>, where beta runs over L0,M0 on atom I0
-            //and psi runs over atomic basis sets on the current core
-            //=======================================================
-            #ifdef _OPENMP
-                std::vector<std::unordered_map<int,std::vector<double>>> nlm_tot_thread;
-                nlm_tot_thread.resize(adjs_all[iat].adj_num + 1);
-            #else 
-                std::vector<std::unordered_map<int,std::vector<double>>> nlm_tot;
-                nlm_tot.resize(adjs_all[iat].adj_num + 1);
-            #endif 
+//=======================================================
+// Step1:
+// saves <beta|psi>, where beta runs over L0,M0 on atom I0
+// and psi runs over atomic basis sets on the current core
+//=======================================================
+#ifdef _OPENMP
+            std::vector<std::unordered_map<int, std::vector<double>>> nlm_tot_thread;
+            nlm_tot_thread.resize(adjs_all[iat].adj_num + 1);
+#else
+        std::vector<std::unordered_map<int, std::vector<double>>> nlm_tot;
+        nlm_tot.resize(adjs_all[iat].adj_num + 1);
+#endif
 
             const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
             const double Rcut_Beta = ucell.infoNL.Beta[T0].get_rcut_max();
 
-            //outermost loop : all adjacent atoms
-            for(int ad_count=0; ad_count < adjs_all[iat].adj_num + 1; ad_count++)
+            // outermost loop : all adjacent atoms
+            for (int ad_count = 0; ad_count < adjs_all[iat].adj_num + 1; ad_count++)
             {
                 const int T1 = adjs_all[iat].ntype[ad_count];
                 const int I1 = adjs_all[iat].natom[ad_count];
@@ -69,163 +67,164 @@ void build_Nonlocal_beta_new(
                 const double Rcut_AO1 = orb.Phi[T1].getRcut();
                 const ModuleBase::Vector3<double> tau1 = adjs_all[iat].adjacent_tau[ad_count];
                 const Atom* atom1 = &ucell.atoms[T1];
-                const int nw1_tot = atom1->nw*npol;
+                const int nw1_tot = atom1->nw * npol;
 
-                #ifdef _OPENMP
-                    nlm_tot_thread[ad_count].clear();
-                #else 
-                    nlm_tot[ad_count].clear();
-                #endif 
+#ifdef _OPENMP
+                nlm_tot_thread[ad_count].clear();
+#else
+            nlm_tot[ad_count].clear();
+#endif
 
-                //middle loop : atomic basis on current processor (either row or column)
-                const double dist1 = (tau1-tau0).norm() * ucell.lat0;
+                // middle loop : atomic basis on current processor (either row or column)
+                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
                 if (dist1 > Rcut_Beta + Rcut_AO1)
                 {
                     continue;
                 }
 
-                for(int iw1=0; iw1<nw1_tot; ++iw1)
+                for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
                 {
                     const int iw1_all = start1 + iw1;
                     const int iw1_local = pv->global2local_row(iw1_all);
                     const int iw2_local = pv->global2local_col(iw1_all);
 
-                    if(iw1_local < 0 && iw2_local < 0) continue;
+                    if (iw1_local < 0 && iw2_local < 0)
+                        continue;
 
-                    const int iw1_0 = iw1/npol;
+                    const int iw1_0 = iw1 / npol;
                     std::vector<std::vector<double>> nlm;
-                    //2D, but first dimension is only 1 here
-                    //for force, the right hand side is the gradient
-                    //and the first dimension is then 3
-                    //inner loop : all projectors (L0,M0)
+                    // 2D, but first dimension is only 1 here
+                    // for force, the right hand side is the gradient
+                    // and the first dimension is then 3
+                    // inner loop : all projectors (L0,M0)
 
-                    int L1 = atom1->iw2l[ iw1_0 ];
-                    int N1 = atom1->iw2n[ iw1_0 ];
-                    int m1 = atom1->iw2m[ iw1_0 ];
+                    int L1 = atom1->iw2l[iw1_0];
+                    int N1 = atom1->iw2n[iw1_0];
+                    int m1 = atom1->iw2m[iw1_0];
 
                     // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-                    int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
+                    int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
                     ModuleBase::Vector3<double> dtau = ucell.atoms[T0].tau[I0] - tau1;
-                    uot.two_center_bundle->overlap_orb_beta->snap(
-                            T1, L1, N1, M1, T0, dtau * ucell.lat0, false, nlm);
+                    uot.two_center_bundle->overlap_orb_beta->snap(T1, L1, N1, M1, T0, dtau * ucell.lat0, false, nlm);
 
-                    #ifdef _OPENMP
-                        nlm_tot_thread[ad_count].insert({iw1_all,nlm[0]});
-                    #else 
-                        nlm_tot[ad_count].insert({iw1_all,nlm[0]});
-                    #endif 
-                }//end iw
-            }//end ad
+#ifdef _OPENMP
+                    nlm_tot_thread[ad_count].insert({iw1_all, nlm[0]});
+#else
+                nlm_tot[ad_count].insert({iw1_all, nlm[0]});
+#endif
+                } // end iw
+            }     // end ad
 
             //=======================================================
-            //Step2:
-            //calculate sum_(L0,M0) beta<psi_i|beta><beta|psi_j>
-            //and accumulate the value to Hloc_fixed(i,j)
+            // Step2:
+            // calculate sum_(L0,M0) beta<psi_i|beta><beta|psi_j>
+            // and accumulate the value to Hloc_fixed(i,j)
             //=======================================================
-            for(int ad1_count=0; ad1_count < adjs_all[iat].adj_num + 1; ad1_count++)
+            for (int ad1_count = 0; ad1_count < adjs_all[iat].adj_num + 1; ad1_count++)
             {
                 const int T1 = adjs_all[iat].ntype[ad1_count];
                 const int I1 = adjs_all[iat].natom[ad1_count];
                 const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
                 const ModuleBase::Vector3<double> tau1 = adjs_all[iat].adjacent_tau[ad1_count];
                 const Atom* atom1 = &ucell.atoms[T1];
-                const int nw1_tot = atom1->nw*npol;
+                const int nw1_tot = atom1->nw * npol;
                 const double Rcut_AO1 = orb.Phi[T1].getRcut();
 
-                for (int ad2_count=0; ad2_count < adjs_all[iat].adj_num + 1; ad2_count++)
+                for (int ad2_count = 0; ad2_count < adjs_all[iat].adj_num + 1; ad2_count++)
                 {
                     const int T2 = adjs_all[iat].ntype[ad2_count];
                     const int I2 = adjs_all[iat].natom[ad2_count];
                     const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
                     const ModuleBase::Vector3<double> tau2 = adjs_all[iat].adjacent_tau[ad2_count];
                     const Atom* atom2 = &ucell.atoms[T2];
-                    const int nw2_tot = atom2->nw*npol;
+                    const int nw2_tot = atom2->nw * npol;
                     const double Rcut_AO2 = orb.Phi[T2].getRcut();
-                    const double dist1 = (tau1-tau0).norm() * ucell.lat0;
-                    const double dist2 = (tau2-tau0).norm() * ucell.lat0;
+                    const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
+                    const double dist2 = (tau2 - tau0).norm() * ucell.lat0;
 
-                    if (dist1 > Rcut_Beta + Rcut_AO1
-                            || dist2 > Rcut_Beta + Rcut_AO2)
+                    if (dist1 > Rcut_Beta + Rcut_AO1 || dist2 > Rcut_Beta + Rcut_AO2)
                     {
                         continue;
                     }
 
-                    for(int iw1=0; iw1<nw1_tot; ++iw1)
+                    for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
                     {
                         const int iw1_all = start1 + iw1;
                         const int iw1_local = pv->global2local_row(iw1_all);
-                        if(iw1_local < 0) continue;
-                        const int iw1_0 = iw1/npol;
-                        for(int iw2=0; iw2<nw2_tot; ++iw2)
+                        if (iw1_local < 0)
+                            continue;
+                        const int iw1_0 = iw1 / npol;
+                        for (int iw2 = 0; iw2 < nw2_tot; ++iw2)
                         {
                             const int iw2_all = start2 + iw2;
                             const int iw2_local = pv->global2local_col(iw2_all);
-                            if(iw2_local < 0) continue;
-                            const int iw2_0 = iw2/npol;
-                            #ifdef _OPENMP
-                                std::vector<double> nlm1 = nlm_tot_thread[ad1_count][iw1_all];
-                                std::vector<double> nlm2 = nlm_tot_thread[ad2_count][iw2_all];
-                            #else 
-                                std::vector<double> nlm1 = nlm_tot[ad1_count][iw1_all];
-                                std::vector<double> nlm2 = nlm_tot[ad2_count][iw2_all];
-                            #endif 
-
-                            assert(nlm1.size()==nlm2.size());
-                            #ifdef _OPENMP
-                                double nlm_thread=0.0;
-                            #else 
-                                double nlm=0.0;
-                            #endif
+                            if (iw2_local < 0)
+                                continue;
+                            const int iw2_0 = iw2 / npol;
+#ifdef _OPENMP
+                            std::vector<double> nlm1 = nlm_tot_thread[ad1_count][iw1_all];
+                            std::vector<double> nlm2 = nlm_tot_thread[ad2_count][iw2_all];
+#else
+                        std::vector<double> nlm1 = nlm_tot[ad1_count][iw1_all];
+                        std::vector<double> nlm2 = nlm_tot[ad2_count][iw2_all];
+#endif
+
+                            assert(nlm1.size() == nlm2.size());
+#ifdef _OPENMP
+                            double nlm_thread = 0.0;
+#else
+                        double nlm = 0.0;
+#endif
                             const int nproj = ucell.infoNL.nproj[T0];
                             int ib = 0;
-                            for(int nb = 0; nb < nproj; nb++)
+                            for (int nb = 0; nb < nproj; nb++)
                             {
                                 const int L0 = ucell.infoNL.Beta[T0].Proj[nb].getL();
-                                for(int m=0;m<2*L0+1;m++)
+                                for (int m = 0; m < 2 * L0 + 1; m++)
                                 {
-                                    #ifdef _OPENMP
-                                        nlm_thread += nlm1[ib]*nlm2[ib]*ucell.atoms[T0].ncpp.dion(nb,nb);
-                                    #else 
-                                        nlm += nlm1[ib]*nlm2[ib]*ucell.atoms[T0].ncpp.dion(nb,nb);
-                                    #endif
-                                    ib+=1;
+#ifdef _OPENMP
+                                    nlm_thread += nlm1[ib] * nlm2[ib] * ucell.atoms[T0].ncpp.dion(nb, nb);
+#else
+                                nlm += nlm1[ib] * nlm2[ib] * ucell.atoms[T0].ncpp.dion(nb, nb);
+#endif
+                                    ib += 1;
                                 }
                             }
-                            assert(ib==nlm1.size());
+                            assert(ib == nlm1.size());
 
                             const int ir = pv->global2local_row(iw1_all);
                             const int ic = pv->global2local_col(iw2_all);
-                            long index=0;
+                            long index = 0;
                             if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
                             {
-                                index=ic*pv->nrow+ir;
+                                index = ic * pv->nrow + ir;
                             }
                             else
                             {
-                                index=ir*pv->ncol+ic;
+                                index = ir * pv->ncol + ic;
                             }
 #ifdef _OPENMP
                             Nonlocal_thread[index] += nlm_thread;
 #else
-                            lm.set_HSgamma(iw1_all, iw2_all, nlm, HSloc);
+                        lm.set_HSgamma(iw1_all, iw2_all, nlm, HSloc);
 #endif
-                        }//iw2
-                    }//iw1
-                }//ad2
-            }//ad1
-        }//end iat
-
-        #ifdef _OPENMP
-            #pragma omp critical(cal_nonlocal)
+                        } // iw2
+                    }     // iw1
+                }         // ad2
+            }             // ad1
+        }                 // end iat
+
+#ifdef _OPENMP
+#pragma omp critical(cal_nonlocal)
+        {
+            for (int i = 0; i < pv->nloc; i++)
             {
-                for(int i=0; i<pv->nloc; i++)
-                {
-                    lm.Hloc_fixed[i] += Nonlocal_thread[i];
-                }
+                lm.Hloc_fixed[i] += Nonlocal_thread[i];
             }
-            delete[] Nonlocal_thread;
-        #endif
+        }
+        delete[] Nonlocal_thread;
+#endif
 #ifdef _OPENMP
     }
 #endif
@@ -233,9 +232,9 @@ void build_Nonlocal_beta_new(
 #ifdef __MKL
     mkl_set_num_threads(mkl_threads);
 #endif
-	
-    ModuleBase::timer::tick ("LCAO_domain","b_NL_beta_new");
-	return;
-}
 
+    ModuleBase::timer::tick("LCAO_domain", "b_NL_beta_new");
+    return;
 }
+
+} // namespace LCAO_domain
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_mu.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_mu.cpp
index 7d44122bfd..b975083027 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_mu.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_nl_mu.cpp
@@ -1,574 +1,599 @@
-#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
 #include "module_base/timer.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
 
 namespace LCAO_domain
 {
 
-
-typedef std::tuple<int,int,int,int> key_tuple;
+typedef std::tuple<int, int, int, int> key_tuple;
 
 #include "record_adj.h" //mohan add 2012-07-06
 
-void build_Nonlocal_mu_new(
-    LCAO_Matrix &lm,
-    ForceStressArrays &fsr,
-    double* NLloc,
-	const bool &calc_deri,
-	const UnitCell &ucell,
-	const LCAO_Orbitals &orb,
-	const ORB_gen_tables &uot,
-	Grid_Driver* GridD)
+void build_Nonlocal_mu_new(LCAO_Matrix& lm,
+                           ForceStressArrays& fsr,
+                           double* NLloc,
+                           const bool& calc_deri,
+                           const UnitCell& ucell,
+                           const LCAO_Orbitals& orb,
+                           const ORB_gen_tables& uot,
+                           Grid_Driver* GridD)
 {
-    ModuleBase::TITLE("LCAO_domain","vnl_mu_new");
+    ModuleBase::TITLE("LCAO_domain", "vnl_mu_new");
     ModuleBase::timer::tick("LCAO_domain", "vnl_mu_new");
     const Parallel_Orbitals* pv = lm.ParaV;
 
-	const int nspin = GlobalV::NSPIN;
-	const int npol = GlobalV::NPOL;
-	const bool gamma_only_local = GlobalV::GAMMA_ONLY_LOCAL;
-
-	// < phi1 | beta > < beta | phi2 >
-	// phi1 is within the unitcell.
-	// while beta is in the supercell.
-	// while phi2 is in the supercell.
-
-
-	//Step 1 : generate <psi|beta>
-
-	//This is the data structure for storing <psi|beta>
-	//It is a 4 layer data structure
-	//The outmost layer is std::vector with size being number of atoms in unit cell
-	//The second layer is a map, the key being a combination of 4 number (iat, dRx, dRy, dRz)
-	//which identifies a unique adjacent atom of the first atom
-	//The third layer is an unordered map, with key being the index of atomic basis |psi>
-	//The inner layer is a vector, each element representing a projector |beta>
-	//It then either stores the number <psi|beta> (nlm_tot)
-	//or a vector of 4, storing additionally <d/dx_i psi|beta> (nlm_tot1) x_i=x,y,z
-	std::vector<std::map<key_tuple,std::unordered_map<int,std::vector<double>>>> nlm_tot;
-	std::vector<std::map<key_tuple,std::unordered_map<int,std::vector<std::vector<double>>>>> nlm_tot1;
-
-	if(!calc_deri)
-	{
-		nlm_tot.resize(ucell.nat);
-	}
-	else
-	{
-		nlm_tot1.resize(ucell.nat);
-	}
+    const int nspin = GlobalV::NSPIN;
+    const int npol = GlobalV::NPOL;
+    const bool gamma_only_local = GlobalV::GAMMA_ONLY_LOCAL;
+
+    // < phi1 | beta > < beta | phi2 >
+    // phi1 is within the unitcell.
+    // while beta is in the supercell.
+    // while phi2 is in the supercell.
+
+    // Step 1 : generate <psi|beta>
+
+    // This is the data structure for storing <psi|beta>
+    // It is a 4 layer data structure
+    // The outmost layer is std::vector with size being number of atoms in unit cell
+    // The second layer is a map, the key being a combination of 4 number (iat, dRx, dRy, dRz)
+    // which identifies a unique adjacent atom of the first atom
+    // The third layer is an unordered map, with key being the index of atomic basis |psi>
+    // The inner layer is a vector, each element representing a projector |beta>
+    // It then either stores the number <psi|beta> (nlm_tot)
+    // or a vector of 4, storing additionally <d/dx_i psi|beta> (nlm_tot1) x_i=x,y,z
+    std::vector<std::map<key_tuple, std::unordered_map<int, std::vector<double>>>> nlm_tot;
+    std::vector<std::map<key_tuple, std::unordered_map<int, std::vector<std::vector<double>>>>> nlm_tot1;
+
+    if (!calc_deri)
+    {
+        nlm_tot.resize(ucell.nat);
+    }
+    else
+    {
+        nlm_tot1.resize(ucell.nat);
+    }
 #ifdef _OPENMP
 #pragma omp parallel for schedule(dynamic)
 #endif
-	for(int iat=0;iat<ucell.nat;iat++)
-	{
-		const int it = ucell.iat2it[iat];
-		const int ia = ucell.iat2ia[iat];
-
-		const double Rcut_Beta = ucell.infoNL.Beta[it].get_rcut_max();
-		const ModuleBase::Vector3<double> tau = ucell.atoms[it].tau[ia];
-		AdjacentAtomInfo adjs;
-        GridD->Find_atom(ucell, tau ,it, ia, &adjs);
-
-		if(!calc_deri)
-		{
-			nlm_tot[iat].clear();
-		}
-		else
-		{
-			nlm_tot1[iat].clear();
-		}
-
-		for (int ad=0; ad<adjs.adj_num+1 ; ++ad)
-		{
-			const int T1 = adjs.ntype[ad];
-			const int I1 = adjs.natom[ad];
-			const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-			const double Rcut_AO1 = orb.Phi[T1].getRcut();
-
-			const ModuleBase::Vector3<double> &tau1 = adjs.adjacent_tau[ad];
-			const Atom* atom1 = &ucell.atoms[T1];
-			const int nw1_tot = atom1->nw*npol;
-
-			const ModuleBase::Vector3<double> dtau = tau1-tau;
-			const double dist1 = dtau.norm2() * pow(ucell.lat0,2);
-			
-			if (dist1 > pow(Rcut_Beta + Rcut_AO1,2))
-			{
-				continue;
-			}
-			std::unordered_map<int,std::vector<double>> nlm_cur;
-			std::unordered_map<int,std::vector<std::vector<double>>> nlm_cur1;
-			
-			if(!calc_deri)
-			{
-				nlm_cur.clear();
-			}
-			else
-			{
-				nlm_cur1.clear();
-			}
-			for (int iw1=0; iw1<nw1_tot; ++iw1)
-			{
-				const int iw1_all = start1 + iw1;
+    for (int iat = 0; iat < ucell.nat; iat++)
+    {
+        const int it = ucell.iat2it[iat];
+        const int ia = ucell.iat2ia[iat];
+
+        const double Rcut_Beta = ucell.infoNL.Beta[it].get_rcut_max();
+        const ModuleBase::Vector3<double> tau = ucell.atoms[it].tau[ia];
+        AdjacentAtomInfo adjs;
+        GridD->Find_atom(ucell, tau, it, ia, &adjs);
+
+        if (!calc_deri)
+        {
+            nlm_tot[iat].clear();
+        }
+        else
+        {
+            nlm_tot1[iat].clear();
+        }
+
+        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+        {
+            const int T1 = adjs.ntype[ad];
+            const int I1 = adjs.natom[ad];
+            const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
+            const double Rcut_AO1 = orb.Phi[T1].getRcut();
+
+            const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
+            const Atom* atom1 = &ucell.atoms[T1];
+            const int nw1_tot = atom1->nw * npol;
+
+            const ModuleBase::Vector3<double> dtau = tau1 - tau;
+            const double dist1 = dtau.norm2() * pow(ucell.lat0, 2);
+
+            if (dist1 > pow(Rcut_Beta + Rcut_AO1, 2))
+            {
+                continue;
+            }
+            std::unordered_map<int, std::vector<double>> nlm_cur;
+            std::unordered_map<int, std::vector<std::vector<double>>> nlm_cur1;
+
+            if (!calc_deri)
+            {
+                nlm_cur.clear();
+            }
+            else
+            {
+                nlm_cur1.clear();
+            }
+            for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
+            {
+                const int iw1_all = start1 + iw1;
                 const int iw1_local = pv->global2local_row(iw1_all);
                 const int iw2_local = pv->global2local_col(iw1_all);
-				if(iw1_local < 0 && iw2_local < 0)continue;
-				const int iw1_0 = iw1/npol;
-				std::vector<std::vector<double>> nlm;
-				//nlm is a vector of vectors, but size of outer vector is only 1 here
-				//If we are calculating force, we need also to store the gradient
-				//and size of outer vector is then 4
-				//inner loop : all projectors (L0,M0)
-
-                int L1 = atom1->iw2l[ iw1_0 ];
-                int N1 = atom1->iw2n[ iw1_0 ];
-                int m1 = atom1->iw2m[ iw1_0 ];
+                if (iw1_local < 0 && iw2_local < 0)
+                    continue;
+                const int iw1_0 = iw1 / npol;
+                std::vector<std::vector<double>> nlm;
+                // nlm is a vector of vectors, but size of outer vector is only 1 here
+                // If we are calculating force, we need also to store the gradient
+                // and size of outer vector is then 4
+                // inner loop : all projectors (L0,M0)
+
+                int L1 = atom1->iw2l[iw1_0];
+                int N1 = atom1->iw2n[iw1_0];
+                int m1 = atom1->iw2m[iw1_0];
 
                 // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-                int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
+                int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
                 ModuleBase::Vector3<double> dtau = tau - tau1;
-                uot.two_center_bundle->overlap_orb_beta->snap(
-                        T1, L1, N1, M1, it, dtau * ucell.lat0, calc_deri, nlm);
-
-				if(!calc_deri)
-				{
-					nlm_cur.insert({iw1_all,nlm[0]});
-				}
-				else
-				{
-					nlm_cur1.insert({iw1_all,nlm});
-				}
-			}//end iw
-
-			const int iat1=ucell.itia2iat(T1, I1);
-			const int rx1=adjs.box[ad].x;
-			const int ry1=adjs.box[ad].y;
-			const int rz1=adjs.box[ad].z;
-			key_tuple key_1(iat1,rx1,ry1,rz1);
-
-			if(!calc_deri)
-			{
-				nlm_tot[iat][key_1]=nlm_cur;
-			}
-			else
-			{
-				nlm_tot1[iat][key_1]=nlm_cur1;
-			}
-		}//end ad
-	}
-
-
-	//=======================================================
-	//Step2:	
-	//calculate sum_(L0,M0) beta<psi_i|beta><beta|psi_j>
-	//and accumulate the value to Hloc_fixedR(i,j)
-	//=======================================================
-	int total_nnr = 0;
+                uot.two_center_bundle->overlap_orb_beta->snap(T1, L1, N1, M1, it, dtau * ucell.lat0, calc_deri, nlm);
+
+                if (!calc_deri)
+                {
+                    nlm_cur.insert({iw1_all, nlm[0]});
+                }
+                else
+                {
+                    nlm_cur1.insert({iw1_all, nlm});
+                }
+            } // end iw
+
+            const int iat1 = ucell.itia2iat(T1, I1);
+            const int rx1 = adjs.box[ad].x;
+            const int ry1 = adjs.box[ad].y;
+            const int rz1 = adjs.box[ad].z;
+            key_tuple key_1(iat1, rx1, ry1, rz1);
+
+            if (!calc_deri)
+            {
+                nlm_tot[iat][key_1] = nlm_cur;
+            }
+            else
+            {
+                nlm_tot1[iat][key_1] = nlm_cur1;
+            }
+        } // end ad
+    }
+
+    //=======================================================
+    // Step2:
+    // calculate sum_(L0,M0) beta<psi_i|beta><beta|psi_j>
+    // and accumulate the value to Hloc_fixedR(i,j)
+    //=======================================================
+    int total_nnr = 0;
 #ifdef _OPENMP
-#pragma omp parallel reduction(+:total_nnr)
-{
+#pragma omp parallel reduction(+ : total_nnr)
+    {
 #endif
-	ModuleBase::Vector3<double> tau1, tau2, dtau;
-	ModuleBase::Vector3<double> dtau1, dtau2, tau0;
-	double distance = 0.0;
-	double rcut = 0.0;
-	double rcut1 = 0.0;
-    double rcut2 = 0.0;
-		
-	//	Record_adj RA;
-	//	RA.for_2d();
-
-	// psi1
+        ModuleBase::Vector3<double> tau1, tau2, dtau;
+        ModuleBase::Vector3<double> dtau1, dtau2, tau0;
+        double distance = 0.0;
+        double rcut = 0.0;
+        double rcut1 = 0.0;
+        double rcut2 = 0.0;
+
+        //	Record_adj RA;
+        //	RA.for_2d();
+
+        // psi1
 #ifdef _OPENMP
 // use schedule(dynamic) for load balancing because adj_num is various
 #pragma omp for schedule(dynamic)
 #endif
-    for (int iat1 = 0; iat1 < ucell.nat; iat1++)
-    {
-		const int T1 = ucell.iat2it[iat1];
-		const Atom* atom1 = &ucell.atoms[T1];
-		const int I1 = ucell.iat2ia[iat1];
-		{
-			//GridD->Find_atom( atom1->tau[I1] );
-			AdjacentAtomInfo adjs;
-			GridD->Find_atom(ucell, atom1->tau[I1] ,T1, I1, &adjs);
-			const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-			// Record_adj.for_2d() may not called in some case
-			int nnr = pv->nlocstart ? pv->nlocstart[iat1] : 0;
-			tau1 = atom1->tau[I1];
-
-			// psi2
-			for (int ad2=0; ad2<adjs.adj_num+1; ++ad2)
-			{
-				const int T2 = adjs.ntype[ad2];
-				const Atom* atom2 = &ucell.atoms[T2];
-				
-				const int I2 = adjs.natom[ad2];
-				const int iat2 = ucell.itia2iat(T2, I2);
-				const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
-				tau2 = adjs.adjacent_tau[ad2];
-
-				bool is_adj = false;
-
-				const int rx2=adjs.box[ad2].x;
-				const int ry2=adjs.box[ad2].y;
-				const int rz2=adjs.box[ad2].z;
-
-					
-				dtau = tau2 - tau1;
-				distance = dtau.norm2() * pow(ucell.lat0,2);
-				// this rcut is in order to make nnr consistent 
-				// with other matrix.
-				rcut = pow(orb.Phi[T1].getRcut() + orb.Phi[T2].getRcut(),2);
-				if(distance < rcut) is_adj = true;
-				else if(distance >= rcut)
-				{
-					for (int ad0 = 0; ad0 < adjs.adj_num+1; ++ad0)
-					{
-						const int T0 = adjs.ntype[ad0];
-
-						tau0 = adjs.adjacent_tau[ad0];
-						dtau1 = tau0 - tau1;
-						dtau2 = tau0 - tau2;
-
-						const double distance1 = dtau1.norm2() * pow(ucell.lat0,2);
-						const double distance2 = dtau2.norm2() * pow(ucell.lat0,2);
-
-						rcut1 = pow(orb.Phi[T1].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max(),2);
-						rcut2 = pow(orb.Phi[T2].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max(),2);
-
-						if( distance1 < rcut1 && distance2 < rcut2 )
-						{
-							is_adj = true;
-							break;
-						}
-					}
-				}
-
-
-				if(is_adj)
-				{
-					// < psi1 | all projectors | psi2 >
-					// ----------------------------- enter the nnr increaing zone -------------------------
-					for (int ad0=0; ad0 < adjs.adj_num+1 ; ++ad0)
-					{
-						const int T0 = adjs.ntype[ad0];
-						const int I0 = adjs.natom[ad0];
-						const int iat = ucell.itia2iat(T0,I0);
-
-						// mohan add 2010-12-19
-						if( ucell.infoNL.nproj[T0] == 0) 
-						{
-							continue;
-						}
-
-						tau0 = adjs.adjacent_tau[ad0];
-
-						dtau1 = tau0 - tau1;
-						dtau2 = tau0 - tau2;
-						const double distance1 = dtau1.norm2() * pow(ucell.lat0,2);
-						const double distance2 = dtau2.norm2() * pow(ucell.lat0,2);
-
-						// seems a bug here!! mohan 2011-06-17
-						rcut1 = pow(orb.Phi[T1].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max(),2);
-						rcut2 = pow(orb.Phi[T2].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max(),2);
-
-						if(distance1 >= rcut1 || distance2 >= rcut2)
-						{
-							continue;
-						}
-						//const Atom* atom0 = &ucell.atoms[T0];
-						const int rx0=adjs.box[ad0].x;
-						const int ry0=adjs.box[ad0].y;
-						const int rz0=adjs.box[ad0].z;
-						key_tuple key1(iat1,-rx0,-ry0,-rz0);
-						key_tuple key2(iat2,rx2-rx0,ry2-ry0,rz2-rz0);
-						
-						std::unordered_map<int,std::vector<double>> *nlm_cur1_e; //left hand side, for energy
-						std::unordered_map<int,std::vector<std::vector<double>>> *nlm_cur1_f; //lhs, for force
-						std::unordered_map<int,std::vector<double>> *nlm_cur2_e; //rhs, for energy
-						std::unordered_map<int,std::vector<std::vector<double>>> *nlm_cur2_f; //rhs, for force
-
-						if(!calc_deri)
-						{
-							nlm_cur1_e = &nlm_tot[iat][key1];
-							nlm_cur2_e = &nlm_tot[iat][key2];
-						}
-						else
-						{
-							nlm_cur1_f = &nlm_tot1[iat][key1];
-							nlm_cur2_f = &nlm_tot1[iat][key2];
-						}
-				
-						int nnr_inner = 0;
-						
-						for (int j=0; j<atom1->nw*npol; j++)
-						{
-							const int j0 = j/npol;//added by zhengdy-soc
-							const int iw1_all = start1 + j;
+        for (int iat1 = 0; iat1 < ucell.nat; iat1++)
+        {
+            const int T1 = ucell.iat2it[iat1];
+            const Atom* atom1 = &ucell.atoms[T1];
+            const int I1 = ucell.iat2ia[iat1];
+            {
+                // GridD->Find_atom( atom1->tau[I1] );
+                AdjacentAtomInfo adjs;
+                GridD->Find_atom(ucell, atom1->tau[I1], T1, I1, &adjs);
+                const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
+                // Record_adj.for_2d() may not called in some case
+                int nnr = pv->nlocstart ? pv->nlocstart[iat1] : 0;
+                tau1 = atom1->tau[I1];
+
+                // psi2
+                for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
+                {
+                    const int T2 = adjs.ntype[ad2];
+                    const Atom* atom2 = &ucell.atoms[T2];
+
+                    const int I2 = adjs.natom[ad2];
+                    const int iat2 = ucell.itia2iat(T2, I2);
+                    const int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+                    tau2 = adjs.adjacent_tau[ad2];
+
+                    bool is_adj = false;
+
+                    const int rx2 = adjs.box[ad2].x;
+                    const int ry2 = adjs.box[ad2].y;
+                    const int rz2 = adjs.box[ad2].z;
+
+                    dtau = tau2 - tau1;
+                    distance = dtau.norm2() * pow(ucell.lat0, 2);
+                    // this rcut is in order to make nnr consistent
+                    // with other matrix.
+                    rcut = pow(orb.Phi[T1].getRcut() + orb.Phi[T2].getRcut(), 2);
+                    if (distance < rcut)
+                        is_adj = true;
+                    else if (distance >= rcut)
+                    {
+                        for (int ad0 = 0; ad0 < adjs.adj_num + 1; ++ad0)
+                        {
+                            const int T0 = adjs.ntype[ad0];
+
+                            tau0 = adjs.adjacent_tau[ad0];
+                            dtau1 = tau0 - tau1;
+                            dtau2 = tau0 - tau2;
+
+                            const double distance1 = dtau1.norm2() * pow(ucell.lat0, 2);
+                            const double distance2 = dtau2.norm2() * pow(ucell.lat0, 2);
+
+                            rcut1 = pow(orb.Phi[T1].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max(), 2);
+                            rcut2 = pow(orb.Phi[T2].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max(), 2);
+
+                            if (distance1 < rcut1 && distance2 < rcut2)
+                            {
+                                is_adj = true;
+                                break;
+                            }
+                        }
+                    }
+
+                    if (is_adj)
+                    {
+                        // < psi1 | all projectors | psi2 >
+                        // ----------------------------- enter the nnr increaing zone -------------------------
+                        for (int ad0 = 0; ad0 < adjs.adj_num + 1; ++ad0)
+                        {
+                            const int T0 = adjs.ntype[ad0];
+                            const int I0 = adjs.natom[ad0];
+                            const int iat = ucell.itia2iat(T0, I0);
+
+                            // mohan add 2010-12-19
+                            if (ucell.infoNL.nproj[T0] == 0)
+                            {
+                                continue;
+                            }
+
+                            tau0 = adjs.adjacent_tau[ad0];
+
+                            dtau1 = tau0 - tau1;
+                            dtau2 = tau0 - tau2;
+                            const double distance1 = dtau1.norm2() * pow(ucell.lat0, 2);
+                            const double distance2 = dtau2.norm2() * pow(ucell.lat0, 2);
+
+                            // seems a bug here!! mohan 2011-06-17
+                            rcut1 = pow(orb.Phi[T1].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max(), 2);
+                            rcut2 = pow(orb.Phi[T2].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max(), 2);
+
+                            if (distance1 >= rcut1 || distance2 >= rcut2)
+                            {
+                                continue;
+                            }
+                            // const Atom* atom0 = &ucell.atoms[T0];
+                            const int rx0 = adjs.box[ad0].x;
+                            const int ry0 = adjs.box[ad0].y;
+                            const int rz0 = adjs.box[ad0].z;
+                            key_tuple key1(iat1, -rx0, -ry0, -rz0);
+                            key_tuple key2(iat2, rx2 - rx0, ry2 - ry0, rz2 - rz0);
+
+                            std::unordered_map<int, std::vector<double>>* nlm_cur1_e; // left hand side, for energy
+                            std::unordered_map<int, std::vector<std::vector<double>>>* nlm_cur1_f; // lhs, for force
+                            std::unordered_map<int, std::vector<double>>* nlm_cur2_e;              // rhs, for energy
+                            std::unordered_map<int, std::vector<std::vector<double>>>* nlm_cur2_f; // rhs, for force
+
+                            if (!calc_deri)
+                            {
+                                nlm_cur1_e = &nlm_tot[iat][key1];
+                                nlm_cur2_e = &nlm_tot[iat][key2];
+                            }
+                            else
+                            {
+                                nlm_cur1_f = &nlm_tot1[iat][key1];
+                                nlm_cur2_f = &nlm_tot1[iat][key2];
+                            }
+
+                            int nnr_inner = 0;
+
+                            for (int j = 0; j < atom1->nw * npol; j++)
+                            {
+                                const int j0 = j / npol; // added by zhengdy-soc
+                                const int iw1_all = start1 + j;
+                                const int mu = pv->global2local_row(iw1_all);
+                                if (mu < 0)
+                                    continue;
+
+                                // fix a serious bug: atom2[T2] -> atom2
+                                // mohan 2010-12-20
+                                for (int k = 0; k < atom2->nw * npol; k++)
+                                {
+                                    const int k0 = k / npol;
+                                    const int iw2_all = start2 + k;
+                                    const int nu = pv->global2local_col(iw2_all);
+                                    if (nu < 0)
+                                        continue;
+
+                                    if (!calc_deri)
+                                    {
+                                        std::vector<double> nlm_1 = (*nlm_cur1_e)[iw1_all];
+                                        std::vector<double> nlm_2 = (*nlm_cur2_e)[iw2_all];
+                                        if (nspin == 2 || nspin == 1)
+                                        {
+                                            double nlm_tmp = 0.0;
+                                            const int nproj = ucell.infoNL.nproj[T0];
+                                            int ib = 0;
+                                            for (int nb = 0; nb < nproj; nb++)
+                                            {
+                                                const int L0 = ucell.infoNL.Beta[T0].Proj[nb].getL();
+                                                for (int m = 0; m < 2 * L0 + 1; m++)
+                                                {
+                                                    if (nlm_1[ib] != 0.0 && nlm_2[ib] != 0.0)
+                                                    {
+                                                        nlm_tmp += nlm_1[ib] * nlm_2[ib]
+                                                                   * ucell.atoms[T0].ncpp.dion(nb, nb);
+                                                    }
+                                                    ib += 1;
+                                                }
+                                            }
+                                            assert(ib == nlm_1.size());
+
+                                            if (gamma_only_local)
+                                            {
+                                                // mohan add 2010-12-20
+                                                if (nlm_tmp != 0.0)
+                                                {
+                                                    lm.set_HSgamma(iw1_all,
+                                                                   iw2_all,
+                                                                   nlm_tmp,
+                                                                   NLloc); // N stands for nonlocal.
+                                                }
+                                            }
+                                            else
+                                            {
+                                                if (nlm_tmp != 0.0)
+                                                {
+                                                    NLloc[nnr + nnr_inner] += nlm_tmp;
+                                                }
+                                            }
+                                        } // end nspin
+                                    }     // calc_deri
+                                    else  // calculate the derivative
+                                    {
+                                        if (nspin == 4)
+                                        {
+                                            std::vector<double> nlm_1 = (*nlm_cur2_f)[iw2_all][0];
+                                            std::vector<std::vector<double>> nlm_2;
+                                            nlm_2.resize(3);
+                                            for (int i = 0; i < 3; i++)
+                                            {
+                                                nlm_2[i] = (*nlm_cur1_f)[iw1_all][i + 1];
+                                            }
+                                            std::complex<double> nlm[4][3] = {ModuleBase::ZERO};
+                                            int is0 = (j - j0 * npol) + (k - k0 * npol) * 2;
+                                            for (int no = 0; no < ucell.atoms[T0].ncpp.non_zero_count_soc[is0]; no++)
+                                            {
+                                                const int p1 = ucell.atoms[T0].ncpp.index1_soc[is0][no];
+                                                const int p2 = ucell.atoms[T0].ncpp.index2_soc[is0][no];
+                                                if (is0 == 0)
+                                                {
+                                                    fsr.DHloc_fixedR_x[nnr + nnr_inner]
+                                                        += nlm_2[0][p1] * nlm_1[p2]
+                                                           * (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
+                                                              + ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())
+                                                           * 0.5;
+                                                    fsr.DHloc_fixedR_y[nnr + nnr_inner]
+                                                        += nlm_2[1][p1] * nlm_1[p2]
+                                                           * (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
+                                                              + ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())
+                                                           * 0.5;
+                                                    fsr.DHloc_fixedR_z[nnr + nnr_inner]
+                                                        += nlm_2[2][p1] * nlm_1[p2]
+                                                           * (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
+                                                              + ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())
+                                                           * 0.5;
+                                                }
+                                                else if (is0 == 1)
+                                                {
+                                                    fsr.DHloc_fixedR_x[nnr + nnr_inner]
+                                                        += nlm_2[0][p1] * nlm_1[p2]
+                                                           * (ucell.atoms[T0].ncpp.d_so(1, p2, p1).real()
+                                                              + ucell.atoms[T0].ncpp.d_so(2, p2, p1).real())
+                                                           * 0.5;
+                                                    fsr.DHloc_fixedR_y[nnr + nnr_inner]
+                                                        += nlm_2[1][p1] * nlm_1[p2]
+                                                           * (ucell.atoms[T0].ncpp.d_so(1, p2, p1).real()
+                                                              + ucell.atoms[T0].ncpp.d_so(2, p2, p1).real())
+                                                           * 0.5;
+                                                    fsr.DHloc_fixedR_z[nnr + nnr_inner]
+                                                        += nlm_2[2][p1] * nlm_1[p2]
+                                                           * (ucell.atoms[T0].ncpp.d_so(1, p2, p1).real()
+                                                              + ucell.atoms[T0].ncpp.d_so(2, p2, p1).real())
+                                                           * 0.5;
+                                                }
+                                                else if (is0 == 2)
+                                                {
+                                                    fsr.DHloc_fixedR_x[nnr + nnr_inner]
+                                                        += nlm_2[0][p1] * nlm_1[p2]
+                                                           * (-ucell.atoms[T0].ncpp.d_so(1, p2, p1).imag()
+                                                              + ucell.atoms[T0].ncpp.d_so(2, p2, p1).imag())
+                                                           * 0.5;
+                                                    fsr.DHloc_fixedR_y[nnr + nnr_inner]
+                                                        += nlm_2[1][p1] * nlm_1[p2]
+                                                           * (-ucell.atoms[T0].ncpp.d_so(1, p2, p1).imag()
+                                                              + ucell.atoms[T0].ncpp.d_so(2, p2, p1).imag())
+                                                           * 0.5;
+                                                    fsr.DHloc_fixedR_z[nnr + nnr_inner]
+                                                        += nlm_2[2][p1] * nlm_1[p2]
+                                                           * (-ucell.atoms[T0].ncpp.d_so(1, p2, p1).imag()
+                                                              + ucell.atoms[T0].ncpp.d_so(2, p2, p1).imag())
+                                                           * 0.5;
+                                                }
+                                                else if (is0 == 3)
+                                                {
+                                                    fsr.DHloc_fixedR_x[nnr + nnr_inner]
+                                                        += nlm_2[0][p1] * nlm_1[p2]
+                                                           * (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
+                                                              - ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())
+                                                           * 0.5;
+                                                    fsr.DHloc_fixedR_y[nnr + nnr_inner]
+                                                        += nlm_2[1][p1] * nlm_1[p2]
+                                                           * (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
+                                                              - ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())
+                                                           * 0.5;
+                                                    fsr.DHloc_fixedR_z[nnr + nnr_inner]
+                                                        += nlm_2[2][p1] * nlm_1[p2]
+                                                           * (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
+                                                              - ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())
+                                                           * 0.5;
+                                                }
+                                            }
+                                        }
+                                        else if (nspin == 1 || nspin == 2)
+                                        {
+                                            if (gamma_only_local)
+                                            {
+                                                double nlm[3] = {0, 0, 0};
+
+                                                // sum all projectors for one atom.
+                                                std::vector<double> nlm_1 = (*nlm_cur1_f)[iw1_all][0];
+                                                std::vector<std::vector<double>> nlm_2;
+                                                nlm_2.resize(3);
+                                                for (int i = 0; i < 3; i++)
+                                                {
+                                                    nlm_2[i] = (*nlm_cur2_f)[iw2_all][i + 1];
+                                                }
+
+                                                assert(nlm_1.size() == nlm_2[0].size());
+
+                                                const int nproj = ucell.infoNL.nproj[T0];
+                                                int ib = 0;
+                                                for (int nb = 0; nb < nproj; nb++)
+                                                {
+                                                    const int L0 = ucell.infoNL.Beta[T0].Proj[nb].getL();
+                                                    for (int m = 0; m < 2 * L0 + 1; m++)
+                                                    {
+                                                        for (int ir = 0; ir < 3; ir++)
+                                                        {
+                                                            nlm[ir] += nlm_2[ir][ib] * nlm_1[ib]
+                                                                       * ucell.atoms[T0].ncpp.dion(nb, nb);
+                                                        }
+                                                        ib += 1;
+                                                    }
+                                                }
+                                                assert(ib == nlm_1.size());
+
+                                                LCAO_domain::set_force(*lm.ParaV,
+                                                                       iw1_all,
+                                                                       iw2_all,
+                                                                       nlm[0],
+                                                                       nlm[1],
+                                                                       nlm[2],
+                                                                       'N',
+                                                                       fsr.DSloc_x,
+                                                                       fsr.DSloc_y,
+                                                                       fsr.DSloc_z,
+                                                                       fsr.DHloc_fixed_x,
+                                                                       fsr.DHloc_fixed_y,
+                                                                       fsr.DHloc_fixed_z);
+                                            }
+                                            else
+                                            {
+                                                // mohan change the order on 2011-06-17
+                                                // origin: < psi1 | beta > < beta | dpsi2/dtau >
+                                                // now: < psi1/dtau | beta > < beta | psi2 >
+                                                double nlm[3] = {0, 0, 0};
+
+                                                // sum all projectors for one atom.
+                                                std::vector<double> nlm_1 = (*nlm_cur2_f)[iw2_all][0];
+                                                std::vector<std::vector<double>> nlm_2;
+                                                nlm_2.resize(3);
+                                                for (int i = 0; i < 3; i++)
+                                                {
+                                                    nlm_2[i] = (*nlm_cur1_f)[iw1_all][i + 1];
+                                                }
+
+                                                assert(nlm_1.size() == nlm_2[0].size());
+
+                                                const int nproj = ucell.infoNL.nproj[T0];
+                                                int ib = 0;
+                                                for (int nb = 0; nb < nproj; nb++)
+                                                {
+                                                    const int L0 = ucell.infoNL.Beta[T0].Proj[nb].getL();
+                                                    for (int m = 0; m < 2 * L0 + 1; m++)
+                                                    {
+                                                        for (int ir = 0; ir < 3; ir++)
+                                                        {
+                                                            nlm[ir] += nlm_2[ir][ib] * nlm_1[ib]
+                                                                       * ucell.atoms[T0].ncpp.dion(nb, nb);
+                                                        }
+                                                        ib += 1;
+                                                    }
+                                                }
+                                                assert(ib == nlm_1.size());
+
+                                                fsr.DHloc_fixedR_x[nnr + nnr_inner] += nlm[0];
+                                                fsr.DHloc_fixedR_y[nnr + nnr_inner] += nlm[1];
+                                                fsr.DHloc_fixedR_z[nnr + nnr_inner] += nlm[2];
+                                            }
+                                        }
+                                        else
+                                        {
+                                            ModuleBase::WARNING_QUIT("LCAO_domain::build_Nonlocal_mu_new",
+                                                                     "nspin must be 1, 2 or 4");
+                                        }
+                                    } //! calc_deri
+                                    nnr_inner++;
+                                } // k
+                            }     // j
+                        }         // ad0
+
+                        // outer circle : accumulate nnr
+                        for (int j = 0; j < atom1->nw * npol; j++)
+                        {
+                            const int j0 = j / npol; // added by zhengdy-soc
+                            const int iw1_all = start1 + j;
                             const int mu = pv->global2local_row(iw1_all);
-							if(mu < 0)continue; 
-
-							// fix a serious bug: atom2[T2] -> atom2
-							// mohan 2010-12-20
-							for (int k=0; k<atom2->nw*npol; k++)
-							{
-								const int k0 = k/npol;
-								const int iw2_all = start2 + k;
+                            if (mu < 0)
+                            {
+                                continue;
+                            }
+
+                            // fix a serious bug: atom2[T2] -> atom2
+                            // mohan 2010-12-20
+                            for (int k = 0; k < atom2->nw * npol; k++)
+                            {
+                                const int k0 = k / npol;
+                                const int iw2_all = start2 + k;
                                 const int nu = pv->global2local_col(iw2_all);
-								if(nu < 0)continue;
-
-								if(!calc_deri)
-								{
-									std::vector<double> nlm_1=(*nlm_cur1_e)[iw1_all];
-									std::vector<double> nlm_2=(*nlm_cur2_e)[iw2_all];
-									if(nspin == 2 || nspin == 1)
-									{
-										double nlm_tmp = 0.0;
-										const int nproj = ucell.infoNL.nproj[T0];
-										int ib = 0;
-										for (int nb = 0; nb < nproj; nb++)
-										{
-											const int L0 = ucell.infoNL.Beta[T0].Proj[nb].getL();
-											for(int m=0;m<2*L0+1;m++)
-											{
-												if(nlm_1[ib]!=0.0 && nlm_2[ib]!=0.0)
-												{
-													nlm_tmp += nlm_1[ib]*nlm_2[ib]*ucell.atoms[T0].ncpp.dion(nb,nb);
-												}
-												ib+=1;
-											}
-										}
-										assert(ib==nlm_1.size());
-
-										if(gamma_only_local)
-										{
-											// mohan add 2010-12-20
-											if( nlm_tmp!=0.0 )
-											{
-                                                lm.set_HSgamma(iw1_all, iw2_all, nlm_tmp, NLloc);//N stands for nonlocal.
-											}
-										}
-										else
-										{
-											if( nlm_tmp!=0.0 )
-											{
-												NLloc[nnr+nnr_inner] += nlm_tmp;
-											}
-										}
-									}// end nspin
-								}// calc_deri
-								else // calculate the derivative
-								{
-									if (nspin == 4)
-									{
-										std::vector<double> nlm_1 = (*nlm_cur2_f)[iw2_all][0];
-										std::vector<std::vector<double>> nlm_2;
-										nlm_2.resize(3);
-										for (int i=0; i< 3; i++)
-										{
-											nlm_2[i] = (*nlm_cur1_f)[iw1_all][i+1];
-										}
-										std::complex<double> nlm[4][3] = {ModuleBase::ZERO};
-										int is0 = (j-j0*npol) + (k-k0*npol)*2;
-										for (int no=0; no < ucell.atoms[T0].ncpp.non_zero_count_soc[is0]; no++)
-										{
-											const int p1 = ucell.atoms[T0].ncpp.index1_soc[is0][no];
-											const int p2 = ucell.atoms[T0].ncpp.index2_soc[is0][no];
-											if (is0 == 0)
-											{
-												fsr.DHloc_fixedR_x[nnr+nnr_inner] += nlm_2[0][p1]*nlm_1[p2]*
-														(ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
-														+ ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())*0.5;
-												fsr.DHloc_fixedR_y[nnr+nnr_inner] += nlm_2[1][p1]*nlm_1[p2]*
-														(ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
-														+ ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())*0.5;
-												fsr.DHloc_fixedR_z[nnr+nnr_inner] += nlm_2[2][p1]*nlm_1[p2]*
-														(ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
-														+ ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())*0.5;
-											}
-											else if (is0 == 1)
-											{
-												fsr.DHloc_fixedR_x[nnr+nnr_inner] += nlm_2[0][p1]*nlm_1[p2]*
-														(ucell.atoms[T0].ncpp.d_so(1, p2, p1).real()
-														+ ucell.atoms[T0].ncpp.d_so(2, p2, p1).real())*0.5;
-												fsr.DHloc_fixedR_y[nnr+nnr_inner] += nlm_2[1][p1]*nlm_1[p2]*
-														(ucell.atoms[T0].ncpp.d_so(1, p2, p1).real()
-														+ ucell.atoms[T0].ncpp.d_so(2, p2, p1).real())*0.5;
-												fsr.DHloc_fixedR_z[nnr+nnr_inner] += nlm_2[2][p1]*nlm_1[p2]*
-														(ucell.atoms[T0].ncpp.d_so(1, p2, p1).real()
-														+ ucell.atoms[T0].ncpp.d_so(2, p2, p1).real())*0.5;
-											}
-											else if (is0 == 2)
-											{
-												fsr.DHloc_fixedR_x[nnr+nnr_inner] += nlm_2[0][p1]*nlm_1[p2]*
-														(-ucell.atoms[T0].ncpp.d_so(1, p2, p1).imag()
-														+ ucell.atoms[T0].ncpp.d_so(2, p2, p1).imag())*0.5;
-												fsr.DHloc_fixedR_y[nnr+nnr_inner] += nlm_2[1][p1]*nlm_1[p2]*
-														(-ucell.atoms[T0].ncpp.d_so(1, p2, p1).imag()
-														+ ucell.atoms[T0].ncpp.d_so(2, p2, p1).imag())*0.5;
-												fsr.DHloc_fixedR_z[nnr+nnr_inner] += nlm_2[2][p1]*nlm_1[p2]*
-														(-ucell.atoms[T0].ncpp.d_so(1, p2, p1).imag()
-														+ ucell.atoms[T0].ncpp.d_so(2, p2, p1).imag())*0.5;
-											}
-											else if (is0 == 3)
-											{
-												fsr.DHloc_fixedR_x[nnr+nnr_inner] += nlm_2[0][p1]*nlm_1[p2]*
-														(ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
-														- ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())*0.5;
-												fsr.DHloc_fixedR_y[nnr+nnr_inner] += nlm_2[1][p1]*nlm_1[p2]*
-														(ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
-														- ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())*0.5;
-												fsr.DHloc_fixedR_z[nnr+nnr_inner] += nlm_2[2][p1]*nlm_1[p2]*
-														(ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
-														- ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())*0.5;
-											}
-										}
-									}
-									else if (nspin == 1 || nspin == 2)
-									{
-										if(gamma_only_local)
-										{
-											double nlm[3]={0,0,0};
-
-											// sum all projectors for one atom.
-											std::vector<double> nlm_1 = (*nlm_cur1_f)[iw1_all][0];
-											std::vector<std::vector<double>> nlm_2;
-											nlm_2.resize(3);
-											for(int i=0;i<3;i++)
-											{
-												nlm_2[i] = (*nlm_cur2_f)[iw2_all][i+1];
-											}
-
-											assert(nlm_1.size()==nlm_2[0].size());
-
-											const int nproj = ucell.infoNL.nproj[T0];
-											int ib = 0;
-											for (int nb = 0; nb < nproj; nb++)
-											{
-												const int L0 = ucell.infoNL.Beta[T0].Proj[nb].getL();
-												for(int m=0;m<2*L0+1;m++)
-												{
-													for(int ir=0;ir<3;ir++)
-													{
-														nlm[ir] += nlm_2[ir][ib]*nlm_1[ib]*ucell.atoms[T0].ncpp.dion(nb,nb);
-													}
-													ib+=1;
-												}
-											}
-											assert(ib==nlm_1.size());
-
-											LCAO_domain::set_force(
-													*lm.ParaV,
-													iw1_all,
-													iw2_all,
-													nlm[0],
-													nlm[1],
-													nlm[2],
-													'N',
-													fsr.DSloc_x,
-													fsr.DSloc_y,
-													fsr.DSloc_z,
-													fsr.DHloc_fixed_x,
-													fsr.DHloc_fixed_y,
-													fsr.DHloc_fixed_z);
-
-										}
-										else
-										{
-											// mohan change the order on 2011-06-17
-											// origin: < psi1 | beta > < beta | dpsi2/dtau >
-											//now: < psi1/dtau | beta > < beta | psi2 >
-											double nlm[3]={0,0,0};
-
-											// sum all projectors for one atom.
-											std::vector<double> nlm_1 = (*nlm_cur2_f)[iw2_all][0];
-											std::vector<std::vector<double>> nlm_2;
-											nlm_2.resize(3);
-											for(int i=0;i<3;i++)
-											{
-												nlm_2[i] = (*nlm_cur1_f)[iw1_all][i+1];
-											}
-
-											assert(nlm_1.size()==nlm_2[0].size());
-
-											const int nproj = ucell.infoNL.nproj[T0];
-											int ib = 0;
-											for (int nb = 0; nb < nproj; nb++)
-											{
-												const int L0 = ucell.infoNL.Beta[T0].Proj[nb].getL();
-												for(int m=0;m<2*L0+1;m++)
-												{
-													for(int ir=0;ir<3;ir++)
-													{
-														nlm[ir] += nlm_2[ir][ib]*nlm_1[ib]*ucell.atoms[T0].ncpp.dion(nb,nb);
-													}
-													ib+=1;
-												}
-											}
-											assert(ib==nlm_1.size());
-
-											fsr.DHloc_fixedR_x[nnr+nnr_inner] += nlm[0];
-											fsr.DHloc_fixedR_y[nnr+nnr_inner] += nlm[1];
-											fsr.DHloc_fixedR_z[nnr+nnr_inner] += nlm[2];
-										}
-									}
-									else
-									{
-										ModuleBase::WARNING_QUIT("LCAO_domain::build_Nonlocal_mu_new","nspin must be 1, 2 or 4");
-									}
-								}//!calc_deri
-								nnr_inner++;
-							}// k
-						} // j 
-					} // ad0
-
-					//outer circle : accumulate nnr
-					for (int j=0; j<atom1->nw*npol; j++)
-					{
-						const int j0 = j/npol;//added by zhengdy-soc
-						const int iw1_all = start1 + j;
-						const int mu = pv->global2local_row(iw1_all);
-						if(mu < 0)
-						{
-							continue; 
-						}
-
-						// fix a serious bug: atom2[T2] -> atom2
-						// mohan 2010-12-20
-						for (int k=0; k<atom2->nw*npol; k++)
-						{
-							const int k0 = k/npol;
-							const int iw2_all = start2 + k;
-                            const int nu = pv->global2local_col(iw2_all);
-							if(nu < 0)
-							{
-								continue;
-							}
-							total_nnr++;
-							nnr++;
-						}
-					}
-				}// end is_adj
-			} // ad2
-		} // I1
-	} // T1
+                                if (nu < 0)
+                                {
+                                    continue;
+                                }
+                                total_nnr++;
+                                nnr++;
+                            }
+                        }
+                    } // end is_adj
+                }     // ad2
+            }         // I1
+        }             // T1
 #ifdef _OPENMP
-}
+    }
 #endif
-	if(!gamma_only_local)
-	{
-		if( total_nnr!=pv->nnr)
-		{
-			GlobalV::ofs_running << " nr="  << total_nnr << std::endl;
-			GlobalV::ofs_running << " pv->nnr=" << pv->nnr << std::endl;
-			ModuleBase::WARNING_QUIT("LCAO_domain::build_Nonlocal_mu_new","nnr!=LNNR.nnr");
-		}
-	}
+    if (!gamma_only_local)
+    {
+        if (total_nnr != pv->nnr)
+        {
+            GlobalV::ofs_running << " nr=" << total_nnr << std::endl;
+            GlobalV::ofs_running << " pv->nnr=" << pv->nnr << std::endl;
+            ModuleBase::WARNING_QUIT("LCAO_domain::build_Nonlocal_mu_new", "nnr!=LNNR.nnr");
+        }
+    }
 
     ModuleBase::timer::tick("LCAO_domain", "vnl_mu_new");
-	return;
+    return;
 }
 
-
-}
+} // namespace LCAO_domain
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_set_st.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_set_st.cpp
index 60897d844d..103102c4f5 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_set_st.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/LCAO_set_st.cpp
@@ -1,589 +1,555 @@
-#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
 #include "module_base/timer.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/LCAO_domain.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h" // only for INPUT
 
 namespace LCAO_domain
 {
 
-void single_derivative(
-    ForceStressArrays& fsr,
-    const LCAO_Orbitals& orb,
-	const ORB_gen_tables& uot,
-    const Parallel_Orbitals& pv,
-    const UnitCell& ucell,
-    const int nspin,
-    const bool cal_stress,
-    const int iw1_all,
-    const int iw2_all,
-    const int m1, 
-    const int m2,
-    const char &dtype,
-    const int T1,
-    const int L1,
-    const int N1,
-    const int T2,
-    const int L2,
-    const int N2,
-    const ModuleBase::Vector3<double> &dtau,
-    const ModuleBase::Vector3<double> &tau1,
-    const ModuleBase::Vector3<double> &tau2,
-    const int npol,
-    const int jj,
-    const int jj0,
-    const int kk,
-    const int kk0,
-    int& nnr,
-    int& total_nnr,
-    double *olm // output value 
+void single_derivative(ForceStressArrays& fsr,
+                       const LCAO_Orbitals& orb,
+                       const ORB_gen_tables& uot,
+                       const Parallel_Orbitals& pv,
+                       const UnitCell& ucell,
+                       const int nspin,
+                       const bool cal_stress,
+                       const int iw1_all,
+                       const int iw2_all,
+                       const int m1,
+                       const int m2,
+                       const char& dtype,
+                       const int T1,
+                       const int L1,
+                       const int N1,
+                       const int T2,
+                       const int L2,
+                       const int N2,
+                       const ModuleBase::Vector3<double>& dtau,
+                       const ModuleBase::Vector3<double>& tau1,
+                       const ModuleBase::Vector3<double>& tau2,
+                       const int npol,
+                       const int jj,
+                       const int jj0,
+                       const int kk,
+                       const int kk0,
+                       int& nnr,
+                       int& total_nnr,
+                       double* olm // output value
 )
 {
 
     const bool gamma_only_local = GlobalV::GAMMA_ONLY_LOCAL;
 
-	// convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-	const int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
-	const int M2 = (m2 % 2 == 0) ? -m2/2 : (m2+1)/2;
-	switch (dtype)
-	{
-		case 'S':
-			uot.two_center_bundle->overlap_orb->calculate(
-					T1,
-					L1,
-					N1,
-					M1,
-					T2,
-					L2,
-					N2,
-					M2,
-					dtau * ucell.lat0,
-					nullptr,
-					olm);
-			break;
-		case 'T':
-			uot.two_center_bundle->kinetic_orb->calculate(
-					T1,
-					L1,
-					N1,
-					M1,
-					T2,
-					L2,
-					N2,
-					M2,
-					dtau * ucell.lat0,
-					nullptr,
-					olm);
-			break;
-		default:  // not supposed to happen
-			ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new","dtype must be S or T");
-	}
-
-	// condition 7: gamma only or multiple k
-	if(gamma_only_local)
-	{
-		LCAO_domain::set_force(
-				pv,
-				iw1_all,
-				iw2_all,
-				olm[0],
-				olm[1],
-				olm[2],
-				dtype,
-				fsr.DSloc_x,
-				fsr.DSloc_y,
-				fsr.DSloc_z,
-				fsr.DHloc_fixed_x,
-				fsr.DHloc_fixed_y,
-				fsr.DHloc_fixed_z);
-
-		if(cal_stress)
-		{
-			LCAO_domain::set_stress(
-					pv,
-					iw1_all,
-					iw2_all,
-					olm[0],
-					olm[1],
-					olm[2],
-					dtype,
-					dtau,
-					fsr.DSloc_11,
-					fsr.DSloc_12,
-					fsr.DSloc_13,
-					fsr.DSloc_22,
-					fsr.DSloc_23,
-					fsr.DSloc_33,
-					fsr.DHloc_fixed_11,
-					fsr.DHloc_fixed_12,
-					fsr.DHloc_fixed_13,
-					fsr.DHloc_fixed_22,
-					fsr.DHloc_fixed_23,
-					fsr.DHloc_fixed_33);
-		}// end stress
-	}// end gamma_only
-	else // condition 7, multiple k-points algorithm
-	{
-		// condition 8, S or T
-		if(dtype=='S')
-		{
-			// condition 9, nspin
-			if (nspin == 1 || nspin ==2)
-			{
-				fsr.DSloc_Rx[nnr] = olm[0];
-				fsr.DSloc_Ry[nnr] = olm[1];
-				fsr.DSloc_Rz[nnr] = olm[2];
-			}
-			else if (nspin == 4)
-			{
-				int is = (jj-jj0*npol) + (kk-kk0*npol)*2;
-				if (is == 0) // is==3 is not needed in force calculation
-				{
-					fsr.DSloc_Rx[nnr] = olm[0];
-					fsr.DSloc_Ry[nnr] = olm[1];
-					fsr.DSloc_Rz[nnr] = olm[2];
-				}
-				else
-				{
-					fsr.DSloc_Rx[nnr] = 0.0;
-					fsr.DSloc_Ry[nnr] = 0.0;
-					fsr.DSloc_Rz[nnr] = 0.0;
-				}
-			}
-			else
-			{
-				ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new","nspin must be 1, 2 or 4");
-			}// end condition 9, nspin
-
-			if(cal_stress)
-			{
-				fsr.DH_r[nnr*3] = dtau.x;
-				fsr.DH_r[nnr*3 + 1] = dtau.y;
-				fsr.DH_r[nnr*3 + 2] = dtau.z;
-			}
-		}
-		else if(dtype=='T') // condition 8, S or T
-		{
-			// condtion 9, nspin
-			if (nspin == 1 || nspin ==2)
-			{
-				fsr.DHloc_fixedR_x[nnr] = olm[0];
-				fsr.DHloc_fixedR_y[nnr] = olm[1];
-				fsr.DHloc_fixedR_z[nnr] = olm[2];
-				if(cal_stress)
-				{
-					fsr.stvnl11[nnr] = olm[0] * dtau.x;
-					fsr.stvnl12[nnr] = olm[0] * dtau.y;
-					fsr.stvnl13[nnr] = olm[0] * dtau.z;
-					fsr.stvnl22[nnr] = olm[1] * dtau.y;
-					fsr.stvnl23[nnr] = olm[1] * dtau.z;
-					fsr.stvnl33[nnr] = olm[2] * dtau.z;
-				}
-			}
-			else if (nspin == 4)// condition 9
-			{
-				const int is = (jj-jj0*npol) + (kk-kk0*npol)*2;
-				// condition 10, details of nspin 4
-				if (is == 0) // is==3 is not needed in force calculation
-				{
-					fsr.DHloc_fixedR_x[nnr] = olm[0];
-					fsr.DHloc_fixedR_y[nnr] = olm[1];
-					fsr.DHloc_fixedR_z[nnr] = olm[2];
-					if(cal_stress)
-					{
-						fsr.stvnl11[nnr] = olm[0] * dtau.x;
-						fsr.stvnl12[nnr] = olm[0] * dtau.y;
-						fsr.stvnl13[nnr] = olm[0] * dtau.z;
-						fsr.stvnl22[nnr] = olm[1] * dtau.y;
-						fsr.stvnl23[nnr] = olm[1] * dtau.z;
-						fsr.stvnl33[nnr] = olm[2] * dtau.z;
-					}
-				}
-				else if (is == 1 || is == 2 || is == 3)
-				{
-					fsr.DHloc_fixedR_x[nnr] = 0.0;
-					fsr.DHloc_fixedR_y[nnr] = 0.0;
-					fsr.DHloc_fixedR_z[nnr] = 0.0;
-					if(cal_stress)
-					{
-						fsr.stvnl11[nnr] = 0.0;
-						fsr.stvnl12[nnr] = 0.0;
-						fsr.stvnl13[nnr] = 0.0;
-						fsr.stvnl22[nnr] = 0.0;
-						fsr.stvnl23[nnr] = 0.0;
-						fsr.stvnl33[nnr] = 0.0;
-					}
-				}
-				else
-				{
-					ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new","is must be 0, 1, 2, 3");
-				}// end condition 10, details of spin 4
-			}
-			else
-			{
-				ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new","nspin must be 1, 2 or 4");
-			}// end condition 9, nspin
-		}// end condition 8, S or T
-		++total_nnr;
-		++nnr;
-	}// end condition 7, gamma or multiple k
+    // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
+    const int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
+    const int M2 = (m2 % 2 == 0) ? -m2 / 2 : (m2 + 1) / 2;
+    switch (dtype)
+    {
+    case 'S':
+        uot.two_center_bundle->overlap_orb->calculate(T1, L1, N1, M1, T2, L2, N2, M2, dtau * ucell.lat0, nullptr, olm);
+        break;
+    case 'T':
+        uot.two_center_bundle->kinetic_orb->calculate(T1, L1, N1, M1, T2, L2, N2, M2, dtau * ucell.lat0, nullptr, olm);
+        break;
+    default: // not supposed to happen
+        ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new", "dtype must be S or T");
+    }
+
+    // condition 7: gamma only or multiple k
+    if (gamma_only_local)
+    {
+        LCAO_domain::set_force(pv,
+                               iw1_all,
+                               iw2_all,
+                               olm[0],
+                               olm[1],
+                               olm[2],
+                               dtype,
+                               fsr.DSloc_x,
+                               fsr.DSloc_y,
+                               fsr.DSloc_z,
+                               fsr.DHloc_fixed_x,
+                               fsr.DHloc_fixed_y,
+                               fsr.DHloc_fixed_z);
+
+        if (cal_stress)
+        {
+            LCAO_domain::set_stress(pv,
+                                    iw1_all,
+                                    iw2_all,
+                                    olm[0],
+                                    olm[1],
+                                    olm[2],
+                                    dtype,
+                                    dtau,
+                                    fsr.DSloc_11,
+                                    fsr.DSloc_12,
+                                    fsr.DSloc_13,
+                                    fsr.DSloc_22,
+                                    fsr.DSloc_23,
+                                    fsr.DSloc_33,
+                                    fsr.DHloc_fixed_11,
+                                    fsr.DHloc_fixed_12,
+                                    fsr.DHloc_fixed_13,
+                                    fsr.DHloc_fixed_22,
+                                    fsr.DHloc_fixed_23,
+                                    fsr.DHloc_fixed_33);
+        } // end stress
+    }     // end gamma_only
+    else  // condition 7, multiple k-points algorithm
+    {
+        // condition 8, S or T
+        if (dtype == 'S')
+        {
+            // condition 9, nspin
+            if (nspin == 1 || nspin == 2)
+            {
+                fsr.DSloc_Rx[nnr] = olm[0];
+                fsr.DSloc_Ry[nnr] = olm[1];
+                fsr.DSloc_Rz[nnr] = olm[2];
+            }
+            else if (nspin == 4)
+            {
+                int is = (jj - jj0 * npol) + (kk - kk0 * npol) * 2;
+                if (is == 0) // is==3 is not needed in force calculation
+                {
+                    fsr.DSloc_Rx[nnr] = olm[0];
+                    fsr.DSloc_Ry[nnr] = olm[1];
+                    fsr.DSloc_Rz[nnr] = olm[2];
+                }
+                else
+                {
+                    fsr.DSloc_Rx[nnr] = 0.0;
+                    fsr.DSloc_Ry[nnr] = 0.0;
+                    fsr.DSloc_Rz[nnr] = 0.0;
+                }
+            }
+            else
+            {
+                ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new", "nspin must be 1, 2 or 4");
+            } // end condition 9, nspin
+
+            if (cal_stress)
+            {
+                fsr.DH_r[nnr * 3] = dtau.x;
+                fsr.DH_r[nnr * 3 + 1] = dtau.y;
+                fsr.DH_r[nnr * 3 + 2] = dtau.z;
+            }
+        }
+        else if (dtype == 'T') // condition 8, S or T
+        {
+            // condtion 9, nspin
+            if (nspin == 1 || nspin == 2)
+            {
+                fsr.DHloc_fixedR_x[nnr] = olm[0];
+                fsr.DHloc_fixedR_y[nnr] = olm[1];
+                fsr.DHloc_fixedR_z[nnr] = olm[2];
+                if (cal_stress)
+                {
+                    fsr.stvnl11[nnr] = olm[0] * dtau.x;
+                    fsr.stvnl12[nnr] = olm[0] * dtau.y;
+                    fsr.stvnl13[nnr] = olm[0] * dtau.z;
+                    fsr.stvnl22[nnr] = olm[1] * dtau.y;
+                    fsr.stvnl23[nnr] = olm[1] * dtau.z;
+                    fsr.stvnl33[nnr] = olm[2] * dtau.z;
+                }
+            }
+            else if (nspin == 4) // condition 9
+            {
+                const int is = (jj - jj0 * npol) + (kk - kk0 * npol) * 2;
+                // condition 10, details of nspin 4
+                if (is == 0) // is==3 is not needed in force calculation
+                {
+                    fsr.DHloc_fixedR_x[nnr] = olm[0];
+                    fsr.DHloc_fixedR_y[nnr] = olm[1];
+                    fsr.DHloc_fixedR_z[nnr] = olm[2];
+                    if (cal_stress)
+                    {
+                        fsr.stvnl11[nnr] = olm[0] * dtau.x;
+                        fsr.stvnl12[nnr] = olm[0] * dtau.y;
+                        fsr.stvnl13[nnr] = olm[0] * dtau.z;
+                        fsr.stvnl22[nnr] = olm[1] * dtau.y;
+                        fsr.stvnl23[nnr] = olm[1] * dtau.z;
+                        fsr.stvnl33[nnr] = olm[2] * dtau.z;
+                    }
+                }
+                else if (is == 1 || is == 2 || is == 3)
+                {
+                    fsr.DHloc_fixedR_x[nnr] = 0.0;
+                    fsr.DHloc_fixedR_y[nnr] = 0.0;
+                    fsr.DHloc_fixedR_z[nnr] = 0.0;
+                    if (cal_stress)
+                    {
+                        fsr.stvnl11[nnr] = 0.0;
+                        fsr.stvnl12[nnr] = 0.0;
+                        fsr.stvnl13[nnr] = 0.0;
+                        fsr.stvnl22[nnr] = 0.0;
+                        fsr.stvnl23[nnr] = 0.0;
+                        fsr.stvnl33[nnr] = 0.0;
+                    }
+                }
+                else
+                {
+                    ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new", "is must be 0, 1, 2, 3");
+                } // end condition 10, details of spin 4
+            }
+            else
+            {
+                ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new", "nspin must be 1, 2 or 4");
+            } // end condition 9, nspin
+        }     // end condition 8, S or T
+        ++total_nnr;
+        ++nnr;
+    } // end condition 7, gamma or multiple k
 }
 
-
-void single_overlap(
-    LCAO_Matrix& lm,
-    const LCAO_Orbitals& orb,
-	const ORB_gen_tables& uot,
-    const Parallel_Orbitals& pv,
-    const UnitCell& ucell,
-    const int nspin,
-    const bool cal_stress,
-    const int iw1_all,
-    const int iw2_all,
-    const int m1, 
-    const int m2,
-    const char &dtype,
-    const int T1,
-    const int L1,
-    const int N1,
-    const int T2,
-    const int L2,
-    const int N2,
-    const ModuleBase::Vector3<double> &dtau,
-    const ModuleBase::Vector3<double> &tau1,
-    const ModuleBase::Vector3<double> &tau2,
-    const int npol,
-    const int jj,
-    const int jj0,
-    const int kk,
-    const int kk0,
-    int& nnr, // output value
-    int& total_nnr, // output value
-    double *olm, // output value 
-	double *HSloc // output value
+void single_overlap(LCAO_Matrix& lm,
+                    const LCAO_Orbitals& orb,
+                    const ORB_gen_tables& uot,
+                    const Parallel_Orbitals& pv,
+                    const UnitCell& ucell,
+                    const int nspin,
+                    const bool cal_stress,
+                    const int iw1_all,
+                    const int iw2_all,
+                    const int m1,
+                    const int m2,
+                    const char& dtype,
+                    const int T1,
+                    const int L1,
+                    const int N1,
+                    const int T2,
+                    const int L2,
+                    const int N2,
+                    const ModuleBase::Vector3<double>& dtau,
+                    const ModuleBase::Vector3<double>& tau1,
+                    const ModuleBase::Vector3<double>& tau2,
+                    const int npol,
+                    const int jj,
+                    const int jj0,
+                    const int kk,
+                    const int kk0,
+                    int& nnr,       // output value
+                    int& total_nnr, // output value
+                    double* olm,    // output value
+                    double* HSloc   // output value
 )
 {
     const bool gamma_only_local = GlobalV::GAMMA_ONLY_LOCAL;
 
-	// convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-	const int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
-	const int M2 = (m2 % 2 == 0) ? -m2/2 : (m2+1)/2;
-
-	switch (dtype)
-	{
-		case 'S':
-			uot.two_center_bundle->overlap_orb->calculate(T1, L1, N1, M1,
-					T2, L2, N2, M2, dtau * ucell.lat0, olm);
-			break;
-		case 'T':
-			uot.two_center_bundle->kinetic_orb->calculate(T1, L1, N1, M1,
-					T2, L2, N2, M2, dtau * ucell.lat0, olm);
-			break;
-		default:  // not supposed to happen
-			ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new","dtype must be S or T");
-	}
-
-	// When NSPIN == 4 , only diagonal term is calculated for T or S Operators
-	// use olm1 to store the diagonal term with complex data type.
-	std::complex<double> olm1[4];
-
-	if(nspin == 4)
-	{
-		olm1[0] = std::complex<double>(olm[0], 0.0);
-		olm1[1] = ModuleBase::ZERO;
-		olm1[2] = ModuleBase::ZERO;
-		olm1[3] = std::complex<double>(olm[0], 0.0);
-	}
-
-
-	// condition 7, gamma only or multiple k-points
-	if(gamma_only_local)
-	{
-		// mohan add 2010-06-29
-		// set the value in Hloc and Sloc
-		// according to global2local_row and global2local_col
-		// the last paramete: 1 for Sloc, 2 for Hloc
-		// and 3 for Hloc_fixed.
-		lm.set_HSgamma(iw1_all, iw2_all, olm[0], HSloc);
-	}
-	else // condition 7, multiple k-points algorithm
-	{
-		// condition 8, S or T
-		if(dtype=='S')
-		{
-			// condition 9, nspin
-			if (nspin == 1 || nspin ==2)
-			{
-				HSloc[nnr] = olm[0];
-			}
-			else
-			{
-				ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new","nspin must be 1, 2 or 4");
-			}
-		}
-		else if(dtype=='T') // condition 8, S or T
-		{
-			// condition 9, nspin
-			if(nspin == 1 || nspin ==2)
-			{
-				HSloc[nnr] = olm[0];// <phi|kin|d phi>
-			}
-			else if (nspin == 4)
-			{//only has diagonal term here.
-			}
-			else
-			{
-				ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new","nspin must be 1, 2 or 4");
-			}
-		}// end condition 8, S or T
-		++total_nnr;
-		++nnr;
-	}// end condition 7, gamma point or multiple k-points
-}
+    // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
+    const int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
+    const int M2 = (m2 % 2 == 0) ? -m2 / 2 : (m2 + 1) / 2;
 
+    switch (dtype)
+    {
+    case 'S':
+        uot.two_center_bundle->overlap_orb->calculate(T1, L1, N1, M1, T2, L2, N2, M2, dtau * ucell.lat0, olm);
+        break;
+    case 'T':
+        uot.two_center_bundle->kinetic_orb->calculate(T1, L1, N1, M1, T2, L2, N2, M2, dtau * ucell.lat0, olm);
+        break;
+    default: // not supposed to happen
+        ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new", "dtype must be S or T");
+    }
+
+    // When NSPIN == 4 , only diagonal term is calculated for T or S Operators
+    // use olm1 to store the diagonal term with complex data type.
+    std::complex<double> olm1[4];
+
+    if (nspin == 4)
+    {
+        olm1[0] = std::complex<double>(olm[0], 0.0);
+        olm1[1] = ModuleBase::ZERO;
+        olm1[2] = ModuleBase::ZERO;
+        olm1[3] = std::complex<double>(olm[0], 0.0);
+    }
+
+    // condition 7, gamma only or multiple k-points
+    if (gamma_only_local)
+    {
+        // mohan add 2010-06-29
+        // set the value in Hloc and Sloc
+        // according to global2local_row and global2local_col
+        // the last paramete: 1 for Sloc, 2 for Hloc
+        // and 3 for Hloc_fixed.
+        lm.set_HSgamma(iw1_all, iw2_all, olm[0], HSloc);
+    }
+    else // condition 7, multiple k-points algorithm
+    {
+        // condition 8, S or T
+        if (dtype == 'S')
+        {
+            // condition 9, nspin
+            if (nspin == 1 || nspin == 2)
+            {
+                HSloc[nnr] = olm[0];
+            }
+            else
+            {
+                ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new", "nspin must be 1, 2 or 4");
+            }
+        }
+        else if (dtype == 'T') // condition 8, S or T
+        {
+            // condition 9, nspin
+            if (nspin == 1 || nspin == 2)
+            {
+                HSloc[nnr] = olm[0]; // <phi|kin|d phi>
+            }
+            else if (nspin == 4)
+            { // only has diagonal term here.
+            }
+            else
+            {
+                ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new", "nspin must be 1, 2 or 4");
+            }
+        } // end condition 8, S or T
+        ++total_nnr;
+        ++nnr;
+    } // end condition 7, gamma point or multiple k-points
+}
 
-void build_ST_new(
-    LCAO_Matrix& lm,
-    ForceStressArrays& fsr,
-    const char& dtype,
-	const bool& calc_deri,
-	const UnitCell &ucell,
-	const LCAO_Orbitals& orb,
-    const Parallel_Orbitals& pv,
-	const ORB_gen_tables& uot,
-	Grid_Driver* GridD,
-	double* HSloc,
-	bool cal_syns,
-	double dmax)
+void build_ST_new(LCAO_Matrix& lm,
+                  ForceStressArrays& fsr,
+                  const char& dtype,
+                  const bool& calc_deri,
+                  const UnitCell& ucell,
+                  const LCAO_Orbitals& orb,
+                  const Parallel_Orbitals& pv,
+                  const ORB_gen_tables& uot,
+                  Grid_Driver* GridD,
+                  double* HSloc,
+                  bool cal_syns,
+                  double dmax)
 {
-    ModuleBase::TITLE("LCAO_domain","build_ST_new");
-    ModuleBase::timer::tick("LCAO_domain","build_ST_new");
+    ModuleBase::TITLE("LCAO_domain", "build_ST_new");
+    ModuleBase::timer::tick("LCAO_domain", "build_ST_new");
 
-	const int nspin = GlobalV::NSPIN;
-	const int npol = GlobalV::NPOL;
-	const bool cal_stress = GlobalV::CAL_STRESS;
-	const bool gamma_only_local = GlobalV::GAMMA_ONLY_LOCAL;
+    const int nspin = GlobalV::NSPIN;
+    const int npol = GlobalV::NPOL;
+    const bool cal_stress = GlobalV::CAL_STRESS;
+    const bool gamma_only_local = GlobalV::GAMMA_ONLY_LOCAL;
 
-	int total_nnr = 0;
+    int total_nnr = 0;
 #ifdef _OPENMP
-#pragma omp parallel reduction(+:total_nnr)
-{
+#pragma omp parallel reduction(+ : total_nnr)
+    {
 #endif
-    //array to store data
-    double olm[3]={0.0,0.0,0.0};
+        // array to store data
+        double olm[3] = {0.0, 0.0, 0.0};
 
-    //\sum{T} e**{ikT} <\phi_{ia}|d\phi_{k\beta}(T)>
-	ModuleBase::Vector3<double> tau1, tau2, dtau;
-	ModuleBase::Vector3<double> dtau1, dtau2, tau0;
+        //\sum{T} e**{ikT} <\phi_{ia}|d\phi_{k\beta}(T)>
+        ModuleBase::Vector3<double> tau1, tau2, dtau;
+        ModuleBase::Vector3<double> dtau1, dtau2, tau0;
 
 #ifdef _OPENMP
 // use schedule(dynamic) for load balancing because adj_num is various
 #pragma omp for schedule(dynamic)
 #endif
-    for (int iat1 = 0; iat1 < ucell.nat; iat1++) // loop 1, iat1
-    {
-		const int T1 = ucell.iat2it[iat1];
-		const Atom* atom1 = &ucell.atoms[T1];
-		const int I1 = ucell.iat2ia[iat1];
- 
-		tau1 = atom1->tau[I1];
-
-		//GridD->Find_atom(tau1);
-		AdjacentAtomInfo adjs;
-		GridD->Find_atom(ucell, tau1, T1, I1, &adjs);
-		// Record_adj.for_2d() may not called in some case
-		int nnr = pv.nlocstart ? pv.nlocstart[iat1] : 0;
-
-		if (cal_syns)
-		{
-			for (int k = 0; k < 3; k++)
-			{
-				tau1[k] = tau1[k] - atom1->vel[I1][k] * INPUT.mdp.md_dt / ucell.lat0 ;
-			}
-		}
-
-        // loop 2, ad
-		for (int ad = 0; ad < adjs.adj_num+1; ++ad)
-		{
-			const int T2 = adjs.ntype[ad];
-			const int I2 = adjs.natom[ad];
-			Atom* atom2 = &ucell.atoms[T2];
-			tau2 = adjs.adjacent_tau[ad];
-			dtau = tau2 - tau1;
-			double distance = dtau.norm() * ucell.lat0;
-			double rcut = orb.Phi[T1].getRcut() + orb.Phi[T2].getRcut();
-
-            // condition 3, distance
-			if(distance < rcut)
-			{
-				int iw1_all = ucell.itiaiw2iwt( T1, I1, 0) ; //iw1_all = combined index (it, ia, iw)
-
-                // loop 4, jj
-				for(int jj=0; jj<atom1->nw*npol; ++jj)
-				{
-					const int jj0 = jj/npol;
-					const int L1 = atom1->iw2l[jj0];
-					const int N1 = atom1->iw2n[jj0];
-					const int m1 = atom1->iw2m[jj0];
-
-					int iw2_all = ucell.itiaiw2iwt( T2, I2, 0);//zhengdy-soc
-
-                    // loop 5, kk
-					for(int kk=0; kk<atom2->nw*npol; ++kk)
-					{
-						const int kk0 = kk/npol;
-						const int L2 = atom2->iw2l[kk0];
-						const int N2 = atom2->iw2n[kk0];
-						const int m2 = atom2->iw2m[kk0];
-
-						// mohan add 2010-06-29
-						// this is in fact the same as in build_Nonlocal_mu,
-						// the difference is that here we use {L,N,m} for ccycle,
-						// build_Nonlocal_mu use atom.nw for cycle.
-						// so, here we use ParaO::in_this_processor,
-						// in build_Non... use global2local_row
-						// and global2local_col directly,
-						if (!pv.in_this_processor(iw1_all, iw2_all))
-						{
-							++iw2_all;
-							continue;
-						}
-
-						olm[0] = 0.0;
-                        olm[1] = 0.0;
-                        olm[2] = 0.0;
-
-                        // condition 6, not calculate the derivative
-						if(!calc_deri)
-						{
-                            single_overlap(
-									lm,
-									orb,
-									uot,
-									pv,
-									ucell,
-									nspin,
-									cal_stress,
-									iw1_all,
-									iw2_all,
-									m1,
-									m2,
-									dtype,
-									T1,
-									L1,
-									N1,
-									T2,
-									L2,
-									N2,
-									dtau,
-									tau1,
-									tau2,
-									npol,
-									jj,
-									jj0,
-									kk,
-									kk0,
-									nnr,
-									total_nnr,
-									olm,
-									HSloc);
-						}
-						else // condition 6, calculate the derivative
-						{
-							single_derivative(
-									fsr,
-									orb,
-									uot,
-									pv,
-									ucell,
-									nspin,
-									cal_stress,
-									iw1_all,
-									iw2_all,
-									m1,
-									m2,
-									dtype,
-									T1,
-									L1,
-									N1,
-									T2,
-									L2,
-									N2,
-									dtau,
-									tau1,
-									tau2,
-									npol,
-									jj,
-									jj0,
-									kk,
-									kk0,
-									nnr,
-									total_nnr,
-									olm);
-						}// end condition 6, calc_deri
-						++iw2_all;
-					}// end loop 5, kk 
-					++iw1_all;
-				}// end loop 4, jj
-			}// condition 3, distance
-			else if(distance>=rcut && (!gamma_only_local))
-			{
-				int start1 = ucell.itiaiw2iwt( T1, I1, 0);
-				int start2 = ucell.itiaiw2iwt( T2, I2, 0);
-
-				bool is_adj = false;
-				for (int ad0=0; ad0 < adjs.adj_num+1; ++ad0)
-				{
-					const int T0 = adjs.ntype[ad0];
-					tau0 = adjs.adjacent_tau[ad0];
-					dtau1 = tau0 - tau1;
-					double distance1 = dtau1.norm() * ucell.lat0;
-					double rcut1 = orb.Phi[T1].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max();
-					dtau2 = tau0 - tau2;
-					double distance2 = dtau2.norm() * ucell.lat0;
-					double rcut2 = orb.Phi[T2].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max();
-					if( distance1 < rcut1 && distance2 < rcut2 )
-					{
-						is_adj = true;
-						break;
-					}
-				}//ad0
-
-				if( is_adj )
-				{
-					for(int jj=0; jj<atom1->nw * npol; ++jj)
-					{
-						const int mu = pv.global2local_row(start1 + jj);
-						if(mu<0)continue; 
-						for(int kk=0; kk<atom2->nw * npol; ++kk)
-						{
-							const int nu = pv.global2local_col(start2 + kk);
-							if(nu<0)continue;
-							++total_nnr;
-							++nnr;
-						}//kk
-					}//jj
-				} // is_adj
-			}//distance, end condition 3
-		}// end loop 2, ad
-	}// end loop 1, iat1
-
+        for (int iat1 = 0; iat1 < ucell.nat; iat1++) // loop 1, iat1
+        {
+            const int T1 = ucell.iat2it[iat1];
+            const Atom* atom1 = &ucell.atoms[T1];
+            const int I1 = ucell.iat2ia[iat1];
+
+            tau1 = atom1->tau[I1];
+
+            // GridD->Find_atom(tau1);
+            AdjacentAtomInfo adjs;
+            GridD->Find_atom(ucell, tau1, T1, I1, &adjs);
+            // Record_adj.for_2d() may not called in some case
+            int nnr = pv.nlocstart ? pv.nlocstart[iat1] : 0;
+
+            if (cal_syns)
+            {
+                for (int k = 0; k < 3; k++)
+                {
+                    tau1[k] = tau1[k] - atom1->vel[I1][k] * INPUT.mdp.md_dt / ucell.lat0;
+                }
+            }
+
+            // loop 2, ad
+            for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+            {
+                const int T2 = adjs.ntype[ad];
+                const int I2 = adjs.natom[ad];
+                Atom* atom2 = &ucell.atoms[T2];
+                tau2 = adjs.adjacent_tau[ad];
+                dtau = tau2 - tau1;
+                double distance = dtau.norm() * ucell.lat0;
+                double rcut = orb.Phi[T1].getRcut() + orb.Phi[T2].getRcut();
+
+                // condition 3, distance
+                if (distance < rcut)
+                {
+                    int iw1_all = ucell.itiaiw2iwt(T1, I1, 0); // iw1_all = combined index (it, ia, iw)
+
+                    // loop 4, jj
+                    for (int jj = 0; jj < atom1->nw * npol; ++jj)
+                    {
+                        const int jj0 = jj / npol;
+                        const int L1 = atom1->iw2l[jj0];
+                        const int N1 = atom1->iw2n[jj0];
+                        const int m1 = atom1->iw2m[jj0];
+
+                        int iw2_all = ucell.itiaiw2iwt(T2, I2, 0); // zhengdy-soc
+
+                        // loop 5, kk
+                        for (int kk = 0; kk < atom2->nw * npol; ++kk)
+                        {
+                            const int kk0 = kk / npol;
+                            const int L2 = atom2->iw2l[kk0];
+                            const int N2 = atom2->iw2n[kk0];
+                            const int m2 = atom2->iw2m[kk0];
+
+                            // mohan add 2010-06-29
+                            // this is in fact the same as in build_Nonlocal_mu,
+                            // the difference is that here we use {L,N,m} for ccycle,
+                            // build_Nonlocal_mu use atom.nw for cycle.
+                            // so, here we use ParaO::in_this_processor,
+                            // in build_Non... use global2local_row
+                            // and global2local_col directly,
+                            if (!pv.in_this_processor(iw1_all, iw2_all))
+                            {
+                                ++iw2_all;
+                                continue;
+                            }
+
+                            olm[0] = 0.0;
+                            olm[1] = 0.0;
+                            olm[2] = 0.0;
+
+                            // condition 6, not calculate the derivative
+                            if (!calc_deri)
+                            {
+                                single_overlap(lm,
+                                               orb,
+                                               uot,
+                                               pv,
+                                               ucell,
+                                               nspin,
+                                               cal_stress,
+                                               iw1_all,
+                                               iw2_all,
+                                               m1,
+                                               m2,
+                                               dtype,
+                                               T1,
+                                               L1,
+                                               N1,
+                                               T2,
+                                               L2,
+                                               N2,
+                                               dtau,
+                                               tau1,
+                                               tau2,
+                                               npol,
+                                               jj,
+                                               jj0,
+                                               kk,
+                                               kk0,
+                                               nnr,
+                                               total_nnr,
+                                               olm,
+                                               HSloc);
+                            }
+                            else // condition 6, calculate the derivative
+                            {
+                                single_derivative(fsr,
+                                                  orb,
+                                                  uot,
+                                                  pv,
+                                                  ucell,
+                                                  nspin,
+                                                  cal_stress,
+                                                  iw1_all,
+                                                  iw2_all,
+                                                  m1,
+                                                  m2,
+                                                  dtype,
+                                                  T1,
+                                                  L1,
+                                                  N1,
+                                                  T2,
+                                                  L2,
+                                                  N2,
+                                                  dtau,
+                                                  tau1,
+                                                  tau2,
+                                                  npol,
+                                                  jj,
+                                                  jj0,
+                                                  kk,
+                                                  kk0,
+                                                  nnr,
+                                                  total_nnr,
+                                                  olm);
+                            } // end condition 6, calc_deri
+                            ++iw2_all;
+                        } // end loop 5, kk
+                        ++iw1_all;
+                    } // end loop 4, jj
+                }     // condition 3, distance
+                else if (distance >= rcut && (!gamma_only_local))
+                {
+                    int start1 = ucell.itiaiw2iwt(T1, I1, 0);
+                    int start2 = ucell.itiaiw2iwt(T2, I2, 0);
+
+                    bool is_adj = false;
+                    for (int ad0 = 0; ad0 < adjs.adj_num + 1; ++ad0)
+                    {
+                        const int T0 = adjs.ntype[ad0];
+                        tau0 = adjs.adjacent_tau[ad0];
+                        dtau1 = tau0 - tau1;
+                        double distance1 = dtau1.norm() * ucell.lat0;
+                        double rcut1 = orb.Phi[T1].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max();
+                        dtau2 = tau0 - tau2;
+                        double distance2 = dtau2.norm() * ucell.lat0;
+                        double rcut2 = orb.Phi[T2].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max();
+                        if (distance1 < rcut1 && distance2 < rcut2)
+                        {
+                            is_adj = true;
+                            break;
+                        }
+                    } // ad0
+
+                    if (is_adj)
+                    {
+                        for (int jj = 0; jj < atom1->nw * npol; ++jj)
+                        {
+                            const int mu = pv.global2local_row(start1 + jj);
+                            if (mu < 0)
+                                continue;
+                            for (int kk = 0; kk < atom2->nw * npol; ++kk)
+                            {
+                                const int nu = pv.global2local_col(start2 + kk);
+                                if (nu < 0)
+                                    continue;
+                                ++total_nnr;
+                                ++nnr;
+                            } // kk
+                        }     // jj
+                    }         // is_adj
+                }             // distance, end condition 3
+            }                 // end loop 2, ad
+        }                     // end loop 1, iat1
 
 #ifdef _OPENMP
-}
+    }
 #endif
 
-
-	if(!gamma_only_local)
-	{
-		if(total_nnr != pv.nnr)
-		{
-			std::cout << " nnr=" << total_nnr << " LNNR.nnr=" << pv.nnr << std::endl;
-			GlobalV::ofs_running << " nnr=" << total_nnr << " LNNR.nnr=" << pv.nnr << std::endl;
-			ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new","nnr != LNNR.nnr");
-		}
-	}
-
-    ModuleBase::timer::tick("LCAO_domain","build_ST_new");
+    if (!gamma_only_local)
+    {
+        if (total_nnr != pv.nnr)
+        {
+            std::cout << " nnr=" << total_nnr << " LNNR.nnr=" << pv.nnr << std::endl;
+            GlobalV::ofs_running << " nnr=" << total_nnr << " LNNR.nnr=" << pv.nnr << std::endl;
+            ModuleBase::WARNING_QUIT("LCAO_domain::build_ST_new", "nnr != LNNR.nnr");
+        }
+    }
+
+    ModuleBase::timer::tick("LCAO_domain", "build_ST_new");
     return;
 }
 
-}
+} // namespace LCAO_domain
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_gamma.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_gamma.cpp
index ea68de9b72..f3048110e8 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_gamma.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_gamma.cpp
@@ -1,27 +1,27 @@
 #include "FORCE.h"
-#include "module_hamilt_pw/hamilt_pwdft/global.h"
+#include "module_base/timer.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
+#include "module_hamilt_pw/hamilt_pwdft/global.h"
+
 #include <unordered_map>
-#include "module_base/timer.h"
 
-template<>
-void Force_LCAO<double>::cal_fvnl_dbeta(
-    const elecstate::DensityMatrix<double, double>* dm,
-    const Parallel_Orbitals& pv,
-    const UnitCell& ucell,
-    const LCAO_Orbitals& orb,
-    const ORB_gen_tables& uot,
-    Grid_Driver& gd,
-	const bool isforce, 
-	const bool isstress, 
-	ModuleBase::matrix& fvnl_dbeta, 
-	ModuleBase::matrix& svnl_dbeta)
+template <>
+void Force_LCAO<double>::cal_fvnl_dbeta(const elecstate::DensityMatrix<double, double>* dm,
+                                        const Parallel_Orbitals& pv,
+                                        const UnitCell& ucell,
+                                        const LCAO_Orbitals& orb,
+                                        const ORB_gen_tables& uot,
+                                        Grid_Driver& gd,
+                                        const bool isforce,
+                                        const bool isstress,
+                                        ModuleBase::matrix& fvnl_dbeta,
+                                        ModuleBase::matrix& svnl_dbeta)
 {
-    ModuleBase::TITLE("Force_LCAO","cal_fvnl_dbeta");
-    ModuleBase::timer::tick("Force_LCAO","cal_fvnl_dbeta");
+    ModuleBase::TITLE("Force_LCAO", "cal_fvnl_dbeta");
+    ModuleBase::timer::tick("Force_LCAO", "cal_fvnl_dbeta");
 
     double r0[3];
-	double r1[3];
+    double r1[3];
 
     for (int iat = 0; iat < ucell.nat; iat++)
     {
@@ -29,17 +29,17 @@ void Force_LCAO<double>::cal_fvnl_dbeta(
         const int ia = ucell.iat2ia[iat];
         const int T0 = it;
         const int I0 = ia;
-        
+
         const ModuleBase::Vector3<double> tau0 = ucell.atoms[it].tau[ia];
-        //find ajacent atom of atom ia
-        //gd.Find_atom( ucell.atoms[it].tau[ia] );
+        // find ajacent atom of atom ia
+        // gd.Find_atom( ucell.atoms[it].tau[ia] );
         gd.Find_atom(ucell, tau0, it, ia);
         const double Rcut_Beta = ucell.infoNL.Beta[it].get_rcut_max();
 
-        std::vector<std::unordered_map<int,std::vector<std::vector<double>>>> nlm_tot;
-        nlm_tot.resize(gd.getAdjacentNum() + 1); //this saves <psi_i|beta> and <psi_i|\nabla|beta>
+        std::vector<std::unordered_map<int, std::vector<std::vector<double>>>> nlm_tot;
+        nlm_tot.resize(gd.getAdjacentNum() + 1); // this saves <psi_i|beta> and <psi_i|\nabla|beta>
 
-        //Step 1 : Calculate and save <psi_i|beta> and <psi_i|\nabla|beta>
+        // Step 1 : Calculate and save <psi_i|beta> and <psi_i|\nabla|beta>
         for (int ad1 = 0; ad1 < gd.getAdjacentNum() + 1; ad1++)
         {
             const int T1 = gd.getType(ad1);
@@ -58,36 +58,35 @@ void Force_LCAO<double>::cal_fvnl_dbeta(
             }
 
             for (int iw1 = 0; iw1 < ucell.atoms[T1].nw; ++iw1)
-			{
-				const int iw1_all = start1 + iw1;
+            {
+                const int iw1_all = start1 + iw1;
                 const int iw1_local = pv.global2local_row(iw1_all);
                 const int iw2_local = pv.global2local_col(iw1_all);
 
-				if(iw1_local < 0 && iw2_local < 0) 
-				{
-					continue;
-				}
-                
+                if (iw1_local < 0 && iw2_local < 0)
+                {
+                    continue;
+                }
+
                 std::vector<std::vector<double>> nlm;
 
-                int L1 = atom1->iw2l[ iw1 ];
-                int N1 = atom1->iw2n[ iw1 ];
-                int m1 = atom1->iw2m[ iw1 ];
+                int L1 = atom1->iw2l[iw1];
+                int N1 = atom1->iw2n[iw1];
+                int m1 = atom1->iw2m[iw1];
 
                 // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-                int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
+                int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
                 ModuleBase::Vector3<double> dtau = ucell.atoms[T0].tau[I0] - tau1;
 
-                uot.two_center_bundle->overlap_orb_beta->snap(
-                    T1, L1, N1, M1, T0, dtau * ucell.lat0, true, nlm);
+                uot.two_center_bundle->overlap_orb_beta->snap(T1, L1, N1, M1, T0, dtau * ucell.lat0, true, nlm);
 
-                assert(nlm.size()==4);
-                nlm_tot[ad1].insert({iw1,nlm});
+                assert(nlm.size() == 4);
+                nlm_tot[ad1].insert({iw1, nlm});
             }
-        }//ad
+        } // ad
 
-        //Step 2 : sum to get beta<psi_i|beta><beta|\nabla|psi_j>
+        // Step 2 : sum to get beta<psi_i|beta><beta|\nabla|psi_j>
         for (int ad1 = 0; ad1 < gd.getAdjacentNum() + 1; ++ad1)
         {
             const int T1 = gd.getType(ad1);
@@ -108,135 +107,134 @@ void Force_LCAO<double>::cal_fvnl_dbeta(
 
                 const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
                 const double dist2 = (tau2 - tau0).norm() * ucell.lat0;
-				if(isstress)
+                if (isstress)
                 {
-                    r1[0] = ( tau1.x - tau0.x) ;
-                    r1[1] = ( tau1.y - tau0.y) ;
-                    r1[2] = ( tau1.z - tau0.z) ;
-                    r0[0] = ( tau2.x - tau0.x) ;
-                    r0[1] = ( tau2.y - tau0.y) ;
-                    r0[2] = ( tau2.z - tau0.z) ;
+                    r1[0] = (tau1.x - tau0.x);
+                    r1[1] = (tau1.y - tau0.y);
+                    r1[2] = (tau1.z - tau0.z);
+                    r0[0] = (tau2.x - tau0.x);
+                    r0[1] = (tau2.y - tau0.y);
+                    r0[2] = (tau2.z - tau0.z);
                 }
 
-                if (dist1 > Rcut_Beta + Rcut_AO1
-                        || dist2 > Rcut_Beta + Rcut_AO2)
+                if (dist1 > Rcut_Beta + Rcut_AO1 || dist2 > Rcut_Beta + Rcut_AO2)
                 {
                     continue;
                 }
-                
+
                 for (int iw1 = 0; iw1 < ucell.atoms[T1].nw; ++iw1)
                 {
                     const int iw1_all = start1 + iw1;
                     const int iw1_local = pv.global2local_row(iw1_all);
-					if(iw1_local < 0)
-					{
-						continue;
-					}
+                    if (iw1_local < 0)
+                    {
+                        continue;
+                    }
                     for (int iw2 = 0; iw2 < ucell.atoms[T2].nw; ++iw2)
 
                     {
                         const int iw2_all = start2 + iw2;
                         const int iw2_local = pv.global2local_col(iw2_all);
 
-						if(iw2_local < 0)
-						{
-							continue;
-						}
+                        if (iw2_local < 0)
+                        {
+                            continue;
+                        }
 
-                        double nlm[3] = {0,0,0};
-                        double nlm_t[3] = {0,0,0}; //transpose
+                        double nlm[3] = {0, 0, 0};
+                        double nlm_t[3] = {0, 0, 0}; // transpose
 
                         std::vector<double> nlm1 = nlm_tot[ad1][iw1][0];
                         std::vector<std::vector<double>> nlm2;
                         nlm2.resize(3);
-                        for(int i=0;i<3;i++)
+                        for (int i = 0; i < 3; i++)
                         {
-                            nlm2[i] = nlm_tot[ad2][iw2][i+1];
+                            nlm2[i] = nlm_tot[ad2][iw2][i + 1];
                         }
 
-                        assert(nlm1.size()==nlm2[0].size());
+                        assert(nlm1.size() == nlm2[0].size());
 
                         const int nproj = ucell.infoNL.nproj[T0];
-						int ib = 0;
-						for (int nb = 0; nb < nproj; nb++)
-						{
+                        int ib = 0;
+                        for (int nb = 0; nb < nproj; nb++)
+                        {
                             const int L0 = ucell.infoNL.Beta[T0].Proj[nb].getL();
-							for(int m=0;m<2*L0+1;m++)
-							{
-                                for(int ir=0;ir<3;ir++)
+                            for (int m = 0; m < 2 * L0 + 1; m++)
+                            {
+                                for (int ir = 0; ir < 3; ir++)
                                 {
                                     nlm[ir] += nlm2[ir][ib] * nlm1[ib] * ucell.atoms[T0].ncpp.dion(nb, nb);
                                 }
-								ib+=1;
-							}
-						}
-						assert(ib==nlm1.size());
+                                ib += 1;
+                            }
+                        }
+                        assert(ib == nlm1.size());
 
-                        if(isstress)
+                        if (isstress)
                         {
                             nlm1 = nlm_tot[ad2][iw2][0];
-                            for(int i=0;i<3;i++)
+                            for (int i = 0; i < 3; i++)
                             {
-                                nlm2[i] = nlm_tot[ad1][iw1][i+1];
+                                nlm2[i] = nlm_tot[ad1][iw1][i + 1];
                             }
 
-                            assert(nlm1.size()==nlm2[0].size());
-                            
+                            assert(nlm1.size() == nlm2[0].size());
+
                             const int nproj = ucell.infoNL.nproj[T0];
-	    					int ib = 0;
-		    				for (int nb = 0; nb < nproj; nb++)
-				    		{
+                            int ib = 0;
+                            for (int nb = 0; nb < nproj; nb++)
+                            {
                                 const int L0 = ucell.infoNL.Beta[T0].Proj[nb].getL();
-		    					for(int m=0;m<2*L0+1;m++)
-				    			{
-                                   for(int ir=0;ir<3;ir++)
+                                for (int m = 0; m < 2 * L0 + 1; m++)
+                                {
+                                    for (int ir = 0; ir < 3; ir++)
                                     {
-                                       nlm_t[ir] += nlm2[ir][ib] * nlm1[ib] * ucell.atoms[T0].ncpp.dion(nb, nb);
+                                        nlm_t[ir] += nlm2[ir][ib] * nlm1[ib] * ucell.atoms[T0].ncpp.dion(nb, nb);
                                     }
-					    			ib+=1;
-					    		}
-					    	}
-						    assert(ib==nlm1.size());
+                                    ib += 1;
+                                }
+                            }
+                            assert(ib == nlm1.size());
                         }
                         // dbeta is minus, that's consistent.
                         // only one projector for each atom force.
 
                         double sum = 0.0;
-                        for(int is=0; is<GlobalV::NSPIN; ++is)
+                        for (int is = 0; is < GlobalV::NSPIN; ++is)
                         {
-                            //sum += dm2d[is](iw2_local, iw1_local);
-                            sum += dm->get_DMK(is+1, 0, iw2_local, iw1_local);
+                            // sum += dm2d[is](iw2_local, iw1_local);
+                            sum += dm->get_DMK(is + 1, 0, iw2_local, iw1_local);
                         }
                         sum *= 2.0;
 
-                        if(isforce)
-						{
-							fvnl_dbeta(iat,0) -= sum * nlm[0];
-							fvnl_dbeta(iat,1) -= sum * nlm[1];
-							fvnl_dbeta(iat,2) -= sum * nlm[2];
-						}
+                        if (isforce)
+                        {
+                            fvnl_dbeta(iat, 0) -= sum * nlm[0];
+                            fvnl_dbeta(iat, 1) -= sum * nlm[1];
+                            fvnl_dbeta(iat, 2) -= sum * nlm[2];
+                        }
 
-                        if(isstress) 
+                        if (isstress)
                         {
-                            for(int ipol=0;ipol<3;ipol++)
-							{
-                                for(int jpol=ipol;jpol<3;jpol++)
+                            for (int ipol = 0; ipol < 3; ipol++)
+                            {
+                                for (int jpol = ipol; jpol < 3; jpol++)
                                 {
-                                    svnl_dbeta(ipol, jpol) += sum/2.0 * (nlm[ipol] * r0[jpol] + nlm_t[ipol] * r1[jpol]);
+                                    svnl_dbeta(ipol, jpol)
+                                        += sum / 2.0 * (nlm[ipol] * r0[jpol] + nlm_t[ipol] * r1[jpol]);
                                 }
                             }
                         }
-                    }//iw2
-                }//iw1
-            }//ad2
-        }//ad1
-    }//iat
+                    } // iw2
+                }     // iw1
+            }         // ad2
+        }             // ad1
+    }                 // iat
 
-    if(isstress)
+    if (isstress)
     {
         StressTools::stress_fill(ucell.lat0, ucell.omega, svnl_dbeta);
     }
 
-    ModuleBase::timer::tick("Force_LCAO","cal_fvnl_dbeta");
+    ModuleBase::timer::tick("Force_LCAO", "cal_fvnl_dbeta");
 }
-
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_k.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_k.cpp
index 8a9501959e..cee03dfa4a 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_k.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/fvnl_dbeta_k.cpp
@@ -1,19 +1,18 @@
 #include "FORCE.h"
-
-#include <map>
-#include <unordered_map>
-
 #include "module_base/memory.h"
 #include "module_base/parallel_reduce.h"
 #include "module_base/timer.h"
 #include "module_base/tool_threading.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 #include "module_elecstate/cal_dm.h"
-#include "module_elecstate/module_dm/cal_dm_psi.h"
 #include "module_elecstate/elecstate_lcao.h"
+#include "module_elecstate/module_dm/cal_dm_psi.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #include "module_io/write_HS.h"
 
+#include <map>
+#include <unordered_map>
+
 #ifdef __DEEPKS
 #include "module_hamilt_lcao/module_deepks/LCAO_deepks.h"
 #endif
@@ -22,25 +21,23 @@
 #include <omp.h>
 #endif
 
-
 typedef std::tuple<int, int, int, int> key_tuple;
 
 // must consider three-center H matrix.
-template<>
-void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
-    const elecstate::DensityMatrix<std::complex<double>, double>* dm,
-    const Parallel_Orbitals& pv,
-    const UnitCell& ucell,
-    const LCAO_Orbitals& orb,
-    const ORB_gen_tables& uot,
-    Grid_Driver& gd,
-    const bool isforce,
-    const bool isstress,
-    ModuleBase::matrix& fvnl_dbeta,
-    ModuleBase::matrix& svnl_dbeta)
+template <>
+void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(const elecstate::DensityMatrix<std::complex<double>, double>* dm,
+                                                      const Parallel_Orbitals& pv,
+                                                      const UnitCell& ucell,
+                                                      const LCAO_Orbitals& orb,
+                                                      const ORB_gen_tables& uot,
+                                                      Grid_Driver& gd,
+                                                      const bool isforce,
+                                                      const bool isstress,
+                                                      ModuleBase::matrix& fvnl_dbeta,
+                                                      ModuleBase::matrix& svnl_dbeta)
 {
-    ModuleBase::TITLE("Force_LCAO","cal_fvnl_dbeta");
-    ModuleBase::timer::tick("Force_LCAO","cal_fvnl_dbeta");
+    ModuleBase::TITLE("Force_LCAO", "cal_fvnl_dbeta");
+    ModuleBase::timer::tick("Force_LCAO", "cal_fvnl_dbeta");
 
     const int nspin = GlobalV::NSPIN;
     const int nspin_DMR = (nspin == 2) ? 2 : 1;
@@ -98,22 +95,21 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                 const int iw1_all = start1 + iw1;
                 const int iw1_local = pv.global2local_row(iw1_all);
                 const int iw2_local = pv.global2local_col(iw1_all);
-				if (iw1_local < 0 && iw2_local < 0)
-				{
-					continue;
-				}
+                if (iw1_local < 0 && iw2_local < 0)
+                {
+                    continue;
+                }
                 const int iw1_0 = iw1 / npol;
                 std::vector<std::vector<double>> nlm;
-                int L1 = atom1->iw2l[ iw1_0 ];
-                int N1 = atom1->iw2n[ iw1_0 ];
-                int m1 = atom1->iw2m[ iw1_0 ];
+                int L1 = atom1->iw2l[iw1_0];
+                int N1 = atom1->iw2n[iw1_0];
+                int m1 = atom1->iw2m[iw1_0];
 
                 // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-                int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
+                int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
                 ModuleBase::Vector3<double> dtau = tau - tau1;
-                uot.two_center_bundle->overlap_orb_beta->snap(
-                        T1, L1, N1, M1, it, dtau * ucell.lat0, true, nlm);
+                uot.two_center_bundle->overlap_orb_beta->snap(T1, L1, N1, M1, it, dtau * ucell.lat0, true, nlm);
 
                 nlm_cur.insert({iw1_all, nlm});
             } // end iw
@@ -135,10 +131,10 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
 #ifdef _OPENMP
 #pragma omp parallel reduction(+ : total_nnr)
     {
-		ModuleBase::matrix local_svnl_dbeta(3, 3);
-		const int num_threads = omp_get_num_threads();
+        ModuleBase::matrix local_svnl_dbeta(3, 3);
+        const int num_threads = omp_get_num_threads();
 #else
-		ModuleBase::matrix& local_svnl_dbeta = svnl_dbeta;
+    ModuleBase::matrix& local_svnl_dbeta = svnl_dbeta;
 #endif
 
         ModuleBase::Vector3<double> tau1;
@@ -212,19 +208,19 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
 
                     // check if this a adjacent atoms.
                     bool is_adj = false;
-					if (distance < rcut)
-					{
-						is_adj = true;
-					}
+                    if (distance < rcut)
+                    {
+                        is_adj = true;
+                    }
                     else if (distance >= rcut)
                     {
                         for (int ad0 = 0; ad0 < adjs.adj_num + 1; ++ad0)
                         {
                             const int T0 = adjs.ntype[ad0];
-							if (ucell.infoNL.nproj[T0] == 0)
-							{
-								continue;
-							}
+                            if (ucell.infoNL.nproj[T0] == 0)
+                            {
+                                continue;
+                            }
                             const int I0 = adjs.natom[ad0];
                             // const int iat0 = ucell.itia2iat(T0, I0);
                             // const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
@@ -256,7 +252,7 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                         std::vector<double*> tmp_matrix_ptr;
                         for (int is = 0; is < nspin_DMR; ++is)
                         {
-                            auto* tmp_base_matrix = dm->get_DMR_pointer(is+1)->find_matrix(iat1, iat2, rx2, ry2, rz2);
+                            auto* tmp_base_matrix = dm->get_DMR_pointer(is + 1)->find_matrix(iat1, iat2, rx2, ry2, rz2);
                             tmp_matrix_ptr.push_back(tmp_base_matrix->get_pointer());
                         }
 
@@ -265,16 +261,16 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                             const int T0 = adjs.ntype[ad0];
                             const int I0 = adjs.natom[ad0];
                             const int iat = ucell.itia2iat(T0, I0);
-							if (!iat_recorded && isforce)
-							{
-								adj_iat[ad0] = iat;
-							}
+                            if (!iat_recorded && isforce)
+                            {
+                                adj_iat[ad0] = iat;
+                            }
 
                             // mohan add 2010-12-19
-							if (ucell.infoNL.nproj[T0] == 0)
-							{
-								continue;
-							}
+                            if (ucell.infoNL.nproj[T0] == 0)
+                            {
+                                continue;
+                            }
 
                             // const int I0 = gd.getNatom(ad0);
                             // const int start0 = ucell.itiaiw2iwt(T0, I0, 0);
@@ -286,10 +282,8 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                             const double distance2 = dtau2.norm2() * pow(ucell.lat0, 2);
 
                             // seems a bug here!! mohan 2011-06-17
-                            rcut1 = pow(orb.Phi[T1].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max(),
-                                        2);
-                            rcut2 = pow(orb.Phi[T2].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max(),
-                                        2);
+                            rcut1 = pow(orb.Phi[T1].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max(), 2);
+                            rcut2 = pow(orb.Phi[T2].getRcut() + ucell.infoNL.Beta[T0].get_rcut_max(), 2);
 
                             double r0[3];
                             double r1[3];
@@ -312,7 +306,7 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                             key_tuple key2(iat2, rx2 - rx0, ry2 - ry0, rz2 - rz0);
 
                             int nnr_inner = 0;
-                            int dhsize = pv.get_row_size(iat1)*pv.get_col_size(iat2);
+                            int dhsize = pv.get_row_size(iat1) * pv.get_col_size(iat2);
                             std::vector<std::vector<double>> dhvnl(3, std::vector<double>(dhsize, 0.0));
                             std::vector<std::vector<double>> dhvnl1(3, std::vector<double>(dhsize, 0.0));
                             for (int j = 0; j < atom1->nw * npol; j++)
@@ -320,22 +314,22 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                                 const int j0 = j / npol; // added by zhengdy-soc
                                 const int iw1_all = start1 + j;
                                 const int mu = pv.global2local_row(iw1_all);
-								if (mu < 0)
-								{
-									continue;
-								}
+                                if (mu < 0)
+                                {
+                                    continue;
+                                }
 
                                 for (int k = 0; k < atom2->nw * npol; k++)
                                 {
                                     const int k0 = k / npol;
                                     const int iw2_all = start2 + k;
                                     const int nu = pv.global2local_col(iw2_all);
-									if (nu < 0)
-									{
-										continue;
-									}
+                                    if (nu < 0)
+                                    {
+                                        continue;
+                                    }
 
-                                    if (nspin==4)
+                                    if (nspin == 4)
                                     {
                                         std::vector<double> nlm_1 = nlm_tot[iat][key2][iw2_all][0];
                                         std::vector<std::vector<double>> nlm_2;
@@ -344,7 +338,7 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                                         {
                                             nlm_2[i] = nlm_tot[iat][key1][iw1_all][i + 1];
                                         }
-                                        int is0 = (j-j0*npol) + (k-k0*npol) * 2;
+                                        int is0 = (j - j0 * npol) + (k - k0 * npol) * 2;
                                         for (int no = 0; no < ucell.atoms[T0].ncpp.non_zero_count_soc[is0]; no++)
                                         {
                                             const int p1 = ucell.atoms[T0].ncpp.index1_soc[is0][no];
@@ -353,27 +347,35 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                                             {
                                                 if (is0 == 0)
                                                 {
-                                                    dhvnl[ir][nnr_inner] += nlm_2[ir][p1]*nlm_1[p2]*
-                                                            (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
-                                                            + ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())*0.5;
+                                                    dhvnl[ir][nnr_inner]
+                                                        += nlm_2[ir][p1] * nlm_1[p2]
+                                                           * (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
+                                                              + ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())
+                                                           * 0.5;
                                                 }
                                                 else if (is0 == 1)
                                                 {
-                                                    dhvnl[ir][nnr_inner] += nlm_2[ir][p1]*nlm_1[p2]*
-                                                            (ucell.atoms[T0].ncpp.d_so(1, p2, p1).real()
-                                                            + ucell.atoms[T0].ncpp.d_so(2, p2, p1).real())*0.5;
+                                                    dhvnl[ir][nnr_inner]
+                                                        += nlm_2[ir][p1] * nlm_1[p2]
+                                                           * (ucell.atoms[T0].ncpp.d_so(1, p2, p1).real()
+                                                              + ucell.atoms[T0].ncpp.d_so(2, p2, p1).real())
+                                                           * 0.5;
                                                 }
                                                 else if (is0 == 2)
                                                 {
-                                                    dhvnl[ir][nnr_inner] += nlm_2[ir][p1]*nlm_1[p2]*
-                                                            (-ucell.atoms[T0].ncpp.d_so(1, p2, p1).imag()
-                                                            +ucell.atoms[T0].ncpp.d_so(2, p2, p1).imag())*0.5;
+                                                    dhvnl[ir][nnr_inner]
+                                                        += nlm_2[ir][p1] * nlm_1[p2]
+                                                           * (-ucell.atoms[T0].ncpp.d_so(1, p2, p1).imag()
+                                                              + ucell.atoms[T0].ncpp.d_so(2, p2, p1).imag())
+                                                           * 0.5;
                                                 }
                                                 else if (is0 == 3)
                                                 {
-                                                    dhvnl[ir][nnr_inner] += nlm_2[ir][p1]*nlm_1[p2]*
-                                                            (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
-                                                            - ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())*0.5;
+                                                    dhvnl[ir][nnr_inner]
+                                                        += nlm_2[ir][p1] * nlm_1[p2]
+                                                           * (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
+                                                              - ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())
+                                                           * 0.5;
                                                 }
                                             }
                                         }
@@ -386,7 +388,7 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                                             {
                                                 nlm_2[i] = nlm_tot[iat][key2][iw2_all][i + 1];
                                             }
-                                            int is0 = (j-j0*npol) + (k-k0*npol) * 2;
+                                            int is0 = (j - j0 * npol) + (k - k0 * npol) * 2;
                                             for (int no = 0; no < ucell.atoms[T0].ncpp.non_zero_count_soc[is0]; no++)
                                             {
                                                 const int p1 = ucell.atoms[T0].ncpp.index1_soc[is0][no];
@@ -395,27 +397,35 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                                                 {
                                                     if (is0 == 0)
                                                     {
-                                                        dhvnl1[ir][nnr_inner] += nlm_2[ir][p1]*nlm_1[p2]*
-                                                                (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
-                                                                + ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())*0.5;
+                                                        dhvnl1[ir][nnr_inner]
+                                                            += nlm_2[ir][p1] * nlm_1[p2]
+                                                               * (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
+                                                                  + ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())
+                                                               * 0.5;
                                                     }
                                                     else if (is0 == 1)
                                                     {
-                                                        dhvnl1[ir][nnr_inner] += nlm_2[ir][p1]*nlm_1[p2]*
-                                                                (ucell.atoms[T0].ncpp.d_so(1, p2, p1).real()
-                                                                + ucell.atoms[T0].ncpp.d_so(2, p2, p1).real())*0.5;
+                                                        dhvnl1[ir][nnr_inner]
+                                                            += nlm_2[ir][p1] * nlm_1[p2]
+                                                               * (ucell.atoms[T0].ncpp.d_so(1, p2, p1).real()
+                                                                  + ucell.atoms[T0].ncpp.d_so(2, p2, p1).real())
+                                                               * 0.5;
                                                     }
                                                     else if (is0 == 2)
                                                     {
-                                                        dhvnl1[ir][nnr_inner] += nlm_2[ir][p1]*nlm_1[p2]*
-                                                                (-ucell.atoms[T0].ncpp.d_so(1, p2, p1).imag()
-                                                                +ucell.atoms[T0].ncpp.d_so(2, p2, p1).imag())*0.5;
+                                                        dhvnl1[ir][nnr_inner]
+                                                            += nlm_2[ir][p1] * nlm_1[p2]
+                                                               * (-ucell.atoms[T0].ncpp.d_so(1, p2, p1).imag()
+                                                                  + ucell.atoms[T0].ncpp.d_so(2, p2, p1).imag())
+                                                               * 0.5;
                                                     }
                                                     else if (is0 == 3)
                                                     {
-                                                        dhvnl1[ir][nnr_inner] += nlm_2[ir][p1]*nlm_1[p2]*
-                                                                (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
-                                                                - ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())*0.5;
+                                                        dhvnl1[ir][nnr_inner]
+                                                            += nlm_2[ir][p1] * nlm_1[p2]
+                                                               * (ucell.atoms[T0].ncpp.d_so(0, p2, p1).real()
+                                                                  - ucell.atoms[T0].ncpp.d_so(3, p2, p1).real())
+                                                               * 0.5;
                                                     }
                                                 }
                                             }
@@ -504,15 +514,15 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                                                 for (int ipol = jpol; ipol < 3; ipol++)
                                                 {
                                                     local_svnl_dbeta(jpol, ipol)
-                                                        += dm2d1
-                                                            * (nlm[jpol] * r1[ipol] + nlm1[jpol] * r0[ipol]);
+                                                        += dm2d1 * (nlm[jpol] * r1[ipol] + nlm1[jpol] * r0[ipol]);
                                                 }
                                             }
                                         }
                                     }
                                     else
                                     {
-                                        ModuleBase::WARNING_QUIT("Force_LCAO_k::cal_fvnl_dbeta_k", "nspin must be 1, 2 or 4");
+                                        ModuleBase::WARNING_QUIT("Force_LCAO_k::cal_fvnl_dbeta_k",
+                                                                 "nspin must be 1, 2 or 4");
                                     }
                                     //}
                                     nnr_inner++;
@@ -542,14 +552,14 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                                                 {
                                                     local_svnl_dbeta(jpol, ipol)
                                                         += tmp_matrix_ptr[0][ir]
-                                                            * (dhvnl[jpol][ir] * r1[ipol] + dhvnl1[jpol][ir] * r0[ipol]);
+                                                           * (dhvnl[jpol][ir] * r1[ipol] + dhvnl1[jpol][ir] * r0[ipol]);
                                                 }
                                             }
                                         }
                                     }
                                 }
                             }
-                        }         // ad0
+                        } // ad0
 
                         // outer circle : accumulate nnr
                         for (int j = 0; j < atom1->nw * npol; j++)
@@ -557,10 +567,10 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                             const int j0 = j / npol; // added by zhengdy-soc
                             const int iw1_all = start1 + j;
                             const int mu = pv.global2local_row(iw1_all);
-							if (mu < 0)
-							{
-								continue;
-							}
+                            if (mu < 0)
+                            {
+                                continue;
+                            }
 
                             // fix a serious bug: atom2[T2] -> atom2
                             // mohan 2010-12-20
@@ -569,11 +579,11 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                                 const int k0 = k / npol;
                                 const int iw2_all = start2 + k;
                                 const int nu = pv.global2local_col(iw2_all);
-								if (nu < 0)
-								{
-									continue;
-								}
-								total_nnr++;
+                                if (nu < 0)
+                                {
+                                    continue;
+                                }
+                                total_nnr++;
                                 nnr++;
                             }
                         }
@@ -610,7 +620,7 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
                 }
             } // I1
         }     // T1
-     
+
 #ifdef _OPENMP
         if (isstress)
         {
@@ -635,7 +645,6 @@ void Force_LCAO<std::complex<double>>::cal_fvnl_dbeta(
         StressTools::stress_fill(ucell.lat0, ucell.omega, svnl_dbeta);
     }
 
-    ModuleBase::timer::tick("Force_LCAO","cal_fvnl_dbeta");
+    ModuleBase::timer::tick("Force_LCAO", "cal_fvnl_dbeta");
     return;
 }
-
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
index d5c0702516..4fcad5b6db 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
@@ -1,4 +1,5 @@
 #include "deepks_lcao.h"
+
 #include "module_base/timer.h"
 #include "module_base/tool_title.h"
 #ifdef __DEEPKS
@@ -15,15 +16,17 @@ namespace hamilt
 
 template <typename TK, typename TR>
 DeePKS<OperatorLCAO<TK, TR>>::DeePKS(Local_Orbital_Charge* loc_in,
-    LCAO_Matrix* LM_in,
-    const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
-    HContainer<TR>* hR_in,
-    std::vector<TK>* hK_in,
-    const UnitCell* ucell_in,
-    Grid_Driver* GridD_in,
-    const ORB_gen_tables* uot,
-    const int& nks_in,
-    elecstate::DensityMatrix<TK,double>* DM_in) : loc(loc_in), nks(nks_in), ucell(ucell_in), OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), DM(DM_in), uot_(uot)
+                                     LCAO_Matrix* LM_in,
+                                     const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
+                                     HContainer<TR>* hR_in,
+                                     std::vector<TK>* hK_in,
+                                     const UnitCell* ucell_in,
+                                     Grid_Driver* GridD_in,
+                                     const ORB_gen_tables* uot,
+                                     const int& nks_in,
+                                     elecstate::DensityMatrix<TK, double>* DM_in)
+    : loc(loc_in), nks(nks_in), ucell(ucell_in), OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), DM(DM_in),
+      uot_(uot)
 {
     this->cal_type = calculation_type::lcao_deepks;
 #ifdef __DEEPKS
@@ -43,14 +46,13 @@ DeePKS<OperatorLCAO<TK, TR>>::~DeePKS()
 #ifdef __DEEPKS
 // initialize_HR()
 template <typename TK, typename TR>
-void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD,
-                                                                      const Parallel_Orbitals* paraV)
+void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD, const Parallel_Orbitals* paraV)
 {
     ModuleBase::TITLE("DeePKS", "initialize_HR");
     ModuleBase::timer::tick("DeePKS", "initialize_HR");
 
-    //this->H_V_delta = new HContainer<TR>(paraV);
-    if(std::is_same<TK, double>::value)
+    // this->H_V_delta = new HContainer<TR>(paraV);
+    if (std::is_same<TK, double>::value)
     {
         this->H_V_delta = new HContainer<TR>(paraV);
         this->H_V_delta->fix_gamma();
@@ -59,8 +61,8 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* Gr
     this->adjs_all.clear();
     this->adjs_all.reserve(this->ucell->nat);
     bool pre_cal_nlm = false;
-    if(ucell->nat<100) // less than 100 atom , cost memory for high performance
-    { // pre calculate nlm in initialization
+    if (ucell->nat < 100) // less than 100 atom , cost memory for high performance
+    {                     // pre calculate nlm in initialization
         this->nlm_tot.resize(ucell->nat);
         pre_cal_nlm = true;
     }
@@ -85,8 +87,8 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* Gr
             const ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
             // choose the real adjacent atoms
             const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
-            // Note: the distance of atoms should less than the cutoff radius, 
-            // When equal, the theoretical value of matrix element is zero, 
+            // Note: the distance of atoms should less than the cutoff radius,
+            // When equal, the theoretical value of matrix element is zero,
             // but the calculated value is not zero due to the numerical error, which would lead to result changes.
             if (this->ucell->cal_dtau(iat0, iat1, R_index1).norm() * this->ucell->lat0
                 < orb.Phi[T1].getRcut() + orb.Alpha[0].getRcut())
@@ -118,21 +120,21 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* Gr
                                          R_index2.y - R_index1.y,
                                          R_index2.z - R_index1.z,
                                          paraV);
-                if(std::is_same<TK, double>::value)
+                if (std::is_same<TK, double>::value)
                 {
                     this->H_V_delta->insert_pair(tmp);
                 }
             }
         }
-        if(pre_cal_nlm)
+        if (pre_cal_nlm)
         {
             this->pre_calculate_nlm(iat0, nlm_tot[iat0]);
         }
     }
     // allocate the memory of BaseMatrix in HR, and set the new values to zero
-    if(std::is_same<TK, double>::value)
+    if (std::is_same<TK, double>::value)
     {
-        // only gamma-only has full size of Hamiltonian of DeePKS now, 
+        // only gamma-only has full size of Hamiltonian of DeePKS now,
         // multi-k keep same size of nonlocal operator, H_V_delta will be allocated by hR
         this->H_V_delta->allocate(nullptr, true);
         // expand hR with H_V_delta, only gamma-only case now
@@ -144,20 +146,17 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* Gr
 }
 #endif
 
-template<>
+template <>
 void DeePKS<OperatorLCAO<double, double>>::contributeHR()
 {
     ModuleBase::TITLE("DeePKS", "contributeHR");
 #ifdef __DEEPKS
-    if(GlobalC::ld.get_hr_cal())
+    if (GlobalC::ld.get_hr_cal())
     {
         ModuleBase::timer::tick("DeePKS", "contributeHR");
         const Parallel_Orbitals* pv = this->LM->ParaV;
-        GlobalC::ld.cal_projected_DM(this->DM,
-            *this->ucell,
-            GlobalC::ORB,
-            GlobalC::GridD);
-        GlobalC::ld.cal_descriptor(this->ucell->nat);        
+        GlobalC::ld.cal_projected_DM(this->DM, *this->ucell, GlobalC::ORB, GlobalC::GridD);
+        GlobalC::ld.cal_descriptor(this->ucell->nat);
         GlobalC::ld.cal_gedm(this->ucell->nat);
         // recalculate the H_V_delta
         this->H_V_delta->set_zero();
@@ -172,7 +171,7 @@ void DeePKS<OperatorLCAO<double, double>>::contributeHR()
 #endif
 }
 
-template<>
+template <>
 void DeePKS<OperatorLCAO<std::complex<double>, double>>::contributeHR()
 {
 #ifdef __DEEPKS
@@ -180,20 +179,17 @@ void DeePKS<OperatorLCAO<std::complex<double>, double>>::contributeHR()
     // if DM_K changed, HR of DeePKS need to refresh.
     // the judgement is based on the status of HR in GlobalC::ld
     // this operator should be informed that DM_K has changed and HR need to recalculate.
-    if(GlobalC::ld.get_hr_cal())
+    if (GlobalC::ld.get_hr_cal())
     {
         ModuleBase::timer::tick("DeePKS", "contributeHR");
 
-        GlobalC::ld.cal_projected_DM_k(this->DM,
-            *this->ucell,
-            GlobalC::ORB,
-            GlobalC::GridD);
+        GlobalC::ld.cal_projected_DM_k(this->DM, *this->ucell, GlobalC::ORB, GlobalC::GridD);
         GlobalC::ld.cal_descriptor(this->ucell->nat);
         // calculate dE/dD
         GlobalC::ld.cal_gedm(this->ucell->nat);
-        
+
         // recalculate the H_V_delta
-        if(this->H_V_delta == nullptr)
+        if (this->H_V_delta == nullptr)
         {
             this->H_V_delta = new hamilt::HContainer<double>(*this->hR);
         }
@@ -201,14 +197,14 @@ void DeePKS<OperatorLCAO<std::complex<double>, double>>::contributeHR()
         this->calculate_HR();
 
         GlobalC::ld.set_hr_cal(false);
-        
+
         ModuleBase::timer::tick("DeePKS", "contributeHR");
-    } 
+    }
     // save H_V_delta to hR
     this->hR->add(*this->H_V_delta);
 #endif
 }
-template<>
+template <>
 void DeePKS<OperatorLCAO<std::complex<double>, std::complex<double>>>::contributeHR()
 {
 #ifdef __DEEPKS
@@ -216,20 +212,17 @@ void DeePKS<OperatorLCAO<std::complex<double>, std::complex<double>>>::contribut
     // if DM_K changed, HR of DeePKS need to refresh.
     // the judgement is based on the status of HR in GlobalC::ld
     // this operator should be informed that DM_K has changed and HR need to recalculate.
-    if(GlobalC::ld.get_hr_cal())
+    if (GlobalC::ld.get_hr_cal())
     {
         ModuleBase::timer::tick("DeePKS", "contributeHR");
 
-        GlobalC::ld.cal_projected_DM_k(this->DM,
-            *this->ucell,
-            GlobalC::ORB,
-            GlobalC::GridD);
+        GlobalC::ld.cal_projected_DM_k(this->DM, *this->ucell, GlobalC::ORB, GlobalC::GridD);
         GlobalC::ld.cal_descriptor(this->ucell->nat);
         // calculate dE/dD
         GlobalC::ld.cal_gedm(this->ucell->nat);
 
         // recalculate the H_V_delta
-        if(this->H_V_delta == nullptr)
+        if (this->H_V_delta == nullptr)
         {
             this->H_V_delta = new hamilt::HContainer<std::complex<double>>(*this->hR);
         }
@@ -237,9 +230,9 @@ void DeePKS<OperatorLCAO<std::complex<double>, std::complex<double>>>::contribut
         this->calculate_HR();
 
         GlobalC::ld.set_hr_cal(false);
-        
+
         ModuleBase::timer::tick("DeePKS", "contributeHR");
-    } 
+    }
     // save H_V_delta to hR
     this->hR->add(*this->H_V_delta);
 
@@ -249,7 +242,9 @@ void DeePKS<OperatorLCAO<std::complex<double>, std::complex<double>>>::contribut
 #ifdef __DEEPKS
 
 template <typename TK, typename TR>
-void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::pre_calculate_nlm(const int iat0, std::vector<std::unordered_map<int, std::vector<double>>>& nlm_in)
+void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::pre_calculate_nlm(
+    const int iat0,
+    std::vector<std::unordered_map<int, std::vector<double>>>& nlm_in)
 {
     const Parallel_Orbitals* paraV = this->LM->ParaV;
     const int npol = this->ucell->get_npol();
@@ -282,18 +277,19 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::pre_calculate_nlm(const int i
             // and size of outer vector is then 4
             // inner loop : all projectors (L0,M0)
 
-            int L1 = atom1->iw2l[ iw1 ];
-            int N1 = atom1->iw2n[ iw1 ];
-            int m1 = atom1->iw2m[ iw1 ];
+            int L1 = atom1->iw2l[iw1];
+            int N1 = atom1->iw2n[iw1];
+            int m1 = atom1->iw2m[iw1];
 
             // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-            int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
+            int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
             ModuleBase::Vector3<double> dtau = tau0 - tau1;
-            uot_->two_center_bundle->overlap_orb_alpha->snap(
-                    T1, L1, N1, M1, 0, dtau * ucell->lat0, 0 /*calc_deri*/, nlm);
+            uot_->two_center_bundle->overlap_orb_alpha
+                ->snap(T1, L1, N1, M1, 0, dtau * ucell->lat0, 0 /*calc_deri*/, nlm);
             nlm_in[ad].insert({all_indexes[iw1l], nlm[0]});
-            if(npol == 2) nlm_in[ad].insert({all_indexes[iw1l+1], nlm[0]});
+            if (npol == 2)
+                nlm_in[ad].insert({all_indexes[iw1l + 1], nlm[0]});
         }
     }
 }
@@ -302,7 +298,7 @@ template <typename TK, typename TR>
 void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
 {
     ModuleBase::TITLE("DeePKS", "calculate_HR");
-    if(this->H_V_delta->size_atom_pairs() == 0)
+    if (this->H_V_delta->size_atom_pairs() == 0)
     {
         return;
     }
@@ -321,49 +317,49 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
         ucell->iat2iait(iat0, &I0, &T0);
         AdjacentAtomInfo& adjs = this->adjs_all[iat0];
 
-        //trace alpha orbital
+        // trace alpha orbital
         std::vector<int> trace_alpha_row;
         std::vector<int> trace_alpha_col;
         std::vector<double> gedms;
-        if(!GlobalV::deepks_equiv)
+        if (!GlobalV::deepks_equiv)
         {
-            int ib=0;
-            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax();++L0)
+            int ib = 0;
+            for (int L0 = 0; L0 <= orb.Alpha[0].getLmax(); ++L0)
             {
-                for (int N0 = 0;N0 < orb.Alpha[0].getNchi(L0);++N0)
+                for (int N0 = 0; N0 < orb.Alpha[0].getNchi(L0); ++N0)
                 {
                     const int inl = GlobalC::ld.get_inl(T0, I0, L0, N0);
                     const double* pgedm = GlobalC::ld.get_gedms(inl);
-                    const int nm = 2*L0+1;
-            
-                    for (int m1=0; m1<nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
+                    const int nm = 2 * L0 + 1;
+
+                    for (int m1 = 0; m1 < nm; ++m1) // m1 = 1 for s, 3 for p, 5 for d
                     {
-                        for (int m2=0; m2<nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
+                        for (int m2 = 0; m2 < nm; ++m2) // m1 = 1 for s, 3 for p, 5 for d
                         {
-                            trace_alpha_row.push_back(ib+m1);
-                            trace_alpha_col.push_back(ib+m2);
-                            gedms.push_back(pgedm[m1*nm+m2]);
+                            trace_alpha_row.push_back(ib + m1);
+                            trace_alpha_col.push_back(ib + m2);
+                            gedms.push_back(pgedm[m1 * nm + m2]);
                         }
                     }
-                    ib+=nm;
+                    ib += nm;
                 }
             }
         }
         else
         {
-            const double * pgedm = GlobalC::ld.get_gedms(iat0);
+            const double* pgedm = GlobalC::ld.get_gedms(iat0);
             int nproj = 0;
-            for(int il = 0; il < GlobalC::ld.get_lmaxd() + 1; il++)
+            for (int il = 0; il < GlobalC::ld.get_lmaxd() + 1; il++)
             {
                 nproj += (2 * il + 1) * orb.Alpha[0].getNchi(il);
             }
-            for(int iproj = 0; iproj < nproj; iproj ++)
+            for (int iproj = 0; iproj < nproj; iproj++)
             {
-                for(int jproj = 0; jproj < nproj; jproj ++)
+                for (int jproj = 0; jproj < nproj; jproj++)
                 {
                     trace_alpha_row.push_back(iproj);
                     trace_alpha_col.push_back(jproj);
-                    gedms.push_back(pgedm[iproj*nproj+jproj]);
+                    gedms.push_back(pgedm[iproj * nproj + jproj]);
                 }
             }
         }
@@ -372,7 +368,7 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
 
         // if nlm_tot is not calculated already, calculate it on the fly now
         int iat00 = iat0;
-        if(nlm_tot.size() != this->ucell->nat)
+        if (nlm_tot.size() != this->ucell->nat)
         {
             iat00 = 0;
             nlm_tot[iat00].clear();
@@ -380,7 +376,7 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
         }
         std::vector<std::unordered_map<int, std::vector<double>>>& nlm_iat = nlm_tot[iat00];
 
-// 2. calculate <psi_I|beta>D<beta|psi_{J,R}> for each pair of <IJR> atoms
+        // 2. calculate <psi_I|beta>D<beta|psi_{J,R}> for each pair of <IJR> atoms
         for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
         {
             const int T1 = adjs.ntype[ad1];
@@ -389,14 +385,15 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
             ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
             auto row_indexes = paraV->get_indexes_row(iat1);
             const int row_size = row_indexes.size();
-            if(row_size == 0) continue;
+            if (row_size == 0)
+                continue;
 
             std::vector<double> s_1t(trace_alpha_size * row_size);
-            for(int irow=0;irow<row_size;irow++)
+            for (int irow = 0; irow < row_size; irow++)
             {
                 const double* row_ptr = nlm_iat[ad1][row_indexes[irow]].data();
                 double* ps1t = &s_1t[irow * trace_alpha_size];
-                for(int i=0;i<trace_alpha_size;i++)
+                for (int i = 0; i < trace_alpha_size; i++)
                 {
                     ps1t[i] = row_ptr[trace_alpha_row[i]] * gedms[i];
                 }
@@ -410,18 +407,20 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
                 ModuleBase::Vector3<int> R_vector(R_index2[0] - R_index1[0],
                                                   R_index2[1] - R_index1[1],
                                                   R_index2[2] - R_index1[2]);
-                hamilt::BaseMatrix<TR>* tmp = this->H_V_delta->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]);
+                hamilt::BaseMatrix<TR>* tmp
+                    = this->H_V_delta->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]);
                 // if not found , skip this pair of atoms
-                if (tmp == nullptr) continue;
+                if (tmp == nullptr)
+                    continue;
                 auto col_indexes = paraV->get_indexes_col(iat2);
                 const int col_size = col_indexes.size();
                 std::vector<double> hr_current(row_size * col_size, 0);
                 std::vector<double> s_2t(trace_alpha_size * col_size);
-                for(int icol=0;icol<col_size;icol++)
+                for (int icol = 0; icol < col_size; icol++)
                 {
                     const double* col_ptr = nlm_iat[ad2][col_indexes[icol]].data();
                     double* ps2t = &s_2t[icol * trace_alpha_size];
-                    for(int i=0;i<trace_alpha_size;i++)
+                    for (int i = 0; i < trace_alpha_size; i++)
                     {
                         ps2t[i] = col_ptr[trace_alpha_col[i]];
                     }
@@ -432,27 +431,27 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
                     {
                         for(int ialpha=0;ialpha<trace_alpha_size;ialpha++)
                         {
-                            tmp->get_pointer()[irow*col_size+icol] += 
+                            tmp->get_pointer()[irow*col_size+icol] +=
                                 s_1t[irow*trace_alpha_size+ialpha] * s_2t[icol*trace_alpha_size+ialpha];
                         }
                     }
                 }*/
-                //dgemm for s_2t and s_1t to get HR_12
-                constexpr char transa='T', transb='N';
+                // dgemm for s_2t and s_1t to get HR_12
+                constexpr char transa = 'T', transb = 'N';
                 const double gemm_alpha = 1.0, gemm_beta = 1.0;
-                dgemm_(
-                    &transa, &transb, 
-                    &col_size, 
-                    &row_size,
-                    &trace_alpha_size, 
-                    &gemm_alpha, 
-                    s_2t.data(), 
-                    &trace_alpha_size,
-                    s_1t.data(),  
-                    &trace_alpha_size,     
-                    &gemm_beta,      
-                    hr_current.data(),    
-                    &col_size);
+                dgemm_(&transa,
+                       &transb,
+                       &col_size,
+                       &row_size,
+                       &trace_alpha_size,
+                       &gemm_alpha,
+                       s_2t.data(),
+                       &trace_alpha_size,
+                       s_1t.data(),
+                       &trace_alpha_size,
+                       &gemm_beta,
+                       hr_current.data(),
+                       &col_size);
                 // add data of HR to target BaseMatrix
                 this->cal_HR_IJR(hr_current.data(), row_size, col_size, tmp->get_pointer());
             }
@@ -463,11 +462,10 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
 
 // cal_HR_IJR()
 template <typename TK, typename TR>
-void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(
-    const double* hr_in,
-    const int& row_size,
-    const int& col_size,
-    TR* data_pointer)
+void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(const double* hr_in,
+                                                              const int& row_size,
+                                                              const int& col_size,
+                                                              TR* data_pointer)
 {
 
     // npol is the number of polarizations,
@@ -506,18 +504,18 @@ inline void get_h_delta_k(int ik, std::complex<double>*& h_delta_k)
 }
 
 // contributeHk()
-template<typename TK, typename TR>
+template <typename TK, typename TR>
 void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik)
 {
     ModuleBase::TITLE("DeePKS", "contributeHk");
     ModuleBase::timer::tick("DeePKS", "contributeHk");
-    
+
     TK* h_delta_k = nullptr;
     get_h_delta_k(ik, h_delta_k);
     // set SK to zero and then calculate SK for each k vector
     ModuleBase::GlobalFunc::ZEROS(h_delta_k, this->hK->size());
 
-    if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
+    if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
     {
         const int nrow = this->LM->ParaV->get_row_size();
         hamilt::folding_HR(*this->H_V_delta, h_delta_k, this->kvec_d[ik], nrow, 1);
@@ -538,4 +536,4 @@ template class DeePKS<OperatorLCAO<std::complex<double>, double>>;
 
 template class DeePKS<OperatorLCAO<std::complex<double>, std::complex<double>>>;
 
-}
+} // namespace hamilt
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp
index da64876d07..97f4191ae7 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp
@@ -1,27 +1,26 @@
 #include "dftu_lcao.h"
 
+#include "module_base/timer.h"
+#include "module_base/tool_title.h"
 #include "module_basis/module_ao/ORB_gen_tables.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
-#include "module_base/timer.h"
-#include "module_base/tool_title.h"
 #ifdef _OPENMP
 #include <unordered_set>
 #endif
 #include "module_base/parallel_reduce.h"
 
 template <typename TK, typename TR>
-hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::DFTU(
-    LCAO_Matrix* LM_in,
-    const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
-    hamilt::HContainer<TR>* hR_in,
-    std::vector<TK>* hK_in,
-    const UnitCell& ucell_in,
-    Grid_Driver* GridD_in,
-    const ORB_gen_tables* uot,
-    ModuleDFTU::DFTU* dftu_in,
-    const Parallel_Orbitals& paraV)
+hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::DFTU(LCAO_Matrix* LM_in,
+                                                 const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
+                                                 hamilt::HContainer<TR>* hR_in,
+                                                 std::vector<TK>* hK_in,
+                                                 const UnitCell& ucell_in,
+                                                 Grid_Driver* GridD_in,
+                                                 const ORB_gen_tables* uot,
+                                                 ModuleDFTU::DFTU* dftu_in,
+                                                 const Parallel_Orbitals& paraV)
     : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), uot_(uot)
 {
     this->cal_type = calculation_type::lcao_dftu;
@@ -32,7 +31,7 @@ hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::DFTU(
 #endif
     // initialize HR to allocate sparse Nonlocal matrix memory
     this->initialize_HR(GridD_in, &paraV);
-    //set nspin
+    // set nspin
     this->nspin = GlobalV::NSPIN;
 }
 
@@ -44,8 +43,7 @@ hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::~DFTU()
 
 // initialize_HR()
 template <typename TK, typename TR>
-void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD,
-                                                                      const Parallel_Orbitals* paraV)
+void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD, const Parallel_Orbitals* paraV)
 {
     ModuleBase::TITLE("DFTU", "initialize_HR");
     ModuleBase::timer::tick("DFTU", "initialize_HR");
@@ -58,7 +56,8 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* Grid
         int T0, I0;
         ucell->iat2iait(iat0, &I0, &T0);
         const int target_L = this->dftu->orbital_corr[T0];
-        if(target_L == -1) continue;
+        if (target_L == -1)
+            continue;
 
         AdjacentAtomInfo adjs;
         GridD->Find_atom(*ucell, tau0, T0, I0, &adjs);
@@ -72,8 +71,8 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* Grid
             const ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
             // choose the real adjacent atoms
             const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
-            // Note: the distance of atoms should less than the cutoff radius, 
-            // When equal, the theoretical value of matrix element is zero, 
+            // Note: the distance of atoms should less than the cutoff radius,
+            // When equal, the theoretical value of matrix element is zero,
             // but the calculated value is not zero due to the numerical error, which would lead to result changes.
             if (this->ucell->cal_dtau(iat0, iat1, R_index1).norm() * this->ucell->lat0
                 < orb.Phi[T1].getRcut() + GlobalV::onsite_radius)
@@ -92,7 +91,8 @@ template <typename TK, typename TR>
 void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_nlm_all(const Parallel_Orbitals* paraV)
 {
     ModuleBase::TITLE("DFTU", "cal_nlm_all");
-    if(this->precal_nlm_done) return;
+    if (this->precal_nlm_done)
+        return;
     ModuleBase::timer::tick("DFTU", "cal_nlm_all");
     nlm_tot.resize(this->ucell->nat);
     const int npol = this->ucell->get_npol();
@@ -103,8 +103,9 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_nlm_all(const Parallel_Orbi
         int T0, I0;
         ucell->iat2iait(iat0, &I0, &T0);
         const int target_L = this->dftu->orbital_corr[T0];
-        if(target_L == -1) continue;
-        const int tlp1 = 2*target_L+1;
+        if (target_L == -1)
+            continue;
+        const int tlp1 = 2 * target_L + 1;
         AdjacentAtomInfo& adjs = this->adjs_all[atom_index++];
 
         // calculate and save the table of two-center integrals
@@ -130,9 +131,9 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_nlm_all(const Parallel_Orbi
                 const int iw1 = all_indexes[iw1l] / npol;
                 // only first zeta orbitals in target L of atom iat0 are needed
                 std::vector<double> nlm_target(tlp1);
-                const int L1 = atom1->iw2l[ iw1 ];
-                const int N1 = atom1->iw2n[ iw1 ];
-                const int m1 = atom1->iw2m[ iw1 ];
+                const int L1 = atom1->iw2l[iw1];
+                const int N1 = atom1->iw2n[iw1];
+                const int m1 = atom1->iw2m[iw1];
                 std::vector<std::vector<double>> nlm;
                 // nlm is a vector of vectors, but size of outer vector is only 1 here
                 // If we are calculating force, we need also to store the gradient
@@ -140,20 +141,20 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_nlm_all(const Parallel_Orbi
                 // inner loop : all projectors (L0,M0)
 
                 // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-                const int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
+                const int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
                 ModuleBase::Vector3<double> dtau = tau0 - tau1;
-                uot_->two_center_bundle->overlap_orb_onsite->snap(
-                        T1, L1, N1, M1, T0, dtau * this->ucell->lat0, 0 /*cal_deri*/, nlm);
+                uot_->two_center_bundle->overlap_orb_onsite
+                    ->snap(T1, L1, N1, M1, T0, dtau * this->ucell->lat0, 0 /*cal_deri*/, nlm);
                 // select the elements of nlm with target_L
-                for(int iw =0;iw < this->ucell->atoms[T0].nw; iw++)
+                for (int iw = 0; iw < this->ucell->atoms[T0].nw; iw++)
                 {
                     const int L0 = this->ucell->atoms[T0].iw2l[iw];
-                    if(L0 == target_L)
+                    if (L0 == target_L)
                     {
-                        for(int m = 0; m < 2*L0+1; m++)
+                        for (int m = 0; m < 2 * L0 + 1; m++)
                         {
-                            nlm_target[m] = nlm[0][iw+m];
+                            nlm_target[m] = nlm[0][iw + m];
                         }
                         break;
                     }
@@ -171,14 +172,15 @@ template <typename TK, typename TR>
 void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
 {
     ModuleBase::TITLE("DFTU", "contributeHR");
-    if(this->dftu->get_dmr(0) == nullptr && this->dftu->initialed_locale == false)
-    {// skip the calculation if dm_in_dftu is nullptr
+    if (this->dftu->get_dmr(0) == nullptr && this->dftu->initialed_locale == false)
+    { // skip the calculation if dm_in_dftu is nullptr
         return;
     }
     else
     {
-        //will update this->dftu->locale and this->dftu->EU
-        if(this->current_spin == 0) this->dftu->EU = 0.0;
+        // will update this->dftu->locale and this->dftu->EU
+        if (this->current_spin == 0)
+            this->dftu->EU = 0.0;
     }
     ModuleBase::timer::tick("DFTU", "contributeHR");
 
@@ -195,15 +197,16 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
         int T0, I0;
         ucell->iat2iait(iat0, &I0, &T0);
         const int target_L = this->dftu->orbital_corr[T0];
-        if(target_L == -1) continue;
-        const int tlp1 = 2*target_L+1;
+        if (target_L == -1)
+            continue;
+        const int tlp1 = 2 * target_L + 1;
         AdjacentAtomInfo& adjs = this->adjs_all[atom_index++];
 
         ModuleBase::timer::tick("DFTU", "cal_occ");
-        //first iteration to calculate occupation matrix
+        // first iteration to calculate occupation matrix
         const int spin_fold = (this->nspin == 4) ? 4 : 1;
         std::vector<double> occ(tlp1 * tlp1 * spin_fold, 0.0);
-        if(this->dftu->initialed_locale == false)
+        if (this->dftu->initialed_locale == false)
         {
             const hamilt::HContainer<double>* dmR_current = this->dftu->get_dmr(this->current_spin);
             for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
@@ -212,18 +215,19 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
                 const int I1 = adjs.natom[ad1];
                 const int iat1 = ucell->itia2iat(T1, I1);
                 ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
-                const std::unordered_map<int,std::vector<double>>& nlm1 = nlm_tot[iat0][ad1];
+                const std::unordered_map<int, std::vector<double>>& nlm1 = nlm_tot[iat0][ad1];
                 for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
                 {
                     const int T2 = adjs.ntype[ad2];
                     const int I2 = adjs.natom[ad2];
                     const int iat2 = ucell->itia2iat(T2, I2);
-                    const std::unordered_map<int,std::vector<double>>& nlm2 = nlm_tot[iat0][ad2];
+                    const std::unordered_map<int, std::vector<double>>& nlm2 = nlm_tot[iat0][ad2];
                     ModuleBase::Vector3<int>& R_index2 = adjs.box[ad2];
                     ModuleBase::Vector3<int> R_vector(R_index2[0] - R_index1[0],
-                                                    R_index2[1] - R_index1[1],
-                                                    R_index2[2] - R_index1[2]);
-                    const hamilt::BaseMatrix<double>* tmp = dmR_current->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]);
+                                                      R_index2[1] - R_index1[1],
+                                                      R_index2[2] - R_index1[2]);
+                    const hamilt::BaseMatrix<double>* tmp
+                        = dmR_current->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]);
                     // if not found , skip this pair of atoms
                     if (tmp != nullptr)
                     {
@@ -231,28 +235,29 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
                     }
                 }
             }
-    #ifdef __MPI
+#ifdef __MPI
             // sum up the occupation matrix
             Parallel_Reduce::reduce_all(occ.data(), occ.size());
-    #endif
+#endif
             // save occ to dftu
-            for(int i=0;i<occ.size();i++)
+            for (int i = 0; i < occ.size(); i++)
             {
-                if(this->nspin==1) occ[i] *= 0.5;
+                if (this->nspin == 1)
+                    occ[i] *= 0.5;
                 this->dftu->locale[iat0][target_L][0][this->current_spin].c[i] = occ[i];
             }
         }
         else // use readin locale to calculate occupation matrix
         {
-            for(int i=0;i<occ.size();i++)
+            for (int i = 0; i < occ.size(); i++)
             {
                 occ[i] = this->dftu->locale[iat0][target_L][0][this->current_spin].c[i];
             }
             // set initialed_locale to false to avoid using readin locale in next iteration
         }
         ModuleBase::timer::tick("DFTU", "cal_occ");
-        
-        //calculate VU
+
+        // calculate VU
         ModuleBase::timer::tick("DFTU", "cal_vu");
         const double u_value = this->dftu->U[T0];
         std::vector<double> VU_tmp(occ.size());
@@ -270,13 +275,13 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
             const int I1 = adjs.natom[ad1];
             const int iat1 = ucell->itia2iat(T1, I1);
             ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
-            const std::unordered_map<int,std::vector<double>>& nlm1 = nlm_tot[iat0][ad1];
+            const std::unordered_map<int, std::vector<double>>& nlm1 = nlm_tot[iat0][ad1];
             for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
             {
                 const int T2 = adjs.ntype[ad2];
                 const int I2 = adjs.natom[ad2];
                 const int iat2 = ucell->itia2iat(T2, I2);
-                const std::unordered_map<int,std::vector<double>>& nlm2 = nlm_tot[iat0][ad2];
+                const std::unordered_map<int, std::vector<double>>& nlm2 = nlm_tot[iat0][ad2];
                 ModuleBase::Vector3<int>& R_index2 = adjs.box[ad2];
                 ModuleBase::Vector3<int> R_vector(R_index2[0] - R_index1[0],
                                                   R_index2[1] - R_index1[1],
@@ -292,15 +297,18 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
         ModuleBase::timer::tick("DFTU", "cal_vu");
     }
 
-    //energy correction for NSPIN=1
-    if(this->nspin==1) this->dftu->EU *= 2.0;
+    // energy correction for NSPIN=1
+    if (this->nspin == 1)
+        this->dftu->EU *= 2.0;
     // for readin onsite_dm, set initialed_locale to false to avoid using readin locale in next iteration
-    if(this->current_spin == this->nspin-1 || this->nspin==4) this->dftu->initialed_locale = false;
+    if (this->current_spin == this->nspin - 1 || this->nspin == 4)
+        this->dftu->initialed_locale = false;
 
     // update this->current_spin: only nspin=2 iterate change it between 0 and 1
-    // the key point is only nspin=2 calculate spin-up and spin-down separately, 
+    // the key point is only nspin=2 calculate spin-up and spin-down separately,
     // and won't calculate spin-up twice without spin-down
-    if(this->nspin == 2) this->current_spin = 1 - this->current_spin;
+    if (this->nspin == 2)
+        this->current_spin = 1 - this->current_spin;
 
     ModuleBase::timer::tick("DFTU", "contributeHR");
 }
@@ -326,9 +334,9 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(
     // ---------------------------------------------
     auto row_indexes = paraV->get_indexes_row(iat1);
     auto col_indexes = paraV->get_indexes_col(iat2);
-    const int m_size = int(sqrt(VU.size())/npol);
+    const int m_size = int(sqrt(VU.size()) / npol);
     // step_trace = 0 for NSPIN=1,2; ={0, 1, local_col, local_col+1} for NSPIN=4
-    std::vector<int> step_trace(npol*npol, 0);
+    std::vector<int> step_trace(npol * npol, 0);
     for (int is = 0; is < npol; is++)
     {
         for (int is2 = 0; is2 < npol; is2++)
@@ -347,13 +355,13 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(
 #ifdef __DEBUG
             assert(nlm1.size() == nlm2.size());
 #endif
-            for (int is = 0; is < npol*npol; ++is)
+            for (int is = 0; is < npol * npol; ++is)
             {
                 int start = is * m_size * m_size;
                 TR nlm_tmp = TR(0);
                 for (int m1 = 0; m1 < m_size; m1++)
                 {
-                    for(int m2 = 0; m2 < m_size; m2++)
+                    for (int m2 = 0; m2 < m_size; m2++)
                     {
                         nlm_tmp += nlm1[m1] * nlm2[m2] * VU[m1 * m_size + m2 + start];
                     }
@@ -367,14 +375,13 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(
 }
 
 template <typename TK, typename TR>
-void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_occ(
-    const int& iat1,
-    const int& iat2,
-    const Parallel_Orbitals* paraV,
-    const std::unordered_map<int, std::vector<double>>& nlm1_all,
-    const std::unordered_map<int, std::vector<double>>& nlm2_all,
-    const double* dm_pointer,
-    std::vector<double>& occ)
+void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_occ(const int& iat1,
+                                                         const int& iat2,
+                                                         const Parallel_Orbitals* paraV,
+                                                         const std::unordered_map<int, std::vector<double>>& nlm1_all,
+                                                         const std::unordered_map<int, std::vector<double>>& nlm2_all,
+                                                         const double* dm_pointer,
+                                                         std::vector<double>& occ)
 {
 
     // npol is the number of polarizations,
@@ -386,7 +393,7 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_occ(
     // ---------------------------------------------
     auto row_indexes = paraV->get_indexes_row(iat1);
     auto col_indexes = paraV->get_indexes_col(iat2);
-    const int m_size = int(sqrt(occ.size())/npol);
+    const int m_size = int(sqrt(occ.size()) / npol);
     const int m_size2 = m_size * m_size;
 #ifdef __DEBUG
     assert(m_size * m_size == occ.size());
@@ -412,14 +419,14 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_occ(
 #endif
             for (int is1 = 0; is1 < npol; ++is1)
             {
-                for(int is2 = 0;is2 < npol; ++is2)
+                for (int is2 = 0; is2 < npol; ++is2)
                 {
                     for (int m1 = 0; m1 < m_size; ++m1)
                     {
-                        for(int m2 = 0; m2 < m_size; ++m2)
+                        for (int m2 = 0; m2 < m_size; ++m2)
                         {
-                            occ[m1 * m_size + m2 + (is1 * npol + is2) * m_size2] += 
-                                nlm1[m1] * nlm2[m2] * dm_pointer[step_trace[is1 * npol + is2]];
+                            occ[m1 * m_size + m2 + (is1 * npol + is2) * m_size2]
+                                += nlm1[m1] * nlm2[m2] * dm_pointer[step_trace[is1 * npol + is2]];
                         }
                     }
                 }
@@ -430,41 +437,43 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_occ(
     }
 }
 
-template<typename TK, typename TR>
+template <typename TK, typename TR>
 void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::transfer_vu(std::vector<double>& vu_tmp, std::vector<TR>& vu)
 {
 #ifdef __DEBUG
     assert(vu.size() == vu_tmp.size());
 #endif
-    for(int i=0;i<vu_tmp.size();i++)
+    for (int i = 0; i < vu_tmp.size(); i++)
     {
         vu[i] = vu_tmp[i];
     }
 }
 
-template<>
-void hamilt::DFTU<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>::transfer_vu(std::vector<double>& vu_tmp, std::vector<std::complex<double>>& vu)
+template <>
+void hamilt::DFTU<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>::transfer_vu(
+    std::vector<double>& vu_tmp,
+    std::vector<std::complex<double>>& vu)
 {
 #ifdef __DEBUG
     assert(vu.size() == vu_tmp.size());
 #endif
-    
+
     // TR == std::complex<double> transfer from double to std::complex<double>
-    const int m_size = int(sqrt(vu.size())/2);
+    const int m_size = int(sqrt(vu.size()) / 2);
     const int m_size2 = m_size * m_size;
     vu.resize(vu_tmp.size());
-    for(int m1=0;m1<m_size;m1++)
+    for (int m1 = 0; m1 < m_size; m1++)
     {
-        for(int m2=0;m2<m_size;m2++)
+        for (int m2 = 0; m2 < m_size; m2++)
         {
             int index[4];
-            index[0] = m1*m_size+m2;
-            index[1] = m1*m_size+m2 + m_size2;
-            index[2] = m2*m_size+m1 + m_size2 * 2;
-            index[3] = m2*m_size+m1 + m_size2 * 3;
+            index[0] = m1 * m_size + m2;
+            index[1] = m1 * m_size + m2 + m_size2;
+            index[2] = m2 * m_size + m1 + m_size2 * 2;
+            index[3] = m2 * m_size + m1 + m_size2 * 3;
             vu[index[0]] = 0.5 * (vu_tmp[index[0]] + vu_tmp[index[3]]);
             vu[index[3]] = 0.5 * (vu_tmp[index[0]] - vu_tmp[index[3]]);
-            //vu should be complex<double> type, but here we use double type for test
+            // vu should be complex<double> type, but here we use double type for test
             vu[index[1]] = 0.5 * (vu_tmp[index[1]] + std::complex<double>(0.0, 1.0) * vu_tmp[index[2]]);
             vu[index[2]] = 0.5 * (vu_tmp[index[1]] - std::complex<double>(0.0, 1.0) * vu_tmp[index[2]]);
         }
@@ -472,44 +481,43 @@ void hamilt::DFTU<hamilt::OperatorLCAO<std::complex<double>, std::complex<double
 }
 
 template <typename TK, typename TR>
-void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_v_of_u(
-    const std::vector<double>& occ,
-    const int m_size,
-    const double u_value,
-    double* vu,
-    double& eu)
+void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_v_of_u(const std::vector<double>& occ,
+                                                            const int m_size,
+                                                            const double u_value,
+                                                            double* vu,
+                                                            double& eu)
 {
     // calculate the local matrix
     int spin_fold = occ.size() / m_size / m_size;
-    if(spin_fold < 4)
-    for(int is=0;is<spin_fold;++is)
-    {
-        int start = is * m_size * m_size;
-        for (int m1 = 0; m1 < m_size; m1++)
+    if (spin_fold < 4)
+        for (int is = 0; is < spin_fold; ++is)
         {
-            for(int m2 = 0; m2 < m_size; m2++)
+            int start = is * m_size * m_size;
+            for (int m1 = 0; m1 < m_size; m1++)
             {
-                vu[start + m1 * m_size + m2] = u_value * (0.5 * (m1 == m2) - occ[start + m2 * m_size + m1]);
-                eu += u_value * 0.5 * occ[start + m2 * m_size + m1] * occ[start + m1 * m_size + m2];
+                for (int m2 = 0; m2 < m_size; m2++)
+                {
+                    vu[start + m1 * m_size + m2] = u_value * (0.5 * (m1 == m2) - occ[start + m2 * m_size + m1]);
+                    eu += u_value * 0.5 * occ[start + m2 * m_size + m1] * occ[start + m1 * m_size + m2];
+                }
             }
         }
-    }
     else
     {
         for (int m1 = 0; m1 < m_size; m1++)
         {
-            for(int m2 = 0; m2 < m_size; m2++)
+            for (int m2 = 0; m2 < m_size; m2++)
             {
                 vu[m1 * m_size + m2] = u_value * (1.0 * (m1 == m2) - occ[m2 * m_size + m1]);
                 eu += u_value * 0.25 * occ[m2 * m_size + m1] * occ[m1 * m_size + m2];
             }
         }
-        for(int is=1;is<spin_fold;++is)
+        for (int is = 1; is < spin_fold; ++is)
         {
             int start = is * m_size * m_size;
             for (int m1 = 0; m1 < m_size; m1++)
             {
-                for(int m2 = 0; m2 < m_size; m2++)
+                for (int m2 = 0; m2 < m_size; m2++)
                 {
                     vu[start + m1 * m_size + m2] = u_value * (0 - occ[start + m2 * m_size + m1]);
                     eu += u_value * 0.25 * occ[start + m2 * m_size + m1] * occ[start + m1 * m_size + m2];
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp
index 456f81c8ca..b2a0d020d1 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp
@@ -1,11 +1,11 @@
 #include "ekinetic_new.h"
 
+#include "module_base/timer.h"
+#include "module_base/tool_title.h"
 #include "module_basis/module_ao/ORB_gen_tables.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
-#include "module_base/timer.h"
-#include "module_base/tool_title.h"
 
 // Constructor
 template <typename TK, typename TR>
@@ -67,8 +67,8 @@ void hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Drive
             const ModuleBase::Vector3<int>& R_index2 = adjs.box[ad1];
             // choose the real adjacent atoms
             const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
-            // Note: the distance of atoms should less than the cutoff radius, 
-            // When equal, the theoretical value of matrix element is zero, 
+            // Note: the distance of atoms should less than the cutoff radius,
+            // When equal, the theoretical value of matrix element is zero,
             // but the calculated value is not zero due to the numerical error, which would lead to result changes.
             if (this->ucell->cal_dtau(iat1, iat2, R_index2).norm() * this->ucell->lat0
                 < orb.Phi[T1].getRcut() + orb.Phi[T2].getRcut())
@@ -98,7 +98,7 @@ template <typename TK, typename TR>
 void hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
 {
     ModuleBase::TITLE("EkineticNew", "calculate_HR");
-    if(this->HR_fixed==nullptr || this->HR_fixed->size_atom_pairs()<=0)
+    if (this->HR_fixed == nullptr || this->HR_fixed->size_atom_pairs() <= 0)
     {
         ModuleBase::WARNING_QUIT("hamilt::EkineticNew::calculate_HR", "HR_fixed is nullptr or empty");
     }
@@ -182,7 +182,7 @@ void hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(const int& ia
         const int m1 = iw2m1[iw1];
 
         // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-        int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
+        int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
         for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
         {
@@ -192,10 +192,10 @@ void hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(const int& ia
             const int m2 = iw2m2[iw2];
 
             // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-            int M2 = (m2 % 2 == 0) ? -m2/2 : (m2+1)/2;
+            int M2 = (m2 % 2 == 0) ? -m2 / 2 : (m2 + 1) / 2;
 
-            uot_->two_center_bundle->kinetic_orb->calculate(T1, L1, N1, M1,
-                    T2, L2, N2, M2, dtau * this->ucell->lat0, olm);
+            uot_->two_center_bundle->kinetic_orb
+                ->calculate(T1, L1, N1, M1, T2, L2, N2, M2, dtau * this->ucell->lat0, olm);
             for (int ipol = 0; ipol < npol; ipol++)
             {
                 data_pointer[ipol * step_trace] += olm[0];
@@ -230,7 +230,7 @@ void hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
             this->HR_fixed->set_zero();
             this->allocated = true;
         }
-        if(this->next_sub_op != nullptr)
+        if (this->next_sub_op != nullptr)
         {
             // pass pointer of HR_fixed to the next node
             static_cast<OperatorLCAO<TK, TR>*>(this->next_sub_op)->set_HR_fixed(this->HR_fixed);
@@ -240,7 +240,7 @@ void hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
         this->HR_fixed_done = true;
     }
     // last node of sub-chain, add HR_fixed into HR
-    if(this->next_sub_op == nullptr)
+    if (this->next_sub_op == nullptr)
     {
         this->hR->add(*(this->HR_fixed));
     }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
index 9a567bf6df..cf1e3d8931 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
@@ -1,11 +1,11 @@
 #include "nonlocal_new.h"
 
+#include "module_base/timer.h"
+#include "module_base/tool_title.h"
 #include "module_basis/module_ao/ORB_gen_tables.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
-#include "module_base/timer.h"
-#include "module_base/tool_title.h"
 #ifdef _OPENMP
 #include <unordered_set>
 #endif
@@ -68,8 +68,8 @@ void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Drive
             const ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
             // choose the real adjacent atoms
             const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
-            // Note: the distance of atoms should less than the cutoff radius, 
-            // When equal, the theoretical value of matrix element is zero, 
+            // Note: the distance of atoms should less than the cutoff radius,
+            // When equal, the theoretical value of matrix element is zero,
             // but the calculated value is not zero due to the numerical error, which would lead to result changes.
             if (this->ucell->cal_dtau(iat0, iat1, R_index1).norm() * this->ucell->lat0
                 < orb.Phi[T1].getRcut() + this->ucell->infoNL.Beta[T0].get_rcut_max())
@@ -122,99 +122,100 @@ void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
     // 1. calculate <psi|beta> for each pair of atoms
 #ifdef _OPENMP
 #pragma omp parallel
-{
-    std::unordered_set<int> atom_row_list;
-    #pragma omp for
-    for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
     {
-        atom_row_list.insert(iat0);
-    }
+        std::unordered_set<int> atom_row_list;
+#pragma omp for
+        for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
+        {
+            atom_row_list.insert(iat0);
+        }
 #endif
-    for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
-    {
-        auto tau0 = ucell->get_tau(iat0);
-        int T0, I0;
-        ucell->iat2iait(iat0, &I0, &T0);
-        AdjacentAtomInfo& adjs = this->adjs_all[iat0];
+        for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
+        {
+            auto tau0 = ucell->get_tau(iat0);
+            int T0, I0;
+            ucell->iat2iait(iat0, &I0, &T0);
+            AdjacentAtomInfo& adjs = this->adjs_all[iat0];
 
-        std::vector<std::unordered_map<int, std::vector<double>>> nlm_tot;
-        nlm_tot.resize(adjs.adj_num + 1);
+            std::vector<std::unordered_map<int, std::vector<double>>> nlm_tot;
+            nlm_tot.resize(adjs.adj_num + 1);
 
-        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
-        {
-            const int T1 = adjs.ntype[ad];
-            const int I1 = adjs.natom[ad];
-            const int iat1 = ucell->itia2iat(T1, I1);
-            const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
-            const Atom* atom1 = &ucell->atoms[T1];
+            for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+            {
+                const int T1 = adjs.ntype[ad];
+                const int I1 = adjs.natom[ad];
+                const int iat1 = ucell->itia2iat(T1, I1);
+                const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
+                const Atom* atom1 = &ucell->atoms[T1];
 
-            auto all_indexes = paraV->get_indexes_row(iat1);
+                auto all_indexes = paraV->get_indexes_row(iat1);
 #ifdef _OPENMP
-            if(atom_row_list.find(iat1) == atom_row_list.end())
-            {
-                all_indexes.clear();
-            }
+                if (atom_row_list.find(iat1) == atom_row_list.end())
+                {
+                    all_indexes.clear();
+                }
 #endif
-            auto col_indexes = paraV->get_indexes_col(iat1);
-            // insert col_indexes into all_indexes to get universal set with no repeat elements
-            all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
-            std::sort(all_indexes.begin(), all_indexes.end());
-            all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
-            for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol)
-            {
-                const int iw1 = all_indexes[iw1l] / npol;
-                std::vector<std::vector<double>> nlm;
-                // nlm is a vector of vectors, but size of outer vector is only 1 here
-                // If we are calculating force, we need also to store the gradient
-                // and size of outer vector is then 4
-                // inner loop : all projectors (L0,M0)
+                auto col_indexes = paraV->get_indexes_col(iat1);
+                // insert col_indexes into all_indexes to get universal set with no repeat elements
+                all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
+                std::sort(all_indexes.begin(), all_indexes.end());
+                all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
+                for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol)
+                {
+                    const int iw1 = all_indexes[iw1l] / npol;
+                    std::vector<std::vector<double>> nlm;
+                    // nlm is a vector of vectors, but size of outer vector is only 1 here
+                    // If we are calculating force, we need also to store the gradient
+                    // and size of outer vector is then 4
+                    // inner loop : all projectors (L0,M0)
 
-                int L1 = atom1->iw2l[ iw1 ];
-                int N1 = atom1->iw2n[ iw1 ];
-                int m1 = atom1->iw2m[ iw1 ];
+                    int L1 = atom1->iw2l[iw1];
+                    int N1 = atom1->iw2n[iw1];
+                    int m1 = atom1->iw2m[iw1];
 
-                // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-                int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
+                    // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
+                    int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
-                ModuleBase::Vector3<double> dtau = tau0 - tau1;
-                uot_->two_center_bundle->overlap_orb_beta->snap(
-                        T1, L1, N1, M1, T0, dtau * ucell->lat0, 0 /*cal_deri*/, nlm);
-                nlm_tot[ad].insert({all_indexes[iw1l], nlm[0]});
+                    ModuleBase::Vector3<double> dtau = tau0 - tau1;
+                    uot_->two_center_bundle->overlap_orb_beta
+                        ->snap(T1, L1, N1, M1, T0, dtau * ucell->lat0, 0 /*cal_deri*/, nlm);
+                    nlm_tot[ad].insert({all_indexes[iw1l], nlm[0]});
+                }
             }
-        }
-// 2. calculate <psi_I|beta>D<beta|psi_{J,R}> for each pair of <IJR> atoms
-        for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
-        {
-            const int T1 = adjs.ntype[ad1];
-            const int I1 = adjs.natom[ad1];
-            const int iat1 = ucell->itia2iat(T1, I1);
-#ifdef _OPENMP
-            if(atom_row_list.find(iat1) == atom_row_list.end())
+            // 2. calculate <psi_I|beta>D<beta|psi_{J,R}> for each pair of <IJR> atoms
+            for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
             {
-                continue;
-            }
+                const int T1 = adjs.ntype[ad1];
+                const int I1 = adjs.natom[ad1];
+                const int iat1 = ucell->itia2iat(T1, I1);
+#ifdef _OPENMP
+                if (atom_row_list.find(iat1) == atom_row_list.end())
+                {
+                    continue;
+                }
 #endif
-            ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
-            for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
-            {
-                const int T2 = adjs.ntype[ad2];
-                const int I2 = adjs.natom[ad2];
-                const int iat2 = ucell->itia2iat(T2, I2);
-                ModuleBase::Vector3<int>& R_index2 = adjs.box[ad2];
-                ModuleBase::Vector3<int> R_vector(R_index2[0] - R_index1[0],
-                                                  R_index2[1] - R_index1[1],
-                                                  R_index2[2] - R_index1[2]);
-                hamilt::BaseMatrix<TR>* tmp = this->HR_fixed->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]);
-                // if not found , skip this pair of atoms
-                if (tmp != nullptr)
+                ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
+                for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
                 {
-                    this->cal_HR_IJR(iat1, iat2, T0, paraV, nlm_tot[ad1], nlm_tot[ad2], tmp->get_pointer());
+                    const int T2 = adjs.ntype[ad2];
+                    const int I2 = adjs.natom[ad2];
+                    const int iat2 = ucell->itia2iat(T2, I2);
+                    ModuleBase::Vector3<int>& R_index2 = adjs.box[ad2];
+                    ModuleBase::Vector3<int> R_vector(R_index2[0] - R_index1[0],
+                                                      R_index2[1] - R_index1[1],
+                                                      R_index2[2] - R_index1[2]);
+                    hamilt::BaseMatrix<TR>* tmp
+                        = this->HR_fixed->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]);
+                    // if not found , skip this pair of atoms
+                    if (tmp != nullptr)
+                    {
+                        this->cal_HR_IJR(iat1, iat2, T0, paraV, nlm_tot[ad1], nlm_tot[ad2], tmp->get_pointer());
+                    }
                 }
             }
         }
-    }
 #ifdef _OPENMP
-}
+    }
 #endif
 
     ModuleBase::timer::tick("NonlocalNew", "calculate_HR");
@@ -303,7 +304,7 @@ void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
             this->HR_fixed->set_zero();
             this->allocated = true;
         }
-        if(this->next_sub_op != nullptr)
+        if (this->next_sub_op != nullptr)
         {
             // pass pointer of HR_fixed to the next node
             static_cast<OperatorLCAO<TK, TR>*>(this->next_sub_op)->set_HR_fixed(this->HR_fixed);
@@ -313,7 +314,7 @@ void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
         this->HR_fixed_done = true;
     }
     // last node of sub-chain, add HR_fixed into HR
-    if(this->next_sub_op == nullptr)
+    if (this->next_sub_op == nullptr)
     {
         this->hR->add(*(this->HR_fixed));
     }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
index dcdfc2de2a..ba91757a90 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
@@ -1,11 +1,11 @@
 #include "overlap_new.h"
 
+#include "module_base/timer.h"
+#include "module_base/tool_title.h"
 #include "module_basis/module_ao/ORB_gen_tables.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
-#include "module_base/timer.h"
-#include "module_base/tool_title.h"
 
 template <typename TK, typename TR>
 hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::OverlapNew(LCAO_Matrix* LM_in,
@@ -58,8 +58,8 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::initialize_SR(Grid_Driver
             const ModuleBase::Vector3<int>& R_index = adjs.box[ad];
             // choose the real adjacent atoms
             const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
-            // Note: the distance of atoms should less than the cutoff radius, 
-            // When equal, the theoretical value of matrix element is zero, 
+            // Note: the distance of atoms should less than the cutoff radius,
+            // When equal, the theoretical value of matrix element is zero,
             // but the calculated value is not zero due to the numerical error, which would lead to result changes.
             if (this->ucell->cal_dtau(iat1, iat2, R_index).norm() * this->ucell->lat0
                 >= orb.Phi[T1].getRcut() + orb.Phi[T2].getRcut())
@@ -152,7 +152,7 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::cal_SR_IJR(const int& iat
         const int m1 = iw2m1[iw1];
 
         // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-        int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
+        int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
         for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
         {
@@ -162,9 +162,9 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::cal_SR_IJR(const int& iat
             const int m2 = iw2m2[iw2];
 
             // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-            int M2 = (m2 % 2 == 0) ? -m2/2 : (m2+1)/2;
-            uot_->two_center_bundle->overlap_orb->calculate(T1, L1, N1, M1,
-                    T2, L2, N2, M2, dtau * this->ucell->lat0, olm);
+            int M2 = (m2 % 2 == 0) ? -m2 / 2 : (m2 + 1) / 2;
+            uot_->two_center_bundle->overlap_orb
+                ->calculate(T1, L1, N1, M1, T2, L2, N2, M2, dtau * this->ucell->lat0, olm);
             for (int ipol = 0; ipol < npol; ipol++)
             {
                 data_pointer[ipol * step_trace] += olm[0];
@@ -200,7 +200,7 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik)
     ModuleBase::timer::tick("OverlapNew", "contributeHk");
     // set SK to zero and then calculate SK for each k vector
     ModuleBase::GlobalFunc::ZEROS(this->SK_pointer->data(), this->SK_pointer->size());
-    if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
+    if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
     {
         const int nrow = this->SR->get_atom_pair(0).get_paraV()->get_row_size();
         hamilt::folding_HR(*this->SR, this->SK_pointer->data(), this->kvec_d[ik], nrow, 1);
@@ -216,10 +216,10 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::contributeHk(int ik)
     ModuleBase::timer::tick("OverlapNew", "contributeHk");
 }
 
-template<typename TK, typename TR>
+template <typename TK, typename TR>
 TK* hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::getSk()
 {
-    if(this->SK_pointer != nullptr)
+    if (this->SK_pointer != nullptr)
     {
         return this->SK_pointer->data();
     }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
index 681af45ecf..38588d7280 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/td_ekinetic_lcao.cpp
@@ -1,40 +1,40 @@
 #include "td_ekinetic_lcao.h"
+
+#include "module_base/libm/libm.h"
 #include "module_base/timer.h"
 #include "module_base/tool_title.h"
-#include "module_hamilt_pw/hamilt_pwdft/global.h"
-#include "module_hamilt_lcao/hamilt_lcaodft/center2_orb-orb11.h"
-#include "module_elecstate/potentials/H_TDDFT_pw.h"
-#include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
-#include "module_hamilt_lcao/hamilt_lcaodft/spar_hsr.h"
-
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
-#include "module_base/libm/libm.h"
+#include "module_elecstate/potentials/H_TDDFT_pw.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/LCAO_matrix.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/center2_orb-orb11.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/spar_hsr.h"
+#include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
+#include "module_hamilt_pw/hamilt_pwdft/global.h"
 
 namespace hamilt
 {
 template <typename TK, typename TR>
 TDEkinetic<OperatorLCAO<TK, TR>>::TDEkinetic(LCAO_Matrix* LM_in,
-                                                   hamilt::HContainer<TR>* hR_in,
-                                                   std::vector<TK>* hK_in,
-                                                   hamilt::HContainer<TR>* SR_in,
-                                                   const K_Vectors* kv_in,
-                                                   const UnitCell* ucell_in,
-                                                   Grid_Driver* GridD_in)
+                                             hamilt::HContainer<TR>* hR_in,
+                                             std::vector<TK>* hK_in,
+                                             hamilt::HContainer<TR>* SR_in,
+                                             const K_Vectors* kv_in,
+                                             const UnitCell* ucell_in,
+                                             Grid_Driver* GridD_in)
     : SR(SR_in), kv(kv_in), OperatorLCAO<TK, TR>(LM_in, kv_in->kvec_d, hR_in, hK_in)
+{
+    this->LM = LM_in;
+    this->ucell = ucell_in;
+    this->cal_type = calculation_type::lcao_tddft_velocity;
+    this->Grid = GridD_in;
+    this->init_td();
+    // initialize HR to get adjs info.
+    this->initialize_HR(Grid, this->LM->ParaV);
+    if (TD_Velocity::out_mat_R == true)
     {
-        this->LM = LM_in;
-        this->ucell = ucell_in;
-        this->cal_type = calculation_type::lcao_tddft_velocity;
-        this->Grid = GridD_in;
-        this->init_td();
-        // initialize HR to get adjs info.
-        this->initialize_HR(Grid,this->LM->ParaV);
-        if(TD_Velocity::out_mat_R == true)
-        {
-            out_mat_R = true;
-        }
+        out_mat_R = true;
     }
+}
 template <typename TK, typename TR>
 TDEkinetic<OperatorLCAO<TK, TR>>::~TDEkinetic()
 {
@@ -44,31 +44,38 @@ TDEkinetic<OperatorLCAO<TK, TR>>::~TDEkinetic()
     }
     TD_Velocity::td_vel_op = nullptr;
 }
-//term A^2*S
+// term A^2*S
 template <typename TK, typename TR>
-void TDEkinetic<OperatorLCAO<TK, TR>>::td_ekinetic_scalar(std::complex<double>* Hloc, TR* Sloc, int nnr){
-	return;
+void TDEkinetic<OperatorLCAO<TK, TR>>::td_ekinetic_scalar(std::complex<double>* Hloc, TR* Sloc, int nnr)
+{
+    return;
 }
-//term A^2*S
+// term A^2*S
 template <>
-void TDEkinetic<OperatorLCAO<std::complex<double>, double>>::td_ekinetic_scalar(std::complex<double>* Hloc, double* Sloc, int nnr){
-	std::complex<double> tmp = {cart_At.norm2()*Sloc[nnr]/4.0, 0};
-	Hloc[nnr] += tmp;
-	return;
+void TDEkinetic<OperatorLCAO<std::complex<double>, double>>::td_ekinetic_scalar(std::complex<double>* Hloc,
+                                                                                double* Sloc,
+                                                                                int nnr)
+{
+    std::complex<double> tmp = {cart_At.norm2() * Sloc[nnr] / 4.0, 0};
+    Hloc[nnr] += tmp;
+    return;
 }
-//term A dot ∇
+// term A dot ∇
 template <typename TK, typename TR>
-void TDEkinetic<OperatorLCAO<TK, TR>>::td_ekinetic_grad(std::complex<double>* Hloc, int nnr, ModuleBase::Vector3<double> grad_overlap){
-	std::complex<double> tmp= {0, grad_overlap*cart_At};
-	Hloc[nnr] -= tmp;
-	return;
+void TDEkinetic<OperatorLCAO<TK, TR>>::td_ekinetic_grad(std::complex<double>* Hloc,
+                                                        int nnr,
+                                                        ModuleBase::Vector3<double> grad_overlap)
+{
+    std::complex<double> tmp = {0, grad_overlap * cart_At};
+    Hloc[nnr] -= tmp;
+    return;
 }
 
 template <typename TK, typename TR>
 void TDEkinetic<OperatorLCAO<TK, TR>>::calculate_HR()
 {
     ModuleBase::TITLE("TDEkinetic", "calculate_HR");
-    if(this->hR_tmp==nullptr || this->hR_tmp->size_atom_pairs()<=0)
+    if (this->hR_tmp == nullptr || this->hR_tmp->size_atom_pairs() <= 0)
     {
         ModuleBase::WARNING_QUIT("TDEkinetic::calculate_HR", "hR_tmp is nullptr or empty");
     }
@@ -96,7 +103,7 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::calculate_HR()
             hamilt::BaseMatrix<TR>* tmp1 = this->SR->find_matrix(iat1, iat2, R_index2);
             if (tmp != nullptr)
             {
-                this->cal_HR_IJR(iat1, iat2, paraV, dtau, tmp->get_pointer(),tmp1->get_pointer());
+                this->cal_HR_IJR(iat1, iat2, paraV, dtau, tmp->get_pointer(), tmp1->get_pointer());
             }
             else
             {
@@ -110,11 +117,11 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::calculate_HR()
 
 template <typename TK, typename TR>
 void TDEkinetic<OperatorLCAO<TK, TR>>::cal_HR_IJR(const int& iat1,
-                                                              const int& iat2,
-                                                              const Parallel_Orbitals* paraV,
-                                                              const ModuleBase::Vector3<double>& dtau,
-                                                              std::complex<double>* data_pointer,
-                                                              TR* s_pointer)
+                                                  const int& iat2,
+                                                  const Parallel_Orbitals* paraV,
+                                                  const ModuleBase::Vector3<double>& dtau,
+                                                  std::complex<double>* data_pointer,
+                                                  TR* s_pointer)
 {
     const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
     // ---------------------------------------------
@@ -166,10 +173,14 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::cal_HR_IJR(const int& iat1,
             const int N2 = iw2n2[iw2];
             const int m2 = iw2m2[iw2];
             // center2_orb11_s are used to calculate <psi|∇|psi> no matter whether to use new two-center or not for now.
-            ModuleBase::Vector3<double> grad_overlap = center2_orb11_s.at(T1).at(T2).at(L1).at(N1).at(L2).at(N2).cal_grad_overlap(tau1 * ucell->lat0, tau2 * ucell->lat0, m1, m2);
+            ModuleBase::Vector3<double> grad_overlap
+                = center2_orb11_s.at(T1).at(T2).at(L1).at(N1).at(L2).at(N2).cal_grad_overlap(tau1 * ucell->lat0,
+                                                                                             tau2 * ucell->lat0,
+                                                                                             m1,
+                                                                                             m2);
             for (int ipol = 0; ipol < npol; ipol++)
             {
-                //key change
+                // key change
                 td_ekinetic_scalar(data_pointer, s_pointer, ipol * step_trace);
                 td_ekinetic_grad(data_pointer, ipol * step_trace, grad_overlap);
             }
@@ -180,59 +191,58 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::cal_HR_IJR(const int& iat1,
         s_pointer += (npol - 1) * col_indexes.size();
     }
 }
-//init two center integrals and vector potential for td_ekintic term
+// init two center integrals and vector potential for td_ekintic term
 template <typename TK, typename TR>
 void TDEkinetic<OperatorLCAO<TK, TR>>::init_td(void)
 {
     TD_Velocity::td_vel_op = &td_velocity;
-    //calculate At in cartesian coorinates.
+    // calculate At in cartesian coorinates.
     td_velocity.cal_cart_At(this->ucell->a1, this->ucell->a2, this->ucell->a3, elecstate::H_TDDFT_pw::At);
     this->cart_At = td_velocity.cart_At;
-    std::cout<<"cart_At: "<<cart_At[0]<< " "<<cart_At[1]<< " "<<cart_At[2]<<std::endl;
+    std::cout << "cart_At: " << cart_At[0] << " " << cart_At[1] << " " << cart_At[2] << std::endl;
 
-    //init MOT,MGT
+    // init MOT,MGT
     const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
-    this->MOT.allocate(
-		orb.get_ntype(),	                                // number of atom types
-		orb.get_lmax(),	                                    // max L used to calculate overlap
-		static_cast<int>(orb.get_kmesh()) | 1,				// kpoints, for integration in k space
-		orb.get_Rmax(),				                        // max value of radial table
-		orb.get_dR(),								        // delta R, for making radial table
-		orb.get_dk());                                      // Peize Lin change 2017-04-16
-	int Lmax_used, Lmax;
-	this->MOT.init_Table_Spherical_Bessel (2, 1, Lmax_used, Lmax, 1, orb, this->ucell->infoNL.Beta);
+    this->MOT.allocate(orb.get_ntype(),                       // number of atom types
+                       orb.get_lmax(),                        // max L used to calculate overlap
+                       static_cast<int>(orb.get_kmesh()) | 1, // kpoints, for integration in k space
+                       orb.get_Rmax(),                        // max value of radial table
+                       orb.get_dR(),                          // delta R, for making radial table
+                       orb.get_dk());                         // Peize Lin change 2017-04-16
+    int Lmax_used, Lmax;
+    this->MOT.init_Table_Spherical_Bessel(2, 1, Lmax_used, Lmax, 1, orb, this->ucell->infoNL.Beta);
 
-	//=========================================
-	// (2) init Ylm Coef
-	//=========================================
-	ModuleBase::Ylm::set_coefficients ();
+    //=========================================
+    // (2) init Ylm Coef
+    //=========================================
+    ModuleBase::Ylm::set_coefficients();
 
-	//=========================================
-	// (3) make Gaunt coefficients table
-	//=========================================
-	this->MGT.init_Gaunt_CH( Lmax );
-	this->MGT.init_Gaunt( Lmax );
+    //=========================================
+    // (3) make Gaunt coefficients table
+    //=========================================
+    this->MGT.init_Gaunt_CH(Lmax);
+    this->MGT.init_Gaunt(Lmax);
 
-    //init_radial table
-    for( size_t TA=0; TA!=orb.get_ntype(); ++TA )
-		for( size_t TB=0; TB!=orb.get_ntype(); ++TB )
-			for( int LA=0; LA<=orb.Phi[TA].getLmax(); ++LA )
-				for( size_t NA=0; NA!=orb.Phi[TA].getNchi(LA); ++NA )
-					for( int LB=0; LB<=orb.Phi[TB].getLmax(); ++LB )
-						for( size_t NB=0; NB!=orb.Phi[TB].getNchi(LB); ++NB )
-							center2_orb11_s[TA][TB][LA][NA][LB].insert(
-								std::make_pair(NB, 
-                                Center2_Orb::Orb11(
-									orb.Phi[TA].PhiLN(LA,NA),
-									orb.Phi[TB].PhiLN(LB,NB),
-									this->MOT, this->MGT)));
-	for( auto &coA : center2_orb11_s )
-		for( auto &coB : coA.second )
-			for( auto &coC : coB.second )
-				for( auto &coD : coC.second )
-					for( auto &coE : coD.second )
-						for( auto &coF : coE.second )
-							coF.second.init_radial_table();
+    // init_radial table
+    for (size_t TA = 0; TA != orb.get_ntype(); ++TA)
+        for (size_t TB = 0; TB != orb.get_ntype(); ++TB)
+            for (int LA = 0; LA <= orb.Phi[TA].getLmax(); ++LA)
+                for (size_t NA = 0; NA != orb.Phi[TA].getNchi(LA); ++NA)
+                    for (int LB = 0; LB <= orb.Phi[TB].getLmax(); ++LB)
+                        for (size_t NB = 0; NB != orb.Phi[TB].getNchi(LB); ++NB)
+                            center2_orb11_s[TA][TB][LA][NA][LB].insert(
+                                std::make_pair(NB,
+                                               Center2_Orb::Orb11(orb.Phi[TA].PhiLN(LA, NA),
+                                                                  orb.Phi[TB].PhiLN(LB, NB),
+                                                                  this->MOT,
+                                                                  this->MGT)));
+    for (auto& coA: center2_orb11_s)
+        for (auto& coB: coA.second)
+            for (auto& coC: coB.second)
+                for (auto& coD: coC.second)
+                    for (auto& coE: coD.second)
+                        for (auto& coF: coE.second)
+                            coF.second.init_radial_table();
 }
 
 template <typename TK, typename TR>
@@ -242,8 +252,7 @@ void hamilt::TDEkinetic<hamilt::OperatorLCAO<TK, TR>>::set_HR_fixed(void* hR_tmp
     this->allocated = false;
 }
 template <typename TK, typename TR>
-void TDEkinetic<OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD,
-                                                        const Parallel_Orbitals* paraV)
+void TDEkinetic<OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD, const Parallel_Orbitals* paraV)
 {
     if (elecstate::H_TDDFT_pw::stype != 1)
     {
@@ -251,7 +260,7 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD,
     }
     ModuleBase::TITLE("TDEkinetic", "initialize_HR");
     ModuleBase::timer::tick("TDEkinetic", "initialize_HR");
-    
+
     this->adjs_all.clear();
     this->adjs_all.reserve(this->ucell->nat);
     for (int iat1 = 0; iat1 < ucell->nat; iat1++)
@@ -274,8 +283,8 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::initialize_HR(Grid_Driver* GridD,
             const ModuleBase::Vector3<int>& R_index2 = adjs.box[ad1];
             // choose the real adjacent atoms
             const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
-            // Note: the distance of atoms should less than the cutoff radius, 
-            // When equal, the theoretical value of matrix element is zero, 
+            // Note: the distance of atoms should less than the cutoff radius,
+            // When equal, the theoretical value of matrix element is zero,
             // but the calculated value is not zero due to the numerical error, which would lead to result changes.
             if (this->ucell->cal_dtau(iat1, iat2, R_index2).norm() * this->ucell->lat0
                 < orb.Phi[T1].getRcut() + orb.Phi[T2].getRcut())
@@ -301,29 +310,28 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::initialize_HR_tmp(const Parallel_Orbitals
     for (int i = 0; i < this->hR->size_atom_pairs(); ++i)
     {
         hamilt::AtomPair<TR>& tmp = this->hR->get_atom_pair(i);
-        for(int ir = 0;ir < tmp.get_R_size(); ++ir )
+        for (int ir = 0; ir < tmp.get_R_size(); ++ir)
         {
             const ModuleBase::Vector3<int> R_index = tmp.get_R_index(ir);
             const int iat1 = tmp.get_atom_i();
-            const int iat2 = tmp.get_atom_j(); 
+            const int iat2 = tmp.get_atom_j();
 
             hamilt::AtomPair<std::complex<double>> tmp1(iat1, iat2, R_index, paraV);
             this->hR_tmp->insert_pair(tmp1);
         }
     }
-    this->hR_tmp->allocate(nullptr,true);
+    this->hR_tmp->allocate(nullptr, true);
 
     ModuleBase::timer::tick("TDEkinetic", "initialize_HR_tmp");
 }
 
-
-template<typename TK, typename TR>
+template <typename TK, typename TR>
 void TDEkinetic<OperatorLCAO<TK, TR>>::contributeHR()
 {
-    //const Parallel_Orbitals* paraV = this->hR->get_atom_pair(0).get_paraV();
+    // const Parallel_Orbitals* paraV = this->hR->get_atom_pair(0).get_paraV();
     ModuleBase::TITLE("TDEkinetic", "contributeHR");
     ModuleBase::timer::tick("TDEkinetic", "contributeHR");
-    //skip if not TDDFT velocity gauge
+    // skip if not TDDFT velocity gauge
     if (elecstate::H_TDDFT_pw::stype != 1)
     {
         return;
@@ -334,12 +342,12 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::contributeHR()
         // if this Operator is the first node of the sub_chain, then hR_tmp is nullptr
         if (this->hR_tmp == nullptr)
         {
-        this->hR_tmp = new hamilt::HContainer<std::complex<double>>(this->LM->ParaV);
-        //allocate memory for hR_tmp use the same memory as hR
-        this->initialize_HR_tmp(this->LM->ParaV);
-        this->allocated = true;
+            this->hR_tmp = new hamilt::HContainer<std::complex<double>>(this->LM->ParaV);
+            // allocate memory for hR_tmp use the same memory as hR
+            this->initialize_HR_tmp(this->LM->ParaV);
+            this->allocated = true;
         }
-        if(this->next_sub_op != nullptr)
+        if (this->next_sub_op != nullptr)
         {
             // pass pointer of hR_tmp to the next node
             static_cast<OperatorLCAO<TK, TR>*>(this->next_sub_op)->set_HR_fixed(this->hR_tmp);
@@ -353,40 +361,41 @@ void TDEkinetic<OperatorLCAO<TK, TR>>::contributeHR()
     return;
 }
 
-template<typename TK, typename TR>
+template <typename TK, typename TR>
 void TDEkinetic<OperatorLCAO<TK, TR>>::contributeHk(int ik)
 {
     return;
 }
-template<>
+template <>
 void TDEkinetic<OperatorLCAO<std::complex<double>, double>>::contributeHk(int ik)
 {
     if (TD_Velocity::tddft_velocity == false)
     {
         return;
     }
-    else{        
+    else
+    {
         ModuleBase::TITLE("TDEkinetic", "contributeHk");
         ModuleBase::timer::tick("TDEkinetic", "contributeHk");
         const Parallel_Orbitals* paraV = this->hR_tmp->get_atom_pair(0).get_paraV();
-        //save HR data for output
+        // save HR data for output
         int spin_tot = paraV->nspin;
-        if(spin_tot==4);
-        else if(!output_hR_done && out_mat_R)
+        if (spin_tot == 4)
+            ;
+        else if (!output_hR_done && out_mat_R)
         {
-            for(int spin_now = 0;spin_now < spin_tot;spin_now++)
+            for (int spin_now = 0; spin_now < spin_tot; spin_now++)
             {
-                sparse_format::cal_HContainer_cd(
-                    *(paraV),
-                    spin_now, 
-                    1e-10, 
-                    *hR_tmp, 
-                    td_velocity.HR_sparse_td_vel[spin_now]);
+                sparse_format::cal_HContainer_cd(*(paraV),
+                                                 spin_now,
+                                                 1e-10,
+                                                 *hR_tmp,
+                                                 td_velocity.HR_sparse_td_vel[spin_now]);
             }
             output_hR_done = true;
         }
-        //folding inside HR to HK
-        if(ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
+        // folding inside HR to HK
+        if (ModuleBase::GlobalFunc::IS_COLUMN_MAJOR_KS_SOLVER())
         {
             const int nrow = paraV->get_row_size();
             hamilt::folding_HR(*this->hR_tmp, this->hK->data(), this->kvec_d[ik], nrow, 1);
@@ -396,7 +405,7 @@ void TDEkinetic<OperatorLCAO<std::complex<double>, double>>::contributeHk(int ik
             const int ncol = paraV->get_col_size();
             hamilt::folding_HR(*this->hR_tmp, this->hK->data(), this->kvec_d[ik], ncol, 0);
         }
-        
+
         ModuleBase::timer::tick("TDEkinetic", "contributeHk");
     }
 }
@@ -405,4 +414,4 @@ template class TDEkinetic<hamilt::OperatorLCAO<double, double>>;
 template class TDEkinetic<hamilt::OperatorLCAO<std::complex<double>, double>>;
 template class TDEkinetic<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>;
 
-}
+} // namespace hamilt
diff --git a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_psialpha.cpp b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_psialpha.cpp
index fd8d4ce07e..5dc85e4f11 100644
--- a/source/module_hamilt_lcao/module_deepks/LCAO_deepks_psialpha.cpp
+++ b/source/module_hamilt_lcao/module_deepks/LCAO_deepks_psialpha.cpp
@@ -1,66 +1,66 @@
-//wenfei 2022-1-11
-//This file contains 2 subroutines:
-//1. build_psialpha, which calculates the overlap
-//between atomic basis and projector alpha : <psi_mu|alpha>
-//which will be used in calculating pdm, gdmx, H_V_delta, F_delta;
-//2. check_psialpha, which prints the results into .dat files
-//for checking
+// wenfei 2022-1-11
+// This file contains 2 subroutines:
+// 1. build_psialpha, which calculates the overlap
+// between atomic basis and projector alpha : <psi_mu|alpha>
+// which will be used in calculating pdm, gdmx, H_V_delta, F_delta;
+// 2. check_psialpha, which prints the results into .dat files
+// for checking
 
 #ifdef __DEEPKS
 
 #include "LCAO_deepks.h"
-#include "module_base/vector3.h"
 #include "module_base/timer.h"
+#include "module_base/vector3.h"
 
 void LCAO_Deepks::build_psialpha(const bool& calc_deri,
-    const UnitCell &ucell,
-    const LCAO_Orbitals &orb,
-    Grid_Driver& GridD,
-    const ORB_gen_tables &UOT)
+                                 const UnitCell& ucell,
+                                 const LCAO_Orbitals& orb,
+                                 Grid_Driver& GridD,
+                                 const ORB_gen_tables& UOT)
 {
     ModuleBase::TITLE("LCAO_Deepks", "build_psialpha");
-    ModuleBase::timer::tick ("LCAO_Deepks","build_psialpha");
+    ModuleBase::timer::tick("LCAO_Deepks", "build_psialpha");
 
-    //cutoff for alpha is same for all types of atoms
+    // cutoff for alpha is same for all types of atoms
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
-    
+
     int job;
-    if(!calc_deri)
+    if (!calc_deri)
     {
-        job=0;
+        job = 0;
     }
     else
     {
-        job=1;
+        job = 1;
     }
 
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
-		Atom* atom0 = &ucell.atoms[T0]; 
-        for (int I0 =0; I0< atom0->na; I0++)
+        Atom* atom0 = &ucell.atoms[T0];
+        for (int I0 = 0; I0 < atom0->na; I0++)
         {
-            //iat: atom index on which |alpha> is located
-            const int iat = ucell.itia2iat(T0,I0);
-			const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
+            // iat: atom index on which |alpha> is located
+            const int iat = ucell.itia2iat(T0, I0);
+            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
+            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
 
-            if(GlobalV::GAMMA_ONLY_LOCAL)
+            if (GlobalV::GAMMA_ONLY_LOCAL)
             {
-                this->nlm_save[iat].resize(GridD.getAdjacentNum()+1);
+                this->nlm_save[iat].resize(GridD.getAdjacentNum() + 1);
             }
-            //outermost loop : find all adjacent atoms
-            for (int ad=0; ad<GridD.getAdjacentNum()+1 ; ++ad)
+            // outermost loop : find all adjacent atoms
+            for (int ad = 0; ad < GridD.getAdjacentNum() + 1; ++ad)
             {
                 const int T1 = GridD.getType(ad);
                 const int I1 = GridD.getNatom(ad);
-                const int ibt=ucell.itia2iat(T1, I1);
-				const double Rcut_AO1 = orb.Phi[T1].getRcut();
+                const int ibt = ucell.itia2iat(T1, I1);
+                const double Rcut_AO1 = orb.Phi[T1].getRcut();
 
                 const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad);
-				const Atom* atom1 = &ucell.atoms[T1];
+                const Atom* atom1 = &ucell.atoms[T1];
 
-                std::unordered_map<int,std::vector<std::vector<double>>> nlm_cur;
-                if(GlobalV::GAMMA_ONLY_LOCAL)
+                std::unordered_map<int, std::vector<std::vector<double>>> nlm_cur;
+                if (GlobalV::GAMMA_ONLY_LOCAL)
                 {
                     this->nlm_save[iat][ad].clear();
                 }
@@ -69,12 +69,12 @@ void LCAO_Deepks::build_psialpha(const bool& calc_deri,
                     nlm_cur.clear();
                 }
 
-				const double dist1 = (tau1-tau0).norm() * ucell.lat0;
+                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
 
-				if (dist1 > Rcut_Alpha + Rcut_AO1)
-				{
-					continue;
-				}
+                if (dist1 > Rcut_Alpha + Rcut_AO1)
+                {
+                    continue;
+                }
 
                 auto all_indexes = pv->get_indexes_row(ibt);
                 auto col_indexes = pv->get_indexes_col(ibt);
@@ -83,72 +83,72 @@ void LCAO_Deepks::build_psialpha(const bool& calc_deri,
                 std::sort(all_indexes.begin(), all_indexes.end());
                 all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
 
-                //middle loop : all atomic basis on the adjacent atom ad
-				for (int iw1l=0; iw1l<all_indexes.size(); iw1l+=GlobalV::NPOL)
-				{
+                // middle loop : all atomic basis on the adjacent atom ad
+                for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += GlobalV::NPOL)
+                {
                     const int iw1 = all_indexes[iw1l] / GlobalV::NPOL;
-					std::vector<std::vector<double>> nlm;
-					//2D, dim 0 contains the overlap <psi|alpha>
-                    //dim 1-3 contains the gradient of overlap
+                    std::vector<std::vector<double>> nlm;
+                    // 2D, dim 0 contains the overlap <psi|alpha>
+                    // dim 1-3 contains the gradient of overlap
 
-                    int L1 = atom1->iw2l[ iw1 ];
-                    int N1 = atom1->iw2n[ iw1 ];
-                    int m1 = atom1->iw2m[ iw1 ];
+                    int L1 = atom1->iw2l[iw1];
+                    int N1 = atom1->iw2n[iw1];
+                    int m1 = atom1->iw2m[iw1];
 
                     // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-                    int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
+                    int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
                     ModuleBase::Vector3<double> dtau = ucell.atoms[T0].tau[I0] - tau1;
-                    UOT.two_center_bundle->overlap_orb_alpha->snap(
-                            T1, L1, N1, M1, 0, dtau * ucell.lat0, calc_deri, nlm);
+                    UOT.two_center_bundle->overlap_orb_alpha
+                        ->snap(T1, L1, N1, M1, 0, dtau * ucell.lat0, calc_deri, nlm);
 
-                    if(GlobalV::GAMMA_ONLY_LOCAL)
+                    if (GlobalV::GAMMA_ONLY_LOCAL)
                     {
-                        this->nlm_save[iat][ad].insert({all_indexes[iw1l],nlm});
+                        this->nlm_save[iat][ad].insert({all_indexes[iw1l], nlm});
                     }
                     else
                     {
-                        nlm_cur.insert({all_indexes[iw1l],nlm});
-                        if(GlobalV::NPOL==2) nlm_cur.insert({all_indexes[iw1l+1],nlm});
+                        nlm_cur.insert({all_indexes[iw1l], nlm});
+                        if (GlobalV::NPOL == 2)
+                            nlm_cur.insert({all_indexes[iw1l + 1], nlm});
                     }
-				}//end iw
+                } // end iw
 
-                if(!GlobalV::GAMMA_ONLY_LOCAL)
+                if (!GlobalV::GAMMA_ONLY_LOCAL)
                 {
-                    const int rx=GridD.getBox(ad).x;
-                    const int ry=GridD.getBox(ad).y;
-                    const int rz=GridD.getBox(ad).z;
-                    key_tuple key_1(ibt,rx,ry,rz);
-                    this->nlm_save_k[iat][key_1]=nlm_cur;
+                    const int rx = GridD.getBox(ad).x;
+                    const int ry = GridD.getBox(ad).y;
+                    const int rz = GridD.getBox(ad).z;
+                    key_tuple key_1(ibt, rx, ry, rz);
+                    this->nlm_save_k[iat][key_1] = nlm_cur;
                 }
-			}//end ad
-		}//end I0
-	}//end T0
-
-    ModuleBase::timer::tick ("LCAO_Deepks","build_psialpha");
-	return;
+            } // end ad
+        }     // end I0
+    }         // end T0
 
+    ModuleBase::timer::tick("LCAO_Deepks", "build_psialpha");
+    return;
 }
 
 void LCAO_Deepks::check_psialpha(const bool& calc_deri,
-    const UnitCell &ucell,
-    const LCAO_Orbitals &orb,
-    Grid_Driver& GridD,
-    const ORB_gen_tables &UOT)
+                                 const UnitCell& ucell,
+                                 const LCAO_Orbitals& orb,
+                                 Grid_Driver& GridD,
+                                 const ORB_gen_tables& UOT)
 {
     ModuleBase::TITLE("LCAO_Deepks", "check_psialpha");
-    ModuleBase::timer::tick ("LCAO_Deepks","check_psialpha");
+    ModuleBase::timer::tick("LCAO_Deepks", "check_psialpha");
 
     const double Rcut_Alpha = orb.Alpha[0].getRcut();
-    //same for all types of atoms
+    // same for all types of atoms
     int job;
-    if(!calc_deri)
+    if (!calc_deri)
     {
-        job=0;
+        job = 0;
     }
     else
     {
-        job=1;
+        job = 1;
     }
 
     std::ofstream ofs("psialpha.dat");
@@ -156,51 +156,51 @@ void LCAO_Deepks::check_psialpha(const bool& calc_deri,
     std::ofstream ofs_y("dpsialpha_y.dat");
     std::ofstream ofs_z("dpsialpha_z.dat");
 
-    ofs<<std::setprecision(10);
-    ofs_x<<std::setprecision(10);
-    ofs_y<<std::setprecision(10);
-    ofs_z<<std::setprecision(10);
+    ofs << std::setprecision(10);
+    ofs_x << std::setprecision(10);
+    ofs_y << std::setprecision(10);
+    ofs_z << std::setprecision(10);
 
     for (int T0 = 0; T0 < ucell.ntype; T0++)
     {
-		Atom* atom0 = &ucell.atoms[T0]; 
-        for (int I0 =0; I0< atom0->na; I0++)
+        Atom* atom0 = &ucell.atoms[T0];
+        for (int I0 = 0; I0 < atom0->na; I0++)
         {
-            const int iat = ucell.itia2iat(T0,I0);
-			//=======================================================
-            //Step 1 : 
-			//saves <alpha|psi>, where alpha runs over all projectors
-			//and psi runs over atomic basis sets on the current core
-			//=======================================================
+            const int iat = ucell.itia2iat(T0, I0);
+            //=======================================================
+            // Step 1 :
+            // saves <alpha|psi>, where alpha runs over all projectors
+            // and psi runs over atomic basis sets on the current core
+            //=======================================================
 
-			const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
-            GridD.Find_atom(ucell, atom0->tau[I0] ,T0, I0);
+            const ModuleBase::Vector3<double> tau0 = atom0->tau[I0];
+            GridD.Find_atom(ucell, atom0->tau[I0], T0, I0);
 
             ofs << "iat : " << iat << std::endl;
             ofs_x << "iat : " << iat << std::endl;
             ofs_y << "iat : " << iat << std::endl;
             ofs_z << "iat : " << iat << std::endl;
 
-            for (int ad=0; ad<GridD.getAdjacentNum()+1 ; ++ad)
+            for (int ad = 0; ad < GridD.getAdjacentNum() + 1; ++ad)
             {
                 const int T1 = GridD.getType(ad);
                 const int I1 = GridD.getNatom(ad);
                 const int start1 = ucell.itiaiw2iwt(T1, I1, 0);
-				const double Rcut_AO1 = orb.Phi[T1].getRcut();
+                const double Rcut_AO1 = orb.Phi[T1].getRcut();
 
                 const ModuleBase::Vector3<double> tau1 = GridD.getAdjacentTau(ad);
-				const Atom* atom1 = &ucell.atoms[T1];
-				const int nw1_tot = atom1->nw*GlobalV::NPOL;
+                const Atom* atom1 = &ucell.atoms[T1];
+                const int nw1_tot = atom1->nw * GlobalV::NPOL;
 
-				const double dist1 = (tau1-tau0).norm() * ucell.lat0;
+                const double dist1 = (tau1 - tau0).norm() * ucell.lat0;
 
-				if (dist1 > Rcut_Alpha + Rcut_AO1)
-				{
-					continue;
-				}
+                if (dist1 > Rcut_Alpha + Rcut_AO1)
+                {
+                    continue;
+                }
 
                 int ibt, rx, ry, rz;
-                if(GlobalV::GAMMA_ONLY_LOCAL)
+                if (GlobalV::GAMMA_ONLY_LOCAL)
                 {
                     ofs << "ad : " << ad << " " << dist1 << std::endl;
                     ofs_x << "ad : " << ad << " " << dist1 << std::endl;
@@ -209,68 +209,72 @@ void LCAO_Deepks::check_psialpha(const bool& calc_deri,
                 }
                 else
                 {
-                    ibt=ucell.itia2iat(T1, I1);
-                    rx=GridD.getBox(ad).x;
-                    ry=GridD.getBox(ad).y;
-                    rz=GridD.getBox(ad).z;
+                    ibt = ucell.itia2iat(T1, I1);
+                    rx = GridD.getBox(ad).x;
+                    ry = GridD.getBox(ad).y;
+                    rz = GridD.getBox(ad).z;
                     ofs << "key : " << ibt << " " << rx << " " << ry << " " << rz << std::endl;
-                    ofs_x << "key : " << ibt << " " << rx << " " << ry << " " << rz << std::endl; 
-                    ofs_y << "key : " << ibt << " " << rx << " " << ry << " " << rz << std::endl; 
-                    ofs_z << "key : " << ibt << " " << rx << " " << ry << " " << rz << std::endl; 
+                    ofs_x << "key : " << ibt << " " << rx << " " << ry << " " << rz << std::endl;
+                    ofs_y << "key : " << ibt << " " << rx << " " << ry << " " << rz << std::endl;
+                    ofs_z << "key : " << ibt << " " << rx << " " << ry << " " << rz << std::endl;
                 }
 
-				for (int iw1=0; iw1<nw1_tot; ++iw1)
-				{
-					const int iw1_all = start1 + iw1;
+                for (int iw1 = 0; iw1 < nw1_tot; ++iw1)
+                {
+                    const int iw1_all = start1 + iw1;
                     ofs << "iw : " << iw1_all << std::endl;
                     ofs_x << "iw : " << iw1_all << std::endl;
                     ofs_y << "iw : " << iw1_all << std::endl;
                     ofs_z << "iw : " << iw1_all << std::endl;
                     const int iw1_local = pv->global2local_row(iw1_all);
                     const int iw2_local = pv->global2local_col(iw1_all);
-					if(iw1_local < 0 && iw2_local < 0)continue;
-					const int iw1_0 = iw1/GlobalV::NPOL;
+                    if (iw1_local < 0 && iw2_local < 0)
+                        continue;
+                    const int iw1_0 = iw1 / GlobalV::NPOL;
 
                     std::vector<std::vector<double>> nlm;
 
-                    if(GlobalV::GAMMA_ONLY_LOCAL)
+                    if (GlobalV::GAMMA_ONLY_LOCAL)
                     {
                         nlm = this->nlm_save[iat][ad][iw1];
                     }
                     else
                     {
-                        key_tuple key_1(ibt,rx,ry,rz);
+                        key_tuple key_1(ibt, rx, ry, rz);
                         nlm = this->nlm_save_k[iat][key_1][iw1];
                     }
-                    
-                    for(int ind=0;ind<nlm[0].size();ind++)
+
+                    for (int ind = 0; ind < nlm[0].size(); ind++)
                     {
                         ofs << nlm[0][ind] << " ";
-                        if(ind%6 == 5) ofs << "\n";
-                        if(calc_deri)
+                        if (ind % 6 == 5)
+                            ofs << "\n";
+                        if (calc_deri)
                         {
                             ofs_x << nlm[1][ind] << " ";
-                            if(ind%6 == 5) ofs_x << "\n";
+                            if (ind % 6 == 5)
+                                ofs_x << "\n";
                             ofs_y << nlm[2][ind] << " ";
-                            if(ind%6 == 5) ofs_y << "\n";
+                            if (ind % 6 == 5)
+                                ofs_y << "\n";
                             ofs_z << nlm[3][ind] << " ";
-                            if(ind%6 == 5) ofs_z << "\n";
+                            if (ind % 6 == 5)
+                                ofs_z << "\n";
                         }
                     }
                     ofs << std::endl;
-                    if(calc_deri)
+                    if (calc_deri)
                     {
                         ofs_x << std::endl;
                         ofs_y << std::endl;
                         ofs_z << std::endl;
                     }
-				}//end iw
-			}//end ad
-		}//end I0
-	}//end T0
-
-    ModuleBase::timer::tick ("LCAO_Deepks","check_psialpha");
-	return;
+                } // end iw
+            }     // end ad
+        }         // end I0
+    }             // end T0
 
+    ModuleBase::timer::tick("LCAO_Deepks", "check_psialpha");
+    return;
 }
 #endif

From ac3cb1bee8a3a44912ef129721561cfc60f2a091 Mon Sep 17 00:00:00 2001
From: jinzx10 <jzx016@hotmail.com>
Date: Wed, 19 Jun 2024 22:19:45 +0800
Subject: [PATCH 4/8] replace some uot_ with intor_

---
 .../hamilt_lcaodft/hamilt_lcao.cpp                 | 14 +++++++-------
 .../hamilt_lcaodft/operator_lcao/ekinetic_new.cpp  |  7 +++----
 .../hamilt_lcaodft/operator_lcao/ekinetic_new.h    |  7 +++----
 .../hamilt_lcaodft/operator_lcao/nonlocal_new.cpp  |  6 +++---
 .../hamilt_lcaodft/operator_lcao/nonlocal_new.h    |  6 +++---
 .../hamilt_lcaodft/operator_lcao/overlap_new.cpp   |  6 +++---
 .../hamilt_lcaodft/operator_lcao/overlap_new.h     |  5 ++---
 .../operator_lcao/test/test_T_NL_cd.cpp            |  6 +++---
 .../operator_lcao/test/test_ekineticnew.cpp        |  6 +++---
 .../operator_lcao/test/test_nonlocalnew.cpp        |  6 +++---
 .../operator_lcao/test/test_overlapnew.cpp         |  6 +++---
 .../operator_lcao/test/test_overlapnew_cd.cpp      |  4 ++--
 .../operator_lcao/test/tmp_mocks.cpp               |  2 +-
 13 files changed, 39 insertions(+), 42 deletions(-)

diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
index bb586192fd..fda99cf3e3 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
@@ -55,7 +55,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(LCAO_Matrix* LM_in, const K_Vectors& kv_in, const
         &(this->getSk(LM_in)),
         &GlobalC::ucell,
         &GlobalC::GridD,
-        uot,
+        uot->two_center_bundle->overlap_orb.get(),
         LM_in->ParaV
     );
 }
@@ -133,7 +133,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
             &(this->getSk(LM_in)),
             &GlobalC::ucell,
             &GlobalC::GridD,
-            uot,
+            uot->two_center_bundle->overlap_orb.get(),
             LM_in->ParaV
         );
 
@@ -149,7 +149,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
                 &(this->getHk(LM_in)),
                 &GlobalC::ucell, 
                 &GlobalC::GridD,
-                uot,
+                uot->two_center_bundle->kinetic_orb.get(),
                 LM_in->ParaV
             );
             this->getOperator()->add(ekinetic);
@@ -166,7 +166,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
                 &(this->getHk(LM_in)),
                 &GlobalC::ucell, 
                 &GlobalC::GridD,
-                uot,
+                uot->two_center_bundle->overlap_orb_beta.get(),
                 LM_in->ParaV
             );
             this->getOperator()->add(nonlocal);
@@ -291,7 +291,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
             &(this->getSk(LM_in)),
             &GlobalC::ucell,
             &GlobalC::GridD,
-            uot,
+            uot->two_center_bundle->overlap_orb.get(),
             LM_in->ParaV
         );
         if(this->getOperator() == nullptr)
@@ -314,7 +314,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
                 &(this->getHk(LM_in)),
                 &GlobalC::ucell,
                 &GlobalC::GridD,
-                uot,
+                uot->two_center_bundle->kinetic_orb.get(),
                 LM_in->ParaV
             );
             this->getOperator()->add(ekinetic);
@@ -331,7 +331,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
                 &(this->getHk(LM_in)),
                 &GlobalC::ucell,
                 &GlobalC::GridD,
-                uot,
+                uot->two_center_bundle->overlap_orb_beta.get(),
                 LM_in->ParaV
             );
             //TDDFT velocity gague will calculate full non-local potential including the original one and the correction on its own.
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp
index 456f81c8ca..aa39d1355d 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.cpp
@@ -16,9 +16,9 @@ hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::EkineticNew(
     std::vector<TK>* hK_in,
     const UnitCell* ucell_in,
     Grid_Driver* GridD_in,
-    const ORB_gen_tables* uot,
+    const TwoCenterIntegrator* intor,
     const Parallel_Orbitals* paraV)
-    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), uot_(uot)
+    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), intor_(intor)
 {
     this->cal_type = calculation_type::lcao_fixed;
     this->ucell = ucell_in;
@@ -194,8 +194,7 @@ void hamilt::EkineticNew<hamilt::OperatorLCAO<TK, TR>>::cal_HR_IJR(const int& ia
             // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
             int M2 = (m2 % 2 == 0) ? -m2/2 : (m2+1)/2;
 
-            uot_->two_center_bundle->kinetic_orb->calculate(T1, L1, N1, M1,
-                    T2, L2, N2, M2, dtau * this->ucell->lat0, olm);
+            intor_->calculate(T1, L1, N1, M1, T2, L2, N2, M2, dtau * this->ucell->lat0, olm);
             for (int ipol = 0; ipol < npol; ipol++)
             {
                 data_pointer[ipol * step_trace] += olm[0];
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.h
index 4e288a1943..8bcfa53ef4 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.h
@@ -5,7 +5,7 @@
 #include "module_cell/unitcell.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
-#include "module_basis/module_ao/ORB_gen_tables.h"
+#include "module_basis/module_nao/two_center_integrator.h"
 
 namespace hamilt
 {
@@ -45,7 +45,7 @@ class EkineticNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
                                       std::vector<TK>* hK_in,
                                       const UnitCell* ucell_in,
                                       Grid_Driver* GridD_in,
-                                      const ORB_gen_tables* uot,
+                                      const TwoCenterIntegrator* intor,
                                       const Parallel_Orbitals* paraV);
 
     /**
@@ -66,8 +66,7 @@ class EkineticNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
     hamilt::HContainer<TR>* HR_fixed = nullptr;
 
-    // the following variable is introduced temporarily during LCAO refactoring
-    const ORB_gen_tables* uot_ = nullptr;
+    const TwoCenterIntegrator* intor_ = nullptr;
 
     bool allocated = false;
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
index 9a567bf6df..7aa0195973 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
@@ -18,9 +18,9 @@ hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::NonlocalNew(
     std::vector<TK>* hK_in,
     const UnitCell* ucell_in,
     Grid_Driver* GridD_in,
-    const ORB_gen_tables* uot,
+    const TwoCenterIntegrator* intor,
     const Parallel_Orbitals* paraV)
-    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), uot_(uot)
+    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), intor_(intor)
 {
     this->cal_type = calculation_type::lcao_fixed;
     this->ucell = ucell_in;
@@ -177,7 +177,7 @@ void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
                 int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
 
                 ModuleBase::Vector3<double> dtau = tau0 - tau1;
-                uot_->two_center_bundle->overlap_orb_beta->snap(
+                intor_->snap(
                         T1, L1, N1, M1, T0, dtau * ucell->lat0, 0 /*cal_deri*/, nlm);
                 nlm_tot[ad].insert({all_indexes[iw1l], nlm[0]});
             }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h
index 4f1781ee7a..c860de3351 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h
@@ -7,7 +7,7 @@
 #include "module_cell/unitcell.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
-#include "module_basis/module_ao/ORB_gen_tables.h"
+#include "module_basis/module_nao/two_center_integrator.h"
 
 namespace hamilt
 {
@@ -44,7 +44,7 @@ class NonlocalNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
                                       std::vector<TK>* hK_in,
                                       const UnitCell* ucell_in,
                                       Grid_Driver* GridD_in,
-                                      const ORB_gen_tables* uot,
+                                      const TwoCenterIntegrator* intor,
                                       const Parallel_Orbitals* paraV);
     ~NonlocalNew<OperatorLCAO<TK, TR>>();
 
@@ -64,7 +64,7 @@ class NonlocalNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
     hamilt::HContainer<TR>* HR_fixed = nullptr;
 
     // the following variable is introduced temporarily during LCAO refactoring
-    const ORB_gen_tables* uot_ = nullptr;
+    const TwoCenterIntegrator* intor_ = nullptr;
 
     bool allocated = false;
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
index dcdfc2de2a..16e91a5f34 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
@@ -16,9 +16,9 @@ hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::OverlapNew(LCAO_Matrix* LM_in,
                                                              std::vector<TK>* SK_pointer_in,
                                                              const UnitCell* ucell_in,
                                                              Grid_Driver* GridD_in,
-                                                             const ORB_gen_tables* uot,
+                                                             const TwoCenterIntegrator* intor,
                                                              const Parallel_Orbitals* paraV)
-    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), uot_(uot)
+    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), intor_(intor)
 {
     this->cal_type = calculation_type::lcao_overlap;
     this->ucell = ucell_in;
@@ -163,7 +163,7 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::cal_SR_IJR(const int& iat
 
             // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
             int M2 = (m2 % 2 == 0) ? -m2/2 : (m2+1)/2;
-            uot_->two_center_bundle->overlap_orb->calculate(T1, L1, N1, M1,
+            intor_->calculate(T1, L1, N1, M1,
                     T2, L2, N2, M2, dtau * this->ucell->lat0, olm);
             for (int ipol = 0; ipol < npol; ipol++)
             {
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.h
index 602e8c07ed..4de2da23d3 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.h
@@ -44,7 +44,7 @@ class OverlapNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
                                      std::vector<TK>* SK_pointer_in,
                                      const UnitCell* ucell_in,
                                      Grid_Driver* GridD_in,
-                                     const ORB_gen_tables* uot,
+                                     const TwoCenterIntegrator* intor,
                                      const Parallel_Orbitals* paraV);
 
     virtual void contributeHR() override;
@@ -60,8 +60,7 @@ class OverlapNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
     std::vector<TK>* SK_pointer = nullptr;
 
-    // the following variable is introduced temporarily during LCAO refactoring
-    const ORB_gen_tables* uot_ = nullptr;
+    const TwoCenterIntegrator* intor_ = nullptr;
 
     bool SR_fixed_done = false;
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp
index c5e3279fc4..893f585540 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp
@@ -124,7 +124,7 @@ class TNLTest : public ::testing::Test
     UnitCell ucell;
     hamilt::HContainer<std::complex<double>>* HR;
     Parallel_Orbitals *paraV;
-    ORB_gen_tables uot_;
+    TwoCenterIntegrator intor_;
 
     int dsize;
     int my_rank = 0;
@@ -145,7 +145,7 @@ TEST_F(TNLTest, testTVNLcd2cd)
         &hk, 
         &ucell, 
         &gd,
-        &uot_,
+        &intor_,
         paraV
     );
     hamilt::Operator<std::complex<double>> *op1 = new hamilt::NonlocalNew<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(
@@ -155,7 +155,7 @@ TEST_F(TNLTest, testTVNLcd2cd)
         &hk, 
         &ucell, 
         &gd,
-        &uot_,
+        &intor_,
         paraV
     );
     // merge two Operators to a chain
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp
index 79e2953ab7..32d7230672 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp
@@ -93,7 +93,7 @@ class EkineticNewTest : public ::testing::Test
     UnitCell ucell;
     hamilt::HContainer<double>* HR;
     Parallel_Orbitals *paraV;
-    ORB_gen_tables uot_;
+    TwoCenterIntegrator intor_;
 
     int dsize;
     int my_rank = 0;
@@ -112,7 +112,7 @@ TEST_F(EkineticNewTest, constructHRd2d)
         &hk, 
         &ucell, 
         &gd,
-        &uot_,
+        &intor_,
         paraV
     );
     op.contributeHR();
@@ -168,7 +168,7 @@ TEST_F(EkineticNewTest, constructHRd2cd)
         &hk, 
         &ucell, 
         &gd,
-        &uot_,
+        &intor_,
         paraV
     );
     op.contributeHR();
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_nonlocalnew.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_nonlocalnew.cpp
index 514ba38f7b..8cbc2f3e55 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_nonlocalnew.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_nonlocalnew.cpp
@@ -122,7 +122,7 @@ class NonlocalNewTest : public ::testing::Test
     UnitCell ucell;
     hamilt::HContainer<double>* HR;
     Parallel_Orbitals *paraV;
-    ORB_gen_tables uot_;
+    TwoCenterIntegrator intor_;
 
     int dsize;
     int my_rank = 0;
@@ -145,7 +145,7 @@ TEST_F(NonlocalNewTest, constructHRd2d)
         &hk, 
         &ucell, 
         &gd,
-        &uot_,
+        &intor_,
         paraV
     );
     std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now();
@@ -214,7 +214,7 @@ TEST_F(NonlocalNewTest, constructHRd2cd)
         &hk, 
         &ucell, 
         &gd,
-        &uot_,
+        &intor_,
         paraV
     );
     op.contributeHR();
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew.cpp
index 106119d801..332a33be96 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew.cpp
@@ -93,7 +93,7 @@ class OverlapNewTest : public ::testing::Test
     UnitCell ucell;
     hamilt::HContainer<double>* SR;
     Parallel_Orbitals *paraV;
-    ORB_gen_tables uot_;
+    TwoCenterIntegrator intor_;
 
     int dsize;
     int my_rank = 0;
@@ -114,7 +114,7 @@ TEST_F(OverlapNewTest, constructHRd2d)
         &hk, 
         &ucell, 
         &gd,
-        &uot_,
+        &intor_,
         paraV
     );
     op.contributeHR();
@@ -156,7 +156,7 @@ TEST_F(OverlapNewTest, constructHRd2cd)
         &hk, 
         &ucell, 
         &gd,
-        &uot_,
+        &intor_,
         paraV
     );
     op.contributeHR();
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew_cd.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew_cd.cpp
index cc5c28b1cd..c4505dbfa6 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew_cd.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew_cd.cpp
@@ -92,7 +92,7 @@ class OverlapNewTest : public ::testing::Test
     UnitCell ucell;
     hamilt::HContainer<std::complex<double>>* SR;
     Parallel_Orbitals *paraV;
-    ORB_gen_tables uot_;
+    TwoCenterIntegrator intor_;
 
     int dsize;
     int my_rank = 0;
@@ -114,7 +114,7 @@ TEST_F(OverlapNewTest, constructHRcd2cd)
         &hk, 
         &ucell, 
         &gd,
-        &uot_,
+        &intor_,
         paraV
     );
     op.contributeHR();
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
index f64cf9f530..0e3f21e771 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
@@ -141,7 +141,7 @@ template class hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>;
 
 // mock of ORB_gen_tables and LCAO_Orbitals
 #include "module_basis/module_ao/ORB_gen_tables.h"
-ORB_gen_tables::ORB_gen_tables() {}
+ORB_gen_tables::ORB_gen_tables() : two_center_bundle(new TwoCenterBundle) {}
 ORB_gen_tables::~ORB_gen_tables() {}
 ORB_gaunt_table::ORB_gaunt_table() {}
 ORB_gaunt_table::~ORB_gaunt_table() {}

From 1c13bea88791d721850d120e15cb62190d992714 Mon Sep 17 00:00:00 2001
From: jinzx10 <jzx016@hotmail.com>
Date: Wed, 19 Jun 2024 23:17:52 +0800
Subject: [PATCH 5/8] fix unit test

---
 .../hamilt_lcaodft/FORCE_STRESS.cpp           |   2 +-
 .../hamilt_lcaodft/hamilt_lcao.cpp            |   4 +-
 .../operator_lcao/dftu_lcao.cpp               |   7 +-
 .../hamilt_lcaodft/operator_lcao/dftu_lcao.h  |   6 +-
 .../operator_lcao/test/CMakeLists.txt         |  12 +-
 .../operator_lcao/test/test_T_NL_cd.cpp       |   8 +-
 .../operator_lcao/test/test_dftu.cpp          |   6 +-
 .../operator_lcao/test/test_ekineticnew.cpp   |  14 +-
 .../operator_lcao/test/tmp_mocks.cpp          | 158 +++++++++++-------
 9 files changed, 131 insertions(+), 86 deletions(-)

diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
index 8983e266e0..0ca36ad054 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
@@ -258,7 +258,7 @@ void Force_Stress_LCAO<T>::getForceStress(const bool isforce,
 					nullptr,
 					GlobalC::ucell,
 					&GlobalC::GridD,
-                    uot,
+                    uot->two_center_bundle->overlap_orb_onsite.get(),
 					&GlobalC::dftu,
 					*(lm.ParaV));
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
index fda99cf3e3..2f8caad081 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
@@ -238,7 +238,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
                     &(this->getHk(LM_in)),
                     GlobalC::ucell,
                     &GlobalC::GridD,
-                    uot,
+                    uot->two_center_bundle->overlap_orb_onsite.get(),
                     &GlobalC::dftu,
                     *(LM_in->ParaV)
                 );
@@ -415,7 +415,7 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
                     &(this->getHk(LM_in)),
                     GlobalC::ucell,
                     &GlobalC::GridD,
-                    uot,
+                    uot->two_center_bundle->overlap_orb_onsite.get(),
                     &GlobalC::dftu,
                     *(LM_in->ParaV)
                 );
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp
index 97f4191ae7..2ea680c852 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.cpp
@@ -18,10 +18,10 @@ hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::DFTU(LCAO_Matrix* LM_in,
                                                  std::vector<TK>* hK_in,
                                                  const UnitCell& ucell_in,
                                                  Grid_Driver* GridD_in,
-                                                 const ORB_gen_tables* uot,
+                                                 const TwoCenterIntegrator* intor,
                                                  ModuleDFTU::DFTU* dftu_in,
                                                  const Parallel_Orbitals& paraV)
-    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), uot_(uot)
+    : hamilt::OperatorLCAO<TK, TR>(LM_in, kvec_d_in, hR_in, hK_in), intor_(intor)
 {
     this->cal_type = calculation_type::lcao_dftu;
     this->ucell = &ucell_in;
@@ -144,8 +144,7 @@ void hamilt::DFTU<hamilt::OperatorLCAO<TK, TR>>::cal_nlm_all(const Parallel_Orbi
                 const int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
                 ModuleBase::Vector3<double> dtau = tau0 - tau1;
-                uot_->two_center_bundle->overlap_orb_onsite
-                    ->snap(T1, L1, N1, M1, T0, dtau * this->ucell->lat0, 0 /*cal_deri*/, nlm);
+                intor_->snap(T1, L1, N1, M1, T0, dtau * this->ucell->lat0, 0 /*cal_deri*/, nlm);
                 // select the elements of nlm with target_L
                 for (int iw = 0; iw < this->ucell->atoms[T0].nw; iw++)
                 {
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h
index bd89dbc322..21be7c0a50 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h
@@ -10,7 +10,7 @@
 #include "module_elecstate/module_dm/density_matrix.h"
 #include "module_hamilt_lcao/module_dftu/dftu.h"
 #include "dftu.hpp"
-#include "module_basis/module_ao/ORB_gen_tables.h"
+#include "module_basis/module_nao/two_center_integrator.h"
 
 namespace hamilt
 {
@@ -32,7 +32,7 @@ class DFTU<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
                                       std::vector<TK>* hK_in,
                                       const UnitCell& ucell_in,
                                       Grid_Driver* gridD_in,
-                                      const ORB_gen_tables* uot,
+                                      const TwoCenterIntegrator* intor,
                                       ModuleDFTU::DFTU* dftu_in,
                                       const Parallel_Orbitals& paraV);
     ~DFTU<OperatorLCAO<TK, TR>>();
@@ -59,7 +59,7 @@ class DFTU<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
     TK* HK_pointer = nullptr;
 
-    const ORB_gen_tables* uot_ = nullptr;
+    const TwoCenterIntegrator* intor_ = nullptr;
 
     /// @brief the number of spin components, 1 for no-spin, 2 for collinear spin case and 4 for non-collinear spin case
     int nspin = 0;
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/CMakeLists.txt b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/CMakeLists.txt
index 798e0caff0..080e338fec 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/CMakeLists.txt
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/CMakeLists.txt
@@ -3,7 +3,7 @@ remove_definitions(-DUSE_NEW_TWO_CENTER)
 
 AddTest(
   TARGET operator_overlap_test
-  LIBS ${math_libs} psi base device numerical_atomic_orbitals container
+  LIBS ${math_libs} psi base device container
   SOURCES test_overlapnew.cpp ../overlap_new.cpp ../../../module_hcontainer/func_folding.cpp 
   ../../../module_hcontainer/base_matrix.cpp ../../../module_hcontainer/hcontainer.cpp ../../../module_hcontainer/atom_pair.cpp  
   ../../../../module_basis/module_ao/parallel_2d.cpp ../../../../module_basis/module_ao/parallel_orbitals.cpp 
@@ -13,7 +13,7 @@ AddTest(
 
 AddTest(
   TARGET operator_overlap_cd_test
-  LIBS ${math_libs} psi base device numerical_atomic_orbitals container
+  LIBS ${math_libs} psi base device container
   SOURCES test_overlapnew_cd.cpp ../overlap_new.cpp ../../../module_hcontainer/func_folding.cpp 
   ../../../module_hcontainer/base_matrix.cpp ../../../module_hcontainer/hcontainer.cpp ../../../module_hcontainer/atom_pair.cpp  
   ../../../../module_basis/module_ao/parallel_2d.cpp ../../../../module_basis/module_ao/parallel_orbitals.cpp 
@@ -23,7 +23,7 @@ AddTest(
 
 AddTest(
   TARGET operator_ekinetic_test
-  LIBS ${math_libs} psi base device numerical_atomic_orbitals container
+  LIBS ${math_libs} psi base device container
   SOURCES test_ekineticnew.cpp ../ekinetic_new.cpp ../../../module_hcontainer/func_folding.cpp 
   ../../../module_hcontainer/base_matrix.cpp ../../../module_hcontainer/hcontainer.cpp ../../../module_hcontainer/atom_pair.cpp  
   ../../../../module_basis/module_ao/parallel_2d.cpp ../../../../module_basis/module_ao/parallel_orbitals.cpp 
@@ -33,7 +33,7 @@ AddTest(
 
 AddTest(
   TARGET operator_nonlocal_test
-  LIBS ${math_libs} psi base device numerical_atomic_orbitals container
+  LIBS ${math_libs} psi base device container
   SOURCES test_nonlocalnew.cpp ../nonlocal_new.cpp ../../../module_hcontainer/func_folding.cpp 
   ../../../module_hcontainer/base_matrix.cpp ../../../module_hcontainer/hcontainer.cpp ../../../module_hcontainer/atom_pair.cpp  
   ../../../../module_basis/module_ao/parallel_2d.cpp ../../../../module_basis/module_ao/parallel_orbitals.cpp 
@@ -43,7 +43,7 @@ AddTest(
 
 AddTest(
   TARGET operator_T_NL_cd_test
-  LIBS ${math_libs} psi base device numerical_atomic_orbitals container 
+  LIBS ${math_libs} psi base device container 
   SOURCES test_T_NL_cd.cpp ../nonlocal_new.cpp ../ekinetic_new.cpp ../../../module_hcontainer/func_folding.cpp 
   ../../../module_hcontainer/base_matrix.cpp ../../../module_hcontainer/hcontainer.cpp ../../../module_hcontainer/atom_pair.cpp  
   ../../../../module_basis/module_ao/parallel_2d.cpp ../../../../module_basis/module_ao/parallel_orbitals.cpp 
@@ -53,7 +53,7 @@ AddTest(
 
 AddTest(
   TARGET operator_dftu_test
-  LIBS ${math_libs} psi base device numerical_atomic_orbitals container 
+  LIBS ${math_libs} psi base device container 
   SOURCES test_dftu.cpp ../dftu_lcao.cpp ../../../module_hcontainer/func_folding.cpp 
   ../../../module_hcontainer/base_matrix.cpp ../../../module_hcontainer/hcontainer.cpp ../../../module_hcontainer/atom_pair.cpp  
   ../../../../module_basis/module_ao/parallel_2d.cpp ../../../../module_basis/module_ao/parallel_orbitals.cpp 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp
index 893f585540..cb31484a9b 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp
@@ -184,7 +184,7 @@ TEST_F(TNLTest, testTVNLcd2cd)
             {
                 if(mu % npol == nu % npol)
                 {
-                    EXPECT_EQ(tmp.get_pointer(0)[i].real(), 2.0);
+                    EXPECT_EQ(tmp.get_pointer(0)[i].real(), 1.0);
                     EXPECT_EQ(tmp.get_pointer(0)[i].imag(), 0.0);
                     EXPECT_EQ(tmp.get_pointer(1)[i].real(), result_ref);
                     EXPECT_EQ(tmp.get_pointer(1)[i].imag(), 0.0);
@@ -201,7 +201,7 @@ TEST_F(TNLTest, testTVNLcd2cd)
         }
     }
     // check the value of HK of gamma point
-    result_ref += 2.0;
+    result_ref += 1.0;
     int i = 0;
     for ( int irow = 0; irow < paraV->get_row_size(); ++irow)
     {
@@ -229,8 +229,8 @@ TEST_F(TNLTest, testTVNLcd2cd)
     std::cout << "Test terms:   " <<std::setw(15)<< "constructor" <<std::setw(15)<< "init(HR+HK)" <<std::setw(15)<< "2nd-init(HK)" << std::endl;
     std::cout << "Elapsed time: " <<std::setw(15)<< elapsed_time0.count()<<std::setw(15)<<elapsed_time1.count()<<std::setw(15)<<elapsed_time2.count() << " seconds." << std::endl;
     // check the value of HK
-    double result_ref1 = -1.6180339887498931 + test_size * 10;
-    double result_ref2 = -1.1755705045849467;
+    double result_ref1 = -1.6180339887498931/2 + test_size * 10;
+    double result_ref2 = -1.1755705045849467/2;
     i = 0;
     for ( int irow = 0; irow < paraV->get_row_size(); ++irow)
     {
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_dftu.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_dftu.cpp
index 9c91878c90..2a53e0feda 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_dftu.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_dftu.cpp
@@ -134,7 +134,7 @@ class DFTUTest : public ::testing::Test
     hamilt::HContainer<double>* HR;
     hamilt::HContainer<double>* DMR;
     Parallel_Orbitals *paraV;
-    ORB_gen_tables uot_;
+    TwoCenterIntegrator intor_;
 
     int dsize;
     int my_rank = 0;
@@ -167,7 +167,7 @@ TEST_F(DFTUTest, constructHRd2d)
         &hk, 
         ucell, 
         &gd,
-        &uot_,
+        &intor_,
         &GlobalC::dftu,
         *paraV
     );
@@ -235,7 +235,7 @@ TEST_F(DFTUTest, constructHRd2cd)
         &hk, 
         ucell, 
         &gd,
-        &uot_,
+        &intor_,
         &GlobalC::dftu,
         *paraV
     );
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp
index 32d7230672..35fbc19850 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp
@@ -127,7 +127,7 @@ TEST_F(EkineticNewTest, constructHRd2d)
         int nwt = indexes1.size() * indexes2.size();
         for (int i = 0; i < nwt; ++i)
         {
-            EXPECT_EQ(tmp.get_pointer(0)[i], 2.0);
+            EXPECT_EQ(tmp.get_pointer(0)[i], 1.0);
         }
     }
     // calculate SK
@@ -135,7 +135,7 @@ TEST_F(EkineticNewTest, constructHRd2d)
     // check the value of SK
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
-        EXPECT_EQ(hk[i], 2.0);
+        EXPECT_EQ(hk[i], 1.0);
     }
     // calculate HR again
     op.contributeHR();
@@ -150,7 +150,7 @@ TEST_F(EkineticNewTest, constructHRd2d)
         int nwt = indexes1.size() * indexes2.size();
         for (int i = 0; i < nwt; ++i)
         {
-            EXPECT_EQ(tmp.get_pointer(0)[i], 4.0);
+            EXPECT_EQ(tmp.get_pointer(0)[i], 2.0);
         }
     }
 }
@@ -183,7 +183,7 @@ TEST_F(EkineticNewTest, constructHRd2cd)
         int nwt = indexes1.size() * indexes2.size();
         for (int i = 0; i < nwt; ++i)
         {
-            EXPECT_EQ(tmp.get_pointer(0)[i], 2.0);
+            EXPECT_EQ(tmp.get_pointer(0)[i], 1.0);
         }
     }
     // calculate SK for gamma point
@@ -191,7 +191,7 @@ TEST_F(EkineticNewTest, constructHRd2cd)
     // check the value of SK of gamma point
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
-        EXPECT_EQ(hk[i].real(), 2.0);
+        EXPECT_EQ(hk[i].real(), 1.0);
         EXPECT_EQ(hk[i].imag(), 0.0);
     }
     // calculate HK for k point
@@ -200,8 +200,8 @@ TEST_F(EkineticNewTest, constructHRd2cd)
     // check the value of HK
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
-        EXPECT_NEAR(hk[i].real(), -1.6180339887498945, 1e-10);
-        EXPECT_NEAR(hk[i].imag(), -1.1755705045849467, 1e-10);
+        EXPECT_NEAR(hk[i].real(), -1.6180339887498945/2, 1e-10);
+        EXPECT_NEAR(hk[i].imag(), -1.1755705045849467/2, 1e-10);
     }
 }
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
index 0e3f21e771..6d9dfa383f 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
@@ -140,70 +140,116 @@ template class hamilt::OperatorLCAO<std::complex<double>, double>;
 template class hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>;
 
 // mock of ORB_gen_tables and LCAO_Orbitals
-#include "module_basis/module_ao/ORB_gen_tables.h"
-ORB_gen_tables::ORB_gen_tables() : two_center_bundle(new TwoCenterBundle) {}
-ORB_gen_tables::~ORB_gen_tables() {}
-ORB_gaunt_table::ORB_gaunt_table() {}
-ORB_gaunt_table::~ORB_gaunt_table() {}
-ORB_table_phi::ORB_table_phi() {}
-ORB_table_phi::~ORB_table_phi() {}
-ORB_table_alpha::ORB_table_alpha() {}
-ORB_table_alpha::~ORB_table_alpha() {}
-ORB_table_beta::ORB_table_beta() {}
-ORB_table_beta::~ORB_table_beta() {}
-// mock of snap_psipsi
-void ORB_gen_tables::snap_psipsi(
-    const LCAO_Orbitals &orb,
-    double olm[],
-    const int &job, ///<[in]0 for matrix element of either S or T, 1 for its derivatives
-    const char &dtype, ///<[in] derivative type, 'S' for overlap, 'T' for kinetic energy, 'D' for descriptor in deepks
-    const ModuleBase::Vector3<double> &R1,
-    const int &I1,
-    const int &l1,
-    const int &m1,
-    const int &n1,
-    const ModuleBase::Vector3<double> &R2,
-    const int &I2,
-    const int &l2,
-    const int &m2,
-    const int &n2,
-    bool cal_syns,
-    double dmax)const
+//#include "module_basis/module_ao/ORB_gen_tables.h"
+#include "module_basis/module_nao/two_center_integrator.h"
+TwoCenterIntegrator::TwoCenterIntegrator() {}
+
+void TwoCenterIntegrator::tabulate(
+    const RadialCollection& bra,
+    const RadialCollection& ket,
+    const char op,
+    const int nr,
+    const double cutoff
+) {}
+
+void TwoCenterIntegrator::calculate(
+    const int itype1, 
+    const int l1, 
+    const int izeta1, 
+    const int m1, 
+    const int itype2,
+    const int l2,
+    const int izeta2,
+    const int m2,
+    const ModuleBase::Vector3<double>& vR, // vR = R2 - R1
+    double* out,
+    double* grad_out
+) const
 {
-    if(dtype == 'S')
-    {
-        olm[0] = 1.0;
-    }
-    else if(dtype == 'T')
-    {
-        olm[0] = 2.0;
-    }
-    else if(dtype == 'D')
-    {
-        olm[0] = 3.0;
-    }
+    out[0] = 1.0;
 }
 
-void ORB_gen_tables::snap_psibeta_half(
-	const LCAO_Orbitals &orb,
-	const InfoNonlocal &infoNL_,
-	std::vector<std::vector<double>> &nlm,
-	const ModuleBase::Vector3<double> &R1,
-	const int &T1,
-	const int &L1,
-	const int &m1,
-	const int &N1,
-	const ModuleBase::Vector3<double> &R0, // The projector.
-	const int &T0,
-	const bool &calc_deri)const // mohan add 2021-04-25)
+void TwoCenterIntegrator::snap(
+    const int itype1, 
+    const int l1, 
+    const int izeta1, 
+    const int m1, 
+    const int itype2,
+    const ModuleBase::Vector3<double>& vR, // vR = R2 - R1
+    const bool deriv,
+    std::vector<std::vector<double>>& out
+) const
 {
-    nlm.resize(1);
-    for(int i = 0; i < nlm.size(); ++i)
+    out.resize(1);
+    for(int i = 0; i < out.size(); ++i)
     {
-        nlm[i].resize(5, 1.0);
+        out[i].resize(5, 1.0);
     }
 }
 
+//ORB_gen_tables::ORB_gen_tables() : two_center_bundle(new TwoCenterBundle) {}
+//ORB_gen_tables::~ORB_gen_tables() {}
+//ORB_gaunt_table::ORB_gaunt_table() {}
+//ORB_gaunt_table::~ORB_gaunt_table() {}
+//ORB_table_phi::ORB_table_phi() {}
+//ORB_table_phi::~ORB_table_phi() {}
+//ORB_table_alpha::ORB_table_alpha() {}
+//ORB_table_alpha::~ORB_table_alpha() {}
+//ORB_table_beta::ORB_table_beta() {}
+//ORB_table_beta::~ORB_table_beta() {}
+// mock of snap_psipsi
+//void ORB_gen_tables::snap_psipsi(
+//    const LCAO_Orbitals &orb,
+//    double olm[],
+//    const int &job, ///<[in]0 for matrix element of either S or T, 1 for its derivatives
+//    const char &dtype, ///<[in] derivative type, 'S' for overlap, 'T' for kinetic energy, 'D' for descriptor in deepks
+//    const ModuleBase::Vector3<double> &R1,
+//    const int &I1,
+//    const int &l1,
+//    const int &m1,
+//    const int &n1,
+//    const ModuleBase::Vector3<double> &R2,
+//    const int &I2,
+//    const int &l2,
+//    const int &m2,
+//    const int &n2,
+//    bool cal_syns,
+//    double dmax)const
+//{
+//    if(dtype == 'S')
+//    {
+//        olm[0] = 1.0;
+//    }
+//    else if(dtype == 'T')
+//    {
+//        olm[0] = 2.0;
+//    }
+//    else if(dtype == 'D')
+//    {
+//        olm[0] = 3.0;
+//    }
+//}
+
+//void ORB_gen_tables::snap_psibeta_half(
+//	const LCAO_Orbitals &orb,
+//	const InfoNonlocal &infoNL_,
+//	std::vector<std::vector<double>> &nlm,
+//	const ModuleBase::Vector3<double> &R1,
+//	const int &T1,
+//	const int &L1,
+//	const int &m1,
+//	const int &N1,
+//	const ModuleBase::Vector3<double> &R0, // The projector.
+//	const int &T0,
+//	const bool &calc_deri)const // mohan add 2021-04-25)
+//{
+//    nlm.resize(1);
+//    for(int i = 0; i < nlm.size(); ++i)
+//    {
+//        nlm[i].resize(5, 1.0);
+//    }
+//}
+
 #include "module_basis/module_ao/ORB_read.h"
 const LCAO_Orbitals& LCAO_Orbitals::get_const_instance()
 {

From 9b5323387ab53b172a22ae28125e65d85fea03a7 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci-lite[bot]"
 <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
Date: Wed, 19 Jun 2024 15:21:00 +0000
Subject: [PATCH 6/8] [pre-commit.ci lite] apply automatic fixes

---
 .../module_nao/two_center_bundle.h            |   4 +-
 .../hamilt_lcaodft/FORCE_STRESS.cpp           | 332 ++++++------
 .../hamilt_lcaodft/hamilt_lcao.cpp            | 495 ++++++++----------
 .../hamilt_lcaodft/operator_lcao/dftu_lcao.h  |  64 +--
 .../operator_lcao/ekinetic_new.h              |   2 +-
 .../operator_lcao/nonlocal_new.cpp            | 161 +++---
 .../operator_lcao/nonlocal_new.h              |   6 +-
 .../operator_lcao/overlap_new.cpp             |   5 +-
 .../operator_lcao/overlap_new.h               |   4 +-
 .../operator_lcao/test/test_T_NL_cd.cpp       |  88 ++--
 .../operator_lcao/test/test_dftu.cpp          | 100 ++--
 .../operator_lcao/test/test_ekineticnew.cpp   |  41 +-
 .../operator_lcao/test/test_nonlocalnew.cpp   |  72 ++-
 .../operator_lcao/test/test_overlapnew.cpp    |  41 +-
 .../operator_lcao/test/test_overlapnew_cd.cpp |  41 +-
 .../operator_lcao/test/tmp_mocks.cpp          | 243 ++++-----
 16 files changed, 802 insertions(+), 897 deletions(-)

diff --git a/source/module_basis/module_nao/two_center_bundle.h b/source/module_basis/module_nao/two_center_bundle.h
index 9f58014a0b..be4f39089a 100644
--- a/source/module_basis/module_nao/two_center_bundle.h
+++ b/source/module_basis/module_nao/two_center_bundle.h
@@ -1,5 +1,5 @@
-#ifndef TWO_CENTER_BUNDLE_H
-#define TWO_CENTER_BUNDLE_H
+#ifndef W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_BASIS_MODULE_NAO_TWO_CENTER_BUNDLE_H
+#define W_ABACUS_DEVELOP_ABACUS_DEVELOP_SOURCE_MODULE_BASIS_MODULE_NAO_TWO_CENTER_BUNDLE_H
 
 #include "module_basis/module_ao/ORB_read.h"
 #include "module_basis/module_nao/two_center_integrator.h"
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
index 0ca36ad054..900be874c2 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.cpp
@@ -11,8 +11,8 @@
 #include "module_hamilt_general/module_surchem/surchem.h" //sunml add 2022-08-10
 #include "module_hamilt_general/module_vdw/vdw.h"
 #ifdef __DEEPKS
-#include "module_hamilt_lcao/module_deepks/LCAO_deepks.h" //caoyu add for deepks 2021-06-03
 #include "module_elecstate/elecstate_lcao.h"
+#include "module_hamilt_lcao/module_deepks/LCAO_deepks.h" //caoyu add for deepks 2021-06-03
 #endif
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h"
 
@@ -29,14 +29,14 @@ void Force_Stress_LCAO<T>::getForceStress(const bool isforce,
                                           const bool isstress,
                                           const bool istestf,
                                           const bool istests,
-										  Parallel_Orbitals &pv,
+                                          Parallel_Orbitals& pv,
                                           const elecstate::ElecState* pelec,
                                           const psi::Psi<T>* psi,
-										  LCAO_Matrix& lm,
-										  Gint_Gamma &gint_gamma, // mohan add 2024-04-01
-										  Gint_k &gint_k, // mohan add 2024-04-01
+                                          LCAO_Matrix& lm,
+                                          Gint_Gamma& gint_gamma, // mohan add 2024-04-01
+                                          Gint_k& gint_k,         // mohan add 2024-04-01
                                           const ORB_gen_tables* uot,
-										  ModuleBase::matrix& fcs,
+                                          ModuleBase::matrix& fcs,
                                           ModuleBase::matrix& scs,
                                           const Structure_Factor& sf,
                                           const K_Vectors& kv,
@@ -142,32 +142,31 @@ void Force_Stress_LCAO<T>::getForceStress(const bool isforce,
     }
 
     //! atomic forces from integration (4 terms)
-    this->integral_part(
-        GlobalV::GAMMA_ONLY_LOCAL,
-        isforce,
-        isstress,
-        fsr,
-        pelec,
-        psi,
-        foverlap,
-        ftvnl_dphi,
-        fvnl_dbeta,
-        fvl_dphi,
-        soverlap,
-        stvnl_dphi,
-        svnl_dbeta,
-        svl_dphi,
+    this->integral_part(GlobalV::GAMMA_ONLY_LOCAL,
+                        isforce,
+                        isstress,
+                        fsr,
+                        pelec,
+                        psi,
+                        foverlap,
+                        ftvnl_dphi,
+                        fvnl_dbeta,
+                        fvl_dphi,
+                        soverlap,
+                        stvnl_dphi,
+                        svnl_dbeta,
+                        svl_dphi,
 #ifdef __DEEPKS
-        svnl_dalpha,
+                        svnl_dalpha,
 #endif
-        gint_gamma,
-        gint_k,
-        uot,
-        pv,
-        lm,
-        kv);
-
-    //! forces and stress from vdw 
+                        gint_gamma,
+                        gint_k,
+                        uot,
+                        pv,
+                        lm,
+                        kv);
+
+    //! forces and stress from vdw
     //  Peize Lin add 2014-04-04, update 2021-03-09
     //  jiyy add 2019-05-18, update 2021-05-02
     ModuleBase::matrix force_vdw;
@@ -238,29 +237,27 @@ void Force_Stress_LCAO<T>::getForceStress(const bool isforce,
         {
             stress_dftu.create(3, 3);
         }
-        if(GlobalV::dft_plus_u == 2)
+        if (GlobalV::dft_plus_u == 2)
         {
-            GlobalC::dftu.force_stress(
-					pelec, 
-					lm, 
-					pv,
-					fsr, //mohan 2024-06-16
-					force_dftu, 
-					stress_dftu, 
-					kv);
+            GlobalC::dftu.force_stress(pelec,
+                                       lm,
+                                       pv,
+                                       fsr, // mohan 2024-06-16
+                                       force_dftu,
+                                       stress_dftu,
+                                       kv);
         }
         else
         {
-			hamilt::DFTU<hamilt::OperatorLCAO<T, double>> tmp_dftu(
-					&lm,
-					kv.kvec_d,
-					nullptr,
-					nullptr,
-					GlobalC::ucell,
-					&GlobalC::GridD,
-                    uot->two_center_bundle->overlap_orb_onsite.get(),
-					&GlobalC::dftu,
-					*(lm.ParaV));
+            hamilt::DFTU<hamilt::OperatorLCAO<T, double>> tmp_dftu(&lm,
+                                                                   kv.kvec_d,
+                                                                   nullptr,
+                                                                   nullptr,
+                                                                   GlobalC::ucell,
+                                                                   &GlobalC::GridD,
+                                                                   uot->two_center_bundle->overlap_orb_onsite.get(),
+                                                                   &GlobalC::dftu,
+                                                                   *(lm.ParaV));
 
             tmp_dftu.cal_force_stress(isforce, isstress, force_dftu, stress_dftu);
         }
@@ -409,7 +406,7 @@ void Force_Stress_LCAO<T>::getForceStress(const bool isforce,
             {
                 GlobalC::ld.save_npy_f(fcs - GlobalC::ld.F_delta, "f_base.npy", GlobalC::ucell.nat); // Ry/Bohr, F_base
 
-                if(!GlobalV::deepks_equiv) //training with force label not supported by equivariant version now
+                if (!GlobalV::deepks_equiv) // training with force label not supported by equivariant version now
                 {
                     if (GlobalV::GAMMA_ONLY_LOCAL)
                     {
@@ -421,15 +418,26 @@ void Force_Stress_LCAO<T>::getForceStress(const bool isforce,
                     {
                         const std::vector<std::vector<std::complex<double>>>& dm_k
                             = dynamic_cast<const elecstate::ElecStateLCAO<std::complex<double>>*>(pelec)
-                                ->get_DM()
-                                ->get_DMK_vector();
-                        GlobalC::ld
-                            .cal_gdmx_k(dm_k, GlobalC::ucell, GlobalC::ORB, GlobalC::GridD, kv.get_nks(), kv.kvec_d, isstress);
+                                  ->get_DM()
+                                  ->get_DMK_vector();
+                        GlobalC::ld.cal_gdmx_k(dm_k,
+                                               GlobalC::ucell,
+                                               GlobalC::ORB,
+                                               GlobalC::GridD,
+                                               kv.get_nks(),
+                                               kv.kvec_d,
+                                               isstress);
+                    }
+                    if (GlobalV::deepks_out_unittest)
+                    {
+                        GlobalC::ld.check_gdmx(GlobalC::ucell.nat);
                     }
-                    if (GlobalV::deepks_out_unittest) { GlobalC::ld.check_gdmx(GlobalC::ucell.nat); }
                     GlobalC::ld.cal_gvx(GlobalC::ucell.nat);
 
-                    if (GlobalV::deepks_out_unittest) { GlobalC::ld.check_gvx(GlobalC::ucell.nat); }
+                    if (GlobalV::deepks_out_unittest)
+                    {
+                        GlobalC::ld.check_gvx(GlobalC::ucell.nat);
+                    }
                     GlobalC::ld.save_npy_gvx(GlobalC::ucell.nat); //  /Bohr, grad_vx
                 }
             }
@@ -620,7 +628,7 @@ void Force_Stress_LCAO<T>::getForceStress(const bool isforce,
                 GlobalC::ld.save_npy_s(scs,
                                        "s_tot.npy",
                                        GlobalC::ucell.omega); // change to energy unit Ry when printing, S_tot, w/ model
-                if(!GlobalV::deepks_equiv) //training with stress label not supported by equivariant version now
+                if (!GlobalV::deepks_equiv) // training with stress label not supported by equivariant version now
                 {
                     GlobalC::ld.cal_gvepsl(GlobalC::ucell.nat);
                     GlobalC::ld.save_npy_gvepsl(GlobalC::ucell.nat); //  unitless, grad_vepsl
@@ -705,17 +713,17 @@ void Force_Stress_LCAO<T>::getForceStress(const bool isforce,
 }
 
 // local pseudopotential, ewald, core correction, scc terms in force
-template<typename T>
+template <typename T>
 void Force_Stress_LCAO<T>::calForcePwPart(ModuleBase::matrix& fvl_dvl,
-                                       ModuleBase::matrix& fewalds,
-                                       ModuleBase::matrix& fcc,
-                                       ModuleBase::matrix& fscc,
-                                       const double& etxc,
-                                       const ModuleBase::matrix& vnew,
-                                       const bool vnew_exist,
-                                       const Charge* const chr,
-                                       ModulePW::PW_Basis* rhopw,
-                                       const Structure_Factor& sf)
+                                          ModuleBase::matrix& fewalds,
+                                          ModuleBase::matrix& fcc,
+                                          ModuleBase::matrix& fscc,
+                                          const double& etxc,
+                                          const ModuleBase::matrix& vnew,
+                                          const bool vnew_exist,
+                                          const Charge* const chr,
+                                          ModulePW::PW_Basis* rhopw,
+                                          const Structure_Factor& sf)
 {
     ModuleBase::TITLE("Force_Stress_LCAO", "calForcePwPart");
     //--------------------------------------------------------
@@ -740,122 +748,118 @@ void Force_Stress_LCAO<T>::calForcePwPart(ModuleBase::matrix& fvl_dvl,
 }
 
 // overlap, kinetic, nonlocal pseudopotential, Local potential terms in force and stress
-template<>
-void Force_Stress_LCAO<double>::integral_part(
-    const bool isGammaOnly,
-    const bool isforce,
-    const bool isstress,
-    ForceStressArrays &fsr, // mohan add 2024-06-15
-    const elecstate::ElecState* pelec,
-    const psi::Psi<double>* psi,
-    ModuleBase::matrix& foverlap,
-    ModuleBase::matrix& ftvnl_dphi,
-    ModuleBase::matrix& fvnl_dbeta,
-    ModuleBase::matrix& fvl_dphi,
-    ModuleBase::matrix& soverlap,
-    ModuleBase::matrix& stvnl_dphi,
-    ModuleBase::matrix& svnl_dbeta,
-    ModuleBase::matrix& svl_dphi,
+template <>
+void Force_Stress_LCAO<double>::integral_part(const bool isGammaOnly,
+                                              const bool isforce,
+                                              const bool isstress,
+                                              ForceStressArrays& fsr, // mohan add 2024-06-15
+                                              const elecstate::ElecState* pelec,
+                                              const psi::Psi<double>* psi,
+                                              ModuleBase::matrix& foverlap,
+                                              ModuleBase::matrix& ftvnl_dphi,
+                                              ModuleBase::matrix& fvnl_dbeta,
+                                              ModuleBase::matrix& fvl_dphi,
+                                              ModuleBase::matrix& soverlap,
+                                              ModuleBase::matrix& stvnl_dphi,
+                                              ModuleBase::matrix& svnl_dbeta,
+                                              ModuleBase::matrix& svl_dphi,
 #if __DEEPKS
-    ModuleBase::matrix& svnl_dalpha,
+                                              ModuleBase::matrix& svnl_dalpha,
 #endif
-	Gint_Gamma &gint_gamma, // mohan add 2024-04-01
-	Gint_k &gint_k, // mohan add 2024-04-01
-    const ORB_gen_tables* uot,
-    const Parallel_Orbitals& pv,
-    LCAO_Matrix &lm,
-    const K_Vectors& kv)
+                                              Gint_Gamma& gint_gamma, // mohan add 2024-04-01
+                                              Gint_k& gint_k,         // mohan add 2024-04-01
+                                              const ORB_gen_tables* uot,
+                                              const Parallel_Orbitals& pv,
+                                              LCAO_Matrix& lm,
+                                              const K_Vectors& kv)
 {
 
-    flk.ftable(
-        isforce,
-        isstress,
-        fsr, // mohan add 2024-06-15
-        GlobalC::ucell,
-        psi,
-        pelec,
-        foverlap,
-        ftvnl_dphi,
-        fvnl_dbeta,
-        fvl_dphi,
-        soverlap,
-        stvnl_dphi,
-        svnl_dbeta,
-        svl_dphi,
+    flk.ftable(isforce,
+               isstress,
+               fsr, // mohan add 2024-06-15
+               GlobalC::ucell,
+               psi,
+               pelec,
+               foverlap,
+               ftvnl_dphi,
+               fvnl_dbeta,
+               fvl_dphi,
+               soverlap,
+               stvnl_dphi,
+               svnl_dbeta,
+               svl_dphi,
 #if __DEEPKS
-        svnl_dalpha,
+               svnl_dalpha,
 #endif
-        gint_gamma,
-        uot,
-        pv,
-        lm);
+               gint_gamma,
+               uot,
+               pv,
+               lm);
     return;
 }
 
-
-template<>
-void Force_Stress_LCAO<std::complex<double>>::integral_part(
-    const bool isGammaOnly,
-    const bool isforce,
-    const bool isstress,
-    ForceStressArrays &fsr, // mohan add 2024-06-15
-    const elecstate::ElecState* pelec,
-    const psi::Psi<std::complex<double>>* psi,
-    ModuleBase::matrix& foverlap,
-    ModuleBase::matrix& ftvnl_dphi,
-    ModuleBase::matrix& fvnl_dbeta,
-    ModuleBase::matrix& fvl_dphi,
-    ModuleBase::matrix& soverlap,
-    ModuleBase::matrix& stvnl_dphi,
-    ModuleBase::matrix& svnl_dbeta,
-    ModuleBase::matrix& svl_dphi,
+template <>
+void Force_Stress_LCAO<std::complex<double>>::integral_part(const bool isGammaOnly,
+                                                            const bool isforce,
+                                                            const bool isstress,
+                                                            ForceStressArrays& fsr, // mohan add 2024-06-15
+                                                            const elecstate::ElecState* pelec,
+                                                            const psi::Psi<std::complex<double>>* psi,
+                                                            ModuleBase::matrix& foverlap,
+                                                            ModuleBase::matrix& ftvnl_dphi,
+                                                            ModuleBase::matrix& fvnl_dbeta,
+                                                            ModuleBase::matrix& fvl_dphi,
+                                                            ModuleBase::matrix& soverlap,
+                                                            ModuleBase::matrix& stvnl_dphi,
+                                                            ModuleBase::matrix& svnl_dbeta,
+                                                            ModuleBase::matrix& svl_dphi,
 #if __DEEPKS
-    ModuleBase::matrix& svnl_dalpha,
+                                                            ModuleBase::matrix& svnl_dalpha,
 #endif
-	Gint_Gamma &gint_gamma,
-	Gint_k &gint_k,
-    const ORB_gen_tables* uot,
-    const Parallel_Orbitals& pv,
-	LCAO_Matrix &lm,
-	const K_Vectors& kv)
+                                                            Gint_Gamma& gint_gamma,
+                                                            Gint_k& gint_k,
+                                                            const ORB_gen_tables* uot,
+                                                            const Parallel_Orbitals& pv,
+                                                            LCAO_Matrix& lm,
+                                                            const K_Vectors& kv)
 {
     flk.ftable(isforce,
-        isstress,
-        fsr, // mohan add 2024-06-16
-        GlobalC::ucell,
-        psi,
-        pelec,
-        foverlap,
-        ftvnl_dphi,
-        fvnl_dbeta,
-        fvl_dphi,
-        soverlap,
-        stvnl_dphi,
-        svnl_dbeta,
-        svl_dphi,
+               isstress,
+               fsr, // mohan add 2024-06-16
+               GlobalC::ucell,
+               psi,
+               pelec,
+               foverlap,
+               ftvnl_dphi,
+               fvnl_dbeta,
+               fvl_dphi,
+               soverlap,
+               stvnl_dphi,
+               svnl_dbeta,
+               svl_dphi,
 #if __DEEPKS
-        svnl_dalpha,
+               svnl_dalpha,
 #endif
-        gint_k,
-        uot,
-        pv,
-        lm,
-        & kv,
-        this->RA);
+               gint_k,
+               uot,
+               pv,
+               lm,
+               &kv,
+               this->RA);
     return;
 }
 
 // vlocal, hartree, ewald, core correction, exchange-correlation terms in stress
-template<typename T>
+template <typename T>
 void Force_Stress_LCAO<T>::calStressPwPart(ModuleBase::matrix& sigmadvl,
-                                        ModuleBase::matrix& sigmahar,
-                                        ModuleBase::matrix& sigmaewa,
-                                        ModuleBase::matrix& sigmacc,
-                                        ModuleBase::matrix& sigmaxc,
-                                        const double& etxc,
-                                        const Charge* const chr,
-                                        ModulePW::PW_Basis* rhopw,
-                                        const Structure_Factor& sf)
+                                           ModuleBase::matrix& sigmahar,
+                                           ModuleBase::matrix& sigmaewa,
+                                           ModuleBase::matrix& sigmacc,
+                                           ModuleBase::matrix& sigmaxc,
+                                           const double& etxc,
+                                           const Charge* const chr,
+                                           ModulePW::PW_Basis* rhopw,
+                                           const Structure_Factor& sf)
 {
     ModuleBase::TITLE("Force_Stress_LCAO", "calStressPwPart");
     //--------------------------------------------------------
@@ -894,7 +898,7 @@ void Force_Stress_LCAO<T>::calStressPwPart(ModuleBase::matrix& sigmadvl,
 
 #include "module_base/mathzone.h"
 // do symmetry for total force
-template<typename T>
+template <typename T>
 void Force_Stress_LCAO<T>::forceSymmetry(ModuleBase::matrix& fcs, ModuleSymmetry::Symmetry* symm)
 {
     double d1, d2, d3;
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
index 2f8caad081..c855cc735a 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/hamilt_lcao.cpp
@@ -1,8 +1,8 @@
 #include "hamilt_lcao.h"
 
 #include "module_base/global_variable.h"
-#include "module_base/timer.h"
 #include "module_base/memory.h"
+#include "module_base/timer.h"
 #include "module_hamilt_lcao/module_dftu/dftu.h"
 #include "module_hamilt_pw/hamilt_pwdft/global.h"
 #ifdef __DEEPKS
@@ -10,32 +10,32 @@
 #include "operator_lcao/deepks_lcao.h"
 #endif
 #ifdef __EXX
-#include "operator_lcao/op_exx_lcao.h"
 #include "module_ri/Exx_LRI_interface.h"
+#include "operator_lcao/op_exx_lcao.h"
 #endif
 #ifdef __ELPA
 #include "module_hsolver/diago_elpa.h"
 #endif
-#include "operator_lcao/op_dftu_lcao.h"
+#include "module_elecstate/potentials/H_TDDFT_pw.h"
+#include "module_hamilt_general/module_xc/xc_functional.h"
+#include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
+#include "module_hsolver/hsolver_lcao.h"
 #include "operator_lcao/dftu_lcao.h"
+#include "operator_lcao/ekinetic_new.h"
 #include "operator_lcao/meta_lcao.h"
+#include "operator_lcao/nonlocal_new.h"
+#include "operator_lcao/op_dftu_lcao.h"
 #include "operator_lcao/op_exx_lcao.h"
+#include "operator_lcao/overlap_new.h"
+#include "operator_lcao/sc_lambda_lcao.h"
 #include "operator_lcao/td_ekinetic_lcao.h"
 #include "operator_lcao/td_nonlocal_lcao.h"
-#include "operator_lcao/overlap_new.h"
-#include "operator_lcao/ekinetic_new.h"
-#include "operator_lcao/nonlocal_new.h"
 #include "operator_lcao/veff_lcao.h"
-#include "operator_lcao/sc_lambda_lcao.h"
-#include "module_hsolver/hsolver_lcao.h"
-#include "module_hamilt_general/module_xc/xc_functional.h"
-#include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
-#include "module_elecstate/potentials/H_TDDFT_pw.h"
 
 namespace hamilt
 {
 
-template<typename TK, typename TR>
+template <typename TK, typename TR>
 HamiltLCAO<TK, TR>::HamiltLCAO(LCAO_Matrix* LM_in, const K_Vectors& kv_in, const ORB_gen_tables* uot)
 {
     this->classname = "HamiltLCAO";
@@ -46,31 +46,28 @@ HamiltLCAO<TK, TR>::HamiltLCAO(LCAO_Matrix* LM_in, const K_Vectors& kv_in, const
     this->hR = new HContainer<TR>(LM_in->ParaV);
     this->sR = new HContainer<TR>(LM_in->ParaV);
 
-    this->getOperator() = new OverlapNew<OperatorLCAO<TK, TR>>(
-        LM_in,
-        this->kv->kvec_d,
-        this->hR,
-        &(this->getHk(LM_in)),
-        this->sR,
-        &(this->getSk(LM_in)),
-        &GlobalC::ucell,
-        &GlobalC::GridD,
-        uot->two_center_bundle->overlap_orb.get(),
-        LM_in->ParaV
-    );
+    this->getOperator() = new OverlapNew<OperatorLCAO<TK, TR>>(LM_in,
+                                                               this->kv->kvec_d,
+                                                               this->hR,
+                                                               &(this->getHk(LM_in)),
+                                                               this->sR,
+                                                               &(this->getSk(LM_in)),
+                                                               &GlobalC::ucell,
+                                                               &GlobalC::GridD,
+                                                               uot->two_center_bundle->overlap_orb.get(),
+                                                               LM_in->ParaV);
 }
 
-template<typename TK, typename TR>
-HamiltLCAO<TK, TR>::HamiltLCAO(
-    Gint_Gamma* GG_in,
-    Gint_k* GK_in,
-    LCAO_Matrix* LM_in,
-    Local_Orbital_Charge* loc_in,
-    elecstate::Potential* pot_in,
-    const K_Vectors& kv_in,
-    const ORB_gen_tables* uot,
-    elecstate::DensityMatrix<TK, double>* DM_in,
-    int* exx_two_level_step)
+template <typename TK, typename TR>
+HamiltLCAO<TK, TR>::HamiltLCAO(Gint_Gamma* GG_in,
+                               Gint_k* GK_in,
+                               LCAO_Matrix* LM_in,
+                               Local_Orbital_Charge* loc_in,
+                               elecstate::Potential* pot_in,
+                               const K_Vectors& kv_in,
+                               const ORB_gen_tables* uot,
+                               elecstate::DensityMatrix<TK, double>* DM_in,
+                               int* exx_two_level_step)
 {
     this->kv = &kv_in;
     this->classname = "HamiltLCAO";
@@ -79,13 +76,14 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
     this->hR = new HContainer<TR>(LM_in->ParaV);
     this->sR = new HContainer<TR>(LM_in->ParaV);
 
-    const std::size_t row_col_size = static_cast<std::size_t>(LM_in->ParaV->get_row_size()) * LM_in->ParaV->get_col_size();
+    const std::size_t row_col_size
+        = static_cast<std::size_t>(LM_in->ParaV->get_row_size()) * LM_in->ParaV->get_col_size();
     this->getSk(LM_in).resize(row_col_size);
     this->getHk(LM_in).resize(row_col_size);
 
     // Effective potential term (\sum_r <psi(r)|Veff(r)|psi(r)>) is registered without template
     std::vector<std::string> pot_register_in;
-    if(GlobalV::VL_IN_H)
+    if (GlobalV::VL_IN_H)
     {
         if (GlobalV::VION_IN_H)
         {
@@ -117,162 +115,150 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
     // Gamma_only case to initialize HamiltLCAO
     //
     // code block to construct Operator Chains
-    if(std::is_same<TK, double>::value)
+    if (std::is_same<TK, double>::value)
     {
         // fix HR to gamma case, where SR will be fixed in Overlap Operator
         this->hR->fix_gamma();
         // initial operator for Gamma_only case
         // overlap term (<psi|psi>) is indispensable
         // in Gamma_only case, target SR is LCAO_Matrix::Sloc, which is same as SK
-        this->getOperator() = new OverlapNew<OperatorLCAO<TK, TR>>(
-            LM_in,
-            this->kv->kvec_d,
-            this->hR,
-            &(this->getHk(LM_in)),
-            this->sR,
-            &(this->getSk(LM_in)),
-            &GlobalC::ucell,
-            &GlobalC::GridD,
-            uot->two_center_bundle->overlap_orb.get(),
-            LM_in->ParaV
-        );
+        this->getOperator() = new OverlapNew<OperatorLCAO<TK, TR>>(LM_in,
+                                                                   this->kv->kvec_d,
+                                                                   this->hR,
+                                                                   &(this->getHk(LM_in)),
+                                                                   this->sR,
+                                                                   &(this->getSk(LM_in)),
+                                                                   &GlobalC::ucell,
+                                                                   &GlobalC::GridD,
+                                                                   uot->two_center_bundle->overlap_orb.get(),
+                                                                   LM_in->ParaV);
 
         // kinetic term (<psi|T|psi>),
         // in Gamma_only case, target HR is LCAO_Matrix::Hloc_fixed, while target HK is LCAO_Matrix::Hloc
         // LCAO_Matrix::Hloc_fixed2 is used for storing
-        if(GlobalV::T_IN_H)
+        if (GlobalV::T_IN_H)
         {
-            Operator<TK>* ekinetic = new EkineticNew<OperatorLCAO<TK, TR>>(
-                LM_in, 
-                this->kv->kvec_d, 
-                this->hR, 
-                &(this->getHk(LM_in)),
-                &GlobalC::ucell, 
-                &GlobalC::GridD,
-                uot->two_center_bundle->kinetic_orb.get(),
-                LM_in->ParaV
-            );
+            Operator<TK>* ekinetic = new EkineticNew<OperatorLCAO<TK, TR>>(LM_in,
+                                                                           this->kv->kvec_d,
+                                                                           this->hR,
+                                                                           &(this->getHk(LM_in)),
+                                                                           &GlobalC::ucell,
+                                                                           &GlobalC::GridD,
+                                                                           uot->two_center_bundle->kinetic_orb.get(),
+                                                                           LM_in->ParaV);
             this->getOperator()->add(ekinetic);
         }
 
         // nonlocal term (<psi|beta>D<beta|psi>)
         // in general case, target HR is this->hR, while target HK is LCAO_Matrix::Hloc
-        if(GlobalV::VNL_IN_H)
+        if (GlobalV::VNL_IN_H)
         {
-            Operator<TK>* nonlocal = new NonlocalNew<OperatorLCAO<TK, TR>>(
-                LM_in, 
-                this->kv->kvec_d, 
-                this->hR, 
-                &(this->getHk(LM_in)),
-                &GlobalC::ucell, 
-                &GlobalC::GridD,
-                uot->two_center_bundle->overlap_orb_beta.get(),
-                LM_in->ParaV
-            );
+            Operator<TK>* nonlocal
+                = new NonlocalNew<OperatorLCAO<TK, TR>>(LM_in,
+                                                        this->kv->kvec_d,
+                                                        this->hR,
+                                                        &(this->getHk(LM_in)),
+                                                        &GlobalC::ucell,
+                                                        &GlobalC::GridD,
+                                                        uot->two_center_bundle->overlap_orb_beta.get(),
+                                                        LM_in->ParaV);
             this->getOperator()->add(nonlocal);
         }
 
         // Effective potential term (\sum_r <psi(r)|Veff(r)|psi(r)>)
         // in general case, target HR is Gint::hRGint, while target HK is LCAO_Matrix::Hloc
-        if(GlobalV::VL_IN_H)
+        if (GlobalV::VL_IN_H)
         {
-            //only Potential is not empty, Veff and Meta are available
-            if(pot_register_in.size()>0)
+            // only Potential is not empty, Veff and Meta are available
+            if (pot_register_in.size() > 0)
             {
-                //register Potential by gathered operator
+                // register Potential by gathered operator
                 pot_in->pot_register(pot_register_in);
-                //effective potential term
-                Operator<TK>* veff = new Veff<OperatorLCAO<TK, TR>>(
-                    GG_in,
-                    loc_in,
-                    LM_in,
-                    this->kv->kvec_d,
-                    pot_in,
-                    this->hR, // no explicit call yet
-                    &(this->getHk(LM_in)),
-                    &GlobalC::ucell,
-                    &GlobalC::GridD,
-                    LM_in->ParaV // no explicit call yet
+                // effective potential term
+                Operator<TK>* veff = new Veff<OperatorLCAO<TK, TR>>(GG_in,
+                                                                    loc_in,
+                                                                    LM_in,
+                                                                    this->kv->kvec_d,
+                                                                    pot_in,
+                                                                    this->hR, // no explicit call yet
+                                                                    &(this->getHk(LM_in)),
+                                                                    &GlobalC::ucell,
+                                                                    &GlobalC::GridD,
+                                                                    LM_in->ParaV // no explicit call yet
                 );
                 this->getOperator()->add(veff);
             }
         }
 
-    #ifdef __DEEPKS
+#ifdef __DEEPKS
         if (GlobalV::deepks_scf)
         {
             Operator<TK>* deepks = new DeePKS<OperatorLCAO<TK, TR>>(loc_in,
-                                                                        LM_in,
-                                                                        this->kv->kvec_d,
-                                                                        this->hR, // no explicit call yet
-                                                                        &(this->getHk(LM_in)),
-                                                                        &GlobalC::ucell,
-                                                                        &GlobalC::GridD,
-                                                                        uot,
-                                                                        this->kv->get_nks(),
-                                                                        DM_in);
+                                                                    LM_in,
+                                                                    this->kv->kvec_d,
+                                                                    this->hR, // no explicit call yet
+                                                                    &(this->getHk(LM_in)),
+                                                                    &GlobalC::ucell,
+                                                                    &GlobalC::GridD,
+                                                                    uot,
+                                                                    this->kv->get_nks(),
+                                                                    DM_in);
             this->getOperator()->add(deepks);
         }
-    #endif
+#endif
 
-        //end node should be OperatorDFTU
+        // end node should be OperatorDFTU
         if (GlobalV::dft_plus_u)
         {
             Operator<TK>* dftu = nullptr;
-            if(GlobalV::dft_plus_u == 2) 
+            if (GlobalV::dft_plus_u == 2)
             {
-                dftu = new OperatorDFTU<OperatorLCAO<TK, TR>>(
-                    LM_in,
-                    kv->kvec_d,
-                    this->hR,// no explicit call yet
-                    &(this->getHk(LM_in)),
-                    this->kv->isk
-                );
+                dftu = new OperatorDFTU<OperatorLCAO<TK, TR>>(LM_in,
+                                                              kv->kvec_d,
+                                                              this->hR, // no explicit call yet
+                                                              &(this->getHk(LM_in)),
+                                                              this->kv->isk);
             }
             else
             {
-                dftu = new DFTU<OperatorLCAO<TK, TR>>(
-                    LM_in,
-                    this->kv->kvec_d,
-                    this->hR,
-                    &(this->getHk(LM_in)),
-                    GlobalC::ucell,
-                    &GlobalC::GridD,
-                    uot->two_center_bundle->overlap_orb_onsite.get(),
-                    &GlobalC::dftu,
-                    *(LM_in->ParaV)
-                );
+                dftu = new DFTU<OperatorLCAO<TK, TR>>(LM_in,
+                                                      this->kv->kvec_d,
+                                                      this->hR,
+                                                      &(this->getHk(LM_in)),
+                                                      GlobalC::ucell,
+                                                      &GlobalC::GridD,
+                                                      uot->two_center_bundle->overlap_orb_onsite.get(),
+                                                      &GlobalC::dftu,
+                                                      *(LM_in->ParaV));
             }
             this->getOperator()->add(dftu);
         }
     }
     // multi-k-points case to initialize HamiltLCAO, ops will be used
-    else if(std::is_same<TK, std::complex<double>>::value)
+    else if (std::is_same<TK, std::complex<double>>::value)
     {
         // Effective potential term (\sum_r <psi(r)|Veff(r)|psi(r)>)
         // Meta potential term (\sum_r <psi(r)|tau(r)|psi(r)>)
         // in general case, target HR is Gint::pvpR_reduced, while target HK is LCAO_Matrix::Hloc2
-        if(GlobalV::VL_IN_H)
+        if (GlobalV::VL_IN_H)
         {
-            //only Potential is not empty, Veff and Meta are available
-            if(pot_register_in.size()>0)
+            // only Potential is not empty, Veff and Meta are available
+            if (pot_register_in.size() > 0)
             {
-                //register Potential by gathered operator
+                // register Potential by gathered operator
                 pot_in->pot_register(pot_register_in);
-                //Veff term
-                this->getOperator() = new Veff<OperatorLCAO<TK, TR>>(
-                    GK_in,
-                    loc_in,
-                    LM_in,
-                    kv->kvec_d,
-                    pot_in,
-                    this->hR, 
-                    &(this->getHk(LM_in)),
-                    &GlobalC::ucell,
-                    &GlobalC::GridD,
-                    LM_in->ParaV
-                );
-                //reset spin index and real space Hamiltonian matrix
+                // Veff term
+                this->getOperator() = new Veff<OperatorLCAO<TK, TR>>(GK_in,
+                                                                     loc_in,
+                                                                     LM_in,
+                                                                     kv->kvec_d,
+                                                                     pot_in,
+                                                                     this->hR,
+                                                                     &(this->getHk(LM_in)),
+                                                                     &GlobalC::ucell,
+                                                                     &GlobalC::GridD,
+                                                                     LM_in->ParaV);
+                // reset spin index and real space Hamiltonian matrix
                 int start_spin = -1;
                 GK_in->reset_spin(start_spin);
                 GK_in->destroy_pvpR();
@@ -282,19 +268,17 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
 
         // initial operator for multi-k case
         // overlap term is indispensable
-        Operator<TK>* overlap = new OverlapNew<OperatorLCAO<TK, TR>>(
-            LM_in,
-            this->kv->kvec_d,
-            this->hR,
-            &(this->getHk(LM_in)),
-            this->sR,
-            &(this->getSk(LM_in)),
-            &GlobalC::ucell,
-            &GlobalC::GridD,
-            uot->two_center_bundle->overlap_orb.get(),
-            LM_in->ParaV
-        );
-        if(this->getOperator() == nullptr)
+        Operator<TK>* overlap = new OverlapNew<OperatorLCAO<TK, TR>>(LM_in,
+                                                                     this->kv->kvec_d,
+                                                                     this->hR,
+                                                                     &(this->getHk(LM_in)),
+                                                                     this->sR,
+                                                                     &(this->getSk(LM_in)),
+                                                                     &GlobalC::ucell,
+                                                                     &GlobalC::GridD,
+                                                                     uot->two_center_bundle->overlap_orb.get(),
+                                                                     LM_in->ParaV);
+        if (this->getOperator() == nullptr)
         {
             this->getOperator() = overlap;
         }
@@ -305,38 +289,35 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
 
         // kinetic term (<psi|T|psi>),
         // in general case, target HR is this->hR, while target HK is LCAO_Matrix::Hloc2
-        if(GlobalV::T_IN_H)
+        if (GlobalV::T_IN_H)
         {
-            Operator<TK>* ekinetic = new EkineticNew<OperatorLCAO<TK, TR>>(
-                LM_in,
-                this->kv->kvec_d,
-                this->hR,
-                &(this->getHk(LM_in)),
-                &GlobalC::ucell,
-                &GlobalC::GridD,
-                uot->two_center_bundle->kinetic_orb.get(),
-                LM_in->ParaV
-            );
+            Operator<TK>* ekinetic = new EkineticNew<OperatorLCAO<TK, TR>>(LM_in,
+                                                                           this->kv->kvec_d,
+                                                                           this->hR,
+                                                                           &(this->getHk(LM_in)),
+                                                                           &GlobalC::ucell,
+                                                                           &GlobalC::GridD,
+                                                                           uot->two_center_bundle->kinetic_orb.get(),
+                                                                           LM_in->ParaV);
             this->getOperator()->add(ekinetic);
         }
 
         // nonlocal term (<psi|beta>D<beta|psi>)
         // in general case, target HR is this->hR, while target HK is LCAO_Matrix::Hloc2
-        if(GlobalV::VNL_IN_H)
+        if (GlobalV::VNL_IN_H)
         {
-            Operator<TK>* nonlocal = new NonlocalNew<OperatorLCAO<TK, TR>>(
-                LM_in,
-                this->kv->kvec_d,
-                this->hR,
-                &(this->getHk(LM_in)),
-                &GlobalC::ucell,
-                &GlobalC::GridD,
-                uot->two_center_bundle->overlap_orb_beta.get(),
-                LM_in->ParaV
-            );
-            //TDDFT velocity gague will calculate full non-local potential including the original one and the correction on its own.
-            //So the original non-local potential term should be skipped
-            if(GlobalV::ESOLVER_TYPE != "tddft" || elecstate::H_TDDFT_pw::stype !=1)
+            Operator<TK>* nonlocal
+                = new NonlocalNew<OperatorLCAO<TK, TR>>(LM_in,
+                                                        this->kv->kvec_d,
+                                                        this->hR,
+                                                        &(this->getHk(LM_in)),
+                                                        &GlobalC::ucell,
+                                                        &GlobalC::GridD,
+                                                        uot->two_center_bundle->overlap_orb_beta.get(),
+                                                        LM_in->ParaV);
+            // TDDFT velocity gague will calculate full non-local potential including the original one and the
+            // correction on its own. So the original non-local potential term should be skipped
+            if (GlobalV::ESOLVER_TYPE != "tddft" || elecstate::H_TDDFT_pw::stype != 1)
             {
                 this->getOperator()->add(nonlocal);
             }
@@ -344,92 +325,79 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
             {
                 delete nonlocal;
             }
-            
         }
 
-    #ifdef __DEEPKS
+#ifdef __DEEPKS
         if (GlobalV::deepks_scf)
         {
-            Operator<TK>* deepks
-                = new DeePKS<OperatorLCAO<TK, TR>>(loc_in,
-                                                    LM_in,
-                                                    this->kv->kvec_d,
-                                                    hR,
-                                                    &(this->getHk(LM_in)),
-                                                    &GlobalC::ucell,
-                                                    &GlobalC::GridD,
-                                                    uot,
-                                                    this->kv->get_nks(),
-                                                    DM_in);
+            Operator<TK>* deepks = new DeePKS<OperatorLCAO<TK, TR>>(loc_in,
+                                                                    LM_in,
+                                                                    this->kv->kvec_d,
+                                                                    hR,
+                                                                    &(this->getHk(LM_in)),
+                                                                    &GlobalC::ucell,
+                                                                    &GlobalC::GridD,
+                                                                    uot,
+                                                                    this->kv->get_nks(),
+                                                                    DM_in);
             this->getOperator()->add(deepks);
         }
-    #endif
-    //TDDFT_velocity_gague
-        if(GlobalV::ESOLVER_TYPE == "tddft" && elecstate::H_TDDFT_pw::stype ==1)
+#endif
+        // TDDFT_velocity_gague
+        if (GlobalV::ESOLVER_TYPE == "tddft" && elecstate::H_TDDFT_pw::stype == 1)
         {
             elecstate::H_TDDFT_pw::update_At();
-            Operator<TK>* td_ekinetic
-                = new TDEkinetic<OperatorLCAO<TK, TR>>(
-                    LM_in,
-                    this->hR,
-                    &(this->getHk(LM_in)),
-                    this->sR,
-                    kv,
-                    &GlobalC::ucell,
-                    &GlobalC::GridD
-                );
-                this->getOperator()->add(td_ekinetic);
-            
-            Operator<TK>* td_nonlocal
-                = new TDNonlocal<OperatorLCAO<TK, TR>>(
-                    LM_in, 
-                    this->kv->kvec_d, 
-                    this->hR, 
-                    &(this->getHk(LM_in)),
-                    &GlobalC::ucell, 
-                    &GlobalC::GridD,
-                    uot,
-                    LM_in->ParaV
-                );
+            Operator<TK>* td_ekinetic = new TDEkinetic<OperatorLCAO<TK, TR>>(LM_in,
+                                                                             this->hR,
+                                                                             &(this->getHk(LM_in)),
+                                                                             this->sR,
+                                                                             kv,
+                                                                             &GlobalC::ucell,
+                                                                             &GlobalC::GridD);
+            this->getOperator()->add(td_ekinetic);
+
+            Operator<TK>* td_nonlocal = new TDNonlocal<OperatorLCAO<TK, TR>>(LM_in,
+                                                                             this->kv->kvec_d,
+                                                                             this->hR,
+                                                                             &(this->getHk(LM_in)),
+                                                                             &GlobalC::ucell,
+                                                                             &GlobalC::GridD,
+                                                                             uot,
+                                                                             LM_in->ParaV);
             this->getOperator()->add(td_nonlocal);
         }
         if (GlobalV::dft_plus_u)
         {
             Operator<TK>* dftu = nullptr;
-            if(GlobalV::dft_plus_u == 2) 
+            if (GlobalV::dft_plus_u == 2)
             {
-                dftu = new OperatorDFTU<OperatorLCAO<TK, TR>>(
-                    LM_in,
-                    kv->kvec_d,
-                    this->hR,// no explicit call yet
-                    &(this->getHk(LM_in)),
-                    this->kv->isk
-                );
+                dftu = new OperatorDFTU<OperatorLCAO<TK, TR>>(LM_in,
+                                                              kv->kvec_d,
+                                                              this->hR, // no explicit call yet
+                                                              &(this->getHk(LM_in)),
+                                                              this->kv->isk);
             }
             else
             {
-                dftu = new DFTU<OperatorLCAO<TK, TR>>(
-                    LM_in,
-                    this->kv->kvec_d,
-                    this->hR,
-                    &(this->getHk(LM_in)),
-                    GlobalC::ucell,
-                    &GlobalC::GridD,
-                    uot->two_center_bundle->overlap_orb_onsite.get(),
-                    &GlobalC::dftu,
-                    *(LM_in->ParaV)
-                );
+                dftu = new DFTU<OperatorLCAO<TK, TR>>(LM_in,
+                                                      this->kv->kvec_d,
+                                                      this->hR,
+                                                      &(this->getHk(LM_in)),
+                                                      GlobalC::ucell,
+                                                      &GlobalC::GridD,
+                                                      uot->two_center_bundle->overlap_orb_onsite.get(),
+                                                      &GlobalC::dftu,
+                                                      *(LM_in->ParaV));
             }
             this->getOperator()->add(dftu);
         }
         if (GlobalV::sc_mag_switch)
         {
-            Operator<TK>* sc_lambda = new OperatorScLambda<OperatorLCAO<TK, TR>>(
-                LM_in,
-                kv->kvec_d,
-                this->hR,// no explicit call yet
-                &(this->getHk(LM_in)),
-                this->kv->isk);
+            Operator<TK>* sc_lambda = new OperatorScLambda<OperatorLCAO<TK, TR>>(LM_in,
+                                                                                 kv->kvec_d,
+                                                                                 this->hR, // no explicit call yet
+                                                                                 &(this->getHk(LM_in)),
+                                                                                 this->kv->isk);
             this->getOperator()->add(sc_lambda);
         }
     }
@@ -437,21 +405,21 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
 #ifdef __EXX
     if (GlobalC::exx_info.info_global.cal_exx)
     {
-        Operator<TK>* exx
-            = new OperatorEXX<OperatorLCAO<TK, TR>>(LM_in,
-                this->hR,
-                &(this->getHk(LM_in)),
-                *this->kv,
-                LM_in->Hexxd,
-                LM_in->Hexxc,
-                exx_two_level_step,
-                !GlobalC::restart.info_load.restart_exx&& GlobalC::restart.info_load.load_H);
+        Operator<TK>* exx = new OperatorEXX<OperatorLCAO<TK, TR>>(LM_in,
+                                                                  this->hR,
+                                                                  &(this->getHk(LM_in)),
+                                                                  *this->kv,
+                                                                  LM_in->Hexxd,
+                                                                  LM_in->Hexxc,
+                                                                  exx_two_level_step,
+                                                                  !GlobalC::restart.info_load.restart_exx
+                                                                      && GlobalC::restart.info_load.load_H);
         this->getOperator()->add(exx);
     }
 #endif
     // if NSPIN==2, HR should be separated into two parts, save HR into this->hRS2
     int memory_fold = 1;
-    if(GlobalV::NSPIN == 2)
+    if (GlobalV::NSPIN == 2)
     {
         this->hRS2.resize(this->hR->get_nnr() * 2);
         this->hR->allocate(this->hRS2.data(), 0);
@@ -460,34 +428,33 @@ HamiltLCAO<TK, TR>::HamiltLCAO(
 
     ModuleBase::Memory::record("HamiltLCAO::hR", this->hR->get_memory_size() * memory_fold);
     ModuleBase::Memory::record("HamiltLCAO::sR", this->sR->get_memory_size());
-    
+
     return;
 }
 
 // case for multi-k-points
 template <typename TK, typename TR>
-void HamiltLCAO<TK, TR>::matrix(MatrixBlock<TK> &hk_in,
-                                              MatrixBlock<TK> &sk_in)
+void HamiltLCAO<TK, TR>::matrix(MatrixBlock<TK>& hk_in, MatrixBlock<TK>& sk_in)
 {
     auto op = dynamic_cast<OperatorLCAO<TK, TR>*>(this->getOperator());
     assert(op != nullptr);
     op->matrixHk(hk_in, sk_in);
 }
 
-template <typename TK, typename TR> 
+template <typename TK, typename TR>
 void HamiltLCAO<TK, TR>::updateHk(const int ik)
 {
     ModuleBase::TITLE("HamiltLCAO", "updateHk");
     ModuleBase::timer::tick("HamiltLCAO", "updateHk");
-    //update global spin index
+    // update global spin index
     if (GlobalV::NSPIN == 2)
     {
         // if Veff is added and current_spin is changed, refresh HR
-        if(GlobalV::VL_IN_H && this->kv->isk[ik] != this->current_spin)
+        if (GlobalV::VL_IN_H && this->kv->isk[ik] != this->current_spin)
         {
             // change data pointer of HR
-            this->hR->allocate(this->hRS2.data()+this->hRS2.size()/2*this->kv->isk[ik], 0);
-            if(this->refresh_times > 0)
+            this->hR->allocate(this->hRS2.data() + this->hRS2.size() / 2 * this->kv->isk[ik], 0);
+            if (this->refresh_times > 0)
             {
                 this->refresh_times--;
                 dynamic_cast<hamilt::OperatorLCAO<TK, TR>*>(this->ops)->set_hr_done(false);
@@ -504,14 +471,14 @@ void HamiltLCAO<TK, TR>::refresh()
 {
     ModuleBase::TITLE("HamiltLCAO", "refresh");
     dynamic_cast<hamilt::OperatorLCAO<TK, TR>*>(this->ops)->set_hr_done(false);
-    if(GlobalV::NSPIN == 2)
+    if (GlobalV::NSPIN == 2)
     {
         this->refresh_times = 1;
         this->current_spin = 0;
-        if(this->hR->get_nnr() != this->hRS2.size()/2)
+        if (this->hR->get_nnr() != this->hRS2.size() / 2)
         {
             // operator has changed, resize hRS2
-            this->hRS2.resize(this->hR->get_nnr() * 2); 
+            this->hRS2.resize(this->hR->get_nnr() * 2);
         }
         this->hR->allocate(this->hRS2.data(), 0);
     }
@@ -573,18 +540,18 @@ HContainer<TR>*& HamiltLCAO<TK, TR>::getSR()
     return this->sR;
 }
 
-template<typename TK, typename TR>
+template <typename TK, typename TR>
 void HamiltLCAO<TK, TR>::updateSk(const int ik, LCAO_Matrix* LM_in, const int hk_type)
 {
     ModuleBase::TITLE("HamiltLCAO", "updateSk");
     ModuleBase::timer::tick("HamiltLCAO", "updateSk");
     ModuleBase::GlobalFunc::ZEROS(this->getSk(LM_in).data(), this->getSk(LM_in).size());
-    if(hk_type == 1)// collumn-major matrix for SK
+    if (hk_type == 1) // collumn-major matrix for SK
     {
         const int nrow = LM_in->ParaV->get_row_size();
         hamilt::folding_HR(*this->sR, this->getSk(LM_in).data(), this->kv->kvec_d[ik], nrow, 1);
     }
-    else if(hk_type == 0) // row-major matrix for SK
+    else if (hk_type == 0) // row-major matrix for SK
     {
         const int ncol = LM_in->ParaV->get_col_size();
         hamilt::folding_HR(*this->sR, this->getSk(LM_in).data(), this->kv->kvec_d[ik], ncol, 0);
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h
index 21be7c0a50..64f64a6d5b 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_lcao.h
@@ -1,16 +1,16 @@
 #ifndef DFTPLUSU_H
 #define DFTPLUSU_H
-#include <unordered_map>
-
+#include "dftu.hpp"
 #include "module_basis/module_ao/parallel_orbitals.h"
+#include "module_basis/module_nao/two_center_integrator.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 #include "module_cell/unitcell.h"
-#include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
-#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
 #include "module_elecstate/module_dm/density_matrix.h"
+#include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "module_hamilt_lcao/module_dftu/dftu.h"
-#include "dftu.hpp"
-#include "module_basis/module_nao/two_center_integrator.h"
+#include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
+
+#include <unordered_map>
 
 namespace hamilt
 {
@@ -27,14 +27,14 @@ class DFTU<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 {
   public:
     DFTU<OperatorLCAO<TK, TR>>(LCAO_Matrix* lm_in,
-                                      const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
-                                      hamilt::HContainer<TR>* hR_in,
-                                      std::vector<TK>* hK_in,
-                                      const UnitCell& ucell_in,
-                                      Grid_Driver* gridD_in,
-                                      const TwoCenterIntegrator* intor,
-                                      ModuleDFTU::DFTU* dftu_in,
-                                      const Parallel_Orbitals& paraV);
+                               const std::vector<ModuleBase::Vector3<double>>& kvec_d_in,
+                               hamilt::HContainer<TR>* hR_in,
+                               std::vector<TK>* hK_in,
+                               const UnitCell& ucell_in,
+                               Grid_Driver* gridD_in,
+                               const TwoCenterIntegrator* intor,
+                               ModuleDFTU::DFTU* dftu_in,
+                               const Parallel_Orbitals& paraV);
     ~DFTU<OperatorLCAO<TK, TR>>();
 
     /**
@@ -43,12 +43,11 @@ class DFTU<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
      */
     virtual void contributeHR() override;
 
-    /// calculate force and stress for DFT+U 
-    void cal_force_stress(
-      const bool cal_force, 
-      const bool cal_stress, 
-      ModuleBase::matrix& force, 
-      ModuleBase::matrix& stress);
+    /// calculate force and stress for DFT+U
+    void cal_force_stress(const bool cal_force,
+                          const bool cal_stress,
+                          ModuleBase::matrix& force,
+                          ModuleBase::matrix& stress);
 
   private:
     const UnitCell* ucell = nullptr;
@@ -68,7 +67,7 @@ class DFTU<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
     /**
      * @brief search the nearest neighbor atoms and save them into this->adjs_all
-     * the size of HR will not change in DFTU, 
+     * the size of HR will not change in DFTU,
      * because I don't want to expand HR larger than Nonlocal operator caused by DFTU
      */
     void initialize_HR(Grid_Driver* gridD_in, const Parallel_Orbitals* paraV);
@@ -76,30 +75,25 @@ class DFTU<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
     /**
      * @brief calculate the <phi|alpha^I> overlap values and save them in this->nlm_tot
      * it will be reused in the calculation of calculate_HR()
-    */
+     */
     void cal_nlm_all(const Parallel_Orbitals* paraV);
 
     /**
      * @brief calculate the occ_mm' = \sum_R DMR*<phi_0|alpha^I_m'><alpha^I_m'|phi_R> matrix for each atom to add U
-    */
+     */
     void cal_occ(const int& iat1,
-                const int& iat2,
-                const Parallel_Orbitals* paraV,
-                const std::unordered_map<int, std::vector<double>>& nlm1_all,
-                const std::unordered_map<int, std::vector<double>>& nlm2_all,
-                const double* data_pointer,
-                std::vector<double>& occupations);
+                 const int& iat2,
+                 const Parallel_Orbitals* paraV,
+                 const std::unordered_map<int, std::vector<double>>& nlm1_all,
+                 const std::unordered_map<int, std::vector<double>>& nlm2_all,
+                 const double* data_pointer,
+                 std::vector<double>& occupations);
 
     /// transfer VU format from pauli matrix to normal for non-collinear spin case
     void transfer_vu(std::vector<double>& vu_tmp, std::vector<TR>& vu);
     /// VU_{m, m'} = sum_{m,m'} (1/2*delta_{m, m'} - occ_{m, m'}) * U
     /// EU = sum_{m,m'} 1/2 * U * occ_{m, m'} * occ_{m', m}
-    void cal_v_of_u(
-      const std::vector<double>& occ,
-      const int m_size,
-      const double u_value,
-      double* vu,
-      double& eu);
+    void cal_v_of_u(const std::vector<double>& occ, const int m_size, const double u_value, double* vu, double& eu);
 
     /**
      * @brief calculate the HR local matrix of <I,J,R> atom pair
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.h
index 8bcfa53ef4..35e827fc3d 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/ekinetic_new.h
@@ -1,11 +1,11 @@
 #ifndef EKINETICNEW_H
 #define EKINETICNEW_H
 #include "module_basis/module_ao/parallel_orbitals.h"
+#include "module_basis/module_nao/two_center_integrator.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 #include "module_cell/unitcell.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
-#include "module_basis/module_nao/two_center_integrator.h"
 
 namespace hamilt
 {
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
index a7635ca076..0b320b54ad 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.cpp
@@ -1,11 +1,11 @@
 #include "nonlocal_new.h"
 
+#include "module_base/timer.h"
+#include "module_base/tool_title.h"
 #include "module_basis/module_ao/ORB_gen_tables.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer_funcs.h"
-#include "module_base/timer.h"
-#include "module_base/tool_title.h"
 #ifdef _OPENMP
 #include <unordered_set>
 #endif
@@ -68,8 +68,8 @@ void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::initialize_HR(Grid_Drive
             const ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
             // choose the real adjacent atoms
             const LCAO_Orbitals& orb = LCAO_Orbitals::get_const_instance();
-            // Note: the distance of atoms should less than the cutoff radius, 
-            // When equal, the theoretical value of matrix element is zero, 
+            // Note: the distance of atoms should less than the cutoff radius,
+            // When equal, the theoretical value of matrix element is zero,
             // but the calculated value is not zero due to the numerical error, which would lead to result changes.
             if (this->ucell->cal_dtau(iat0, iat1, R_index1).norm() * this->ucell->lat0
                 < orb.Phi[T1].getRcut() + this->ucell->infoNL.Beta[T0].get_rcut_max())
@@ -122,97 +122,98 @@ void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::calculate_HR()
     // 1. calculate <psi|beta> for each pair of atoms
 #ifdef _OPENMP
 #pragma omp parallel
-{
-    std::unordered_set<int> atom_row_list;
-    #pragma omp for
-    for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
     {
-        atom_row_list.insert(iat0);
-    }
+        std::unordered_set<int> atom_row_list;
+#pragma omp for
+        for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
+        {
+            atom_row_list.insert(iat0);
+        }
 #endif
-    for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
-    {
-        auto tau0 = ucell->get_tau(iat0);
-        int T0, I0;
-        ucell->iat2iait(iat0, &I0, &T0);
-        AdjacentAtomInfo& adjs = this->adjs_all[iat0];
+        for (int iat0 = 0; iat0 < this->ucell->nat; iat0++)
+        {
+            auto tau0 = ucell->get_tau(iat0);
+            int T0, I0;
+            ucell->iat2iait(iat0, &I0, &T0);
+            AdjacentAtomInfo& adjs = this->adjs_all[iat0];
 
-        std::vector<std::unordered_map<int, std::vector<double>>> nlm_tot;
-        nlm_tot.resize(adjs.adj_num + 1);
+            std::vector<std::unordered_map<int, std::vector<double>>> nlm_tot;
+            nlm_tot.resize(adjs.adj_num + 1);
 
-        for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
-        {
-            const int T1 = adjs.ntype[ad];
-            const int I1 = adjs.natom[ad];
-            const int iat1 = ucell->itia2iat(T1, I1);
-            const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
-            const Atom* atom1 = &ucell->atoms[T1];
+            for (int ad = 0; ad < adjs.adj_num + 1; ++ad)
+            {
+                const int T1 = adjs.ntype[ad];
+                const int I1 = adjs.natom[ad];
+                const int iat1 = ucell->itia2iat(T1, I1);
+                const ModuleBase::Vector3<double>& tau1 = adjs.adjacent_tau[ad];
+                const Atom* atom1 = &ucell->atoms[T1];
 
-            auto all_indexes = paraV->get_indexes_row(iat1);
+                auto all_indexes = paraV->get_indexes_row(iat1);
 #ifdef _OPENMP
-            if(atom_row_list.find(iat1) == atom_row_list.end())
-            {
-                all_indexes.clear();
-            }
+                if (atom_row_list.find(iat1) == atom_row_list.end())
+                {
+                    all_indexes.clear();
+                }
 #endif
-            auto col_indexes = paraV->get_indexes_col(iat1);
-            // insert col_indexes into all_indexes to get universal set with no repeat elements
-            all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
-            std::sort(all_indexes.begin(), all_indexes.end());
-            all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
-            for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol)
-            {
-                const int iw1 = all_indexes[iw1l] / npol;
-                std::vector<std::vector<double>> nlm;
-                // nlm is a vector of vectors, but size of outer vector is only 1 here
-                // If we are calculating force, we need also to store the gradient
-                // and size of outer vector is then 4
-                // inner loop : all projectors (L0,M0)
-                int L1 = atom1->iw2l[ iw1 ];
-                int N1 = atom1->iw2n[ iw1 ];
-                int m1 = atom1->iw2m[ iw1 ];
+                auto col_indexes = paraV->get_indexes_col(iat1);
+                // insert col_indexes into all_indexes to get universal set with no repeat elements
+                all_indexes.insert(all_indexes.end(), col_indexes.begin(), col_indexes.end());
+                std::sort(all_indexes.begin(), all_indexes.end());
+                all_indexes.erase(std::unique(all_indexes.begin(), all_indexes.end()), all_indexes.end());
+                for (int iw1l = 0; iw1l < all_indexes.size(); iw1l += npol)
+                {
+                    const int iw1 = all_indexes[iw1l] / npol;
+                    std::vector<std::vector<double>> nlm;
+                    // nlm is a vector of vectors, but size of outer vector is only 1 here
+                    // If we are calculating force, we need also to store the gradient
+                    // and size of outer vector is then 4
+                    // inner loop : all projectors (L0,M0)
+                    int L1 = atom1->iw2l[iw1];
+                    int N1 = atom1->iw2n[iw1];
+                    int m1 = atom1->iw2m[iw1];
 
-                // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-                int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
+                    // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
+                    int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
-                ModuleBase::Vector3<double> dtau = tau0 - tau1;
-                intor_->snap(T1, L1, N1, M1, T0, dtau * this->ucell->lat0, 0 /*cal_deri*/, nlm);
-                nlm_tot[ad].insert({all_indexes[iw1l], nlm[0]});
+                    ModuleBase::Vector3<double> dtau = tau0 - tau1;
+                    intor_->snap(T1, L1, N1, M1, T0, dtau * this->ucell->lat0, 0 /*cal_deri*/, nlm);
+                    nlm_tot[ad].insert({all_indexes[iw1l], nlm[0]});
+                }
             }
-        }
-// 2. calculate <psi_I|beta>D<beta|psi_{J,R}> for each pair of <IJR> atoms
-        for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
-        {
-            const int T1 = adjs.ntype[ad1];
-            const int I1 = adjs.natom[ad1];
-            const int iat1 = ucell->itia2iat(T1, I1);
-#ifdef _OPENMP
-            if(atom_row_list.find(iat1) == atom_row_list.end())
+            // 2. calculate <psi_I|beta>D<beta|psi_{J,R}> for each pair of <IJR> atoms
+            for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
             {
-                continue;
-            }
+                const int T1 = adjs.ntype[ad1];
+                const int I1 = adjs.natom[ad1];
+                const int iat1 = ucell->itia2iat(T1, I1);
+#ifdef _OPENMP
+                if (atom_row_list.find(iat1) == atom_row_list.end())
+                {
+                    continue;
+                }
 #endif
-            ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
-            for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
-            {
-                const int T2 = adjs.ntype[ad2];
-                const int I2 = adjs.natom[ad2];
-                const int iat2 = ucell->itia2iat(T2, I2);
-                ModuleBase::Vector3<int>& R_index2 = adjs.box[ad2];
-                ModuleBase::Vector3<int> R_vector(R_index2[0] - R_index1[0],
-                                                  R_index2[1] - R_index1[1],
-                                                  R_index2[2] - R_index1[2]);
-                hamilt::BaseMatrix<TR>* tmp = this->HR_fixed->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]);
-                // if not found , skip this pair of atoms
-                if (tmp != nullptr)
+                ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
+                for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
                 {
-                    this->cal_HR_IJR(iat1, iat2, T0, paraV, nlm_tot[ad1], nlm_tot[ad2], tmp->get_pointer());
+                    const int T2 = adjs.ntype[ad2];
+                    const int I2 = adjs.natom[ad2];
+                    const int iat2 = ucell->itia2iat(T2, I2);
+                    ModuleBase::Vector3<int>& R_index2 = adjs.box[ad2];
+                    ModuleBase::Vector3<int> R_vector(R_index2[0] - R_index1[0],
+                                                      R_index2[1] - R_index1[1],
+                                                      R_index2[2] - R_index1[2]);
+                    hamilt::BaseMatrix<TR>* tmp
+                        = this->HR_fixed->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]);
+                    // if not found , skip this pair of atoms
+                    if (tmp != nullptr)
+                    {
+                        this->cal_HR_IJR(iat1, iat2, T0, paraV, nlm_tot[ad1], nlm_tot[ad2], tmp->get_pointer());
+                    }
                 }
             }
         }
-    }
 #ifdef _OPENMP
-}
+    }
 #endif
 
     ModuleBase::timer::tick("NonlocalNew", "calculate_HR");
@@ -301,7 +302,7 @@ void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
             this->HR_fixed->set_zero();
             this->allocated = true;
         }
-        if(this->next_sub_op != nullptr)
+        if (this->next_sub_op != nullptr)
         {
             // pass pointer of HR_fixed to the next node
             static_cast<OperatorLCAO<TK, TR>*>(this->next_sub_op)->set_HR_fixed(this->HR_fixed);
@@ -311,7 +312,7 @@ void hamilt::NonlocalNew<hamilt::OperatorLCAO<TK, TR>>::contributeHR()
         this->HR_fixed_done = true;
     }
     // last node of sub-chain, add HR_fixed into HR
-    if(this->next_sub_op == nullptr)
+    if (this->next_sub_op == nullptr)
     {
         this->hR->add(*(this->HR_fixed));
     }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h
index c860de3351..6dbcdd9ca3 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/nonlocal_new.h
@@ -1,13 +1,13 @@
 #ifndef NONLOCALNEW_H
 #define NONLOCALNEW_H
-#include <unordered_map>
-
 #include "module_basis/module_ao/parallel_orbitals.h"
+#include "module_basis/module_nao/two_center_integrator.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 #include "module_cell/unitcell.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
-#include "module_basis/module_nao/two_center_integrator.h"
+
+#include <unordered_map>
 
 namespace hamilt
 {
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
index ec4c074b0c..6e149195fb 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.cpp
@@ -162,9 +162,8 @@ void hamilt::OverlapNew<hamilt::OperatorLCAO<TK, TR>>::cal_SR_IJR(const int& iat
             const int m2 = iw2m2[iw2];
 
             // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-            int M2 = (m2 % 2 == 0) ? -m2/2 : (m2+1)/2;
-            intor_->calculate(T1, L1, N1, M1,
-                    T2, L2, N2, M2, dtau * this->ucell->lat0, olm);
+            int M2 = (m2 % 2 == 0) ? -m2 / 2 : (m2 + 1) / 2;
+            intor_->calculate(T1, L1, N1, M1, T2, L2, N2, M2, dtau * this->ucell->lat0, olm);
             for (int ipol = 0; ipol < npol; ipol++)
             {
                 data_pointer[ipol * step_trace] += olm[0];
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.h b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.h
index 4de2da23d3..3840c9077f 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.h
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/overlap_new.h
@@ -1,11 +1,11 @@
 #ifndef OVERLAPNEW_H
 #define OVERLAPNEW_H
+#include "module_basis/module_ao/ORB_gen_tables.h"
 #include "module_basis/module_ao/parallel_orbitals.h"
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 #include "module_cell/unitcell.h"
 #include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/operator_lcao.h"
 #include "module_hamilt_lcao/module_hcontainer/hcontainer.h"
-#include "module_basis/module_ao/ORB_gen_tables.h"
 
 namespace hamilt
 {
@@ -89,7 +89,7 @@ class OverlapNew<OperatorLCAO<TK, TR>> : public OperatorLCAO<TK, TR>
 
     // if k vector is not changed, then do nothing and return
     // default of kvec_d_old is (-10,-10,-10), which is not a valid k vector
-    ModuleBase::Vector3<double> kvec_d_old = ModuleBase::Vector3<double>(-10,-10,-10);
+    ModuleBase::Vector3<double> kvec_d_old = ModuleBase::Vector3<double>(-10, -10, -10);
 };
 
 } // namespace hamilt
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp
index cb31484a9b..bdb85cf947 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_T_NL_cd.cpp
@@ -1,6 +1,7 @@
-#include "gtest/gtest.h"
 #include "../ekinetic_new.h"
 #include "../nonlocal_new.h"
+
+#include "gtest/gtest.h"
 #include <chrono>
 
 //---------------------------------------
@@ -70,7 +71,7 @@ class TNLTest : public ::testing::Test
         ucell.atoms[0].ncpp.index2_soc[0] = new int[5];
         ucell.atoms[0].ncpp.index1_soc[3] = new int[5];
         ucell.atoms[0].ncpp.index2_soc[3] = new int[5];
-        for(int i = 0; i < 5; ++i)
+        for (int i = 0; i < 5; ++i)
         {
             ucell.atoms[0].ncpp.d_real(i, i) = 1.0;
             ucell.atoms[0].ncpp.d_so(0, i, i) = std::complex<double>(2.0, 0.0);
@@ -102,7 +103,6 @@ class TNLTest : public ::testing::Test
         delete[] ucell.iat2it;
         delete[] ucell.iat2ia;
         delete[] ucell.infoNL.Beta;
-
     }
 
 #ifdef __MPI
@@ -118,12 +118,13 @@ class TNLTest : public ::testing::Test
     }
 #else
     void init_parav()
-    {}
+    {
+    }
 #endif
 
     UnitCell ucell;
     hamilt::HContainer<std::complex<double>>* HR;
-    Parallel_Orbitals *paraV;
+    Parallel_Orbitals* paraV;
     TwoCenterIntegrator intor_;
 
     int dsize;
@@ -136,37 +137,37 @@ TEST_F(TNLTest, testTVNLcd2cd)
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(2, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     kvec_d_in[1] = ModuleBase::Vector3<double>(0.1, 0.2, 0.3);
     std::vector<std::complex<double>> hk(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
-    Grid_Driver gd(0,0,0);
+    Grid_Driver gd(0, 0, 0);
     std::chrono::high_resolution_clock::time_point start_time = std::chrono::high_resolution_clock::now();
-    hamilt::Operator<std::complex<double>> *op = new hamilt::EkineticNew<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(
-        nullptr, 
-        kvec_d_in, 
-        HR, 
-        &hk, 
-        &ucell, 
-        &gd,
-        &intor_,
-        paraV
-    );
-    hamilt::Operator<std::complex<double>> *op1 = new hamilt::NonlocalNew<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(
-        nullptr, 
-        kvec_d_in, 
-        HR, 
-        &hk, 
-        &ucell, 
-        &gd,
-        &intor_,
-        paraV
-    );
+    hamilt::Operator<std::complex<double>>* op
+        = new hamilt::EkineticNew<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(nullptr,
+                                                                                                    kvec_d_in,
+                                                                                                    HR,
+                                                                                                    &hk,
+                                                                                                    &ucell,
+                                                                                                    &gd,
+                                                                                                    &intor_,
+                                                                                                    paraV);
+    hamilt::Operator<std::complex<double>>* op1
+        = new hamilt::NonlocalNew<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>(nullptr,
+                                                                                                    kvec_d_in,
+                                                                                                    HR,
+                                                                                                    &hk,
+                                                                                                    &ucell,
+                                                                                                    &gd,
+                                                                                                    &intor_,
+                                                                                                    paraV);
     // merge two Operators to a chain
     op->add(op1);
     std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> elapsed_time0 = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::chrono::duration<double> elapsed_time0
+        = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
     start_time = std::chrono::high_resolution_clock::now();
     // calculate HR and folding HK for gamma point
     op->init(0);
     end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> elapsed_time1 = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::chrono::duration<double> elapsed_time1
+        = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
 
     // check the value of HR
     double result_ref = test_size * 10;
@@ -180,9 +181,9 @@ TEST_F(TNLTest, testTVNLcd2cd)
         int i = 0;
         for (int mu = 0; mu < indexes1.size(); ++mu)
         {
-            for(int nu = 0; nu < indexes2.size(); ++nu)
+            for (int nu = 0; nu < indexes2.size(); ++nu)
             {
-                if(mu % npol == nu % npol)
+                if (mu % npol == nu % npol)
                 {
                     EXPECT_EQ(tmp.get_pointer(0)[i].real(), 1.0);
                     EXPECT_EQ(tmp.get_pointer(0)[i].imag(), 0.0);
@@ -203,11 +204,11 @@ TEST_F(TNLTest, testTVNLcd2cd)
     // check the value of HK of gamma point
     result_ref += 1.0;
     int i = 0;
-    for ( int irow = 0; irow < paraV->get_row_size(); ++irow)
+    for (int irow = 0; irow < paraV->get_row_size(); ++irow)
     {
-        for ( int icol = 0; icol < paraV->get_col_size(); ++icol)
+        for (int icol = 0; icol < paraV->get_col_size(); ++icol)
         {
-            if (irow%npol == icol%npol)
+            if (irow % npol == icol % npol)
             {
                 EXPECT_NEAR(hk[i].real(), result_ref, 1e-10);
                 EXPECT_NEAR(hk[i].imag(), 0.0, 1e-10);
@@ -222,21 +223,24 @@ TEST_F(TNLTest, testTVNLcd2cd)
     }
     // calculate HK for k point
     start_time = std::chrono::high_resolution_clock::now();
-    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0) );
+    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
     op->init(1);
     end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> elapsed_time2 = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
-    std::cout << "Test terms:   " <<std::setw(15)<< "constructor" <<std::setw(15)<< "init(HR+HK)" <<std::setw(15)<< "2nd-init(HK)" << std::endl;
-    std::cout << "Elapsed time: " <<std::setw(15)<< elapsed_time0.count()<<std::setw(15)<<elapsed_time1.count()<<std::setw(15)<<elapsed_time2.count() << " seconds." << std::endl;
+    std::chrono::duration<double> elapsed_time2
+        = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::cout << "Test terms:   " << std::setw(15) << "constructor" << std::setw(15) << "init(HR+HK)" << std::setw(15)
+              << "2nd-init(HK)" << std::endl;
+    std::cout << "Elapsed time: " << std::setw(15) << elapsed_time0.count() << std::setw(15) << elapsed_time1.count()
+              << std::setw(15) << elapsed_time2.count() << " seconds." << std::endl;
     // check the value of HK
-    double result_ref1 = -1.6180339887498931/2 + test_size * 10;
-    double result_ref2 = -1.1755705045849467/2;
+    double result_ref1 = -1.6180339887498931 / 2 + test_size * 10;
+    double result_ref2 = -1.1755705045849467 / 2;
     i = 0;
-    for ( int irow = 0; irow < paraV->get_row_size(); ++irow)
+    for (int irow = 0; irow < paraV->get_row_size(); ++irow)
     {
-        for ( int icol = 0; icol < paraV->get_col_size(); ++icol)
+        for (int icol = 0; icol < paraV->get_col_size(); ++icol)
         {
-            if (irow%npol == icol%npol)
+            if (irow % npol == icol % npol)
             {
                 EXPECT_NEAR(hk[i].real(), result_ref1, 1e-10);
                 EXPECT_NEAR(hk[i].imag(), result_ref2, 1e-10);
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_dftu.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_dftu.cpp
index 2a53e0feda..8f61b6919a 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_dftu.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_dftu.cpp
@@ -1,6 +1,7 @@
-#include "gtest/gtest.h"
 #include "../dftu_lcao.h"
-#include <chrono> 
+
+#include "gtest/gtest.h"
+#include <chrono>
 
 // mock of DFTU
 #include "module_hamilt_lcao/module_dftu/dftu.h"
@@ -8,7 +9,7 @@ ModuleDFTU::DFTU::DFTU(){};
 ModuleDFTU::DFTU::~DFTU(){};
 namespace GlobalC
 {
-    ModuleDFTU::DFTU dftu;
+ModuleDFTU::DFTU dftu;
 }
 const hamilt::HContainer<double>* tmp_DMR;
 const hamilt::HContainer<double>* ModuleDFTU::DFTU::get_dmr(int ispin) const
@@ -16,8 +17,6 @@ const hamilt::HContainer<double>* ModuleDFTU::DFTU::get_dmr(int ispin) const
     return tmp_DMR;
 }
 
-
-
 //---------------------------------------
 // Unit test of DFTU class
 // DFTU is a derivative class of Operator, it is used to calculate the kinetic matrix
@@ -32,8 +31,8 @@ const hamilt::HContainer<double>* ModuleDFTU::DFTU::get_dmr(int ispin) const
 
 // test_size is the number of atoms in the unitcell
 // modify test_size to test different size of unitcell
-int test_size = 10; 
-int test_nw = 10;   // please larger than 5
+int test_size = 10;
+int test_nw = 10; // please larger than 5
 class DFTUTest : public ::testing::Test
 {
   protected:
@@ -80,24 +79,23 @@ class DFTUTest : public ::testing::Test
         DMR = new hamilt::HContainer<double>(*HR);
         tmp_DMR = DMR;
 
-        //setting of DFTU
+        // setting of DFTU
         GlobalC::dftu.locale.resize(test_size);
-        for(int iat=0;iat<test_size;iat++)
+        for (int iat = 0; iat < test_size; iat++)
         {
             GlobalC::dftu.locale[iat].resize(3);
-            for(int l=0;l<3;l++)
+            for (int l = 0; l < 3; l++)
             {
                 GlobalC::dftu.locale[iat][l].resize(1);
                 GlobalC::dftu.locale[iat][l][0].resize(2);
-                GlobalC::dftu.locale[iat][l][0][0].create(2*l+1, 2*l+1);
-                GlobalC::dftu.locale[iat][l][0][1].create(2*l+1, 2*l+1);
+                GlobalC::dftu.locale[iat][l][0][0].create(2 * l + 1, 2 * l + 1);
+                GlobalC::dftu.locale[iat][l][0][1].create(2 * l + 1, 2 * l + 1);
             }
         }
         GlobalC::dftu.U = &U_test;
         GlobalC::dftu.orbital_corr = &orbital_c_test;
 
         GlobalV::onsite_radius = 1.0;
-
     }
 
     void TearDown() override
@@ -127,13 +125,14 @@ class DFTUTest : public ::testing::Test
     }
 #else
     void init_parav()
-    {}
+    {
+    }
 #endif
 
     UnitCell ucell;
     hamilt::HContainer<double>* HR;
     hamilt::HContainer<double>* DMR;
-    Parallel_Orbitals *paraV;
+    Parallel_Orbitals* paraV;
     TwoCenterIntegrator intor_;
 
     int dsize;
@@ -145,42 +144,35 @@ class DFTUTest : public ::testing::Test
 // using TEST_F to test DFTU
 TEST_F(DFTUTest, constructHRd2d)
 {
-    //test for nspin=1
+    // test for nspin=1
     GlobalV::NSPIN = 1;
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(1, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     std::vector<double> hk(paraV->get_row_size() * paraV->get_col_size(), 0.0);
-    Grid_Driver gd(0,0,0);
+    Grid_Driver gd(0, 0, 0);
     // check some input values
     EXPECT_EQ(LCAO_Orbitals::get_const_instance().Phi[0].getRcut(), 1.0);
     // reset HR and DMR
     const double factor = 1.0 / test_nw / test_nw / test_size / test_size;
-    for(int i=0;i<DMR->get_nnr();i++)
+    for (int i = 0; i < DMR->get_nnr(); i++)
     {
         DMR->get_wrapper()[i] = factor;
         HR->get_wrapper()[i] = 0.0;
     }
     std::chrono::high_resolution_clock::time_point start_time = std::chrono::high_resolution_clock::now();
-    hamilt::DFTU<hamilt::OperatorLCAO<double, double>> op(
-        nullptr, 
-        kvec_d_in, 
-        HR, 
-        &hk, 
-        ucell, 
-        &gd,
-        &intor_,
-        &GlobalC::dftu,
-        *paraV
-    );
+    hamilt::DFTU<hamilt::OperatorLCAO<double, double>>
+        op(nullptr, kvec_d_in, HR, &hk, ucell, &gd, &intor_, &GlobalC::dftu, *paraV);
     std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> elapsed_time = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::chrono::duration<double> elapsed_time
+        = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
     start_time = std::chrono::high_resolution_clock::now();
     op.contributeHR();
     end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> elapsed_time1 = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::chrono::duration<double> elapsed_time1
+        = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
     // check the occupations of dftu
-    for(int iat=0;iat<test_size;iat++)
+    for (int iat = 0; iat < test_size; iat++)
     {
-        for(int icc=0;icc<25;icc++)
+        for (int icc = 0; icc < 25; icc++)
         {
             EXPECT_NEAR(GlobalC::dftu.locale[iat][2][0][0].c[icc], 0.5, 1e-10);
         }
@@ -196,21 +188,24 @@ TEST_F(DFTUTest, constructHRd2d)
         int nwt = indexes1.size() * indexes2.size();
         for (int i = 0; i < nwt; ++i)
         {
-            EXPECT_NEAR(tmp.get_pointer(0)[i], -10.0*test_size, 1e-10);
+            EXPECT_NEAR(tmp.get_pointer(0)[i], -10.0 * test_size, 1e-10);
         }
     }
     // calculate SK
     start_time = std::chrono::high_resolution_clock::now();
     op.contributeHk(0);
     end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> elapsed_time2 = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::chrono::duration<double> elapsed_time2
+        = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
     // check the value of SK
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
-        EXPECT_NEAR(hk[i], -10.0*test_size, 1e-10);
+        EXPECT_NEAR(hk[i], -10.0 * test_size, 1e-10);
     }
-    std::cout << "Test terms:   " <<std::setw(15)<< "initialize_HR" <<std::setw(15)<< "contributeHR" <<std::setw(15)<< "contributeHk" << std::endl;
-    std::cout << "Elapsed time: " <<std::setw(15)<< elapsed_time.count()<<std::setw(15)<<elapsed_time1.count()<<std::setw(15)<<elapsed_time2.count() << " seconds." << std::endl;
+    std::cout << "Test terms:   " << std::setw(15) << "initialize_HR" << std::setw(15) << "contributeHR"
+              << std::setw(15) << "contributeHk" << std::endl;
+    std::cout << "Elapsed time: " << std::setw(15) << elapsed_time.count() << std::setw(15) << elapsed_time1.count()
+              << std::setw(15) << elapsed_time2.count() << " seconds." << std::endl;
 }
 
 TEST_F(DFTUTest, constructHRd2cd)
@@ -219,31 +214,22 @@ TEST_F(DFTUTest, constructHRd2cd)
     GlobalV::NSPIN = 2;
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(2, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     std::vector<std::complex<double>> hk(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
-    Grid_Driver gd(0,0,0);
+    Grid_Driver gd(0, 0, 0);
     EXPECT_EQ(LCAO_Orbitals::get_const_instance().Phi[0].getRcut(), 1.0);
     // reset HR and DMR
     const double factor = 0.5 / test_nw / test_nw / test_size / test_size;
-    for(int i=0;i<DMR->get_nnr();i++)
+    for (int i = 0; i < DMR->get_nnr(); i++)
     {
         DMR->get_wrapper()[i] = factor;
         HR->get_wrapper()[i] = 0.0;
     }
-    hamilt::DFTU<hamilt::OperatorLCAO<std::complex<double>, double>> op(
-        nullptr, 
-        kvec_d_in, 
-        HR, 
-        &hk, 
-        ucell, 
-        &gd,
-        &intor_,
-        &GlobalC::dftu,
-        *paraV
-    );
+    hamilt::DFTU<hamilt::OperatorLCAO<std::complex<double>, double>>
+        op(nullptr, kvec_d_in, HR, &hk, ucell, &gd, &intor_, &GlobalC::dftu, *paraV);
     op.contributeHR();
     // check the occupations of dftu for spin-up
-    for(int iat=0;iat<test_size;iat++)
+    for (int iat = 0; iat < test_size; iat++)
     {
-        for(int icc=0;icc<25;icc++)
+        for (int icc = 0; icc < 25; icc++)
         {
             EXPECT_NEAR(GlobalC::dftu.locale[iat][2][0][0].c[icc], 0.5, 1e-10);
         }
@@ -259,7 +245,7 @@ TEST_F(DFTUTest, constructHRd2cd)
         int nwt = indexes1.size() * indexes2.size();
         for (int i = 0; i < nwt; ++i)
         {
-            EXPECT_NEAR(tmp.get_pointer(0)[i], -10.0*test_size, 1e-10);
+            EXPECT_NEAR(tmp.get_pointer(0)[i], -10.0 * test_size, 1e-10);
         }
     }
     // calculate HK for gamma point
@@ -267,15 +253,15 @@ TEST_F(DFTUTest, constructHRd2cd)
     // check the value of HK of gamma point
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
-        EXPECT_NEAR(hk[i].real(), -10.0*test_size, 1e-10);
+        EXPECT_NEAR(hk[i].real(), -10.0 * test_size, 1e-10);
         EXPECT_NEAR(hk[i].imag(), 0.0, 1e-10);
     }
     // calculate spin-down hamiltonian
     op.contributeHR();
     // check the occupations of dftu for spin-down
-    for(int iat=0;iat<test_size;iat++)
+    for (int iat = 0; iat < test_size; iat++)
     {
-        for(int icc=0;icc<25;icc++)
+        for (int icc = 0; icc < 25; icc++)
         {
             EXPECT_NEAR(GlobalC::dftu.locale[iat][2][0][1].c[icc], 0.5, 1e-10);
         }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp
index 35fbc19850..2985317391 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_ekineticnew.cpp
@@ -1,6 +1,6 @@
-#include "gtest/gtest.h"
 #include "../ekinetic_new.h"
 
+#include "gtest/gtest.h"
 
 //---------------------------------------
 // Unit test of EkineticNew class
@@ -87,12 +87,13 @@ class EkineticNewTest : public ::testing::Test
     }
 #else
     void init_parav()
-    {}
+    {
+    }
 #endif
 
     UnitCell ucell;
     hamilt::HContainer<double>* HR;
-    Parallel_Orbitals *paraV;
+    Parallel_Orbitals* paraV;
     TwoCenterIntegrator intor_;
 
     int dsize;
@@ -104,17 +105,9 @@ TEST_F(EkineticNewTest, constructHRd2d)
 {
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(1, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     std::vector<double> hk(paraV->get_row_size() * paraV->get_col_size(), 0.0);
-    Grid_Driver gd(0,0,0);
-    hamilt::EkineticNew<hamilt::OperatorLCAO<double, double>> op(
-        nullptr, 
-        kvec_d_in, 
-        HR, 
-        &hk, 
-        &ucell, 
-        &gd,
-        &intor_,
-        paraV
-    );
+    Grid_Driver gd(0, 0, 0);
+    hamilt::EkineticNew<hamilt::OperatorLCAO<double, double>>
+        op(nullptr, kvec_d_in, HR, &hk, &ucell, &gd, &intor_, paraV);
     op.contributeHR();
     // check the value of HR
     for (int iap = 0; iap < HR->size_atom_pairs(); ++iap)
@@ -160,17 +153,9 @@ TEST_F(EkineticNewTest, constructHRd2cd)
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(2, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     kvec_d_in[1] = ModuleBase::Vector3<double>(0.1, 0.2, 0.3);
     std::vector<std::complex<double>> hk(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
-    Grid_Driver gd(0,0,0);
-    hamilt::EkineticNew<hamilt::OperatorLCAO<std::complex<double>, double>> op(
-        nullptr, 
-        kvec_d_in, 
-        HR, 
-        &hk, 
-        &ucell, 
-        &gd,
-        &intor_,
-        paraV
-    );
+    Grid_Driver gd(0, 0, 0);
+    hamilt::EkineticNew<hamilt::OperatorLCAO<std::complex<double>, double>>
+        op(nullptr, kvec_d_in, HR, &hk, &ucell, &gd, &intor_, paraV);
     op.contributeHR();
     // check the value of HR
     for (int iap = 0; iap < HR->size_atom_pairs(); ++iap)
@@ -195,13 +180,13 @@ TEST_F(EkineticNewTest, constructHRd2cd)
         EXPECT_EQ(hk[i].imag(), 0.0);
     }
     // calculate HK for k point
-    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0) );
+    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
     op.contributeHk(1);
     // check the value of HK
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
-        EXPECT_NEAR(hk[i].real(), -1.6180339887498945/2, 1e-10);
-        EXPECT_NEAR(hk[i].imag(), -1.1755705045849467/2, 1e-10);
+        EXPECT_NEAR(hk[i].real(), -1.6180339887498945 / 2, 1e-10);
+        EXPECT_NEAR(hk[i].imag(), -1.1755705045849467 / 2, 1e-10);
     }
 }
 
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_nonlocalnew.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_nonlocalnew.cpp
index 8cbc2f3e55..6aafc318c5 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_nonlocalnew.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_nonlocalnew.cpp
@@ -1,7 +1,7 @@
-#include "gtest/gtest.h"
 #include "../nonlocal_new.h"
-#include <chrono>
 
+#include "gtest/gtest.h"
+#include <chrono>
 
 //---------------------------------------
 // Unit test of NonlocalNew class
@@ -69,7 +69,7 @@ class NonlocalNewTest : public ::testing::Test
         ucell.atoms[0].ncpp.index2_soc[0] = new int[5];
         ucell.atoms[0].ncpp.index1_soc[3] = new int[5];
         ucell.atoms[0].ncpp.index2_soc[3] = new int[5];
-        for(int i = 0; i < 5; ++i)
+        for (int i = 0; i < 5; ++i)
         {
             ucell.atoms[0].ncpp.d_real(i, i) = 1.0;
             ucell.atoms[0].ncpp.d_so(0, i, i) = std::complex<double>(2.0, 0.0);
@@ -116,12 +116,13 @@ class NonlocalNewTest : public ::testing::Test
     }
 #else
     void init_parav()
-    {}
+    {
+    }
 #endif
 
     UnitCell ucell;
     hamilt::HContainer<double>* HR;
-    Parallel_Orbitals *paraV;
+    Parallel_Orbitals* paraV;
     TwoCenterIntegrator intor_;
 
     int dsize;
@@ -133,27 +134,21 @@ TEST_F(NonlocalNewTest, constructHRd2d)
 {
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(1, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     std::vector<double> hk(paraV->get_row_size() * paraV->get_col_size(), 0.0);
-    Grid_Driver gd(0,0,0);
+    Grid_Driver gd(0, 0, 0);
     // check some input values
     EXPECT_EQ(ucell.infoNL.Beta[0].get_rcut_max(), 1.0);
     EXPECT_EQ(LCAO_Orbitals::get_const_instance().Phi[0].getRcut(), 1.0);
     std::chrono::high_resolution_clock::time_point start_time = std::chrono::high_resolution_clock::now();
-    hamilt::NonlocalNew<hamilt::OperatorLCAO<double, double>> op(
-        nullptr, 
-        kvec_d_in, 
-        HR, 
-        &hk, 
-        &ucell, 
-        &gd,
-        &intor_,
-        paraV
-    );
+    hamilt::NonlocalNew<hamilt::OperatorLCAO<double, double>>
+        op(nullptr, kvec_d_in, HR, &hk, &ucell, &gd, &intor_, paraV);
     std::chrono::high_resolution_clock::time_point end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> elapsed_time = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::chrono::duration<double> elapsed_time
+        = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
     start_time = std::chrono::high_resolution_clock::now();
     op.contributeHR();
     end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> elapsed_time1 = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::chrono::duration<double> elapsed_time1
+        = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
     // check the value of HR
     for (int iap = 0; iap < HR->size_atom_pairs(); ++iap)
     {
@@ -165,24 +160,26 @@ TEST_F(NonlocalNewTest, constructHRd2d)
         int nwt = indexes1.size() * indexes2.size();
         for (int i = 0; i < nwt; ++i)
         {
-            EXPECT_EQ(tmp.get_pointer(0)[i], 5.0*test_size);
+            EXPECT_EQ(tmp.get_pointer(0)[i], 5.0 * test_size);
         }
     }
     // calculate SK
     start_time = std::chrono::high_resolution_clock::now();
     op.contributeHk(0);
     end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> elapsed_time2 = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::chrono::duration<double> elapsed_time2
+        = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
     // check the value of SK
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
-        EXPECT_EQ(hk[i], 5.0*test_size);
+        EXPECT_EQ(hk[i], 5.0 * test_size);
     }
     // calculate HR again
     start_time = std::chrono::high_resolution_clock::now();
     op.contributeHR();
     end_time = std::chrono::high_resolution_clock::now();
-    std::chrono::duration<double> elapsed_time3 = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
+    std::chrono::duration<double> elapsed_time3
+        = std::chrono::duration_cast<std::chrono::duration<double>>(end_time - start_time);
     // check the value of HR
     for (int iap = 0; iap < HR->size_atom_pairs(); ++iap)
     {
@@ -194,11 +191,14 @@ TEST_F(NonlocalNewTest, constructHRd2d)
         int nwt = indexes1.size() * indexes2.size();
         for (int i = 0; i < nwt; ++i)
         {
-            EXPECT_EQ(tmp.get_pointer(0)[i], 10.0*test_size);
+            EXPECT_EQ(tmp.get_pointer(0)[i], 10.0 * test_size);
         }
     }
-    std::cout << "Test terms:   " <<std::setw(15)<< "initialize_HR" <<std::setw(15)<< "contributeHR" <<std::setw(15)<< "contributeHk"<<std::setw(15) << "2nd-calHR" << std::endl;
-    std::cout << "Elapsed time: " <<std::setw(15)<< elapsed_time.count()<<std::setw(15)<<elapsed_time1.count()<<std::setw(15)<<elapsed_time2.count() <<std::setw(15)<<elapsed_time3.count()<< " seconds." << std::endl;
+    std::cout << "Test terms:   " << std::setw(15) << "initialize_HR" << std::setw(15) << "contributeHR"
+              << std::setw(15) << "contributeHk" << std::setw(15) << "2nd-calHR" << std::endl;
+    std::cout << "Elapsed time: " << std::setw(15) << elapsed_time.count() << std::setw(15) << elapsed_time1.count()
+              << std::setw(15) << elapsed_time2.count() << std::setw(15) << elapsed_time3.count() << " seconds."
+              << std::endl;
 }
 
 TEST_F(NonlocalNewTest, constructHRd2cd)
@@ -206,17 +206,9 @@ TEST_F(NonlocalNewTest, constructHRd2cd)
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(2, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     kvec_d_in[1] = ModuleBase::Vector3<double>(0.1, 0.2, 0.3);
     std::vector<std::complex<double>> hk(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
-    Grid_Driver gd(0,0,0);
-    hamilt::NonlocalNew<hamilt::OperatorLCAO<std::complex<double>, double>> op(
-        nullptr, 
-        kvec_d_in, 
-        HR, 
-        &hk, 
-        &ucell, 
-        &gd,
-        &intor_,
-        paraV
-    );
+    Grid_Driver gd(0, 0, 0);
+    hamilt::NonlocalNew<hamilt::OperatorLCAO<std::complex<double>, double>>
+        op(nullptr, kvec_d_in, HR, &hk, &ucell, &gd, &intor_, paraV);
     op.contributeHR();
     // check the value of HR
     for (int iap = 0; iap < HR->size_atom_pairs(); ++iap)
@@ -229,7 +221,7 @@ TEST_F(NonlocalNewTest, constructHRd2cd)
         int nwt = indexes1.size() * indexes2.size();
         for (int i = 0; i < nwt; ++i)
         {
-            EXPECT_EQ(tmp.get_pointer(0)[i], 5.0*test_size);
+            EXPECT_EQ(tmp.get_pointer(0)[i], 5.0 * test_size);
         }
     }
     // calculate SK for gamma point
@@ -237,16 +229,16 @@ TEST_F(NonlocalNewTest, constructHRd2cd)
     // check the value of SK of gamma point
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
-        EXPECT_EQ(hk[i].real(), 5.0*test_size);
+        EXPECT_EQ(hk[i].real(), 5.0 * test_size);
         EXPECT_EQ(hk[i].imag(), 0.0);
     }
     // calculate HK for k point
-    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0) );
+    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
     op.contributeHk(1);
     // check the value of HK
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
     {
-        EXPECT_NEAR(hk[i].real(), 5.0*test_size, 1e-10);
+        EXPECT_NEAR(hk[i].real(), 5.0 * test_size, 1e-10);
         EXPECT_NEAR(hk[i].imag(), 0.0, 1e-10);
     }
 }
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew.cpp
index 332a33be96..a7d637371d 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew.cpp
@@ -1,6 +1,6 @@
-#include "gtest/gtest.h"
 #include "../overlap_new.h"
 
+#include "gtest/gtest.h"
 
 //---------------------------------------
 // Unit test of OverlapNew class
@@ -87,12 +87,13 @@ class OverlapNewTest : public ::testing::Test
     }
 #else
     void init_parav()
-    {}
+    {
+    }
 #endif
 
     UnitCell ucell;
     hamilt::HContainer<double>* SR;
-    Parallel_Orbitals *paraV;
+    Parallel_Orbitals* paraV;
     TwoCenterIntegrator intor_;
 
     int dsize;
@@ -104,19 +105,9 @@ TEST_F(OverlapNewTest, constructHRd2d)
 {
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(1, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     std::vector<double> hk(paraV->get_row_size() * paraV->get_col_size(), 0.0);
-    Grid_Driver gd(0,0,0);
-    hamilt::OverlapNew<hamilt::OperatorLCAO<double, double>> op(
-        nullptr, 
-        kvec_d_in, 
-        nullptr,
-        nullptr,
-        SR, 
-        &hk, 
-        &ucell, 
-        &gd,
-        &intor_,
-        paraV
-    );
+    Grid_Driver gd(0, 0, 0);
+    hamilt::OverlapNew<hamilt::OperatorLCAO<double, double>>
+        op(nullptr, kvec_d_in, nullptr, nullptr, SR, &hk, &ucell, &gd, &intor_, paraV);
     op.contributeHR();
     // check the value of SR
     for (int iap = 0; iap < SR->size_atom_pairs(); ++iap)
@@ -146,19 +137,9 @@ TEST_F(OverlapNewTest, constructHRd2cd)
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(2, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     kvec_d_in[1] = ModuleBase::Vector3<double>(0.1, 0.2, 0.3);
     std::vector<std::complex<double>> hk(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
-    Grid_Driver gd(0,0,0);
-    hamilt::OverlapNew<hamilt::OperatorLCAO<std::complex<double>, double>> op(
-        nullptr, 
-        kvec_d_in, 
-        nullptr,
-        nullptr,
-        SR, 
-        &hk, 
-        &ucell, 
-        &gd,
-        &intor_,
-        paraV
-    );
+    Grid_Driver gd(0, 0, 0);
+    hamilt::OverlapNew<hamilt::OperatorLCAO<std::complex<double>, double>>
+        op(nullptr, kvec_d_in, nullptr, nullptr, SR, &hk, &ucell, &gd, &intor_, paraV);
     op.contributeHR();
     // check the value of SR
     for (int iap = 0; iap < SR->size_atom_pairs(); ++iap)
@@ -183,7 +164,7 @@ TEST_F(OverlapNewTest, constructHRd2cd)
         EXPECT_EQ(hk[i].imag(), 0.0);
     }
     // calculate SK for k point
-    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0) );
+    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
     op.contributeHk(1);
     // check the value of SK
     for (int i = 0; i < paraV->get_row_size() * paraV->get_col_size(); ++i)
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew_cd.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew_cd.cpp
index c4505dbfa6..171f34c2cd 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew_cd.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/test_overlapnew_cd.cpp
@@ -1,6 +1,6 @@
-#include "gtest/gtest.h"
 #include "../overlap_new.h"
 
+#include "gtest/gtest.h"
 
 //---------------------------------------
 // Unit test of OverlapNew class
@@ -86,12 +86,13 @@ class OverlapNewTest : public ::testing::Test
     }
 #else
     void init_parav()
-    {}
+    {
+    }
 #endif
 
     UnitCell ucell;
     hamilt::HContainer<std::complex<double>>* SR;
-    Parallel_Orbitals *paraV;
+    Parallel_Orbitals* paraV;
     TwoCenterIntegrator intor_;
 
     int dsize;
@@ -104,19 +105,9 @@ TEST_F(OverlapNewTest, constructHRcd2cd)
     std::vector<ModuleBase::Vector3<double>> kvec_d_in(2, ModuleBase::Vector3<double>(0.0, 0.0, 0.0));
     kvec_d_in[1] = ModuleBase::Vector3<double>(0.1, 0.2, 0.3);
     std::vector<std::complex<double>> hk(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
-    Grid_Driver gd(0,0,0);
-    hamilt::OverlapNew<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>> op(
-        nullptr, 
-        kvec_d_in, 
-        nullptr,
-        nullptr,
-        SR, 
-        &hk, 
-        &ucell, 
-        &gd,
-        &intor_,
-        paraV
-    );
+    Grid_Driver gd(0, 0, 0);
+    hamilt::OverlapNew<hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>>
+        op(nullptr, kvec_d_in, nullptr, nullptr, SR, &hk, &ucell, &gd, &intor_, paraV);
     op.contributeHR();
     // check the value of SR
     for (int iap = 0; iap < SR->size_atom_pairs(); ++iap)
@@ -129,9 +120,9 @@ TEST_F(OverlapNewTest, constructHRcd2cd)
         int i = 0;
         for (int mu = 0; mu < indexes1.size(); ++mu)
         {
-            for(int nu = 0; nu < indexes2.size(); ++nu)
+            for (int nu = 0; nu < indexes2.size(); ++nu)
             {
-                if(mu % npol == nu % npol)
+                if (mu % npol == nu % npol)
                 {
                     EXPECT_EQ(tmp.get_pointer(0)[i].real(), 1.0);
                     EXPECT_EQ(tmp.get_pointer(0)[i].imag(), 0.0);
@@ -149,11 +140,11 @@ TEST_F(OverlapNewTest, constructHRcd2cd)
     op.contributeHk(0);
     // check the value of SK of gamma point
     int i = 0;
-    for ( int irow = 0; irow < paraV->get_row_size(); ++irow)
+    for (int irow = 0; irow < paraV->get_row_size(); ++irow)
     {
-        for ( int icol = 0; icol < paraV->get_col_size(); ++icol)
+        for (int icol = 0; icol < paraV->get_col_size(); ++icol)
         {
-            if (irow%npol == icol%npol)
+            if (irow % npol == icol % npol)
             {
                 EXPECT_NEAR(hk[i].real(), 1.0, 1e-10);
                 EXPECT_NEAR(hk[i].imag(), 0.0, 1e-10);
@@ -167,15 +158,15 @@ TEST_F(OverlapNewTest, constructHRcd2cd)
         }
     }
     // calculate SK for k point
-    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0) );
+    hk.assign(paraV->get_row_size() * paraV->get_col_size(), std::complex<double>(0.0, 0.0));
     op.contributeHk(1);
     // check the value of SK
     i = 0;
-    for ( int irow = 0; irow < paraV->get_row_size(); ++irow)
+    for (int irow = 0; irow < paraV->get_row_size(); ++irow)
     {
-        for ( int icol = 0; icol < paraV->get_col_size(); ++icol)
+        for (int icol = 0; icol < paraV->get_col_size(); ++icol)
         {
-            if (irow%npol == icol%npol)
+            if (irow % npol == icol % npol)
             {
                 EXPECT_NEAR(hk[i].real(), -0.80901699437494723, 1e-10);
                 EXPECT_NEAR(hk[i].imag(), -0.58778525229247336, 1e-10);
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
index 6d9dfa383f..5fc699728f 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
@@ -47,20 +47,20 @@ UnitCell::~UnitCell()
 
 void UnitCell::set_iat2iwt(const int& npol_in)
 {
-	this->iat2iwt.resize(this->nat);
-	this->npol = npol_in;
-	int iat=0;
-	int iwt=0;
-	for(int it = 0;it < this->ntype; it++)
-	{
-		for(int ia=0; ia<atoms[it].na; ia++)
-		{
-			this->iat2iwt[iat] = iwt;
-			iwt += atoms[it].nw * this->npol;
-			++iat;
-		}	
-	}
-	return;
+    this->iat2iwt.resize(this->nat);
+    this->npol = npol_in;
+    int iat = 0;
+    int iwt = 0;
+    for (int it = 0; it < this->ntype; it++)
+    {
+        for (int ia = 0; ia < atoms[it].na; ia++)
+        {
+            this->iat2iwt[iat] = iwt;
+            iwt += atoms[it].nw * this->npol;
+            ++iat;
+        }
+    }
+    return;
 }
 
 // mock of OperatorLCAO
@@ -101,13 +101,13 @@ template class hamilt::Operator<double, base_device::DEVICE_CPU>;
 template class hamilt::Operator<std::complex<double>, base_device::DEVICE_CPU>;*/
 
 // mock of OperatorLCAO
-template<typename TK, typename TR>
+template <typename TK, typename TR>
 void hamilt::OperatorLCAO<TK, TR>::init(const int ik_in)
 {
-    if(!this->hr_done)
+    if (!this->hr_done)
     {
         OperatorLCAO<TK, TR>* last = this;
-        while(last != nullptr)
+        while (last != nullptr)
         {
             last->contributeHR();
             last = dynamic_cast<OperatorLCAO<TK, TR>*>(last->next_sub_op);
@@ -130,7 +130,7 @@ void hamilt::OperatorLCAO<TK, TR>::contributeHk(int ik)
         hamilt::folding_HR(*this->hR, this->hK->data(), this->kvec_d[ik], ncol, 0);
     }
 }
-template<typename TK, typename TR>
+template <typename TK, typename TR>
 void hamilt::OperatorLCAO<TK, TR>::get_hs_pointers()
 {
     return;
@@ -142,95 +142,84 @@ template class hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>;
 // mock of ORB_gen_tables and LCAO_Orbitals
 //#include "module_basis/module_ao/ORB_gen_tables.h"
 #include "module_basis/module_nao/two_center_integrator.h"
-TwoCenterIntegrator::TwoCenterIntegrator() {}
+TwoCenterIntegrator::TwoCenterIntegrator()
+{
+}
 
-void TwoCenterIntegrator::tabulate(
-    const RadialCollection& bra,
-    const RadialCollection& ket,
-    const char op,
-    const int nr,
-    const double cutoff
-) {}
+void TwoCenterIntegrator::tabulate(const RadialCollection& bra,
+                                   const RadialCollection& ket,
+                                   const char op,
+                                   const int nr,
+                                   const double cutoff)
+{
+}
 
-void TwoCenterIntegrator::calculate(
-    const int itype1, 
-    const int l1, 
-    const int izeta1, 
-    const int m1, 
-    const int itype2,
-    const int l2,
-    const int izeta2,
-    const int m2,
-    const ModuleBase::Vector3<double>& vR, // vR = R2 - R1
-    double* out,
-    double* grad_out
-) const
+void TwoCenterIntegrator::calculate(const int itype1,
+                                    const int l1,
+                                    const int izeta1,
+                                    const int m1,
+                                    const int itype2,
+                                    const int l2,
+                                    const int izeta2,
+                                    const int m2,
+                                    const ModuleBase::Vector3<double>& vR, // vR = R2 - R1
+                                    double* out,
+                                    double* grad_out) const
 {
     out[0] = 1.0;
 }
 
-void TwoCenterIntegrator::snap(
-    const int itype1, 
-    const int l1, 
-    const int izeta1, 
-    const int m1, 
-    const int itype2,
-    const ModuleBase::Vector3<double>& vR, // vR = R2 - R1
-    const bool deriv,
-    std::vector<std::vector<double>>& out
-) const
+void TwoCenterIntegrator::snap(const int itype1,
+                               const int l1,
+                               const int izeta1,
+                               const int m1,
+                               const int itype2,
+                               const ModuleBase::Vector3<double>& vR, // vR = R2 - R1
+                               const bool deriv,
+                               std::vector<std::vector<double>>& out) const
 {
     out.resize(1);
-    for(int i = 0; i < out.size(); ++i)
+    for (int i = 0; i < out.size(); ++i)
     {
         out[i].resize(5, 1.0);
     }
 }
 
-//ORB_gen_tables::ORB_gen_tables() : two_center_bundle(new TwoCenterBundle) {}
-//ORB_gen_tables::~ORB_gen_tables() {}
-//ORB_gaunt_table::ORB_gaunt_table() {}
-//ORB_gaunt_table::~ORB_gaunt_table() {}
-//ORB_table_phi::ORB_table_phi() {}
-//ORB_table_phi::~ORB_table_phi() {}
-//ORB_table_alpha::ORB_table_alpha() {}
-//ORB_table_alpha::~ORB_table_alpha() {}
-//ORB_table_beta::ORB_table_beta() {}
-//ORB_table_beta::~ORB_table_beta() {}
-// mock of snap_psipsi
-//void ORB_gen_tables::snap_psipsi(
-//    const LCAO_Orbitals &orb,
-//    double olm[],
-//    const int &job, ///<[in]0 for matrix element of either S or T, 1 for its derivatives
-//    const char &dtype, ///<[in] derivative type, 'S' for overlap, 'T' for kinetic energy, 'D' for descriptor in deepks
-//    const ModuleBase::Vector3<double> &R1,
-//    const int &I1,
-//    const int &l1,
-//    const int &m1,
-//    const int &n1,
-//    const ModuleBase::Vector3<double> &R2,
-//    const int &I2,
-//    const int &l2,
-//    const int &m2,
-//    const int &n2,
-//    bool cal_syns,
-//    double dmax)const
+// ORB_gen_tables::ORB_gen_tables() : two_center_bundle(new TwoCenterBundle) {}
+// ORB_gen_tables::~ORB_gen_tables() {}
+// ORB_gaunt_table::ORB_gaunt_table() {}
+// ORB_gaunt_table::~ORB_gaunt_table() {}
+// ORB_table_phi::ORB_table_phi() {}
+// ORB_table_phi::~ORB_table_phi() {}
+// ORB_table_alpha::ORB_table_alpha() {}
+// ORB_table_alpha::~ORB_table_alpha() {}
+// ORB_table_beta::ORB_table_beta() {}
+// ORB_table_beta::~ORB_table_beta() {}
+//  mock of snap_psipsi
+// void ORB_gen_tables::snap_psipsi(
+//     const LCAO_Orbitals &orb,
+//     double olm[],
+//     const int &job, ///<[in]0 for matrix element of either S or T, 1 for its derivatives
+//     const char &dtype, ///<[in] derivative type, 'S' for overlap, 'T' for kinetic energy, 'D' for descriptor in
+//     deepks const ModuleBase::Vector3<double> &R1, const int &I1, const int &l1, const int &m1, const int &n1, const
+//     ModuleBase::Vector3<double> &R2, const int &I2, const int &l2, const int &m2, const int &n2, bool cal_syns,
+//     double dmax)const
 //{
-//    if(dtype == 'S')
-//    {
-//        olm[0] = 1.0;
-//    }
-//    else if(dtype == 'T')
-//    {
-//        olm[0] = 2.0;
-//    }
-//    else if(dtype == 'D')
-//    {
-//        olm[0] = 3.0;
-//    }
-//}
+//     if(dtype == 'S')
+//     {
+//         olm[0] = 1.0;
+//     }
+//     else if(dtype == 'T')
+//     {
+//         olm[0] = 2.0;
+//     }
+//     else if(dtype == 'D')
+//     {
+//         olm[0] = 3.0;
+//     }
+// }
 
-//void ORB_gen_tables::snap_psibeta_half(
+// void ORB_gen_tables::snap_psibeta_half(
 //	const LCAO_Orbitals &orb,
 //	const InfoNonlocal &infoNL_,
 //	std::vector<std::vector<double>> &nlm,
@@ -243,12 +232,12 @@ void TwoCenterIntegrator::snap(
 //	const int &T0,
 //	const bool &calc_deri)const // mohan add 2021-04-25)
 //{
-//    nlm.resize(1);
-//    for(int i = 0; i < nlm.size(); ++i)
-//    {
-//        nlm[i].resize(5, 1.0);
-//    }
-//}
+//     nlm.resize(1);
+//     for(int i = 0; i < nlm.size(); ++i)
+//     {
+//         nlm[i].resize(5, 1.0);
+//     }
+// }
 
 #include "module_basis/module_ao/ORB_read.h"
 const LCAO_Orbitals& LCAO_Orbitals::get_const_instance()
@@ -256,8 +245,14 @@ const LCAO_Orbitals& LCAO_Orbitals::get_const_instance()
     static LCAO_Orbitals instance;
     return instance;
 }
-LCAO_Orbitals::LCAO_Orbitals() {this->Phi = new Numerical_Orbital[1];}
-LCAO_Orbitals::~LCAO_Orbitals() { delete[] Phi; }
+LCAO_Orbitals::LCAO_Orbitals()
+{
+    this->Phi = new Numerical_Orbital[1];
+}
+LCAO_Orbitals::~LCAO_Orbitals()
+{
+    delete[] Phi;
+}
 
 #include "module_cell/module_neighbor/sltk_grid_driver.h"
 // mock find_atom() function
@@ -267,12 +262,12 @@ void Grid_Driver::Find_atom(const UnitCell& ucell,
                             const int& I,
                             AdjacentAtomInfo* adjs)
 {
-    adjs->adj_num = ucell.nat - 1 ;
+    adjs->adj_num = ucell.nat - 1;
     adjs->adjacent_tau.resize(ucell.nat);
     adjs->ntype.resize(ucell.nat, 0);
     adjs->natom.resize(ucell.nat);
     adjs->box.resize(ucell.nat);
-    for(int iat = 0;iat<ucell.nat;iat++)
+    for (int iat = 0; iat < ucell.nat; iat++)
     {
         adjs->natom[iat] = iat;
         adjs->box[iat].x = 1;
@@ -281,29 +276,35 @@ void Grid_Driver::Find_atom(const UnitCell& ucell,
         adjs->adjacent_tau[iat] = ucell.get_tau(iat);
     }
 }
-Grid::Grid(const int &test_grid_in):test_grid(test_grid_in)
-{}
-Grid::~Grid(){}
-Grid_Driver::Grid_Driver(const int &test_d_in, 
-		const int &test_gd_in, 
-		const int &test_grid_in) :Grid(test_grid_in), test_deconstructor(test_d_in), test_grid_driver(test_gd_in) {}
-Grid_Driver::~Grid_Driver() {}
+Grid::Grid(const int& test_grid_in) : test_grid(test_grid_in)
+{
+}
+Grid::~Grid()
+{
+}
+Grid_Driver::Grid_Driver(const int& test_d_in, const int& test_gd_in, const int& test_grid_in)
+    : Grid(test_grid_in), test_deconstructor(test_d_in), test_grid_driver(test_gd_in)
+{
+}
+Grid_Driver::~Grid_Driver()
+{
+}
 
 // filter_adjs delete not adjacent atoms in adjs
 void filter_adjs(const std::vector<bool>& is_adj, AdjacentAtomInfo& adjs)
 {
-	const int size = adjs.adj_num+1;
-	for(int i = size-1; i >= 0; --i)
-	{
-		if(!is_adj[i])
-		{
-			adjs.adj_num--;
-			adjs.ntype.erase(adjs.ntype.begin()+i);
-			adjs.natom.erase(adjs.natom.begin()+i);
-			adjs.adjacent_tau.erase(adjs.adjacent_tau.begin()+i);
-			adjs.box.erase(adjs.box.begin()+i);
-		}
-	}
+    const int size = adjs.adj_num + 1;
+    for (int i = size - 1; i >= 0; --i)
+    {
+        if (!is_adj[i])
+        {
+            adjs.adj_num--;
+            adjs.ntype.erase(adjs.ntype.begin() + i);
+            adjs.natom.erase(adjs.natom.begin() + i);
+            adjs.adjacent_tau.erase(adjs.adjacent_tau.begin() + i);
+            adjs.box.erase(adjs.box.begin() + i);
+        }
+    }
 }
 
 Numerical_Nonlocal::Numerical_Nonlocal()

From 5e1dc82581a5b19e1e7302d1ff3a170adc2b1491 Mon Sep 17 00:00:00 2001
From: jinzx10 <jzx016@hotmail.com>
Date: Wed, 19 Jun 2024 23:27:29 +0800
Subject: [PATCH 7/8] fix bug

---
 .../operator_lcao/dftu_force_stress.hpp       | 10 +--
 .../operator_lcao/test/tmp_mocks.cpp          | 66 +------------------
 2 files changed, 2 insertions(+), 74 deletions(-)

diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp
index e1f46a83c7..287ce920b4 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp
@@ -75,10 +75,6 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(
                 // If we are calculating force, we need also to store the gradient
                 // and size of outer vector is then 4
                 // inner loop : all projectors (L0,M0)
-#ifdef USE_NEW_TWO_CENTER
-                //=================================================================
-                //          new two-center integral (temporary)
-                //=================================================================
                 int L1 = atom1->iw2l[ iw1 ];
                 int N1 = atom1->iw2n[ iw1 ];
                 int m1 = atom1->iw2m[ iw1 ];
@@ -87,11 +83,7 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(
                 int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
 
                 ModuleBase::Vector3<double> dtau = tau0 - tau1;
-                uot_->two_center_bundle->overlap_orb_onsite->snap(
-                        T1, L1, N1, M1, T0, dtau * this->ucell->lat0, 1 /*cal_deri*/, nlm);
-#else
-                ModuleBase::WARNING_QUIT("DFTU", "old two center integral method not implemented");
-#endif
+                intor_->snap(T1, L1, N1, M1, T0, dtau * this->ucell->lat0, 1 /*cal_deri*/, nlm);
                 // select the elements of nlm with target_L
                 std::vector<double> nlm_target(tlp1 * 4);
                 for(int iw =0;iw < this->ucell->atoms[T0].nw; iw++)
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
index 6d9dfa383f..90fcf023f1 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/test/tmp_mocks.cpp
@@ -139,8 +139,7 @@ template class hamilt::OperatorLCAO<double, double>;
 template class hamilt::OperatorLCAO<std::complex<double>, double>;
 template class hamilt::OperatorLCAO<std::complex<double>, std::complex<double>>;
 
-// mock of ORB_gen_tables and LCAO_Orbitals
-//#include "module_basis/module_ao/ORB_gen_tables.h"
+// mock of TwoCenterIntegrator and LCAO_Orbitals
 #include "module_basis/module_nao/two_center_integrator.h"
 TwoCenterIntegrator::TwoCenterIntegrator() {}
 
@@ -187,69 +186,6 @@ void TwoCenterIntegrator::snap(
     }
 }
 
-//ORB_gen_tables::ORB_gen_tables() : two_center_bundle(new TwoCenterBundle) {}
-//ORB_gen_tables::~ORB_gen_tables() {}
-//ORB_gaunt_table::ORB_gaunt_table() {}
-//ORB_gaunt_table::~ORB_gaunt_table() {}
-//ORB_table_phi::ORB_table_phi() {}
-//ORB_table_phi::~ORB_table_phi() {}
-//ORB_table_alpha::ORB_table_alpha() {}
-//ORB_table_alpha::~ORB_table_alpha() {}
-//ORB_table_beta::ORB_table_beta() {}
-//ORB_table_beta::~ORB_table_beta() {}
-// mock of snap_psipsi
-//void ORB_gen_tables::snap_psipsi(
-//    const LCAO_Orbitals &orb,
-//    double olm[],
-//    const int &job, ///<[in]0 for matrix element of either S or T, 1 for its derivatives
-//    const char &dtype, ///<[in] derivative type, 'S' for overlap, 'T' for kinetic energy, 'D' for descriptor in deepks
-//    const ModuleBase::Vector3<double> &R1,
-//    const int &I1,
-//    const int &l1,
-//    const int &m1,
-//    const int &n1,
-//    const ModuleBase::Vector3<double> &R2,
-//    const int &I2,
-//    const int &l2,
-//    const int &m2,
-//    const int &n2,
-//    bool cal_syns,
-//    double dmax)const
-//{
-//    if(dtype == 'S')
-//    {
-//        olm[0] = 1.0;
-//    }
-//    else if(dtype == 'T')
-//    {
-//        olm[0] = 2.0;
-//    }
-//    else if(dtype == 'D')
-//    {
-//        olm[0] = 3.0;
-//    }
-//}
-
-//void ORB_gen_tables::snap_psibeta_half(
-//	const LCAO_Orbitals &orb,
-//	const InfoNonlocal &infoNL_,
-//	std::vector<std::vector<double>> &nlm,
-//	const ModuleBase::Vector3<double> &R1,
-//	const int &T1,
-//	const int &L1,
-//	const int &m1,
-//	const int &N1,
-//	const ModuleBase::Vector3<double> &R0, // The projector.
-//	const int &T0,
-//	const bool &calc_deri)const // mohan add 2021-04-25)
-//{
-//    nlm.resize(1);
-//    for(int i = 0; i < nlm.size(); ++i)
-//    {
-//        nlm[i].resize(5, 1.0);
-//    }
-//}
-
 #include "module_basis/module_ao/ORB_read.h"
 const LCAO_Orbitals& LCAO_Orbitals::get_const_instance()
 {

From 0cc3dd3cb14587cbc6f575e5863a06fca7867d3d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci-lite[bot]"
 <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
Date: Wed, 19 Jun 2024 16:04:18 +0000
Subject: [PATCH 8/8] [pre-commit.ci lite] apply automatic fixes

---
 .../operator_lcao/deepks_lcao.cpp             |   2 +-
 .../operator_lcao/dftu_force_stress.hpp       | 202 +++++++++++-------
 2 files changed, 120 insertions(+), 84 deletions(-)

diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
index 4fcad5b6db..637eedb9c0 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/deepks_lcao.cpp
@@ -286,7 +286,7 @@ void hamilt::DeePKS<hamilt::OperatorLCAO<TK, TR>>::pre_calculate_nlm(
 
             ModuleBase::Vector3<double> dtau = tau0 - tau1;
             uot_->two_center_bundle->overlap_orb_alpha
-                ->snap(T1, L1, N1, M1, 0, dtau * ucell->lat0, 0 /*calc_deri*/, nlm);
+                ->snap(T1, L1, N1, M1, 0, dtau * ucell->lat0, false /*calc_deri*/, nlm);
             nlm_in[ad].insert({all_indexes[iw1l], nlm[0]});
             if (npol == 2)
                 nlm_in[ad].insert({all_indexes[iw1l + 1], nlm[0]});
diff --git a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp
index 287ce920b4..e7fbf8c0ac 100644
--- a/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp
+++ b/source/module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dftu_force_stress.hpp
@@ -6,22 +6,22 @@ namespace hamilt
 {
 
 template <typename TK, typename TR>
-void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(
-  const bool cal_force, 
-  const bool cal_stress, 
-  ModuleBase::matrix& force, 
-  ModuleBase::matrix& stress)
+void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(const bool cal_force,
+                                                  const bool cal_stress,
+                                                  ModuleBase::matrix& force,
+                                                  ModuleBase::matrix& stress)
 {
-    ModuleBase::TITLE("DFTU", "cal_force_stress");    
-    if(this->dftu->get_dmr(0) == nullptr)
-    { 
+    ModuleBase::TITLE("DFTU", "cal_force_stress");
+    if (this->dftu->get_dmr(0) == nullptr)
+    {
         ModuleBase::WARNING_QUIT("DFTU", "dmr is not set");
     }
-    //try to get the density matrix, if the density matrix is empty, skip the calculation and return
+    // try to get the density matrix, if the density matrix is empty, skip the calculation and return
     const hamilt::HContainer<double>* dmR_tmp[this->nspin];
     dmR_tmp[0] = this->dftu->get_dmr(0);
-    if(this->nspin==2) dmR_tmp[1] = this->dftu->get_dmr(1);
-    if(dmR_tmp[0]->size_atom_pairs() == 0)
+    if (this->nspin == 2)
+        dmR_tmp[1] = this->dftu->get_dmr(1);
+    if (dmR_tmp[0]->size_atom_pairs() == 0)
     {
         return;
     }
@@ -45,7 +45,8 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(
         int T0, I0;
         ucell->iat2iait(iat0, &I0, &T0);
         const int target_L = this->dftu->orbital_corr[T0];
-        if(target_L == -1) continue;
+        if (target_L == -1)
+            continue;
         const int tlp1 = 2 * target_L + 1;
         AdjacentAtomInfo& adjs = this->adjs_all[atom_index++];
 
@@ -75,28 +76,30 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(
                 // If we are calculating force, we need also to store the gradient
                 // and size of outer vector is then 4
                 // inner loop : all projectors (L0,M0)
-                int L1 = atom1->iw2l[ iw1 ];
-                int N1 = atom1->iw2n[ iw1 ];
-                int m1 = atom1->iw2m[ iw1 ];
+                int L1 = atom1->iw2l[iw1];
+                int N1 = atom1->iw2n[iw1];
+                int m1 = atom1->iw2m[iw1];
 
                 // convert m (0,1,...2l) to M (-l, -l+1, ..., l-1, l)
-                int M1 = (m1 % 2 == 0) ? -m1/2 : (m1+1)/2;
+                int M1 = (m1 % 2 == 0) ? -m1 / 2 : (m1 + 1) / 2;
 
                 ModuleBase::Vector3<double> dtau = tau0 - tau1;
                 intor_->snap(T1, L1, N1, M1, T0, dtau * this->ucell->lat0, 1 /*cal_deri*/, nlm);
                 // select the elements of nlm with target_L
                 std::vector<double> nlm_target(tlp1 * 4);
-                for(int iw =0;iw < this->ucell->atoms[T0].nw; iw++)
+                for (int iw = 0; iw < this->ucell->atoms[T0].nw; iw++)
                 {
                     const int L0 = this->ucell->atoms[T0].iw2l[iw];
-                    if(L0 == target_L)
+                    if (L0 == target_L)
                     {
-                        for(int m = 0; m < tlp1; m++)//-l, -l+1, ..., l-1, l
+                        for (int m = 0; m < tlp1; m++) //-l, -l+1, ..., l-1, l
                         {
-                            for(int n = 0; n < 4; n++)// value, deri_x, deri_y, deri_z
+                            for (int n = 0; n < 4; n++) // value, deri_x, deri_y, deri_z
                             {
-                                nlm_target[m + n * tlp1] = nlm[n][iw+m];
-                                //if(dtau.norm2 == 0.0) std::cout<<__FILE__<<__LINE__<<" "<<m<<" "<<n<<" "<<(m+n*tlp1)<<" "<<iw+m<<" "<<nlm[n][iw+m]<<" "<<nlm_target[m + n * tlp1] << std::endl;
+                                nlm_target[m + n * tlp1] = nlm[n][iw + m];
+                                // if(dtau.norm2 == 0.0) std::cout<<__FILE__<<__LINE__<<" "<<m<<" "<<n<<"
+                                // "<<(m+n*tlp1)<<" "<<iw+m<<" "<<nlm[n][iw+m]<<" "<<nlm_target[m + n * tlp1] <<
+                                // std::endl;
                             }
                         }
                         break;
@@ -105,16 +108,16 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(
                 nlm_tot[ad].insert({all_indexes[iw1l], nlm_target});
             }
         }
-        //first iteration to calculate occupation matrix
+        // first iteration to calculate occupation matrix
         std::vector<double> occ(tlp1 * tlp1 * this->nspin, 0);
-        for(int i=0;i<occ.size();i++)
+        for (int i = 0; i < occ.size(); i++)
         {
             const int is = i / (tlp1 * tlp1);
             const int ii = i % (tlp1 * tlp1);
             occ[i] = this->dftu->locale[iat0][target_L][0][is].c[ii];
         }
-        
-        //calculate VU
+
+        // calculate VU
         const double u_value = this->dftu->U[T0];
         std::vector<double> VU(occ.size());
         double eu_tmp = 0;
@@ -122,11 +125,11 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(
 
         // second iteration to calculate force and stress
         // calculate Force for atom J
-        //     DMR_{I,J,R'-R} * <phi_{I,R}|chi_m> U*(1/2*delta(m, m')-occ(m, m')) 
+        //     DMR_{I,J,R'-R} * <phi_{I,R}|chi_m> U*(1/2*delta(m, m')-occ(m, m'))
         //     \frac{\partial <chi_m'|phi_{J,R'}>}{\partial \tau_J} for each pair of <IJR> atoms
         // calculate Stress for strain tensor \varepsilon_{\alpha\beta}
-        //     -1/Omega * DMR_{I,J,R'-R} * [ \frac{\partial <phi_{I,R}|chi_m>}{\partial \tau_{J,\alpha}}\tau_{J,\beta} 
-        //     U*(1/2*delta(m, m')-occ(m, m'))<chi_m'|phi_{J,R'}>  
+        //     -1/Omega * DMR_{I,J,R'-R} * [ \frac{\partial <phi_{I,R}|chi_m>}{\partial \tau_{J,\alpha}}\tau_{J,\beta}
+        //     U*(1/2*delta(m, m')-occ(m, m'))<chi_m'|phi_{J,R'}>
         //   + <phi_{I,R}|chi_m> U*(1/2*delta(m, m')-occ(m, m'))
         //     \frac{\partial <chi_m'|phi_{J,R'}>}{\partial \tau_{J,\alpha}}\tau_{J,\beta}] for each pair of <IJR> atoms
         for (int ad1 = 0; ad1 < adjs.adj_num + 1; ++ad1)
@@ -134,8 +137,8 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(
             const int T1 = adjs.ntype[ad1];
             const int I1 = adjs.natom[ad1];
             const int iat1 = ucell->itia2iat(T1, I1);
-            double* force_tmp1 = (cal_force)? &force(iat1, 0) : nullptr;
-            double* force_tmp2 = (cal_force)? &force(iat0, 0) : nullptr;
+            double* force_tmp1 = (cal_force) ? &force(iat1, 0) : nullptr;
+            double* force_tmp2 = (cal_force) ? &force(iat0, 0) : nullptr;
             ModuleBase::Vector3<int>& R_index1 = adjs.box[ad1];
             ModuleBase::Vector3<double> dis1 = adjs.adjacent_tau[ad1] - tau0;
             for (int ad2 = 0; ad2 < adjs.adj_num + 1; ++ad2)
@@ -150,7 +153,7 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(
                                                   R_index2[2] - R_index1[2]);
                 const hamilt::BaseMatrix<double>* tmp[this->nspin];
                 tmp[0] = dmR_tmp[0]->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]);
-                if(this->nspin == 2)
+                if (this->nspin == 2)
                 {
                     tmp[1] = dmR_tmp[1]->find_matrix(iat1, iat2, R_vector[0], R_vector[1], R_vector[2]);
                 }
@@ -158,36 +161,57 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(
                 if (tmp[0] != nullptr)
                 {
                     // calculate force
-                    if (cal_force) this->cal_force_IJR(iat1, iat2, paraV, nlm_tot[ad1], nlm_tot[ad2], VU, tmp, this->nspin, force_tmp1, force_tmp2);
+                    if (cal_force)
+                        this->cal_force_IJR(iat1,
+                                            iat2,
+                                            paraV,
+                                            nlm_tot[ad1],
+                                            nlm_tot[ad2],
+                                            VU,
+                                            tmp,
+                                            this->nspin,
+                                            force_tmp1,
+                                            force_tmp2);
 
                     // calculate stress
-                    if (cal_stress) this->cal_stress_IJR(iat1, iat2, paraV, nlm_tot[ad1], nlm_tot[ad2], VU, tmp, this->nspin, dis1, dis2, stress_tmp.data());
+                    if (cal_stress)
+                        this->cal_stress_IJR(iat1,
+                                             iat2,
+                                             paraV,
+                                             nlm_tot[ad1],
+                                             nlm_tot[ad2],
+                                             VU,
+                                             tmp,
+                                             this->nspin,
+                                             dis1,
+                                             dis2,
+                                             stress_tmp.data());
                 }
             }
         }
     }
 
-    if(cal_force)
+    if (cal_force)
     {
 #ifdef __MPI
         // sum up the occupation matrix
-        Parallel_Reduce::reduce_all(force.c, force.nr*force.nc);
+        Parallel_Reduce::reduce_all(force.c, force.nr * force.nc);
 #endif
-        for(int i=0;i<force.nr*force.nc;i++)
+        for (int i = 0; i < force.nr * force.nc; i++)
         {
             force.c[i] *= 2.0;
         }
     }
 
     // stress renormalization
-    if(cal_stress)
+    if (cal_stress)
     {
 #ifdef __MPI
         // sum up the occupation matrix
         Parallel_Reduce::reduce_all(stress_tmp.data(), 6);
 #endif
         const double weight = this->ucell->lat0 / this->ucell->omega;
-        for(int i=0;i<6;i++)
+        for (int i = 0; i < 6; i++)
         {
             stress.c[i] = stress_tmp[i] * weight;
         }
@@ -203,17 +227,16 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_stress(
 }
 
 template <typename TK, typename TR>
-void DFTU<OperatorLCAO<TK, TR>>::cal_force_IJR(
-    const int& iat1,
-    const int& iat2,
-    const Parallel_Orbitals* paraV,
-    const std::unordered_map<int, std::vector<double>>& nlm1_all,
-    const std::unordered_map<int, std::vector<double>>& nlm2_all,
-    const std::vector<double>& vu_in,
-    const hamilt::BaseMatrix<double>** dmR_pointer,
-    const int nspin,
-    double* force1,
-    double* force2)
+void DFTU<OperatorLCAO<TK, TR>>::cal_force_IJR(const int& iat1,
+                                               const int& iat2,
+                                               const Parallel_Orbitals* paraV,
+                                               const std::unordered_map<int, std::vector<double>>& nlm1_all,
+                                               const std::unordered_map<int, std::vector<double>>& nlm2_all,
+                                               const std::vector<double>& vu_in,
+                                               const hamilt::BaseMatrix<double>** dmR_pointer,
+                                               const int nspin,
+                                               double* force1,
+                                               double* force2)
 {
     // npol is the number of polarizations,
     // 1 for non-magnetic (one Hamiltonian matrix only has spin-up or spin-down),
@@ -224,11 +247,12 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_IJR(
     // ---------------------------------------------
     auto row_indexes = paraV->get_indexes_row(iat1);
     auto col_indexes = paraV->get_indexes_col(iat2);
-    const int m_size = int(sqrt(vu_in.size()/nspin));
+    const int m_size = int(sqrt(vu_in.size() / nspin));
     const int m_size2 = m_size * m_size;
     // step_trace = 0 for NSPIN=1,2; ={0, 1, local_col, local_col+1} for NSPIN=4
     std::vector<int> step_trace(npol, 0);
-    if(npol == 2) step_trace[1] = col_indexes.size() + 1;
+    if (npol == 2)
+        step_trace[1] = col_indexes.size() + 1;
     double tmp[3];
     // calculate the local matrix
     for (int is = 0; is < nspin; is++)
@@ -240,16 +264,18 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_IJR(
             for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
             {
                 const std::vector<double>& nlm2 = nlm2_all.find(col_indexes[iw2l])->second;
-    #ifdef __DEBUG
+#ifdef __DEBUG
                 assert(nlm1.size() == nlm2.size());
-    #endif
+#endif
                 for (int m1 = 0; m1 < m_size; m1++)
                 {
-                    for(int m2 = 0; m2 < m_size; m2++)
+                    for (int m2 = 0; m2 < m_size; m2++)
                     {
-                        tmp[0] = vu_in[m1 * m_size + m2 + is*m_size2] * nlm1[m1 + m_size] * nlm2[m2] * dm_pointer[0];
-                        tmp[1] = vu_in[m1 * m_size + m2 + is*m_size2] * nlm1[m1 + m_size*2] * nlm2[m2] * dm_pointer[0];
-                        tmp[2] = vu_in[m1 * m_size + m2 + is*m_size2] * nlm1[m1 + m_size*3] * nlm2[m2] * dm_pointer[0];
+                        tmp[0] = vu_in[m1 * m_size + m2 + is * m_size2] * nlm1[m1 + m_size] * nlm2[m2] * dm_pointer[0];
+                        tmp[1]
+                            = vu_in[m1 * m_size + m2 + is * m_size2] * nlm1[m1 + m_size * 2] * nlm2[m2] * dm_pointer[0];
+                        tmp[2]
+                            = vu_in[m1 * m_size + m2 + is * m_size2] * nlm1[m1 + m_size * 3] * nlm2[m2] * dm_pointer[0];
                         // force1 = - VU * <d phi_{I,R1}/d R1|chi_m> * <chi_m'|phi_{J,R2}>
                         // force2 = - VU * <phi_{I,R1}|d chi_m/d R0> * <chi_m'|phi_{J,R2>}
                         force1[0] += tmp[0];
@@ -268,18 +294,17 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_force_IJR(
 }
 
 template <typename TK, typename TR>
-void DFTU<OperatorLCAO<TK, TR>>::cal_stress_IJR(
-    const int& iat1,
-    const int& iat2,
-    const Parallel_Orbitals* paraV,
-    const std::unordered_map<int, std::vector<double>>& nlm1_all,
-    const std::unordered_map<int, std::vector<double>>& nlm2_all,
-    const std::vector<double>& vu_in,
-    const hamilt::BaseMatrix<double>** dmR_pointer,
-    const int nspin,
-    const ModuleBase::Vector3<double>& dis1,
-    const ModuleBase::Vector3<double>& dis2,
-    double* stress)
+void DFTU<OperatorLCAO<TK, TR>>::cal_stress_IJR(const int& iat1,
+                                                const int& iat2,
+                                                const Parallel_Orbitals* paraV,
+                                                const std::unordered_map<int, std::vector<double>>& nlm1_all,
+                                                const std::unordered_map<int, std::vector<double>>& nlm2_all,
+                                                const std::vector<double>& vu_in,
+                                                const hamilt::BaseMatrix<double>** dmR_pointer,
+                                                const int nspin,
+                                                const ModuleBase::Vector3<double>& dis1,
+                                                const ModuleBase::Vector3<double>& dis2,
+                                                double* stress)
 {
     // npol is the number of polarizations,
     // 1 for non-magnetic (one Hamiltonian matrix only has spin-up or spin-down),
@@ -290,11 +315,12 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_stress_IJR(
     // ---------------------------------------------
     auto row_indexes = paraV->get_indexes_row(iat1);
     auto col_indexes = paraV->get_indexes_col(iat2);
-    const int m_size = int(sqrt(vu_in.size()/nspin));
+    const int m_size = int(sqrt(vu_in.size() / nspin));
     const int m_size2 = m_size * m_size;
     // step_trace = 0 for NSPIN=1,2; ={0, 1, local_col, local_col+1} for NSPIN=4
     std::vector<int> step_trace(npol, 0);
-    if(npol == 2) step_trace[1] = col_indexes.size() + 1;
+    if (npol == 2)
+        step_trace[1] = col_indexes.size() + 1;
     // calculate the local matrix
     for (int is = 0; is < nspin; is++)
     {
@@ -305,21 +331,31 @@ void DFTU<OperatorLCAO<TK, TR>>::cal_stress_IJR(
             for (int iw2l = 0; iw2l < col_indexes.size(); iw2l += npol)
             {
                 const std::vector<double>& nlm2 = nlm2_all.find(col_indexes[iw2l])->second;
-    #ifdef __DEBUG
+#ifdef __DEBUG
                 assert(nlm1.size() == nlm2.size());
-    #endif
+#endif
                 for (int m1 = 0; m1 < m_size; m1++)
                 {
-                    for(int m2 = 0; m2 < m_size; m2++)
+                    for (int m2 = 0; m2 < m_size; m2++)
                     {
-                        double tmp = vu_in[m1 * m_size + m2 + is*m_size2] * dm_pointer[0];
-                        //std::cout<<__FILE__<<__LINE__<<" "<<tmp<<" "<<m1<<" "<<m2<<" "<<nlm1[m1 + m_size * 2]<<" "<<nlm2[m2 + m_size * 2]<<" "<<dis1.y<<" "<<dis2.y<<std::endl;
-                        stress[0] += tmp * (nlm1[m1 + m_size] * dis1.x * nlm2[m2] + nlm1[m1] * nlm2[m2 + m_size] * dis2.x);
-                        stress[1] += tmp * (nlm1[m1 + m_size] * dis1.y * nlm2[m2] + nlm1[m1] * nlm2[m2 + m_size] * dis2.y);
-                        stress[2] += tmp * (nlm1[m1 + m_size] * dis1.z * nlm2[m2] + nlm1[m1] * nlm2[m2 + m_size] * dis2.z);
-                        stress[3] += tmp * (nlm1[m1 + m_size*2] * dis1.y * nlm2[m2] + nlm1[m1] * nlm2[m2 + m_size*2] * dis2.y);
-                        stress[4] += tmp * (nlm1[m1 + m_size*2] * dis1.z * nlm2[m2] + nlm1[m1] * nlm2[m2 + m_size*2] * dis2.z);
-                        stress[5] += tmp * (nlm1[m1 + m_size*3] * dis1.z * nlm2[m2] + nlm1[m1] * nlm2[m2 + m_size*3] * dis2.z);
+                        double tmp = vu_in[m1 * m_size + m2 + is * m_size2] * dm_pointer[0];
+                        // std::cout<<__FILE__<<__LINE__<<" "<<tmp<<" "<<m1<<" "<<m2<<" "<<nlm1[m1 + m_size * 2]<<"
+                        // "<<nlm2[m2 + m_size * 2]<<" "<<dis1.y<<" "<<dis2.y<<std::endl;
+                        stress[0]
+                            += tmp * (nlm1[m1 + m_size] * dis1.x * nlm2[m2] + nlm1[m1] * nlm2[m2 + m_size] * dis2.x);
+                        stress[1]
+                            += tmp * (nlm1[m1 + m_size] * dis1.y * nlm2[m2] + nlm1[m1] * nlm2[m2 + m_size] * dis2.y);
+                        stress[2]
+                            += tmp * (nlm1[m1 + m_size] * dis1.z * nlm2[m2] + nlm1[m1] * nlm2[m2 + m_size] * dis2.z);
+                        stress[3] += tmp
+                                     * (nlm1[m1 + m_size * 2] * dis1.y * nlm2[m2]
+                                        + nlm1[m1] * nlm2[m2 + m_size * 2] * dis2.y);
+                        stress[4] += tmp
+                                     * (nlm1[m1 + m_size * 2] * dis1.z * nlm2[m2]
+                                        + nlm1[m1] * nlm2[m2 + m_size * 2] * dis2.z);
+                        stress[5] += tmp
+                                     * (nlm1[m1 + m_size * 3] * dis1.z * nlm2[m2]
+                                        + nlm1[m1] * nlm2[m2 + m_size * 3] * dis2.z);
                     }
                 }
                 dm_pointer++;