diff --git a/CMakeLists.txt b/CMakeLists.txt index b53098a001..9086f0b6c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -711,7 +711,9 @@ target_link_libraries( esolver vdw device - container) + container + dftu + deltaspin) if(ENABLE_LCAO) target_link_libraries( ${ABACUS_BIN_NAME} @@ -719,9 +721,7 @@ if(ENABLE_LCAO) tddft orb gint - dftu hcontainer - deltaspin numerical_atomic_orbitals lr rdmft) diff --git a/docs/advanced/input_files/input-main.md b/docs/advanced/input_files/input-main.md index ced6c56c54..4b9790fc27 100644 --- a/docs/advanced/input_files/input-main.md +++ b/docs/advanced/input_files/input-main.md @@ -1235,6 +1235,12 @@ Note: In new angle mixing, you should set `mixing_beta_mag >> mixing_beta`. The - **Description**: To determine the number of old iterations' `drho` used in slope calculations. - **Default**: `mixing_ndim` +### sc_os_ndim + +- **Type**: int +- **Description**: To determine the number of old iterations to judge oscillation, it occured, more accurate lambda with DeltaSpin method would be calculated, only for PW base. +- **Default**: 5 + ### chg_extrap - **Type**: String diff --git a/python/pyabacus/src/ModuleNAO/CMakeLists.txt b/python/pyabacus/src/ModuleNAO/CMakeLists.txt index 53600a08f3..c5eb016903 100644 --- a/python/pyabacus/src/ModuleNAO/CMakeLists.txt +++ b/python/pyabacus/src/ModuleNAO/CMakeLists.txt @@ -12,7 +12,6 @@ list(APPEND _naos ${NAO_PATH}/two_center_bundle.cpp ${NAO_PATH}/two_center_integrator.cpp ${NAO_PATH}/two_center_table.cpp - ${NAO_PATH}/projgen.cpp # dependency ${ABACUS_SOURCE_DIR}/module_base/kernels/math_op.cpp # ${ABACUS_SOURCE_DIR}/module_psi/kernels/psi_memory_op.cpp diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index e8af89216b..1f4d4a8370 100644 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -47,6 +47,7 @@ list(APPEND device_srcs module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp + module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.cpp module_hamilt_pw/hamilt_pwdft/kernels/wf_op.cpp module_hamilt_pw/hamilt_pwdft/kernels/vnl_op.cpp module_base/kernels/math_op.cpp @@ -60,6 +61,7 @@ if(USE_CUDA) module_hamilt_pw/hamilt_pwdft/kernels/cuda/ekinetic_op.cu module_hamilt_pw/hamilt_pwdft/kernels/cuda/meta_op.cu module_hamilt_pw/hamilt_stodft/kernels/cuda/hpsi_norm_op.cu + module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu module_basis/module_pw/kernels/cuda/pw_op.cu module_hsolver/kernels/cuda/dngvd_op.cu module_hsolver/kernels/cuda/math_kernel_op.cu @@ -83,6 +85,7 @@ if(USE_ROCM) module_hamilt_pw/hamilt_pwdft/kernels/rocm/veff_op.hip.cu module_hamilt_pw/hamilt_pwdft/kernels/rocm/ekinetic_op.hip.cu module_hamilt_pw/hamilt_pwdft/kernels/rocm/meta_op.hip.cu + module_hamilt_pw/hamilt_pwdft/kernels/rocm/onsite_op.hip.cu module_hamilt_pw/hamilt_stodft/kernels/rocm/hpsi_norm_op.hip.cu module_basis/module_pw/kernels/rocm/pw_op.hip.cu module_hsolver/kernels/rocm/dngvd_op.hip.cu diff --git a/source/Makefile.Objects b/source/Makefile.Objects index 79c24632a7..661db25611 100644 --- a/source/Makefile.Objects +++ b/source/Makefile.Objects @@ -635,10 +635,13 @@ OBJS_SRCPW=H_Ewald_pw.o\ forces_nl.o\ forces_cc.o\ forces_scc.o\ + forces_onsite.o\ + onsite_proj_pw.o\ fs_nonlocal_tools.o\ fs_kin_tools.o\ force_op.o\ stress_op.o\ + onsite_op.o\ wf_op.o\ vnl_op.o\ global.o\ @@ -663,6 +666,7 @@ OBJS_SRCPW=H_Ewald_pw.o\ stress_func_loc.o\ stress_func_nl.o\ stress_func_us.o\ + stress_func_onsite.o\ stress_pw.o\ of_stress_pw.o\ symmetry_rho.o\ @@ -673,7 +677,9 @@ OBJS_SRCPW=H_Ewald_pw.o\ elecond.o\ sto_tool.o\ sto_elecond.o\ - sto_dos.o + sto_dos.o\ + onsite_projector.o\ + onsite_proj_tools.o OBJS_VDW=vdw.o\ vdwd2_parameters.o\ @@ -691,7 +697,8 @@ OBJS_DFTU=dftu.o\ dftu_io.o\ dftu_tools.o\ dftu_occup.o\ - dftu_hamilt.o + dftu_hamilt.o\ + dftu_pw.o OBJS_DELTASPIN=basic_funcs.o\ cal_mw_from_lambda.o\ diff --git a/source/module_base/CMakeLists.txt b/source/module_base/CMakeLists.txt index 5335be34ac..38c466a2c1 100644 --- a/source/module_base/CMakeLists.txt +++ b/source/module_base/CMakeLists.txt @@ -54,6 +54,7 @@ add_library( spherical_bessel_transformer.cpp cubic_spline.cpp parallel_2d.cpp + projgen.cpp module_mixing/mixing_data.cpp module_mixing/mixing.cpp module_mixing/plain_mixing.cpp diff --git a/source/module_basis/module_nao/projgen.cpp b/source/module_base/projgen.cpp similarity index 100% rename from source/module_basis/module_nao/projgen.cpp rename to source/module_base/projgen.cpp diff --git a/source/module_basis/module_nao/projgen.h b/source/module_base/projgen.h similarity index 100% rename from source/module_basis/module_nao/projgen.h rename to source/module_base/projgen.h diff --git a/source/module_basis/module_nao/CMakeLists.txt b/source/module_basis/module_nao/CMakeLists.txt index 8e54af0778..29e091510f 100644 --- a/source/module_basis/module_nao/CMakeLists.txt +++ b/source/module_basis/module_nao/CMakeLists.txt @@ -14,7 +14,6 @@ if(ENABLE_LCAO) two_center_table.cpp two_center_integrator.cpp two_center_bundle.cpp - projgen.cpp ) if(ENABLE_COVERAGE) diff --git a/source/module_basis/module_nao/atomic_radials.cpp b/source/module_basis/module_nao/atomic_radials.cpp index 7d095e70cd..e2461a3970 100644 --- a/source/module_basis/module_nao/atomic_radials.cpp +++ b/source/module_basis/module_nao/atomic_radials.cpp @@ -7,7 +7,7 @@ // FIXME: should update with pyabacus // #include "module_io/orb_io.h" -#include "projgen.h" +#include "module_base/projgen.h" #include #include diff --git a/source/module_basis/module_nao/test/CMakeLists.txt b/source/module_basis/module_nao/test/CMakeLists.txt index 0759f33435..0e4f063be6 100644 --- a/source/module_basis/module_nao/test/CMakeLists.txt +++ b/source/module_basis/module_nao/test/CMakeLists.txt @@ -14,7 +14,6 @@ AddTest( ../atomic_radials.cpp ../radial_set.cpp ../numerical_radial.cpp - ../projgen.cpp ../../module_ao/ORB_atomic_lm.cpp ../../module_ao/ORB_atomic.cpp ../../../module_io/orb_io.cpp @@ -84,7 +83,6 @@ AddTest( ../pswfc_radials.cpp ../radial_set.cpp ../numerical_radial.cpp - ../projgen.cpp ../sphbes_radials.cpp ../../module_ao/ORB_atomic_lm.cpp ../../module_ao/ORB_atomic.cpp @@ -104,7 +102,6 @@ AddTest( ../pswfc_radials.cpp ../sphbes_radials.cpp ../radial_set.cpp - ../projgen.cpp ../numerical_radial.cpp ../two_center_bundle.cpp ../two_center_integrator.cpp @@ -131,7 +128,6 @@ AddTest( ../real_gaunt_table.cpp ../radial_collection.cpp ../atomic_radials.cpp - ../projgen.cpp ../beta_radials.cpp ../hydrogen_radials.cpp ../pswfc_radials.cpp @@ -158,7 +154,6 @@ AddTest( ../pswfc_radials.cpp ../sphbes_radials.cpp ../radial_set.cpp - ../projgen.cpp ../numerical_radial.cpp ../../../module_io/orb_io.cpp LIBS parameter ${math_libs} device base container orb @@ -179,7 +174,6 @@ AddTest( ../pswfc_radials.cpp ../sphbes_radials.cpp ../radial_set.cpp - ../projgen.cpp ../numerical_radial.cpp ../../../module_io/orb_io.cpp LIBS parameter ${math_libs} device base container orb diff --git a/source/module_basis/module_nao/test/projgen_test.cpp b/source/module_basis/module_nao/test/projgen_test.cpp index aaea89f5d0..2feaadfb7a 100644 --- a/source/module_basis/module_nao/test/projgen_test.cpp +++ b/source/module_basis/module_nao/test/projgen_test.cpp @@ -1,4 +1,4 @@ -#include "module_basis/module_nao/projgen.h" +#include "module_base/projgen.h" #include "gtest/gtest.h" #include "module_base/math_integral.h" diff --git a/source/module_cell/read_atoms.cpp b/source/module_cell/read_atoms.cpp index a17ab81ae5..81608ce609 100644 --- a/source/module_cell/read_atoms.cpp +++ b/source/module_cell/read_atoms.cpp @@ -455,7 +455,7 @@ bool UnitCell::read_atom_positions(std::ifstream &ifpos, std::ofstream &ofs_runn } else if(PARAM.inp.basis_type == "pw") { - if ((PARAM.inp.psi_initializer)&&(PARAM.inp.init_wfc.substr(0, 3) == "nao")) + if ((PARAM.inp.psi_initializer)&&(PARAM.inp.init_wfc.substr(0, 3) == "nao") || PARAM.inp.onsite_radius > 0.0) { std::string orbital_file = PARAM.inp.orbital_dir + orbital_fn[it]; this->read_orb_file(it, orbital_file, ofs_running, &(atoms[it])); diff --git a/source/module_cell/unitcell.cpp b/source/module_cell/unitcell.cpp index cc7140eb45..9f2b8bdbca 100755 --- a/source/module_cell/unitcell.cpp +++ b/source/module_cell/unitcell.cpp @@ -64,7 +64,6 @@ UnitCell::UnitCell() { atom_mass = nullptr; pseudo_fn = new std::string[1]; pseudo_type = new std::string[1]; - orbital_fn = new std::string[1]; set_atom_flag = false; } @@ -114,6 +113,15 @@ void UnitCell::bcast_unitcell() { Parallel_Common::bcast_int(lc[1]); Parallel_Common::bcast_int(lc[2]); + if(this->orbital_fn == nullptr) + { + this->orbital_fn = new std::string[ntype]; + } + for (int i = 0; i < ntype; i++) + { + Parallel_Common::bcast_string(orbital_fn[i]); + } + // distribute lattice vectors. Parallel_Common::bcast_double(a1.x); Parallel_Common::bcast_double(a1.y); diff --git a/source/module_cell/unitcell.h b/source/module_cell/unitcell.h index 1933a95c2f..af0d79a5c1 100644 --- a/source/module_cell/unitcell.h +++ b/source/module_cell/unitcell.h @@ -216,7 +216,7 @@ class UnitCell { std::string* pseudo_fn; std::string* pseudo_type; // pseudopotential types for each elements, // sunliang added 2022-09-15. - std::string* orbital_fn; // filenames of orbitals, liuyu add 2022-10-19 + std::string* orbital_fn = nullptr; // filenames of orbitals, liuyu add 2022-10-19 std::string descriptor_file; // filenames of descriptor_file, liuyu add 2023-04-06 diff --git a/source/module_elecstate/elecstate.h b/source/module_elecstate/elecstate.h index a90555a249..7640d43d0f 100644 --- a/source/module_elecstate/elecstate.h +++ b/source/module_elecstate/elecstate.h @@ -151,9 +151,7 @@ class ElecState return 0.0; } -#ifdef __LCAO double get_dftu_energy(); -#endif #ifdef __DEEPKS double get_deepks_E_delta(); diff --git a/source/module_elecstate/elecstate_energy.cpp b/source/module_elecstate/elecstate_energy.cpp index 86a02d7364..f016da85a5 100644 --- a/source/module_elecstate/elecstate_energy.cpp +++ b/source/module_elecstate/elecstate_energy.cpp @@ -287,7 +287,6 @@ void ElecState::cal_energies(const int type) } //! spin constrained energy -#ifdef __LCAO if (PARAM.inp.sc_mag_switch) { this->f_en.escon = get_spin_constrain_energy(); @@ -298,7 +297,6 @@ void ElecState::cal_energies(const int type) { this->f_en.edftu = get_dftu_energy(); } -#endif #ifdef __DEEPKS // energy from deepks diff --git a/source/module_elecstate/elecstate_energy_terms.cpp b/source/module_elecstate/elecstate_energy_terms.cpp index a4d7d98cf3..d820ba064e 100644 --- a/source/module_elecstate/elecstate_energy_terms.cpp +++ b/source/module_elecstate/elecstate_energy_terms.cpp @@ -34,12 +34,10 @@ double ElecState::get_solvent_model_Acav() return GlobalC::solvent_model.Acav; } -#ifdef __LCAO double ElecState::get_dftu_energy() { return GlobalC::dftu.get_energy(); } -#endif #ifdef __DEEPKS double ElecState::get_deepks_E_delta() diff --git a/source/module_esolver/esolver_ks_lcao.cpp b/source/module_esolver/esolver_ks_lcao.cpp index 3ffeac7712..63a1201bef 100644 --- a/source/module_esolver/esolver_ks_lcao.cpp +++ b/source/module_esolver/esolver_ks_lcao.cpp @@ -193,7 +193,8 @@ void ESolver_KS_LCAO::before_all_runners(UnitCell& ucell, const Input_pa // 7) initialize DFT+U if (PARAM.inp.dft_plus_u) { - GlobalC::dftu.init(ucell, &this->pv, this->kv.get_nks(), orb_); + auto* dftu = ModuleDFTU::DFTU::get_instance(); + dftu->init(ucell, &this->pv, this->kv.get_nks(), &orb_); } // 8) initialize ppcell @@ -1140,7 +1141,7 @@ void ESolver_KS_LCAO::after_scf(UnitCell& ucell, const int istep) //! Perform Mulliken charge analysis if (PARAM.inp.out_mul) { - ModuleIO::cal_mag(&(this->pv), this->p_hamilt, this->kv, this->pelec, ucell, istep, true); + ModuleIO::cal_mag(&(this->pv), this->p_hamilt, this->kv, this->pelec, this->two_center_bundle_, this->orb_, ucell, istep, true); } } diff --git a/source/module_esolver/esolver_ks_pw.cpp b/source/module_esolver/esolver_ks_pw.cpp index 33be890791..d57c4f5ffc 100644 --- a/source/module_esolver/esolver_ks_pw.cpp +++ b/source/module_esolver/esolver_ks_pw.cpp @@ -54,6 +54,10 @@ #include "module_base/kernels/dsp/dsp_connector.h" #endif +#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h" +#include "module_hamilt_lcao/module_deltaspin/spin_constrain.h" +#include "module_hamilt_lcao/module_dftu/dftu.h" + namespace ModuleESolver { @@ -359,6 +363,46 @@ void ESolver_KS_PW::before_scf(UnitCell& ucell, const int istep) this->ppcell.cal_effective_D(veff, this->pw_rhod, ucell); + if(PARAM.inp.onsite_radius > 0) + { + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + onsite_p->init(PARAM.inp.orbital_dir, + &ucell, + *(this->kspw_psi), + this->kv, + *(this->pw_wfc), + this->sf, + PARAM.inp.onsite_radius, + PARAM.globalv.nqx, + PARAM.globalv.dq, + this->pelec->wg, + this->pelec->ekb); + } + + if (PARAM.inp.sc_mag_switch) + { + spinconstrain::SpinConstrain>& sc = spinconstrain::SpinConstrain>::getScInstance(); + sc.init_sc(PARAM.inp.sc_thr, + PARAM.inp.nsc, + PARAM.inp.nsc_min, + PARAM.inp.alpha_trial, + PARAM.inp.sccut, + PARAM.inp.sc_drop_thr, + ucell, + nullptr, + PARAM.inp.nspin, + this->kv, + this->p_hamilt, + this->kspw_psi, + this->pelec, + this->pw_wfc); + } + + if(PARAM.inp.dft_plus_u) + { + auto* dftu = ModuleDFTU::DFTU::get_instance(); + dftu->init(ucell, nullptr, this->kv.get_nks()); + } // after init_rho (in pelec->init_scf), we have rho now. // before hamilt2density, we update Hk and initialize psi @@ -400,10 +444,55 @@ void ESolver_KS_PW::iter_init(UnitCell& ucell, const int istep, const if (iter == this->p_chgmix->mixing_restart_step && PARAM.inp.mixing_restart > 0.0) { this->p_chgmix->init_mixing(); + this->p_chgmix->mixing_restart_count++; + if (PARAM.inp.dft_plus_u) + { + auto* dftu = ModuleDFTU::DFTU::get_instance(); + if (dftu->uramping > 0.01 && !dftu->u_converged()) + { + this->p_chgmix->mixing_restart_step = PARAM.inp.scf_nmax + 1; + } + if (dftu->uramping > 0.01) + { + bool do_uramping = true; + if (PARAM.inp.sc_mag_switch) + { + spinconstrain::SpinConstrain>& sc = spinconstrain::SpinConstrain>::getScInstance(); + if(!sc.mag_converged())// skip uramping if mag not converged + { + do_uramping = false; + } + } + if(do_uramping) + { + dftu->uramping_update(); // update U by uramping if uramping > 0.01 + std::cout << " U-Ramping! Current U = "; + for (int i = 0; i < dftu->U0.size(); i++) + { + std::cout << dftu->U[i] * ModuleBase::Ry_to_eV << " "; + } + std::cout << " eV " << std::endl; + } + } + } } // mohan move harris functional to here, 2012-06-05 // use 'rho(in)' and 'v_h and v_xc'(in) this->pelec->f_en.deband_harris = this->pelec->cal_delta_eband(); + + // update local occupations for DFT+U + // should before lambda loop in DeltaSpin + if (PARAM.inp.dft_plus_u && (iter != 1 || istep != 0)) + { + auto* dftu = ModuleDFTU::DFTU::get_instance(); + // only old DFT+U method should calculated energy correction in esolver, + // new DFT+U method will calculate energy in calculating Hamiltonian + if (dftu->omc != 2) + { + dftu->cal_occ_pw(iter, this->kspw_psi, this->pelec->wg, ucell, PARAM.inp.mixing_beta); + } + dftu->output(ucell); + } } // Temporary, it should be replaced by hsolver later. @@ -431,27 +520,49 @@ void ESolver_KS_PW::hamilt2density_single(UnitCell& ucell, } bool skip_charge = PARAM.inp.calculation == "nscf" ? true : false; - hsolver::HSolverPW hsolver_pw_obj(this->pw_wfc, - PARAM.inp.calculation, - PARAM.inp.basis_type, - PARAM.inp.ks_solver, - PARAM.inp.use_paw, - PARAM.globalv.use_uspp, - PARAM.inp.nspin, - hsolver::DiagoIterAssist::SCF_ITER, - hsolver::DiagoIterAssist::PW_DIAG_NMAX, - hsolver::DiagoIterAssist::PW_DIAG_THR, - hsolver::DiagoIterAssist::need_subspace); - - hsolver_pw_obj.solve(this->p_hamilt, - this->kspw_psi[0], - this->pelec, - this->pelec->ekb.c, - GlobalV::RANK_IN_POOL, - GlobalV::NPROC_IN_POOL, - skip_charge, - ucell.tpiba, - ucell.nat); + // run the inner lambda loop to contrain atomic moments with the DeltaSpin method + bool skip_solve = false; + if (PARAM.inp.sc_mag_switch) + { + spinconstrain::SpinConstrain>& sc = spinconstrain::SpinConstrain>::getScInstance(); + if(!sc.mag_converged() && this->drho>0 && this->drho < PARAM.inp.sc_scf_thr) + { + // optimize lambda to get target magnetic moments, but the lambda is not near target + sc.run_lambda_loop(iter-1); + sc.set_mag_converged(true); + skip_solve = true; + } + else if(sc.mag_converged()) + { + // optimize lambda to get target magnetic moments, but the lambda is not near target + sc.run_lambda_loop(iter-1); + skip_solve = true; + } + } + if(!skip_solve) + { + hsolver::HSolverPW hsolver_pw_obj(this->pw_wfc, + PARAM.inp.calculation, + PARAM.inp.basis_type, + PARAM.inp.ks_solver, + PARAM.inp.use_paw, + PARAM.globalv.use_uspp, + PARAM.inp.nspin, + hsolver::DiagoIterAssist::SCF_ITER, + hsolver::DiagoIterAssist::PW_DIAG_NMAX, + hsolver::DiagoIterAssist::PW_DIAG_THR, + hsolver::DiagoIterAssist::need_subspace); + + hsolver_pw_obj.solve(this->p_hamilt, + this->kspw_psi[0], + this->pelec, + this->pelec->ekb.c, + GlobalV::RANK_IN_POOL, + GlobalV::NPROC_IN_POOL, + skip_charge, + ucell.tpiba, + ucell.nat); + } Symmetry_rho srho; for (int is = 0; is < PARAM.inp.nspin; is++) @@ -517,6 +628,20 @@ void ESolver_KS_PW::iter_finish(UnitCell& ucell, const int istep, int // functions into file WAVEFUNC.dat"); } } + // 4) check if oscillate for delta_spin method + if(PARAM.inp.sc_mag_switch) + { + spinconstrain::SpinConstrain>& sc = spinconstrain::SpinConstrain>::getScInstance(); + if(!sc.higher_mag_prec) + { + sc.higher_mag_prec = + this->p_chgmix->if_scf_oscillate(iter, this->drho, PARAM.inp.sc_os_ndim, PARAM.inp.scf_os_thr); + if(sc.higher_mag_prec) + { // if oscillate, increase the precision of magnetization and do mixing_restart in next iteration + this->p_chgmix->mixing_restart_step = iter + 1; + } + } + } } template @@ -600,6 +725,22 @@ void ESolver_KS_PW::after_scf(UnitCell& ucell, const int istep) bp.Macroscopic_polarization(ucell,this->pw_wfc->npwk_max, this->psi, this->pw_rho, this->pw_wfc, this->kv); std::cout << FmtCore::format(" >> Finish %s.\n * * * * * *\n", "Berry phase polarization"); } + + // 8) write spin constrian results + // spin constrain calculations, write atomic magnetization and magnetic force. + if (PARAM.inp.sc_mag_switch) { + spinconstrain::SpinConstrain>& sc + = spinconstrain::SpinConstrain>::getScInstance(); + sc.cal_mi_pw(); + sc.print_Mag_Force(GlobalV::ofs_running); + } + + // 9) write onsite occupations for charge and magnetizations + if(PARAM.inp.onsite_radius > 0) + { // float type has not been implemented + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + onsite_p->cal_occupations(reinterpret_cast, Device>*>(this->kspw_psi), this->pelec->wg); + } } template diff --git a/source/module_esolver/lcao_before_scf.cpp b/source/module_esolver/lcao_before_scf.cpp index b02c92729d..2066b5069b 100644 --- a/source/module_esolver/lcao_before_scf.cpp +++ b/source/module_esolver/lcao_before_scf.cpp @@ -238,7 +238,6 @@ void ESolver_KS_LCAO::before_scf(UnitCell& ucell, const int istep) &(this->pv), PARAM.inp.nspin, this->kv, - PARAM.inp.ks_solver, this->p_hamilt, this->psi, this->pelec); diff --git a/source/module_esolver/lcao_others.cpp b/source/module_esolver/lcao_others.cpp index 50012599c1..fa82e38ff6 100644 --- a/source/module_esolver/lcao_others.cpp +++ b/source/module_esolver/lcao_others.cpp @@ -242,7 +242,6 @@ void ESolver_KS_LCAO::others(UnitCell& ucell, const int istep) &(this->pv), PARAM.inp.nspin, this->kv, - PARAM.inp.ks_solver, this->p_hamilt, this->psi, this->pelec); diff --git a/source/module_hamilt_general/operator.h b/source/module_hamilt_general/operator.h index f040efc710..6cf29122fe 100644 --- a/source/module_hamilt_general/operator.h +++ b/source/module_hamilt_general/operator.h @@ -17,6 +17,7 @@ enum class calculation_type pw_nonlocal, pw_veff, pw_meta, + pw_onsite, lcao_overlap, lcao_fixed, lcao_gint, diff --git a/source/module_hamilt_lcao/module_deltaspin/cal_mw.cpp b/source/module_hamilt_lcao/module_deltaspin/cal_mw.cpp index 9da6f58564..94c5c74db7 100644 --- a/source/module_hamilt_lcao/module_deltaspin/cal_mw.cpp +++ b/source/module_hamilt_lcao/module_deltaspin/cal_mw.cpp @@ -5,7 +5,7 @@ #include "module_base/scalapack_connector.h" #include "module_base/tool_title.h" #include "module_base/timer.h" -//#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h" +#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h" #include "spin_constrain.h" #include "module_parameter/parameter.h" #ifdef __LCAO @@ -51,7 +51,7 @@ void spinconstrain::SpinConstrain>::cal_mi_lcao(const int& #endif -/*template <> +template <> void spinconstrain::SpinConstrain>::cal_mi_pw() { ModuleBase::TITLE("module_deltaspin", "cal_mi_pw"); @@ -154,7 +154,7 @@ void spinconstrain::SpinConstrain>::cal_mi_pw() Parallel_Reduce::reduce_double_allpool(PARAM.inp.kpar, GlobalV::NPROC_IN_POOL, &(this->Mi_[0][0]), 3 * this->Mi_.size()); ModuleBase::timer::tick("spinconstrain::SpinConstrain", "cal_mi_pw"); -}*/ +} template <> void spinconstrain::SpinConstrain>::set_operator( diff --git a/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp b/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp index 4ce31dfeda..87a2fa41cc 100644 --- a/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp +++ b/source/module_hamilt_lcao/module_deltaspin/cal_mw_from_lambda.cpp @@ -3,10 +3,12 @@ #include "module_hsolver/diago_iter_assist.h" #include "module_parameter/parameter.h" #include "spin_constrain.h" -//#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h" +#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h" #include "module_base/parallel_reduce.h" #include "module_hsolver/kernels/math_kernel_op.h" #include "module_hsolver/hsolver_lcao.h" +#include "module_hsolver/hsolver_pw.h" +#include "module_elecstate/elecstate_pw.h" #ifdef __LCAO #include "module_elecstate/elecstate_lcao.h" @@ -18,7 +20,23 @@ template <> void spinconstrain::SpinConstrain>::calculate_delta_hcc(std::complex* h_tmp, const std::complex* becp_k, const ModuleBase::Vector3* delta_lambda, const int nbands, const int nkb, const int* nh_iat) { int sum = 0; - std::vector> ps(nkb * 2 * nbands, 0.0); + int size_ps = nkb * 2 * nbands; + std::complex* becp_cpu = nullptr; + if(PARAM.inp.device == "gpu") + { +#if ((defined __CUDA) || (defined __ROCM)) + base_device::DEVICE_GPU* ctx = {}; + base_device::DEVICE_CPU* cpu_ctx = {}; + base_device::memory::resize_memory_op, base_device::DEVICE_CPU>()(cpu_ctx, becp_cpu, size_ps); + base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(cpu_ctx, ctx, becp_cpu, becp_k, size_ps); +#endif + } + else if (PARAM.inp.device == "cpu") + { + becp_cpu = const_cast*>(becp_k); + } + + std::vector> ps(size_ps, 0.0); for (int iat = 0; iat < this->Mi_.size(); iat++) { const int nproj = nh_iat[iat]; @@ -34,8 +52,8 @@ void spinconstrain::SpinConstrain>::calculate_delta_hcc(std for (int ip = 0; ip < nproj; ip++) { const int becpind = ib * nkb + sum + ip; - const std::complex becp1 = becp_k[becpind]; - const std::complex becp2 = becp_k[becpind + nkb]; + const std::complex becp1 = becp_cpu[becpind]; + const std::complex becp2 = becp_cpu[becpind + nkb]; ps[becpind] += coefficients0 * becp1 + coefficients2 * becp2; ps[becpind + nkb] += coefficients1 * becp1 @@ -44,27 +62,69 @@ void spinconstrain::SpinConstrain>::calculate_delta_hcc(std } // end ib sum += nproj; } // end iat + std::complex* ps_pointer = nullptr; + if(PARAM.inp.device == "gpu") + { +#if ((defined __CUDA) || (defined __ROCM)) + base_device::DEVICE_GPU* ctx = {}; + base_device::DEVICE_CPU* cpu_ctx = {}; + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, ps_pointer, size_ps); + base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_CPU>()(ctx, cpu_ctx, ps_pointer, ps.data(), size_ps); +#endif + } + else if (PARAM.inp.device == "cpu") + { + ps_pointer = ps.data(); + } // update h_tmp by becp_k * ps char transa = 'C'; char transb = 'N'; const int npm = nkb * 2; - base_device::DEVICE_CPU* ctx = {}; - hsolver::gemm_op, base_device::DEVICE_CPU>()( - ctx, - transa, - transb, - nbands, - nbands, - npm, - &ModuleBase::ONE, - becp_k, - npm, - ps.data(), - npm, - &ModuleBase::ONE, - h_tmp, - nbands - ); + if (PARAM.inp.device == "gpu") + { +#if ((defined __CUDA) || (defined __ROCM)) + base_device::DEVICE_GPU* ctx = {}; + hsolver::gemm_op, base_device::DEVICE_GPU>()( + ctx, + transa, + transb, + nbands, + nbands, + npm, + &ModuleBase::ONE, + becp_k, + npm, + ps_pointer, + npm, + &ModuleBase::ONE, + h_tmp, + nbands + ); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ctx, ps_pointer); + delete[] becp_cpu; +#endif + + } + else if (PARAM.inp.device == "cpu") + { + base_device::DEVICE_CPU* ctx = {}; + hsolver::gemm_op, base_device::DEVICE_CPU>()( + ctx, + transa, + transb, + nbands, + nbands, + npm, + &ModuleBase::ONE, + becp_k, + npm, + ps_pointer, + npm, + &ModuleBase::ONE, + h_tmp, + nbands + ); + } } template <> @@ -103,149 +163,196 @@ void spinconstrain::SpinConstrain>::cal_mw_from_lambda(int else #endif { - /*this->zero_Mi(); - int size_becp = 0; - std::vector> becp_tmp; - int nk = 0; - int nkb = 0; - int nbands = 0; - int npol = 0; - const int* nh_iat = nullptr; - if (PARAM.inp.device == "cpu") + /*if (i_step == -1 && this->higher_mag_prec) { - psi::Psi>* psi_t = static_cast>*>(this->psi); - hamilt::Hamilt, base_device::DEVICE_CPU>* hamilt_t = static_cast, base_device::DEVICE_CPU>*>(this->p_hamilt); - auto* onsite_p = projectors::OnsiteProjector::get_instance(); - nbands = psi_t->get_nbands(); - npol = psi_t->npol; - nkb = onsite_p->get_tot_nproj(); - nk = psi_t->get_nk(); - nh_iat = &onsite_p->get_nh(0); - size_becp = nbands * nkb * npol; - becp_tmp.resize(size_becp * nk); - std::vector> h_tmp(nbands * nbands), s_tmp(nbands * nbands); - int initial_hs = 0; - if(this->sub_h_save == nullptr) + // std::cout<<__FILE__<<__LINE__<<"istep == 0"<sub_h_save = new std::complex[nbands * nbands * nk]; - this->sub_s_save = new std::complex[nbands * nbands * nk]; - this->becp_save = new std::complex[size_becp * nk]; + psi::Psi>* psi_t = static_cast>*>(this->psi); + hamilt::Hamilt>* hamilt_t = static_cast>*>(this->p_hamilt); + hsolver::HSolver, base_device::DEVICE_CPU>* hsolver_t = static_cast, base_device::DEVICE_CPU>*>(this->phsol); + hsolver_t->solve(hamilt_t, psi_t[0], this->pelec, this->KS_SOLVER, true); } - for (int ik = 0; ik < nk; ++ik) + else { - - psi_t->fix_k(ik); - - std::complex* h_k = this->sub_h_save + ik * nbands * nbands; - std::complex* s_k = this->sub_s_save + ik * nbands * nbands; - std::complex* becp_k = this->becp_save + ik * size_becp; - if(initial_hs) + psi::Psi, base_device::DEVICE_GPU>* psi_t = static_cast, base_device::DEVICE_GPU>*>(this->psi); + hamilt::Hamilt, base_device::DEVICE_GPU>* hamilt_t = static_cast, base_device::DEVICE_GPU>*>(this->p_hamilt); + hsolver::HSolver, base_device::DEVICE_GPU>* hsolver_t = static_cast, base_device::DEVICE_GPU>*>(this->phsol); + hsolver_t->solve(hamilt_t, psi_t[0], this->pelec, this->KS_SOLVER, true); + } + this->pelec->calculate_weights(); + this->cal_Mi_pw(); + } + else*/ + { + this->zero_Mi(); + int size_becp = 0; + std::vector> becp_tmp; + int nk = 0; + int nkb = 0; + int nbands = 0; + int npol = 0; + const int* nh_iat = nullptr; + if (PARAM.inp.device == "cpu") + { + psi::Psi>* psi_t = static_cast>*>(this->psi); + hamilt::Hamilt, base_device::DEVICE_CPU>* hamilt_t = static_cast, base_device::DEVICE_CPU>*>(this->p_hamilt); + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + nbands = psi_t->get_nbands(); + npol = psi_t->npol; + nkb = onsite_p->get_tot_nproj(); + nk = psi_t->get_nk(); + nh_iat = &onsite_p->get_nh(0); + size_becp = nbands * nkb * npol; + becp_tmp.resize(size_becp * nk); + std::vector> h_tmp(nbands * nbands), s_tmp(nbands * nbands); + int initial_hs = 0; + if(this->sub_h_save == nullptr) { - /// update H(k) for each k point - hamilt_t->updateHk(ik); - hsolver::DiagoIterAssist>::cal_hs_subspace(hamilt_t, psi_t[0], h_k, s_k); - memcpy(becp_k, onsite_p->get_becp(), sizeof(std::complex) * size_becp); + initial_hs = 1; + this->sub_h_save = new std::complex[nbands * nbands * nk]; + this->sub_s_save = new std::complex[nbands * nbands * nk]; + this->becp_save = new std::complex[size_becp * nk]; } - memcpy(h_tmp.data(), h_k, sizeof(std::complex) * nbands * nbands); - memcpy(s_tmp.data(), s_k, sizeof(std::complex) * nbands * nbands); - // update h_tmp by delta_lambda - if (i_step != -1) this->calculate_delta_hcc(h_tmp.data(), becp_k, delta_lambda, nbands, nkb, nh_iat); + for (int ik = 0; ik < nk; ++ik) + { - hsolver::DiagoIterAssist>::diag_responce(h_tmp.data(), - s_tmp.data(), - nbands, - becp_k, - &becp_tmp[ik * size_becp], - nkb * 2, - &this->pelec->ekb(ik, 0)); + psi_t->fix_k(ik); + + std::complex* h_k = this->sub_h_save + ik * nbands * nbands; + std::complex* s_k = this->sub_s_save + ik * nbands * nbands; + std::complex* becp_k = this->becp_save + ik * size_becp; + if(initial_hs) + { + /// update H(k) for each k point + hamilt_t->updateHk(ik); + hsolver::DiagoIterAssist>::cal_hs_subspace(hamilt_t, psi_t[0], h_k, s_k); + memcpy(becp_k, onsite_p->get_becp(), sizeof(std::complex) * size_becp); + } + memcpy(h_tmp.data(), h_k, sizeof(std::complex) * nbands * nbands); + memcpy(s_tmp.data(), s_k, sizeof(std::complex) * nbands * nbands); + // update h_tmp by delta_lambda + if (i_step != -1) this->calculate_delta_hcc(h_tmp.data(), becp_k, delta_lambda, nbands, nkb, nh_iat); + + hsolver::DiagoIterAssist>::diag_responce(h_tmp.data(), + s_tmp.data(), + nbands, + becp_k, + &becp_tmp[ik * size_becp], + nkb * 2, + &this->pelec->ekb(ik, 0)); + } } - } #if ((defined __CUDA) || (defined __ROCM)) - else - { - base_device::DEVICE_GPU* ctx = {}; - base_device::DEVICE_CPU* cpu_ctx = {}; - psi::Psi, base_device::DEVICE_GPU>* psi_t = static_cast, base_device::DEVICE_GPU>*>(this->psi); - hamilt::Hamilt, base_device::DEVICE_GPU>* hamilt_t = static_cast, base_device::DEVICE_GPU>*>(this->p_hamilt); - auto* onsite_p = projectors::OnsiteProjector::get_instance(); - nbands = psi_t->get_nbands(); - npol = psi_t->npol; - nkb = onsite_p->get_tot_nproj(); - nk = psi_t->get_nk(); - nh_iat = &onsite_p->get_nh(0); - size_becp = nbands * nkb * npol; - becp_tmp.resize(size_becp * nk); - std::complex* becp_pointer = nullptr; - // allocate memory for becp_pointer in GPU device - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, becp_pointer, size_becp); - for (int ik = 0; ik < nk; ++ik) + else { - /// update H(k) for each k point - hamilt_t->updateHk(ik); + base_device::DEVICE_GPU* ctx = {}; + base_device::DEVICE_CPU* cpu_ctx = {}; + psi::Psi, base_device::DEVICE_GPU>* psi_t = static_cast, base_device::DEVICE_GPU>*>(this->psi); + hamilt::Hamilt, base_device::DEVICE_GPU>* hamilt_t = static_cast, base_device::DEVICE_GPU>*>(this->p_hamilt); + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + nbands = psi_t->get_nbands(); + npol = psi_t->npol; + nkb = onsite_p->get_tot_nproj(); + nk = psi_t->get_nk(); + nh_iat = &onsite_p->get_nh(0); + size_becp = nbands * nkb * npol; + becp_tmp.resize(size_becp * nk); + std::complex* h_tmp = nullptr; + std::complex* s_tmp = nullptr; + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, h_tmp, nbands * nbands); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, s_tmp, nbands * nbands); + int initial_hs = 0; + if(this->sub_h_save == nullptr) + { + initial_hs = 1; + + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, this->sub_h_save, nbands * nbands * nk); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, this->sub_s_save, nbands * nbands * nk); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, this->becp_save, size_becp * nk); + } + std::complex* becp_pointer = nullptr; + // allocate memory for becp_pointer in GPU device + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, becp_pointer, size_becp); + for (int ik = 0; ik < nk; ++ik) + { + psi_t->fix_k(ik); - psi_t->fix_k(ik); + std::complex* h_k = this->sub_h_save + ik * nbands * nbands; + std::complex* s_k = this->sub_s_save + ik * nbands * nbands; + std::complex* becp_k = this->becp_save + ik * size_becp; + if(initial_hs) + { + /// update H(k) for each k point + hamilt_t->updateHk(ik); + hsolver::DiagoIterAssist, base_device::DEVICE_GPU>::cal_hs_subspace(hamilt_t, psi_t[0], h_k, s_k); + base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, becp_k, onsite_p->get_becp(), size_becp); + } + base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, h_tmp, h_k, nbands * nbands); + base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, s_tmp, s_k, nbands * nbands); + // update h_tmp by delta_lambda + if (i_step != -1) this->calculate_delta_hcc(h_tmp, becp_k, delta_lambda, nbands, nkb, nh_iat); - const std::complex* becp_new = onsite_p->get_becp(); - hsolver::DiagoIterAssist, base_device::DEVICE_GPU>::diag_responce(hamilt_t, - psi_t[0], - becp_new, - becp_pointer, - nkb * npol, - &this->pelec->ekb(ik, 0)); - // copy becp_pointer from GPU to CPU - base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(cpu_ctx, ctx, &becp_tmp[ik * size_becp], becp_pointer, size_becp); - } + hsolver::DiagoIterAssist, base_device::DEVICE_GPU>::diag_responce(h_tmp, + s_tmp, + nbands, + becp_k, + becp_pointer, + nkb * npol, + &this->pelec->ekb(ik, 0)); + // copy becp_pointer from GPU to CPU + base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(cpu_ctx, ctx, &becp_tmp[ik * size_becp], becp_pointer, size_becp); + } - // free memory for becp_pointer in GPU device - base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ctx, becp_pointer); - } + // free memory for becp_pointer in GPU device + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ctx, becp_pointer); + } #endif - // calculate weights from ekb to update wg - this->pelec->calculate_weights(); - // calculate Mi from existed becp - for (int ik = 0; ik < nk; ik++) - { - const std::complex* becp = &becp_tmp[ik * size_becp]; - // becp(nbands*npol , nkb) - // mag = wg * \sum_{nh}becp * becp - for (int ib = 0; ib < nbands; ib++) + // calculate weights from ekb to update wg + this->pelec->calculate_weights(); + // calculate Mi from existed becp + for (int ik = 0; ik < nk; ik++) { - const double weight = this->pelec->wg(ik, ib); - int begin_ih = 0; - for (int iat = 0; iat < this->Mi_.size(); iat++) + const std::complex* becp = &becp_tmp[ik * size_becp]; + // becp(nbands*npol , nkb) + // mag = wg * \sum_{nh}becp * becp + for (int ib = 0; ib < nbands; ib++) { - const int nh = nh_iat[iat]; - std::complex occ[4] - = {ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO}; - for (int ih = 0; ih < nh; ih++) + const double weight = this->pelec->wg(ik, ib); + int begin_ih = 0; + for (int iat = 0; iat < this->Mi_.size(); iat++) { - const int index = ib * npol * nkb + begin_ih + ih; - occ[0] += conj(becp[index]) * becp[index]; - occ[1] += conj(becp[index]) * becp[index + nkb]; - occ[2] += conj(becp[index + nkb]) * becp[index]; - occ[3] += conj(becp[index + nkb]) * becp[index + nkb]; + const int nh = nh_iat[iat]; + std::complex occ[4] + = {ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO, ModuleBase::ZERO}; + for (int ih = 0; ih < nh; ih++) + { + const int index = ib * npol * nkb + begin_ih + ih; + occ[0] += conj(becp[index]) * becp[index]; + occ[1] += conj(becp[index]) * becp[index + nkb]; + occ[2] += conj(becp[index + nkb]) * becp[index]; + occ[3] += conj(becp[index + nkb]) * becp[index + nkb]; + } + // occ has been reduced and calculate mag + this->Mi_[iat].x += weight * (occ[1] + occ[2]).real(); + this->Mi_[iat].y += weight * (occ[1] - occ[2]).imag(); + this->Mi_[iat].z += weight * (occ[0] - occ[3]).real(); + begin_ih += nh; } - // occ has been reduced and calculate mag - this->Mi_[iat].x += weight * (occ[1] + occ[2]).real(); - this->Mi_[iat].y += weight * (occ[1] - occ[2]).imag(); - this->Mi_[iat].z += weight * (occ[0] - occ[3]).real(); - begin_ih += nh; } } + Parallel_Reduce::reduce_double_allpool(GlobalV::KPAR, + GlobalV::NPROC_IN_POOL, + &(this->Mi_[0][0]), + 3 * this->Mi_.size()); + // for(int i = 0; i < this->Mi_.size(); i++) + //{ + // std::cout<<"atom"<Mi_[i].x<<" "<Mi_[i].y<<" "<Mi_[i].z<<" + // "<lambda_[i].x<<" "<lambda_[i].y<<" "<lambda_[i].z<Mi_[0][0]), - 3 * this->Mi_.size()); - // for(int i = 0; i < this->Mi_.size(); i++) - //{ - // std::cout<<"atom"<Mi_[i].x<<" "<Mi_[i].y<<" "<Mi_[i].z<<" - // "<lambda_[i].x<<" "<lambda_[i].y<<" "<lambda_[i].z< @@ -262,7 +369,7 @@ void spinconstrain::SpinConstrain>::update_psi_charge(const else #endif { - /*int size_becp = 0; + int size_becp = 0; std::vector> becp_tmp; int nk = 0; int nkb = 0; @@ -311,8 +418,27 @@ void spinconstrain::SpinConstrain>::update_psi_charge(const if(pw_solve) { - hsolver::HSolver, base_device::DEVICE_CPU>* hsolver_t = static_cast, base_device::DEVICE_CPU>*>(this->phsol); - hsolver_t->solve(hamilt_t, psi_t[0], this->pelec, this->KS_SOLVER, false); + hsolver::HSolverPW, base_device::DEVICE_CPU> hsolver_pw_obj(this->pw_wfc_, + PARAM.inp.calculation, + PARAM.inp.basis_type, + PARAM.inp.ks_solver, + PARAM.inp.use_paw, + PARAM.globalv.use_uspp, + PARAM.inp.nspin, + hsolver::DiagoIterAssist, base_device::DEVICE_CPU>::SCF_ITER, + hsolver::DiagoIterAssist, base_device::DEVICE_CPU>::PW_DIAG_NMAX, + hsolver::DiagoIterAssist, base_device::DEVICE_CPU>::PW_DIAG_THR, + hsolver::DiagoIterAssist, base_device::DEVICE_CPU>::need_subspace); + + hsolver_pw_obj.solve(hamilt_t, + psi_t[0], + this->pelec, + this->pelec->ekb.c, + GlobalV::RANK_IN_POOL, + GlobalV::NPROC_IN_POOL, + false, + this->tpiba, + this->get_nat()); } else {// update charge density only @@ -333,36 +459,69 @@ void spinconstrain::SpinConstrain>::update_psi_charge(const nk = psi_t->get_nk(); nh_iat = &onsite_p->get_nh(0); size_becp = nbands * nkb * npol; - becp_tmp.resize(size_becp * nk); - std::complex* becp_pointer = nullptr; - // allocate memory for becp_pointer in GPU device - base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, becp_pointer, size_becp); + + std::complex* h_tmp = nullptr; + std::complex* s_tmp = nullptr; + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, h_tmp, nbands * nbands); + base_device::memory::resize_memory_op, base_device::DEVICE_GPU>()(ctx, s_tmp, nbands * nbands); + assert(this->sub_h_save != nullptr); + assert(this->sub_s_save != nullptr); + assert(this->becp_save != nullptr); for (int ik = 0; ik < nk; ++ik) { - /// update H(k) for each k point - hamilt_t->updateHk(ik); + std::complex* h_k = this->sub_h_save + ik * nbands * nbands; + std::complex* s_k = this->sub_s_save + ik * nbands * nbands; + std::complex* becp_k = this->becp_save + ik * size_becp; psi_t->fix_k(ik); - - const std::complex* becp_new = onsite_p->get_becp(); - hsolver::DiagoIterAssist, base_device::DEVICE_GPU>::diag_responce(hamilt_t, + base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, h_tmp, h_k, nbands * nbands); + base_device::memory::synchronize_memory_op, base_device::DEVICE_GPU, base_device::DEVICE_GPU>()(ctx, ctx, s_tmp, s_k, nbands * nbands); + this->calculate_delta_hcc(h_tmp, becp_k, delta_lambda, nbands, nkb, nh_iat); + hsolver::DiagoIterAssist, base_device::DEVICE_GPU>::diag_subspace_psi(h_tmp, + s_tmp, + nbands, psi_t[0], - becp_new, - becp_pointer, - nkb * npol, &this->pelec->ekb(ik, 0)); - // copy becp_pointer from GPU to CPU - base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, base_device::DEVICE_GPU>()(cpu_ctx, ctx, &becp_tmp[ik * size_becp], becp_pointer, size_becp); } - // free memory for becp_pointer in GPU device - base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ctx, becp_pointer); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ctx, sub_h_save); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ctx, sub_s_save); + base_device::memory::delete_memory_op, base_device::DEVICE_GPU>()(ctx, becp_save); + this->sub_h_save = nullptr; + this->sub_s_save = nullptr; + this->becp_save = nullptr; + + if(pw_solve) + { + hsolver::HSolverPW, base_device::DEVICE_GPU> hsolver_pw_obj(this->pw_wfc_, + PARAM.inp.calculation, + PARAM.inp.basis_type, + PARAM.inp.ks_solver, + PARAM.inp.use_paw, + PARAM.globalv.use_uspp, + PARAM.inp.nspin, + hsolver::DiagoIterAssist, base_device::DEVICE_GPU>::SCF_ITER, + hsolver::DiagoIterAssist, base_device::DEVICE_GPU>::PW_DIAG_NMAX, + hsolver::DiagoIterAssist, base_device::DEVICE_GPU>::PW_DIAG_THR, + hsolver::DiagoIterAssist, base_device::DEVICE_GPU>::need_subspace); - hsolver::HSolver, base_device::DEVICE_GPU>* hsolver_t = static_cast, base_device::DEVICE_GPU>*>(this->phsol); - hsolver_t->solve(hamilt_t, psi_t[0], this->pelec, this->KS_SOLVER, false); + hsolver_pw_obj.solve(hamilt_t, + psi_t[0], + this->pelec, + this->pelec->ekb.c, + GlobalV::RANK_IN_POOL, + GlobalV::NPROC_IN_POOL, + false, + this->tpiba, + this->get_nat()); + } + else + {// update charge density only + reinterpret_cast, base_device::DEVICE_GPU>*>(this->pelec)->psiToRho(*psi_t); + } + } -#endif - */ +#endif } ModuleBase::timer::tick("spinconstrain::SpinConstrain", "update_psi_charge"); } diff --git a/source/module_hamilt_lcao/module_deltaspin/init_sc.cpp b/source/module_hamilt_lcao/module_deltaspin/init_sc.cpp index 45cb492780..fbba82a839 100644 --- a/source/module_hamilt_lcao/module_deltaspin/init_sc.cpp +++ b/source/module_hamilt_lcao/module_deltaspin/init_sc.cpp @@ -12,10 +12,10 @@ void spinconstrain::SpinConstrain::init_sc(double sc_thr_in, Parallel_Orbitals* ParaV_in, int nspin_in, K_Vectors& kv_in, - std::string KS_SOLVER_in, void* p_hamilt_in, void* psi_in, - elecstate::ElecState* pelec_in) + elecstate::ElecState* pelec_in, + ModulePW::PW_Basis_K* pw_wfc_in) { this->set_input_parameters(sc_thr_in, nsc_in, nsc_min_in, alpha_trial_in, sccut_in, sc_drop_thr_in); this->set_atomCounts(ucell.get_atom_Counts()); @@ -26,9 +26,11 @@ void spinconstrain::SpinConstrain::init_sc(double sc_thr_in, this->lambda_ = ucell.get_lambda(); this->constrain_ = ucell.get_constrain(); this->atomLabels_ = ucell.get_atomLabels(); + this->tpiba = ucell.tpiba; + this->pw_wfc_ = pw_wfc_in; this->set_decay_grad(); if(ParaV_in != nullptr) this->set_ParaV(ParaV_in); - this->set_solver_parameters(kv_in, p_hamilt_in, psi_in, pelec_in, KS_SOLVER_in); + this->set_solver_parameters(kv_in, p_hamilt_in, psi_in, pelec_in); } template class spinconstrain::SpinConstrain>; diff --git a/source/module_hamilt_lcao/module_deltaspin/lambda_loop.cpp b/source/module_hamilt_lcao/module_deltaspin/lambda_loop.cpp index f4eb1f7edc..cad7b64c7c 100644 --- a/source/module_hamilt_lcao/module_deltaspin/lambda_loop.cpp +++ b/source/module_hamilt_lcao/module_deltaspin/lambda_loop.cpp @@ -202,7 +202,7 @@ void spinconstrain::SpinConstrain>::run_lambda_loop(int out { //add_scalar_multiply_2d(initial_lambda, dnu_last_step, 1.0, this->lambda_); this->update_psi_charge(dnu_last_step.data(), rerun); - /*if(PARAM.inp.basis_type == "pw") + if(PARAM.inp.basis_type == "pw") { //double check Atomic spin moment this->cal_mi_pw(); @@ -224,7 +224,7 @@ void spinconstrain::SpinConstrain>::run_lambda_loop(int out std::cout<<"Error: RMS error is too large, rerun the loop"<run_lambda_loop(outer_step, false); } - }*/ + } break; } #ifdef __MPI diff --git a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp index 30ac4d7dfd..1339fc4601 100644 --- a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp +++ b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.cpp @@ -490,14 +490,12 @@ template void SpinConstrain::set_solver_parameters(K_Vectors& kv_in, void* p_hamilt_in, void* psi_in, - elecstate::ElecState* pelec_in, - std::string KS_SOLVER_in) + elecstate::ElecState* pelec_in) { this->kv_ = kv_in; this->p_hamilt = p_hamilt_in; this->psi = psi_in; this->pelec = pelec_in; - this->KS_SOLVER = KS_SOLVER_in; } /// @brief set ParaV diff --git a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h index 866f2373e0..2e7cf6c8db 100644 --- a/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h +++ b/source/module_hamilt_lcao/module_deltaspin/spin_constrain.h @@ -37,17 +37,17 @@ class SpinConstrain Parallel_Orbitals* ParaV_in, int nspin_in, K_Vectors& kv_in, - std::string KS_SOLVER_in, void* p_hamilt_in, void* psi_in, - elecstate::ElecState* pelec_in); + elecstate::ElecState* pelec_in, + ModulePW::PW_Basis_K* pw_wfc_in = nullptr); /// @brief calculate the magnetization of each atom with real space projection method for LCAO base /// @param step : the step number of the SCF calculation /// @param print : print the magnetization of each atom if true void cal_mi_lcao(const int& step, bool print = false); - //void cal_mi_pw(); + void cal_mi_pw(); void cal_mw_from_lambda(int i_step, const ModuleBase::Vector3* delta_lambda = nullptr); @@ -108,7 +108,8 @@ class SpinConstrain void* p_hamilt = nullptr; void* psi = nullptr; elecstate::ElecState* pelec = nullptr; - std::string KS_SOLVER; + ModulePW::PW_Basis_K* pw_wfc_ = nullptr; + double tpiba = 0.0; /// save ucell.tpiba const double meV_to_Ry = 7.349864435130999e-05; K_Vectors kv_; //-------------------------------------------------------------------------------- @@ -203,8 +204,7 @@ class SpinConstrain void set_solver_parameters(K_Vectors& kv_in, void* p_hamilt_in, void* psi_in, - elecstate::ElecState* pelec_in, - std::string KS_SOLVER_in); + elecstate::ElecState* pelec_in); private: SpinConstrain(){}; // Private constructor diff --git a/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp b/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp index 72f2941a75..1fd36524e3 100644 --- a/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp +++ b/source/module_hamilt_lcao/module_deltaspin/test/spin_constrain_test.cpp @@ -149,12 +149,11 @@ TYPED_TEST(SpinConstrainTest, SetSolverParameters) { K_Vectors kv; this->sc.set_nspin(4); - this->sc.set_solver_parameters(kv, nullptr, nullptr, nullptr, "genelpa"); + this->sc.set_solver_parameters(kv, nullptr, nullptr, nullptr); EXPECT_EQ(this->sc.get_nspin(), 4); EXPECT_EQ(this->sc.p_hamilt, nullptr); EXPECT_EQ(this->sc.psi, nullptr); EXPECT_EQ(this->sc.pelec, nullptr); - EXPECT_EQ(this->sc.KS_SOLVER, "genelpa"); } TYPED_TEST(SpinConstrainTest, SetParaV) diff --git a/source/module_hamilt_lcao/module_dftu/CMakeLists.txt b/source/module_hamilt_lcao/module_dftu/CMakeLists.txt index dd7197dbfd..d412154970 100644 --- a/source/module_hamilt_lcao/module_dftu/CMakeLists.txt +++ b/source/module_hamilt_lcao/module_dftu/CMakeLists.txt @@ -1,22 +1,21 @@ -if(ENABLE_LCAO) - list(APPEND objects - dftu.cpp - dftu_force.cpp - dftu_yukawa.cpp - dftu_folding.cpp - dftu_io.cpp - dftu_tools.cpp - dftu_occup.cpp - dftu_hamilt.cpp - ) +list(APPEND objects + dftu.cpp + dftu_force.cpp + dftu_yukawa.cpp + dftu_folding.cpp + dftu_io.cpp + dftu_tools.cpp + dftu_occup.cpp + dftu_hamilt.cpp + dftu_pw.cpp +) - add_library( - dftu - OBJECT - ${objects} - ) +add_library( + dftu + OBJECT + ${objects} +) - if(ENABLE_COVERAGE) - add_coverage(dftu) - endif() +if(ENABLE_COVERAGE) + add_coverage(dftu) endif() \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_dftu/dftu.cpp b/source/module_hamilt_lcao/module_dftu/dftu.cpp index 9f5eb09a3a..2dd705a03c 100644 --- a/source/module_hamilt_lcao/module_dftu/dftu.cpp +++ b/source/module_hamilt_lcao/module_dftu/dftu.cpp @@ -38,8 +38,10 @@ DFTU::~DFTU() void DFTU::init(UnitCell& cell, // unitcell class const Parallel_Orbitals* pv, - const int& nks, - const LCAO_Orbitals& orb + const int nks +#ifdef __LCAO + , const LCAO_Orbitals* orb +#endif ) { ModuleBase::TITLE("DFTU", "init"); @@ -50,9 +52,14 @@ void DFTU::init(UnitCell& cell, // unitcell class #endif this->paraV = pv; - - ptr_orb_ = &orb; - orb_cutoff_ = orb.cutoffs(); + +#ifdef __LCAO + ptr_orb_ = orb; + if(ptr_orb_ != nullptr) + { + orb_cutoff_ = orb->cutoffs(); + } +#endif // needs reconstructions in future // global parameters, need to be removed in future @@ -64,6 +71,9 @@ void DFTU::init(UnitCell& cell, // unitcell class this->locale.resize(cell.nat); this->locale_save.resize(cell.nat); + // only for PW base + this->eff_pot_pw_index.resize(cell.nat); + int pot_index = 0; this->iatlnmipol2iwt.resize(cell.nat); @@ -80,6 +90,10 @@ void DFTU::init(UnitCell& cell, // unitcell class locale[iat].resize(cell.atoms[it].nwl + 1); locale_save[iat].resize(cell.atoms[it].nwl + 1); + const int tlp1_npol = (this->orbital_corr[it]*2+1)*npol; + this->eff_pot_pw_index[iat] = pot_index; + pot_index += tlp1_npol * tlp1_npol; + for (int l = 0; l <= cell.atoms[it].nwl; l++) { const int N = cell.atoms[it].l_nchi[l]; @@ -143,6 +157,8 @@ void DFTU::init(UnitCell& cell, // unitcell class } } } + // allocate memory for eff_pot_pw + this->eff_pot_pw.resize(pot_index, 0.0); if (Yukawa) { @@ -209,6 +225,8 @@ void DFTU::init(UnitCell& cell, // unitcell class return; } +#ifdef __LCAO + void DFTU::cal_energy_correction(const UnitCell& ucell, const int istep) { @@ -360,6 +378,8 @@ void DFTU::cal_energy_correction(const UnitCell& ucell, return; } +#endif + void DFTU::uramping_update() { // if uramping < 0.1, use the original U @@ -392,6 +412,8 @@ bool DFTU::u_converged() return true; } +#ifdef __LCAO + void DFTU::set_dmr(const elecstate::DensityMatrix, double>* dmr) { this->dm_in_dftu_cd = dmr; @@ -443,4 +465,7 @@ void dftu_cal_occup_m(const int iter, { GlobalC::dftu.cal_occup_m_k(iter,ucell, dm, kv, mixing_beta, p_ham); } + +#endif + } // namespace ModuleDFTU diff --git a/source/module_hamilt_lcao/module_dftu/dftu.h b/source/module_hamilt_lcao/module_dftu/dftu.h index 9543ae6e55..68aae44516 100644 --- a/source/module_hamilt_lcao/module_dftu/dftu.h +++ b/source/module_hamilt_lcao/module_dftu/dftu.h @@ -8,12 +8,14 @@ #include "module_cell/klist.h" #include "module_cell/unitcell.h" #include "module_basis/module_ao/parallel_orbitals.h" +#ifdef __LCAO #include "module_elecstate/module_charge/charge_mixing.h" #include "module_hamilt_general/hamilt.h" #include "module_elecstate/elecstate.h" #include "module_hamilt_lcao/module_hcontainer/hcontainer.h" #include "module_elecstate/module_dm/density_matrix.h" #include "module_hamilt_lcao/hamilt_lcaodft/force_stress_arrays.h" // mohan add 2024-06-15 +#endif #include #include @@ -40,9 +42,13 @@ class DFTU // allocate relevant data strcutures void init(UnitCell& cell, // unitcell class const Parallel_Orbitals* pv, - const int& nks, - const LCAO_Orbitals& orb + const int nks +#ifdef __LCAO + , const LCAO_Orbitals* orb = nullptr +#endif ); + + static DFTU* get_instance(); // calculate the energy correction void cal_energy_correction(const UnitCell& ucell, const int istep); @@ -65,13 +71,16 @@ class DFTU // FIXME: the following variable does not have static lifetime; // while the present class is used via a global variable. This has // potential to cause dangling pointer issues. +#ifdef __LCAO const LCAO_Orbitals* ptr_orb_ = nullptr; std::vector orb_cutoff_; +#endif // transform between iwt index and it, ia, L, N and m index std::vector>>>> iatlnmipol2iwt; // iatlnm2iwt[iat][l][n][m][ipol] +#ifdef __LCAO //============================================================= // In dftu_hamilt.cpp // For calculating contribution to Hamiltonian matrices @@ -81,6 +90,7 @@ class DFTU void cal_eff_pot_mat_real(const int ik, double* eff_pot, const std::vector& isk, const double* sk); void cal_eff_pot_mat_R_double(const int ispin, double* SR, double* HR); void cal_eff_pot_mat_R_complex_double(const int ispin, std::complex* SR, std::complex* HR); +#endif //============================================================= // In dftu_occup.cpp @@ -88,6 +98,16 @@ class DFTU // and other operations of locale: copy,zero out,mix //============================================================= public: + /// interface for PW base + /// calculate the local occupation number matrix for PW based wave functions + void cal_occ_pw(const int iter, const void* psi_in, const ModuleBase::matrix& wg_in, const UnitCell& cell, const double& mixing_beta); + /// calculate the local DFT+U effective potential matrix for PW base. + void cal_VU_pot_pw(const int spin); + /// get effective potential matrix for PW base + const std::complex* get_eff_pot_pw(const int iat) const { return &(eff_pot_pw[this->eff_pot_pw_index[iat]]); } + int get_size_eff_pot_pw() const { return eff_pot_pw.size(); } + +#ifdef __LCAO // calculate the local occupation number matrix void cal_occup_m_k(const int iter, const UnitCell& ucell, @@ -100,6 +120,7 @@ class DFTU const std::vector>& dm_gamma, const double& mixing_beta, hamilt::Hamilt* p_ham); +#endif // dftu can be calculated only after locale has been initialed bool initialed_locale = false; @@ -109,12 +130,16 @@ class DFTU void zero_locale(const UnitCell& ucell); void mix_locale(const UnitCell& ucell,const double& mixing_beta); + std::vector> eff_pot_pw; + std::vector eff_pot_pw_index; + public: // local occupancy matrix of the correlated subspace // locale: the out put local occupation number matrix of correlated electrons in the current electronic step // locale_save: the input local occupation number matrix of correlated electrons in the current electronic step std::vector>>> locale; // locale[iat][l][n][spin](m1,m2) std::vector>>> locale_save; // locale_save[iat][l][n][spin](m1,m2) +#ifdef __LCAO private: //============================================================= // In dftu_tools.cpp @@ -224,6 +249,7 @@ class DFTU double* dh_r, const double* rho_VU, ModuleBase::matrix& stress_dftu); +#endif //============================================================= // In dftu_io.cpp @@ -261,6 +287,7 @@ class DFTU double spherical_Bessel(const int k, const double r, const double lambda); double spherical_Hankel(const int k, const double r, const double lambda); +#ifdef __LCAO public: /** * @brief get the density matrix of target spin @@ -278,8 +305,10 @@ class DFTU private: const elecstate::DensityMatrix* dm_in_dftu_d = nullptr; const elecstate::DensityMatrix, double>* dm_in_dftu_cd = nullptr; +#endif }; +#ifdef __LCAO template void dftu_cal_occup_m(const int iter, const UnitCell& ucell, @@ -287,6 +316,7 @@ void dftu_cal_occup_m(const int iter, const K_Vectors& kv, const double& mixing_beta, hamilt::Hamilt* p_ham); +#endif } // namespace ModuleDFTU diff --git a/source/module_hamilt_lcao/module_dftu/dftu_folding.cpp b/source/module_hamilt_lcao/module_dftu/dftu_folding.cpp index 84ed194008..aac95478d7 100644 --- a/source/module_hamilt_lcao/module_dftu/dftu_folding.cpp +++ b/source/module_hamilt_lcao/module_dftu/dftu_folding.cpp @@ -1,3 +1,4 @@ +#ifdef __LCAO #include "dftu.h" #include "module_base/timer.h" #include "module_parameter/parameter.h" @@ -305,3 +306,4 @@ void DFTU::folding_matrix_k_new(const int ik, } // namespace ModuleDFTU +#endif // __LCAO \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_dftu/dftu_force.cpp b/source/module_hamilt_lcao/module_dftu/dftu_force.cpp index b24aa09865..3ab4ef2496 100644 --- a/source/module_hamilt_lcao/module_dftu/dftu_force.cpp +++ b/source/module_hamilt_lcao/module_dftu/dftu_force.cpp @@ -3,6 +3,7 @@ #include "module_parameter/parameter.h" // DATE : 2019-12-10 //========================================================== +#ifdef __LCAO #include "dftu.h" #include "module_base/constants.h" #include "module_base/global_function.h" @@ -665,3 +666,4 @@ void DFTU::cal_stress_gamma(const UnitCell& ucell, return; } } // namespace ModuleDFTU +#endif \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_dftu/dftu_hamilt.cpp b/source/module_hamilt_lcao/module_dftu/dftu_hamilt.cpp index c350aef529..90d781ae9c 100644 --- a/source/module_hamilt_lcao/module_dftu/dftu_hamilt.cpp +++ b/source/module_hamilt_lcao/module_dftu/dftu_hamilt.cpp @@ -7,6 +7,7 @@ namespace ModuleDFTU { +#ifdef __LCAO void DFTU::cal_eff_pot_mat_complex(const int ik, std::complex* eff_pot, const std::vector& isk, const std::complex* sk) { ModuleBase::TITLE("DFTU", "cal_eff_pot_mat"); @@ -167,4 +168,5 @@ void DFTU::cal_eff_pot_mat_R_complex_double(const int ispin, std::complex>>& dm_k, @@ -519,4 +523,5 @@ void DFTU::cal_occup_m_gamma(const int iter, ModuleBase::timer::tick("DFTU", "cal_occup_m_gamma"); return; } +#endif } // namespace ModuleDFTU \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_dftu/dftu_pw.cpp b/source/module_hamilt_lcao/module_dftu/dftu_pw.cpp new file mode 100644 index 0000000000..cc0c3a6c30 --- /dev/null +++ b/source/module_hamilt_lcao/module_dftu/dftu_pw.cpp @@ -0,0 +1,212 @@ +#include "dftu.h" +#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h" +#include "module_base/parallel_reduce.h" +#include "module_parameter/parameter.h" +#include "module_base/timer.h" + + +namespace ModuleDFTU +{ +DFTU* DFTU::get_instance() +{ + return &GlobalC::dftu; +} +/// calculate occupation matrix for DFT+U +void DFTU::cal_occ_pw(const int iter, const void* psi_in, const ModuleBase::matrix& wg_in, const UnitCell& cell, const double& mixing_beta) +{ + ModuleBase::timer::tick("DFTU", "cal_occ_pw"); + this->copy_locale(cell); + this->zero_locale(cell); + + if(PARAM.inp.device == "cpu") + { + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + const psi::Psi>* psi_p = (const psi::Psi>*)psi_in; + // loop over k-points to calculate Mi of \sum_{k,i,l,m} + const int nbands = psi_p->get_nbands(); + for(int ik = 0; ik < psi_p->get_nk(); ik++) + { + psi_p->fix_k(ik); + onsite_p->tabulate_atomic(ik); + + onsite_p->overlap_proj_psi(nbands*psi_p->npol, psi_p->get_pointer()); + const std::complex* becp = onsite_p->get_h_becp(); + // becp(nbands*npol , nkb) + // mag = wg * \sum_{nh}becp * becp + int nkb = onsite_p->get_size_becp() / nbands / psi_p->npol; + int begin_ih = 0; + for(int iat = 0; iat < cell.nat; iat++) + { + const int it = cell.iat2it[iat]; + const int nh = onsite_p->get_nh(iat); + const int target_l = this->orbital_corr[it]; + if(target_l == -1) + { + begin_ih += nh; + continue; + } + // m = l^2, l^2+1, ..., (l+1)^2-1 + const int m_begin = target_l * target_l; + const int tlp1 = 2 * target_l + 1; + const int tlp1_2 = tlp1 * tlp1; + for(int ib = 0;ib occ[4]; + occ[0] = weight * conj(becp[index_m1]) * becp[index_m2]; + occ[1] = weight * conj(becp[index_m1]) * becp[index_m2 + nkb]; + occ[2] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2]; + occ[3] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2 + nkb]; + this->locale[iat][target_l][0][0].c[ind_m1m2] += (occ[0] + occ[3]).real(); + this->locale[iat][target_l][0][0].c[ind_m1m2 + tlp1_2] += (occ[1] + occ[2]).real(); + this->locale[iat][target_l][0][0].c[ind_m1m2 + 2 * tlp1_2] += (occ[1] - occ[2]).imag(); + this->locale[iat][target_l][0][0].c[ind_m1m2 + 3 * tlp1_2] += (occ[0] - occ[3]).real(); + ind_m1m2++; + } + } + }// ib + begin_ih += nh; + }// iat + }// ik + } +#if defined(__CUDA) || defined(__ROCM) + else + { + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + const psi::Psi, base_device::DEVICE_GPU>* psi_p = (const psi::Psi, base_device::DEVICE_GPU>*)psi_in; + // loop over k-points to calculate Mi of \sum_{k,i,l,m} + const int nbands = psi_p->get_nbands(); + for(int ik = 0; ik < psi_p->get_nk(); ik++) + { + psi_p->fix_k(ik); + onsite_p->tabulate_atomic(ik); + + onsite_p->overlap_proj_psi(nbands*psi_p->npol, psi_p->get_pointer()); + const std::complex* becp = onsite_p->get_h_becp(); + // becp(nbands*npol , nkb) + // mag = wg * \sum_{nh}becp * becp + int nkb = onsite_p->get_size_becp() / nbands / psi_p->npol; + int begin_ih = 0; + for(int iat = 0; iat < cell.nat; iat++) + { + const int it = cell.iat2it[iat]; + const int nh = onsite_p->get_nh(iat); + const int target_l = this->orbital_corr[it]; + if(target_l == -1) + { + begin_ih += nh; + continue; + } + // m = l^2, l^2+1, ..., (l+1)^2-1 + const int m_begin = target_l * target_l; + const int tlp1 = 2 * target_l + 1; + const int tlp1_2 = tlp1 * tlp1; + for(int ib = 0;ib occ[4]; + occ[0] = weight * conj(becp[index_m1]) * becp[index_m2]; + occ[1] = weight * conj(becp[index_m1]) * becp[index_m2 + nkb]; + occ[2] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2]; + occ[3] = weight * conj(becp[index_m1 + nkb]) * becp[index_m2 + nkb]; + this->locale[iat][target_l][0][0].c[ind_m1m2] += (occ[0] + occ[3]).real(); + this->locale[iat][target_l][0][0].c[ind_m1m2 + tlp1_2] += (occ[1] + occ[2]).real(); + this->locale[iat][target_l][0][0].c[ind_m1m2 + 2 * tlp1_2] += (occ[1] - occ[2]).imag(); + this->locale[iat][target_l][0][0].c[ind_m1m2 + 3 * tlp1_2] += (occ[0] - occ[3]).real(); + ind_m1m2++; + } + } + }// ib + begin_ih += nh; + }// iat + }// ik + } +#endif + + this->EU = 0.0; + // reduce mag from all k-pools + for(int iat = 0; iat < cell.nat; iat++) + { + const int it = cell.iat2it[iat]; + const int target_l = this->orbital_corr[it]; + if(target_l == -1) + { + continue; + } + const int size = (2 * target_l + 1) * (2 * target_l + 1); + Parallel_Reduce::reduce_double_allpool(PARAM.inp.kpar, PARAM.globalv.nproc_in_pool, this->locale[iat][target_l][0][0].c, size * PARAM.inp.nspin); + //update effective potential + const double u_value = this->U[it]; + std::complex* vu_iat = &(this->eff_pot_pw[this->eff_pot_pw_index[iat]]); + const int m_size = 2 * target_l + 1; + for (int m1 = 0; m1 < m_size; m1++) + { + for (int m2 = 0; m2 < m_size; m2++) + { + vu_iat[m1 * m_size + m2] = u_value * (1.0 * (m1 == m2) - this->locale[iat][target_l][0][0].c[m2 * m_size + m1]); + this->EU += u_value * 0.25 * this->locale[iat][target_l][0][0].c[m2 * m_size + m1] * this->locale[iat][target_l][0][0].c[m1 * m_size + m2]; + } + } + for (int is = 1; is < 4; ++is) + { + int start = is * m_size * m_size; + for (int m1 = 0; m1 < m_size; m1++) + { + for (int m2 = 0; m2 < m_size; m2++) + { + vu_iat[start + m1 * m_size + m2] = u_value * (0 - this->locale[iat][target_l][0][0].c[start + m2 * m_size + m1]); + this->EU += u_value * 0.25 * this->locale[iat][target_l][0][0].c[start + m2 * m_size + m1] * this->locale[iat][target_l][0][0].c[start + m1 * m_size + m2]; + } + } + } + // transfer from Pauli matrix representation to spin representation + for (int m1 = 0; m1 < m_size; m1++) + { + for (int m2 = 0; m2 < m_size; m2++) + { + int index[4]; + index[0] = m1 * m_size + m2; + index[1] = m1 * m_size + m2 + size; + index[2] = m1 * m_size + m2 + size * 2; + index[3] = m1 * m_size + m2 + size * 3; + std::complex vu_tmp[4]; + for (int i = 0; i < 4; i++) + { + vu_tmp[i] = vu_iat[index[i]]; + } + vu_iat[index[0]] = 0.5 * (vu_tmp[0] + vu_tmp[3]); + vu_iat[index[3]] = 0.5 * (vu_tmp[0] - vu_tmp[3]); + vu_iat[index[1]] = 0.5 * (vu_tmp[1] + std::complex(0.0, 1.0) * vu_tmp[2]); + vu_iat[index[2]] = 0.5 * (vu_tmp[1] - std::complex(0.0, 1.0) * vu_tmp[2]); + } + } + } + + if(mixing_dftu && initialed_locale) + { + this->mix_locale(cell, mixing_beta); + } + // update effective potential + ModuleBase::timer::tick("DFTU", "cal_occ_pw"); +} +/// calculate the local DFT+U effective potential matrix for PW base. +void DFTU::cal_VU_pot_pw(const int spin) +{ + +} + +} // namespace ModuleDFTU \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_dftu/dftu_tools.cpp b/source/module_hamilt_lcao/module_dftu/dftu_tools.cpp index 96582ee6aa..363c84da89 100644 --- a/source/module_hamilt_lcao/module_dftu/dftu_tools.cpp +++ b/source/module_hamilt_lcao/module_dftu/dftu_tools.cpp @@ -6,6 +6,7 @@ namespace ModuleDFTU { +#ifdef __LCAO void DFTU::cal_VU_pot_mat_complex(const int spin, const bool newlocale, std::complex* VU) { ModuleBase::TITLE("DFTU", "cal_VU_pot_mat_complex"); @@ -203,4 +204,5 @@ double DFTU::get_onebody_eff_pot(const int T, return VU; } +#endif } // namespace ModuleDFTU \ No newline at end of file diff --git a/source/module_hamilt_lcao/module_dftu/dftu_yukawa.cpp b/source/module_hamilt_lcao/module_dftu/dftu_yukawa.cpp index cdc83dc867..a2c3dd2973 100644 --- a/source/module_hamilt_lcao/module_dftu/dftu_yukawa.cpp +++ b/source/module_hamilt_lcao/module_dftu/dftu_yukawa.cpp @@ -1,5 +1,6 @@ //========================================================== // Author:Xin Qu +#ifdef __LCAO #include "module_parameter/parameter.h" // DATE : 2019-12-10 //========================================================== @@ -281,3 +282,5 @@ double DFTU::spherical_Hankel(const int k, const double r, const double lambda) } } // namespace ModuleDFTU + +#endif \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt b/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt index 18e6518a8d..9e797f3744 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt +++ b/source/module_hamilt_pw/hamilt_pwdft/CMakeLists.txt @@ -8,11 +8,13 @@ list(APPEND objects operator_pw/meta_pw.cpp operator_pw/velocity_pw.cpp operator_pw/operator_pw.cpp + operator_pw/onsite_proj_pw.cpp forces_nl.cpp forces_cc.cpp forces_scc.cpp forces.cpp forces_us.cpp + forces_onsite.cpp stress_func_cc.cpp stress_func_ewa.cpp stress_func_gga.cpp @@ -22,6 +24,7 @@ list(APPEND objects stress_func_loc.cpp stress_func_nl.cpp stress_func_us.cpp + stress_func_onsite.cpp stress_pw.cpp VL_in_pw.cpp VNL_in_pw.cpp @@ -35,6 +38,8 @@ list(APPEND objects fs_nonlocal_tools.cpp fs_kin_tools.cpp radial_proj.cpp + onsite_projector.cpp + onsite_proj_tools.cpp ) add_library( diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp index f9c6a63556..c1fcd2299c 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.cpp @@ -50,6 +50,7 @@ void Forces::cal_force(const UnitCell& ucell, ModuleBase::matrix forcenl(nat, 3); ModuleBase::matrix forcescc(nat, 3); ModuleBase::matrix forcepaw(nat, 3); + ModuleBase::matrix forceonsite(nat, 3); // Force due to local ionic potential // For PAW, calculated together in paw_cell.calculate_force @@ -156,6 +157,11 @@ void Forces::cal_force(const UnitCell& ucell, } #endif } + // DFT+U and DeltaSpin + if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch) + { + this->cal_force_onsite(forceonsite, wg, wfc_basis, GlobalC::ucell, psi_in); + } } // non-linear core correction @@ -317,6 +323,11 @@ void Forces::cal_force(const UnitCell& ucell, force(iat, ipol) = force(iat, ipol) + forcesol(iat, ipol); } + if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch) + { + force(iat, ipol) += forceonsite(iat, ipol); + } + sum += force(iat, ipol); iat++; @@ -457,6 +468,14 @@ void Forces::cal_force(const UnitCell& ucell, forcesol, false); } + if (PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch) + { + ModuleIO::print_force(GlobalV::ofs_running, + ucell, + "ONSITE_PROJ FORCE (eV/Angstrom)", + forceonsite, + false); + } } ModuleIO::print_force(GlobalV::ofs_running, ucell, "TOTAL-FORCE (eV/Angstrom)", force, false); ModuleBase::timer::tick("Forces", "cal_force"); diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces.h b/source/module_hamilt_pw/hamilt_pwdft/forces.h index c23f24f53e..90b419199d 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/forces.h +++ b/source/module_hamilt_pw/hamilt_pwdft/forces.h @@ -82,6 +82,17 @@ class Forces const pseudopot_cell_vnl& nlpp_in, const UnitCell& ucell_in, const psi::Psi, Device>* psi_in = nullptr); + /// @brief atomic force for DFT+U and DeltaSpin + /// @param force_onsite , the output atomic force + /// @param wg , the weight of k points + /// @param wfc_basis , the plane wave basis + /// @param ucell_in , the unit cell + /// @param psi_in , the wave function + void cal_force_onsite(ModuleBase::matrix& force_onsite, + const ModuleBase::matrix& wg, + const ModulePW::PW_Basis_K* wfc_basis, + const UnitCell& ucell_in, + const psi::Psi, Device>* psi_in = nullptr); void cal_force_scc(ModuleBase::matrix& forcescc, ModulePW::PW_Basis* rho_basis, const ModuleBase::matrix& v_current, diff --git a/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp b/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp new file mode 100644 index 0000000000..240187b3ba --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/forces_onsite.cpp @@ -0,0 +1,79 @@ +#include "forces.h" +#include "module_base/timer.h" +#include "module_base/tool_title.h" +#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h" +#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" +#include "module_parameter/parameter.h" +#include "module_hamilt_lcao/module_dftu/dftu.h" +#include "module_hamilt_lcao/module_deltaspin/spin_constrain.h" + +template +void Forces::cal_force_onsite(ModuleBase::matrix& force_onsite, + const ModuleBase::matrix& wg, + const ModulePW::PW_Basis_K* wfc_basis, + const UnitCell& ucell_in, + const psi::Psi, Device>* psi_in) +{ + ModuleBase::TITLE("Forces", "cal_force_onsite"); + if(psi_in == nullptr || wfc_basis == nullptr) + { + return; + } + ModuleBase::timer::tick("Forces", "cal_force_onsite"); + + // allocate memory for the force + FPTYPE* force = nullptr; + resmem_var_op()(this->ctx, force, ucell_in.nat * 3); + base_device::memory::set_memory_op()(this->ctx, force, 0.0, ucell_in.nat * 3); + + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + + const int nks = wfc_basis->nks; + for (int ik = 0; ik < nks; ik++) // loop k points + { + // skip zero weights to speed up + int nbands_occ = wg.nc; + while (wg(ik, nbands_occ - 1) == 0.0) + { + nbands_occ--; + if (nbands_occ == 0) + { + break; + } + } + const int npm = nbands_occ; + onsite_p->get_fs_tools()->cal_becp(ik, npm); + // calculate becp = for all beta functions + for (int ipol = 0; ipol < 3; ipol++) + { + // calculate dbecp = for all beta functions + onsite_p->get_fs_tools()->cal_dbecp_f(ik, npm, ipol); + } + // calculate the force_i = \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_i + // force for DFT+U + if(PARAM.inp.dft_plus_u) + { + auto* dftu = ModuleDFTU::DFTU::get_instance(); + onsite_p->get_fs_tools()->cal_force_dftu(ik, npm, force, dftu->orbital_corr.data(), dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw(), wg.c); + } + if(PARAM.inp.sc_mag_switch) + { + spinconstrain::SpinConstrain>& sc = spinconstrain::SpinConstrain>::getScInstance(); + const std::vector>& lambda = sc.get_sc_lambda(); + onsite_p->get_fs_tools()->cal_force_dspin(ik, npm, force, lambda.data(), wg.c); + } + + } // end ik + + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, force_onsite.c, force, force_onsite.nr * force_onsite.nc); + delmem_var_op()(this->ctx, force); + // sum up force_onsite from all processors + Parallel_Reduce::reduce_all(force_onsite.c, force_onsite.nr * force_onsite.nc); + + ModuleBase::timer::tick("Forces", "cal_force_onsite"); +} + +template class Forces; +#if ((defined __CUDA) || (defined __ROCM)) +template class Forces; +#endif \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp index b219678f4c..810b313292 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/fs_nonlocal_tools.cpp @@ -216,8 +216,8 @@ void FS_Nonlocal_tools::cal_vkb(const int& ik, const int& nbdall hd_vq); // prepare(-i)^l, size: nh - std::vector> pref = maths.cal_pref(it); - const int nh = pref.size(); + const int nh = this->ucell_->atoms[it].ncpp.nh; + std::vector> pref = maths.cal_pref(it, nh); this->dvkb_indexes.resize(nh * 4); maths.cal_dvkb_index(this->ucell_->atoms[it].ncpp.nbeta, this->nlpp_->nhtol.c, @@ -369,8 +369,8 @@ void FS_Nonlocal_tools::cal_vkb_deri_s(const int& ik, hd_vq_deri); // prepare(-i)^l, size: nh - std::vector> pref = maths.cal_pref(it); - int nh = pref.size(); + const int nh = this->ucell_->atoms[it].ncpp.nh; + std::vector> pref = maths.cal_pref(it, nh); // prepare indexes for calculate vkb_deri this->dvkb_indexes.resize(nh * 4); maths.cal_dvkb_index(this->ucell_->atoms[it].ncpp.nbeta, diff --git a/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp index 6272675398..7fe256b23d 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/hamilt_pw.cpp @@ -10,6 +10,7 @@ #include "operator_pw/ekinetic_pw.h" #include "operator_pw/meta_pw.h" #include "operator_pw/nonlocal_pw.h" +#include "operator_pw/onsite_proj_pw.h" #ifdef USE_PAW #include "module_cell/module_paw/paw_cell.h" @@ -114,6 +115,12 @@ HamiltPW::HamiltPW(elecstate::Potential* pot_in, this->ops->add(nonlocal); } } + if(PARAM.inp.sc_mag_switch || PARAM.inp.dft_plus_u) + { + Operator* onsite_proj + = new OnsiteProj>(isk, &GlobalC::ucell, PARAM.inp.sc_mag_switch, (PARAM.inp.dft_plus_u>0)); + this->ops->add(onsite_proj); + } return; } @@ -192,6 +199,17 @@ HamiltPW::HamiltPW(const HamiltPW *hamilt) this->ops->add(meta); } } + else if (node->classname == "OnsiteProj") { + Operator* onsite_proj = + new OnsiteProj>( + reinterpret_cast>*>(node)); + if(this->ops == nullptr) { + this->ops = onsite_proj; + } + else { + this->ops->add(onsite_proj); + } + } else { ModuleBase::WARNING_QUIT("HamiltPW", "Unrecognized Operator type!"); } diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu index 991a81e746..5d0656d105 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/force_op.cu @@ -306,6 +306,209 @@ void cal_force_nl_op::operator()(const base_dev cudaCheckOnDebug(); } +template +__global__ void cal_force_onsite(int wg_nc, + int ntype, + int forcenl_nc, + int nbands, + int ik, + int nkb, + const int* atom_nh, + const int* atom_na, + int tpiba, + const FPTYPE* d_wg, + const thrust::complex* vu, + const int* orbital_corr, + const thrust::complex* becp, + const thrust::complex* dbecp, + FPTYPE* force) +{ + const int ib = blockIdx.x / ntype; // index of loop-nbands + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; // index of loop-ntype + if (orbital_corr[it] == -1) + return; + const int orbital_l = orbital_corr[it]; + const int ip_begin = orbital_l * orbital_l; + const int tlp1 = 2 * orbital_l + 1; + const int tlp1_2 = tlp1 * tlp1; + + int iat = 0; // calculate the begin of atomic index + int sum = 0; // calculate the begin of atomic-orbital index + for (int ii = 0; ii < it; ii++) + { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + vu += 4 * tlp1_2 * atom_na[ii]; // step for vu + } + + const FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + const int nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int mm = threadIdx.x; mm < tlp1_2; mm += blockDim.x) + { + const int m1 = mm / tlp1; + const int m2 = mm % tlp1; + const int ip1 = ip_begin + m1; + const int ip2 = ip_begin + m2; + const int inkb1 = sum + ip1 + ib2 * nkb; + const int inkb2 = sum + ip2 + ib2 * nkb; + thrust::complex ps[4] = {vu[mm], vu[mm + tlp1_2], vu[mm + 2 * tlp1_2], vu[mm + 3 * tlp1_2]}; + // out<<"\n ps = "< dbb0 = conj(dbecp[inkb0]) * becp[inkb2]; + const thrust::complex dbb1 = conj(dbecp[inkb0]) * becp[inkb2 + nkb]; + const thrust::complex dbb2 = conj(dbecp[inkb0 + nkb]) * becp[inkb2]; + const thrust::complex dbb3 = conj(dbecp[inkb0 + nkb]) * becp[inkb2 + nkb]; + const FPTYPE tmp = -fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real(); + atomicAdd(force + iat * forcenl_nc + ipol, tmp); + } + } + ++iat; + sum += nprojs; + vu += 4 * tlp1_2; + } // ia +} + +template +__global__ void cal_force_onsite(int wg_nc, + int ntype, + int forcenl_nc, + int nbands, + int ik, + int nkb, + const int* atom_nh, + const int* atom_na, + int tpiba, + const FPTYPE* d_wg, + const FPTYPE* lambda, + const thrust::complex* becp, + const thrust::complex* dbecp, + FPTYPE* force) +{ + const int ib = blockIdx.x / ntype; // index of loop-nbands + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; // index of loop-ntype + + int iat = 0; // calculate the begin of atomic index + int sum = 0; // calculate the begin of atomic-orbital index + for (int ii = 0; ii < it; ii++) + { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + const FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + const int nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + const thrust::complex coefficients0(lambda[iat * 3 + 2], 0.0); + const thrust::complex coefficients1(lambda[iat * 3], lambda[iat * 3 + 1]); + const thrust::complex coefficients2(lambda[iat * 3], -1 * lambda[iat * 3 + 1]); + const thrust::complex coefficients3(-1 * lambda[iat * 3 + 2], 0.0); + for (int ip = threadIdx.x; ip < nprojs; ip += blockDim.x) + { + const int inkb = sum + ip + ib2 * nkb; + // out<<"\n ps = "< dbb0 = conj(dbecp[inkb0]) * becp[inkb]; + const thrust::complex dbb1 = conj(dbecp[inkb0]) * becp[inkb + nkb]; + const thrust::complex dbb2 = conj(dbecp[inkb0 + nkb]) * becp[inkb]; + const thrust::complex dbb3 = conj(dbecp[inkb0 + nkb]) * becp[inkb + nkb]; + const FPTYPE tmp + = -fac + * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3) + .real(); + atomicAdd(force + iat * forcenl_nc + ipol, tmp); + } + } + ++iat; + sum += nprojs; + } // ia +} + +// kernel for DFTU force +template +void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const std::complex* vu, + const int* orbital_corr, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) +{ + cal_force_onsite + <<>>(wg_nc, + ntype, + forcenl_nc, + nbands, + ik, + nkb, + atom_nh, + atom_na, + tpiba, + d_wg, + reinterpret_cast*>(vu), + orbital_corr, + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + force); // array of data + + cudaCheckOnDebug(); +} +// kernel for DeltaSpin force +template +void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* lambda, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) +{ + cal_force_onsite + <<>>(wg_nc, + ntype, + forcenl_nc, + nbands, + ik, + nkb, + atom_nh, + atom_na, + tpiba, + d_wg, + lambda, + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + force); // array of data + + cudaCheckOnDebug(); +} + template __global__ void saveVkbValues_( const int *gcar_zero_ptrs, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu new file mode 100644 index 0000000000..ef54ff0605 --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/onsite_op.cu @@ -0,0 +1,134 @@ +#include "module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h" + +#include +#include +#include +#include + +namespace hamilt +{ + +#define THREADS_PER_BLOCK 256 + +template +__global__ void onsite_op(const int npm, + const int npol, + const int* ip_iat, + const int tnp, + const thrust::complex* lambda_coeff, + thrust::complex* ps, + const thrust::complex* becp) +{ + const int ip = blockIdx.x; + const int nbands = npm / npol; + for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x) + { + int ib2 = ib * npol; + int iat = ip_iat[ip]; + const int psind = ip * npm + ib2; + const int becpind = ib2 * tnp + ip; + ps[psind] += lambda_coeff[iat * 4] * becp[becpind] + lambda_coeff[iat * 4 + 2] * becp[becpind + tnp]; + ps[psind + 1] += lambda_coeff[iat * 4 + 1] * becp[becpind] + lambda_coeff[iat * 4 + 3] * becp[becpind + tnp]; + } +} + +template +__global__ void onsite_op(const int npm, + const int npol, + const int* orb_l_iat, + const int* ip_iat, + const int* ip_m, + const int* vu_begin_iat, + const int tnp, + const thrust::complex* vu, + thrust::complex* ps, + const thrust::complex* becp) +{ + const int ip = blockIdx.x; + int m1 = ip_m[ip]; + if (m1 >= 0) + { + const int nbands = npm / npol; + for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x) + { + int ib2 = ib * npol; + int iat = ip_iat[ip]; + const thrust::complex* vu_iat = vu + vu_begin_iat[iat]; + int orb_l = orb_l_iat[iat]; + int tlp1 = 2 * orb_l + 1; + int tlp1_2 = tlp1 * tlp1; + int ip2_begin = ip - m1; + int ip2_end = ip - m1 + tlp1; + const int psind = ip * npm + ib2; + for (int ip2 = ip2_begin; ip2 < ip2_end; ip2++) + { + const int becpind = ib2 * tnp + ip2; + int m2 = ip_m[ip2]; + const int index_mm = m1 * tlp1 + m2; + ps[psind] += vu_iat[index_mm] * becp[becpind] + vu_iat[index_mm + tlp1_2 * 2] * becp[becpind + tnp]; + ps[psind + 1] += vu_iat[index_mm + tlp1_2 * 1] * becp[becpind] + + vu_iat[index_mm + tlp1_2 * 3] * becp[becpind + tnp]; + } + } + } +} + +template +void hamilt::onsite_ps_op::operator()(const base_device::DEVICE_GPU* dev, + const int& npm, + const int npol, + const int* ip_iat, + const int& tnp, + const std::complex* lambda_coeff, + std::complex* ps, + const std::complex* becp) +{ + // denghui implement 20221019 + // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + onsite_op + <<>>(npm, + npol, + ip_iat, + tnp, + reinterpret_cast*>(lambda_coeff), + reinterpret_cast*>(ps), // array of data + reinterpret_cast*>(becp)); // array of data + + cudaCheckOnDebug(); +} + +template +void hamilt::onsite_ps_op::operator()(const base_device::DEVICE_GPU* dev, + const int& npm, + const int npol, + const int* orb_l_iat, + const int* ip_iat, + const int* ip_m, + const int* vu_begin_iat, + const int& tnp, + const std::complex* vu, + std::complex* ps, + const std::complex* becp) +{ + // denghui implement 20221109 + // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + onsite_op + <<>>(npm, + npol, + orb_l_iat, + ip_iat, + ip_m, + vu_begin_iat, + tnp, + reinterpret_cast*>(vu), + reinterpret_cast*>(ps), // array of data + reinterpret_cast*>(becp)); // array of data + + cudaCheckOnDebug(); + // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +} + +template struct onsite_ps_op; +template struct onsite_ps_op; + +} // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu index 997827d669..b18e5c5160 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/cuda/stress_op.cu @@ -922,6 +922,187 @@ void synchronize_ptrs::operator()( cudaMemcpy(ptr_out, ptr_in, sizeof(void*) * size, cudaMemcpyHostToDevice); } +template +__global__ void cal_stress_onsite( + const int nkb, + const int ntype, + const int wg_nc, + const int ik, + const int *atom_nh, + const int *atom_na, + const FPTYPE *d_wg, + const thrust::complex *vu, + const int* orbital_corr, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *stress) +{ + const int ib = blockIdx.x / ntype; // index of loop-nbands + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; // index of loop-ntype + if(orbital_corr[it] == -1) return; + const int orbital_l = orbital_corr[it]; + const int ip_begin = orbital_l * orbital_l; + const int tlp1 = 2 * orbital_l + 1; + const int tlp1_2 = tlp1 * tlp1; + + int iat = 0; // calculate the begin of atomic index + int sum = 0; // calculate the begin of atomic-orbital index + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + vu += 4 * tlp1_2 * atom_na[ii];// step for vu + } + + FPTYPE stress_var = 0; + const FPTYPE fac = d_wg[ik * wg_nc + ib]; + const int nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int mm = threadIdx.x; mm < tlp1_2; mm += blockDim.x) { + const int m1 = mm / tlp1; + const int m2 = mm % tlp1; + const int ip1 = ip_begin + m1; + const int ip2 = ip_begin + m2; + const int inkb1 = sum + ip1 + ib2 * nkb; + const int inkb2 = sum + ip2 + ib2 * nkb; + thrust::complex ps[4] = {vu[mm], vu[mm + tlp1_2], vu[mm + 2 * tlp1_2], vu[mm + 3 * tlp1_2]}; + //out<<"\n ps = "< dbb0 = conj(dbecp[inkb1]) * becp[inkb2]; + const thrust::complex dbb1 = conj(dbecp[inkb1]) * becp[inkb2 + nkb]; + const thrust::complex dbb2 = conj(dbecp[inkb1 + nkb]) * becp[inkb2]; + const thrust::complex dbb3 = conj(dbecp[inkb1 + nkb]) * becp[inkb2 + nkb]; + stress_var -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real(); + } + ++iat; + sum+=nprojs; + vu += 4 * tlp1_2; + }//ia + __syncwarp(); + warp_reduce(stress_var); + if (threadIdx.x % WARP_SIZE == 0) { + atomicAdd(stress, stress_var); + } +} + +template +__global__ void cal_stress_onsite( + const int nkb, + const int ntype, + const int wg_nc, + const int ik, + const int *atom_nh, + const int *atom_na, + const FPTYPE *d_wg, + const double* lambda, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *stress) +{ + const int ib = blockIdx.x / ntype; // index of loop-nbands + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; // index of loop-ntype + + int iat = 0; // calculate the begin of atomic index + int sum = 0; // calculate the begin of atomic-orbital index + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + FPTYPE stress_var = 0; + const FPTYPE fac = d_wg[ik * wg_nc + ib]; + const int nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + const thrust::complex coefficients0(lambda[iat*3+2], 0.0); + const thrust::complex coefficients1(lambda[iat*3] , lambda[iat*3+1]); + const thrust::complex coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]); + const thrust::complex coefficients3(-1 * lambda[iat*3+2], 0.0); + for (int ip = threadIdx.x; ip < nprojs; ip += blockDim.x) { + const int inkb = sum + ip + ib2 * nkb; + //out<<"\n ps = "< dbb0 = conj(dbecp[inkb]) * becp[inkb]; + const thrust::complex dbb1 = conj(dbecp[inkb]) * becp[inkb + nkb]; + const thrust::complex dbb2 = conj(dbecp[inkb + nkb]) * becp[inkb]; + const thrust::complex dbb3 = conj(dbecp[inkb + nkb]) * becp[inkb + nkb]; + stress_var -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real(); + } + ++iat; + sum+=nprojs; + }//ia + __syncwarp(); + warp_reduce(stress_var); + if (threadIdx.x % WARP_SIZE == 0) { + atomicAdd(stress, stress_var); + } +} + +//kernel for DFTU stress +template +void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const std::complex* vu, + const int* orbital_corr, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) +{ + cal_stress_onsite<<>>( + nkb, + ntype, + wg_nc, + ik, + atom_nh, + atom_na, + d_wg, + reinterpret_cast*>(vu), + orbital_corr, + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + stress);// array of data + + cudaCheckOnDebug(); +} +// kernel for DeltaSpin stress +template +void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const double* lambda, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) +{ + cal_stress_onsite<<>>( + nkb, + ntype, + wg_nc, + ik, + atom_nh, + atom_na, + d_wg, + lambda, + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + stress);// array of data + + cudaCheckOnDebug(); +} + template struct synchronize_ptrs; template struct cal_stress_mgga_op, base_device::DEVICE_GPU>; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp index 261c510efc..6d797e147d 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.cpp @@ -278,6 +278,149 @@ struct cal_force_nl_op #ifdef _OPENMP } #endif + }; + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const std::complex* vu, + const int* orbital_corr, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) + { + int iat0 = 0; + int sum0 = 0; + for (int it = 0; it < ntype; it++) + { + const int orbital_l = orbital_corr[it]; + const int nproj = atom_nh[it]; + if(orbital_l == -1) + { + sum0 += nproj * atom_na[it]; + continue; + } + const int ip_begin = orbital_l * orbital_l; + const int ip_end = (orbital_l + 1) * (orbital_l + 1); + const int tlp1 = 2 * orbital_l + 1; + const int tlp1_2 = tlp1 * tlp1; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ib = 0; ib < nbands_occ; ib++) + { + const int ib2 = ib*2; + FPTYPE local_force[3] = {0, 0, 0}; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + int iat = iat0 + ia; + int sum = sum0 + ia * nproj; + for (int ip = ip_begin; ip < ip_end; ip++) + { + const int inkb = sum + ip; + const int m = ip - ip_begin; + // out<<"\n ps = "< ps[4]; + for(int i = 0; i < 4; i++) + { + ps[i] = vu[(i * tlp1_2 + m * tlp1 + m2)]; + } + + for (int ipol = 0; ipol < 3; ipol++) + { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + jnkb; + const std::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const std::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const std::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const std::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + + local_force[ipol] -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real(); + } + } + } + for (int ipol = 0; ipol < 3; ++ipol) + { + force[iat * forcenl_nc + ipol] += local_force[ipol]; + } + } + vu += 4 * tlp1_2;// step for vu + } // end ia + iat0 += atom_na[it]; + sum0 += atom_na[it] * nproj; + } // end it + }; + + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* lambda, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) + { + int iat0 = 0; + int sum0 = 0; + for (int it = 0; it < ntype; it++) + { + const int nproj = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + int iat = iat0 + ia; + int sum = sum0 + ia * nproj; + const std::complex coefficients0(lambda[iat*3+2], 0.0); + const std::complex coefficients1(lambda[iat*3] , lambda[iat*3+1]); + const std::complex coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]); + const std::complex coefficients3(-1 * lambda[iat*3+2], 0.0); + for (int ib = 0; ib < nbands_occ; ib++) + { + const int ib2 = ib*2; + FPTYPE local_force[3] = {0, 0, 0}; + FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + for (int ip = 0; ip < nproj; ip++) + { + const int inkb = sum + ip; + + for (int ipol = 0; ipol < 3; ipol++) + { + const int index0 = ipol * nbands * 2 * nkb + ib2 * nkb + inkb; + const int index1 = ib2 * nkb + inkb; + const std::complex dbb0 = conj(dbecp[index0]) * becp[index1]; + const std::complex dbb1 = conj(dbecp[index0]) * becp[index1 + nkb]; + const std::complex dbb2 = conj(dbecp[index0 + nkb]) * becp[index1]; + const std::complex dbb3 = conj(dbecp[index0 + nkb]) * becp[index1 + nkb]; + + local_force[ipol] -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real(); + } + }//ip + for (int ipol = 0; ipol < 3; ++ipol) + { + force[iat * forcenl_nc + ipol] += local_force[ipol]; + } + } // end ib + } // ia + iat0 += atom_na[it]; + sum0 += atom_na[it] * nproj; + }//it } }; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h index b9aaa6d468..3aa5d4f87e 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/force_op.h @@ -112,6 +112,41 @@ struct cal_force_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* force); + /// kernel for DFT+U + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const std::complex* vu, + const int* orbital_corr, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force); + /// kernel for DeltaSpin + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* lambda, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force); }; #if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM @@ -176,6 +211,41 @@ struct cal_force_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* force); + /// kernel for DFT+U + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const std::complex* vu, + const int* orbital_corr, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force); + /// kernel for DeltaSpin + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* lambda, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force); }; /** diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.cpp new file mode 100644 index 0000000000..1528af190c --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.cpp @@ -0,0 +1,87 @@ +#include "module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h" + +namespace hamilt +{ + +template +struct onsite_ps_op +{ + // kernel for DeltaSpin calculation + void operator()(const base_device::DEVICE_CPU* /*dev*/, + const int& npm, + const int npol, + const int* ip_iat, + const int& tnp, + const std::complex* lambda_array, + std::complex* ps, + const std::complex* becp) + { +#ifdef _OPENMP +#pragma omp parallel for collapse(2) +#endif + for (int ib = 0; ib < npm / npol; ib++) + { + for (int ip = 0; ip < tnp; ip++) + { + int ib2 = ib * npol; + int iat = ip_iat[ip]; + const int psind = ip * npm + ib2; + const int becpind = ib2 * tnp + ip; + ps[psind] += lambda_array[iat * 4] * becp[becpind] + + lambda_array[iat * 4 + 2] * becp[becpind + tnp]; + ps[psind + 1] += lambda_array[iat * 4 + 1] * becp[becpind] + + lambda_array[iat * 4 + 3] * becp[becpind + tnp]; + } // end ip + } // end ib + }; + + // kernel for DFT+U calculation + void operator()(const base_device::DEVICE_CPU* dev, + const int& npm, + const int npol, + const int* orb_l_iat, + const int* ip_iat, + const int* ip_m, + const int* vu_begin_iat, + const int& tnp, + const std::complex* vu, + std::complex* ps, + const std::complex* becp) + { +#ifdef _OPENMP +#pragma omp parallel for collapse(2) +#endif + for (int ib = 0; ib < npm / npol; ib++) + { + for (int ip = 0; ip < tnp; ip++) + { + int m1 = ip_m[ip]; + if(m1 < 0) continue; + int ib2 = ib * npol; + int iat = ip_iat[ip]; + const std::complex* vu_iat = vu + vu_begin_iat[iat]; + int orb_l = orb_l_iat[iat]; + int tlp1 = 2 * orb_l + 1; + int tlp1_2 = tlp1 * tlp1; + int ip2_begin = ip - m1; + int ip2_end = ip - m1 + tlp1; + const int psind = ip * npm + ib2; + for(int ip2 = ip2_begin;ip2; +template struct onsite_ps_op; + +} // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h new file mode 100644 index 0000000000..fee57fbbd3 --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h @@ -0,0 +1,63 @@ +#ifndef MODULE_HAMILT_OPERATOR_KERNELS_ONSITE_H +#define MODULE_HAMILT_OPERATOR_KERNELS_ONSITE_H + +#include "module_psi/psi.h" +#include + +namespace hamilt { +template +struct onsite_ps_op { + void operator() ( + const Device* dev, + const int& npm, + const int npol, + const int* ip_iat, + const int& tnp, + const std::complex* lambda_coeff, + std::complex* ps, + const std::complex* becp); + + void operator() ( + const Device* dev, + const int& npm, + const int npol, + const int* orb_l_iat, + const int* ip_iat, + const int* ip_m, + const int* vu_begin_iat, + const int& tnp, + const std::complex* vu, + std::complex* ps, + const std::complex* becp); +}; + +#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM +// Partially specialize functor for base_device::GpuDevice. +template +struct onsite_ps_op { + void operator() ( + const base_device::DEVICE_GPU* dev, + const int& npm, + const int npol, + const int* ip_iat, + const int& tnp, + const std::complex* lambda_coeff, + std::complex* ps, + const std::complex* becp); + + void operator() ( + const base_device::DEVICE_GPU* dev, + const int& npm, + const int npol, + const int* orb_l_iat, + const int* ip_iat, + const int* ip_m, + const int* vu_begin_iat, + const int& tnp, + const std::complex* vu, + std::complex* ps, + const std::complex* becp); +}; +#endif // __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM +} // namespace hamilt +#endif //MODULE_HAMILT_OPERATOR_KERNELS_ONSITE_H \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu index b89a380133..c78b333b86 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/force_op.hip.cu @@ -304,6 +304,217 @@ void cal_force_nl_op::operator()(const base_dev hipCheckOnDebug(); } +template +__global__ void cal_force_onsite(int wg_nc, + int ntype, + int forcenl_nc, + int nbands, + int ik, + int nkb, + const int* atom_nh, + const int* atom_na, + int tpiba, + const FPTYPE* d_wg, + const thrust::complex* vu, + const int* orbital_corr, + const thrust::complex* becp, + const thrust::complex* dbecp, + FPTYPE* force) +{ + const int ib = blockIdx.x / ntype; // index of loop-nbands + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; // index of loop-ntype + if (orbital_corr[it] == -1) + return; + const int orbital_l = orbital_corr[it]; + const int ip_begin = orbital_l * orbital_l; + const int tlp1 = 2 * orbital_l + 1; + const int tlp1_2 = tlp1 * tlp1; + + int iat = 0; // calculate the begin of atomic index + int sum = 0; // calculate the begin of atomic-orbital index + for (int ii = 0; ii < it; ii++) + { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + vu += 4 * tlp1_2 * atom_na[ii]; // step for vu + } + + const FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + const int nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int mm = threadIdx.x; mm < tlp1_2; mm += blockDim.x) + { + const int m1 = mm / tlp1; + const int m2 = mm % tlp1; + const int ip1 = ip_begin + m1; + const int ip2 = ip_begin + m2; + const int inkb1 = sum + ip1 + ib2 * nkb; + const int inkb2 = sum + ip2 + ib2 * nkb; + thrust::complex ps[4] = {vu[mm], vu[mm + tlp1_2], vu[mm + 2 * tlp1_2], vu[mm + 3 * tlp1_2]}; + // out<<"\n ps = "< dbb0 = conj(dbecp[inkb0]) * becp[inkb2]; + const thrust::complex dbb1 = conj(dbecp[inkb0]) * becp[inkb2 + nkb]; + const thrust::complex dbb2 = conj(dbecp[inkb0 + nkb]) * becp[inkb2]; + const thrust::complex dbb3 = conj(dbecp[inkb0 + nkb]) * becp[inkb2 + nkb]; + const FPTYPE tmp = -fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real(); + atomicAdd(force + iat * forcenl_nc + ipol, tmp); + } + } + ++iat; + sum += nprojs; + vu += 4 * tlp1_2; + } // ia +} + +template +__global__ void cal_force_onsite(int wg_nc, + int ntype, + int forcenl_nc, + int nbands, + int ik, + int nkb, + const int* atom_nh, + const int* atom_na, + int tpiba, + const FPTYPE* d_wg, + const FPTYPE* lambda, + const thrust::complex* becp, + const thrust::complex* dbecp, + FPTYPE* force) +{ + const int ib = blockIdx.x / ntype; // index of loop-nbands + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; // index of loop-ntype + + int iat = 0; // calculate the begin of atomic index + int sum = 0; // calculate the begin of atomic-orbital index + for (int ii = 0; ii < it; ii++) + { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + const FPTYPE fac = d_wg[ik * wg_nc + ib] * 2.0 * tpiba; + const int nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + const thrust::complex coefficients0(lambda[iat * 3 + 2], 0.0); + const thrust::complex coefficients1(lambda[iat * 3], lambda[iat * 3 + 1]); + const thrust::complex coefficients2(lambda[iat * 3], -1 * lambda[iat * 3 + 1]); + const thrust::complex coefficients3(-1 * lambda[iat * 3 + 2], 0.0); + for (int ip = threadIdx.x; ip < nprojs; ip += blockDim.x) + { + const int inkb = sum + ip + ib2 * nkb; + // out<<"\n ps = "< dbb0 = conj(dbecp[inkb0]) * becp[inkb]; + const thrust::complex dbb1 = conj(dbecp[inkb0]) * becp[inkb + nkb]; + const thrust::complex dbb2 = conj(dbecp[inkb0 + nkb]) * becp[inkb]; + const thrust::complex dbb3 = conj(dbecp[inkb0 + nkb]) * becp[inkb + nkb]; + const FPTYPE tmp + = -fac + * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3) + .real(); + atomicAdd(force + iat * forcenl_nc + ipol, tmp); + } + } + ++iat; + sum += nprojs; + } // ia +} + +// kernel for DFTU force +template +void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const std::complex* vu, + const int* orbital_corr, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_onsite), + dim3(nbands_occ * ntype), + dim3(THREADS_PER_BLOCK), + 0, + 0, + wg_nc, + ntype, + forcenl_nc, + nbands, + ik, + nkb, + atom_nh, + atom_na, + tpiba, + d_wg, + reinterpret_cast*>(vu), + orbital_corr, + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + force); // array of data + + hipCheckOnDebug(); +} +// kernel for DeltaSpin force +template +void cal_force_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nbands_occ, + const int& wg_nc, + const int& ntype, + const int& forcenl_nc, + const int& nbands, + const int& ik, + const int& nkb, + const int* atom_nh, + const int* atom_na, + const FPTYPE& tpiba, + const FPTYPE* d_wg, + const FPTYPE* lambda, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* force) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_force_onsite), + dim3(nbands_occ * ntype), + dim3(THREADS_PER_BLOCK), + 0, + 0, + wg_nc, + ntype, + forcenl_nc, + nbands, + ik, + nkb, + atom_nh, + atom_na, + tpiba, + d_wg, + lambda, + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + force); // array of data + + hipCheckOnDebug(); +} + template __global__ void saveVkbValues_( const int *gcar_zero_ptrs, diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/onsite_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/onsite_op.hip.cu new file mode 100644 index 0000000000..31ec309a28 --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/onsite_op.hip.cu @@ -0,0 +1,134 @@ +#include "module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h" + +#include +#include +#include +#include + +namespace hamilt +{ + +#define THREADS_PER_BLOCK 256 + +template +__global__ void onsite_op(const int npm, + const int npol, + const int* ip_iat, + const int tnp, + const thrust::complex* lambda_coeff, + thrust::complex* ps, + const thrust::complex* becp) +{ + const int ip = blockIdx.x; + const int nbands = npm / npol; + for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x) + { + int ib2 = ib * npol; + int iat = ip_iat[ip]; + const int psind = ip * npm + ib2; + const int becpind = ib2 * tnp + ip; + ps[psind] += lambda_coeff[iat * 4] * becp[becpind] + lambda_coeff[iat * 4 + 2] * becp[becpind + tnp]; + ps[psind + 1] += lambda_coeff[iat * 4 + 1] * becp[becpind] + lambda_coeff[iat * 4 + 3] * becp[becpind + tnp]; + } +} + +template +__global__ void onsite_op(const int npm, + const int npol, + const int* orb_l_iat, + const int* ip_iat, + const int* ip_m, + const int* vu_begin_iat, + const int tnp, + const thrust::complex* vu, + thrust::complex* ps, + const thrust::complex* becp) +{ + const int ip = blockIdx.x; + int m1 = ip_m[ip]; + if (m1 >= 0) + { + const int nbands = npm / npol; + for (int ib = threadIdx.x; ib < nbands; ib += blockDim.x) + { + int ib2 = ib * npol; + int iat = ip_iat[ip]; + const thrust::complex* vu_iat = vu + vu_begin_iat[iat]; + int orb_l = orb_l_iat[iat]; + int tlp1 = 2 * orb_l + 1; + int tlp1_2 = tlp1 * tlp1; + int ip2_begin = ip - m1; + int ip2_end = ip - m1 + tlp1; + const int psind = ip * npm + ib2; + for (int ip2 = ip2_begin; ip2 < ip2_end; ip2++) + { + const int becpind = ib2 * tnp + ip2; + int m2 = ip_m[ip2]; + const int index_mm = m1 * tlp1 + m2; + ps[psind] += vu_iat[index_mm] * becp[becpind] + vu_iat[index_mm + tlp1_2 * 2] * becp[becpind + tnp]; + ps[psind + 1] += vu_iat[index_mm + tlp1_2 * 1] * becp[becpind] + + vu_iat[index_mm + tlp1_2 * 3] * becp[becpind + tnp]; + } + } + } +} + +template +void hamilt::onsite_ps_op::operator()(const base_device::DEVICE_GPU* dev, + const int& npm, + const int npol, + const int* ip_iat, + const int& tnp, + const std::complex* lambda_coeff, + std::complex* ps, + const std::complex* becp) +{ + // denghui implement 20221019 + // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + hipLaunchKernelGGL(HIP_KERNEL_NAME(onsite_op), dim3(tnp), dim3(THREADS_PER_BLOCK), 0, 0, + npm, + npol, + ip_iat, + tnp, + reinterpret_cast*>(lambda_coeff), + reinterpret_cast*>(ps), // array of data + reinterpret_cast*>(becp)); // array of data + + hipCheckOnDebug(); +} + +template +void hamilt::onsite_ps_op::operator()(const base_device::DEVICE_GPU* dev, + const int& npm, + const int npol, + const int* orb_l_iat, + const int* ip_iat, + const int* ip_m, + const int* vu_begin_iat, + const int& tnp, + const std::complex* vu, + std::complex* ps, + const std::complex* becp) +{ + // denghui implement 20221109 + // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + hipLaunchKernelGGL(HIP_KERNEL_NAME(onsite_op), dim3(tnp), dim3(THREADS_PER_BLOCK), 0, 0, + npm, + npol, + orb_l_iat, + ip_iat, + ip_m, + vu_begin_iat, + tnp, + reinterpret_cast*>(vu), + reinterpret_cast*>(ps), // array of data + reinterpret_cast*>(becp)); // array of data + + hipCheckOnDebug(); + // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +} + +template struct onsite_ps_op; +template struct onsite_ps_op; + +} // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu index a5f8e553af..ef138c04cc 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/rocm/stress_op.hip.cu @@ -922,6 +922,185 @@ void synchronize_ptrs::operator()( hipErrcheck(hipMemcpy(ptr_out, ptr_in, sizeof(void*) * size, hipMemcpyHostToDevice)); } +template +__global__ void cal_stress_onsite( + const int nkb, + const int ntype, + const int wg_nc, + const int ik, + const int *atom_nh, + const int *atom_na, + const FPTYPE *d_wg, + const thrust::complex *vu, + const int* orbital_corr, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *stress) +{ + const int ib = blockIdx.x / ntype; // index of loop-nbands + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; // index of loop-ntype + if(orbital_corr[it] == -1) return; + const int orbital_l = orbital_corr[it]; + const int ip_begin = orbital_l * orbital_l; + const int tlp1 = 2 * orbital_l + 1; + const int tlp1_2 = tlp1 * tlp1; + + int iat = 0; // calculate the begin of atomic index + int sum = 0; // calculate the begin of atomic-orbital index + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + vu += 4 * tlp1_2 * atom_na[ii];// step for vu + } + + FPTYPE stress_var = 0; + const FPTYPE fac = d_wg[ik * wg_nc + ib]; + const int nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int mm = threadIdx.x; mm < tlp1_2; mm += blockDim.x) { + const int m1 = mm / tlp1; + const int m2 = mm % tlp1; + const int ip1 = ip_begin + m1; + const int ip2 = ip_begin + m2; + const int inkb1 = sum + ip1 + ib2 * nkb; + const int inkb2 = sum + ip2 + ib2 * nkb; + thrust::complex ps[4] = {vu[mm], vu[mm + tlp1_2], vu[mm + 2 * tlp1_2], vu[mm + 3 * tlp1_2]}; + //out<<"\n ps = "< dbb0 = conj(dbecp[inkb1]) * becp[inkb2]; + const thrust::complex dbb1 = conj(dbecp[inkb1]) * becp[inkb2 + nkb]; + const thrust::complex dbb2 = conj(dbecp[inkb1 + nkb]) * becp[inkb2]; + const thrust::complex dbb3 = conj(dbecp[inkb1 + nkb]) * becp[inkb2 + nkb]; + stress_var -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real(); + } + ++iat; + sum+=nprojs; + vu += 4 * tlp1_2; + }//ia + warp_reduce(stress_var); + if (threadIdx.x % WARP_SIZE == 0) { + atomicAdd(stress, stress_var); + } +} + +template +__global__ void cal_stress_onsite( + const int nkb, + const int ntype, + const int wg_nc, + const int ik, + const int *atom_nh, + const int *atom_na, + const FPTYPE *d_wg, + const double* lambda, + const thrust::complex *becp, + const thrust::complex *dbecp, + FPTYPE *stress) +{ + const int ib = blockIdx.x / ntype; // index of loop-nbands + const int ib2 = ib * 2; + const int it = blockIdx.x % ntype; // index of loop-ntype + + int iat = 0; // calculate the begin of atomic index + int sum = 0; // calculate the begin of atomic-orbital index + for (int ii = 0; ii < it; ii++) { + iat += atom_na[ii]; + sum += atom_na[ii] * atom_nh[ii]; + } + + FPTYPE stress_var = 0; + const FPTYPE fac = d_wg[ik * wg_nc + ib]; + const int nprojs = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + const thrust::complex coefficients0(lambda[iat*3+2], 0.0); + const thrust::complex coefficients1(lambda[iat*3] , lambda[iat*3+1]); + const thrust::complex coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]); + const thrust::complex coefficients3(-1 * lambda[iat*3+2], 0.0); + for (int ip = threadIdx.x; ip < nprojs; ip += blockDim.x) { + const int inkb = sum + ip + ib2 * nkb; + //out<<"\n ps = "< dbb0 = conj(dbecp[inkb]) * becp[inkb]; + const thrust::complex dbb1 = conj(dbecp[inkb]) * becp[inkb + nkb]; + const thrust::complex dbb2 = conj(dbecp[inkb + nkb]) * becp[inkb]; + const thrust::complex dbb3 = conj(dbecp[inkb + nkb]) * becp[inkb + nkb]; + stress_var -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real(); + } + ++iat; + sum+=nprojs; + }//ia + warp_reduce(stress_var); + if (threadIdx.x % WARP_SIZE == 0) { + atomicAdd(stress, stress_var); + } +} + +//kernel for DFTU stress +template +void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const std::complex* vu, + const int* orbital_corr, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_onsite), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, + nkb, + ntype, + wg_nc, + ik, + atom_nh, + atom_na, + d_wg, + reinterpret_cast*>(vu), + orbital_corr, + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + stress);// array of data + + hipCheckOnDebug(); +} +// kernel for DeltaSpin stress +template +void cal_stress_nl_op::operator()(const base_device::DEVICE_GPU* ctx, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const double* lambda, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) +{ + hipLaunchKernelGGL(HIP_KERNEL_NAME(cal_stress_onsite), dim3(nbands_occ * ntype), dim3(THREADS_PER_BLOCK), 0, 0, + nkb, + ntype, + wg_nc, + ik, + atom_nh, + atom_na, + d_wg, + lambda, + reinterpret_cast*>(becp), + reinterpret_cast*>(dbecp), + stress);// array of data + + hipCheckOnDebug(); +} + template struct synchronize_ptrs; template struct cal_stress_mgga_op, base_device::DEVICE_GPU>; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp index 979955d3e8..0cd0e1ab96 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.cpp @@ -239,6 +239,121 @@ struct cal_stress_nl_op } #endif stress[ipol * 3 + jpol] += local_stress; + }; + // kernel for DFT+U + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const std::complex* vu, + const int* orbital_corr, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) + { + FPTYPE local_stress = 0; + int iat = 0, sum = 0; + for (int it = 0; it < ntype; it++) + { + const int orbital_l = orbital_corr[it]; + const int nproj = atom_nh[it]; + if(orbital_l == -1) + { + sum += nproj * atom_na[it]; + continue; + } + const int ip_begin = orbital_l * orbital_l; + const int ip_end = (orbital_l + 1) * (orbital_l + 1); + const int tlp1 = 2 * orbital_l + 1; + const int tlp1_2 = tlp1 * tlp1; + for (int ia = 0; ia < atom_na[it]; ia++) + { + for (int ib = 0; ib < nbands_occ; ib++) + { + const int ib2 = ib*2; + FPTYPE fac = d_wg[ik * wg_nc + ib]; + for (int ip1 = ip_begin; ip1 < ip_end; ip1++) + { + const int m1 = ip1 - ip_begin; + const int inkb1 = ib2 * nkb + sum + ia * nproj + ip1; + // out<<"\n ps = "< ps[4]; + for(int i = 0; i < 4; i++) + { + ps[i] = vu[(i * tlp1_2 + m1 * tlp1 + m2)]; + } + const int inkb2 = ib2 * nkb + sum + ia * nproj + ip2; + + const std::complex dbb0 = conj(dbecp[inkb1]) * becp[inkb2]; + const std::complex dbb1 = conj(dbecp[inkb1]) * becp[nkb + inkb2]; + const std::complex dbb2 = conj(dbecp[nkb + inkb1]) * becp[inkb2]; + const std::complex dbb3 = conj(dbecp[nkb + inkb1]) * becp[nkb + inkb2]; + local_stress -= fac * (ps[0] * dbb0 + ps[1] * dbb1 + ps[2] * dbb2 + ps[3] * dbb3).real(); + } + } // end ip + }// ib + vu += 4 * tlp1_2;// step for vu + }// ia + sum += atom_na[it] * nproj; + iat += atom_na[it]; + } // end it + *stress += local_stress; + }; + // kernel for DeltaSpin + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const FPTYPE* lambda, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress) + { + FPTYPE local_stress = 0; + int iat0 = 0, sum = 0; + for (int it = 0; it < ntype; it++) + { + const int nproj = atom_nh[it]; + for (int ia = 0; ia < atom_na[it]; ia++) + { + int iat = iat0 + ia; + const std::complex coefficients0(lambda[iat*3+2], 0.0); + const std::complex coefficients1(lambda[iat*3] , lambda[iat*3+1]); + const std::complex coefficients2(lambda[iat*3] , -1 * lambda[iat*3+1]); + const std::complex coefficients3(-1 * lambda[iat*3+2], 0.0); + for (int ib = 0; ib < nbands_occ; ib++) + { + const int ib2 = ib*2; + FPTYPE fac = d_wg[ik * wg_nc + ib]; + for (int ip = 0; ip < nproj; ip++) + { + const int inkb1 = ib2 * nkb + sum + ia * nproj + ip; + + const std::complex dbb0 = conj(dbecp[inkb1]) * becp[inkb1]; + const std::complex dbb1 = conj(dbecp[inkb1]) * becp[nkb + inkb1]; + const std::complex dbb2 = conj(dbecp[nkb + inkb1]) * becp[inkb1]; + const std::complex dbb3 = conj(dbecp[nkb + inkb1]) * becp[nkb + inkb1]; + local_stress -= fac * (coefficients0 * dbb0 + coefficients1 * dbb1 + coefficients2 * dbb2 + coefficients3 * dbb3).real(); + } // end ip + }// ib + }// ia + sum += atom_na[it] * nproj; + iat0 += atom_na[it]; + } // end it + *stress += local_stress; } }; diff --git a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h index af7d51523d..7fecd96d75 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h +++ b/source/module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h @@ -122,6 +122,35 @@ struct cal_stress_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* stress); + // kernel for DFT+U + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const std::complex* vu, + const int* orbital_corr, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress); + // kernel for DeltaSpin + void operator()(const base_device::DEVICE_CPU* ctx, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const double* lambda, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress); }; template @@ -298,6 +327,35 @@ struct cal_stress_nl_op const std::complex* becp, const std::complex* dbecp, FPTYPE* stress); + // kernel for DFT+U + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const std::complex* vu, + const int* orbital_corr, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress); + // kernel for DeltaSpin + void operator()(const base_device::DEVICE_GPU* ctx, + const int& nkb, + const int& nbands_occ, + const int& ntype, + const int& wg_nc, + const int& ik, + const int* atom_nh, + const int* atom_na, + const FPTYPE* d_wg, + const double* lambda, + const std::complex* becp, + const std::complex* dbecp, + FPTYPE* stress); }; // cpu version first, gpu version later diff --git a/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp b/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp index 5fddaa0e84..aa28b5abe2 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp +++ b/source/module_hamilt_pw/hamilt_pwdft/nonlocal_maths.hpp @@ -19,12 +19,21 @@ class Nonlocal_maths Nonlocal_maths(const pseudopot_cell_vnl* nlpp_in, const UnitCell* ucell_in) { this->device = base_device::get_device_type(this->ctx); - this->nlpp_ = nlpp_in; + this->nhtol_ = nlpp_in->nhtol; + this->lmax_ = nlpp_in->lmaxkb; + this->ucell_ = ucell_in; + } + Nonlocal_maths(const ModuleBase::matrix& nhtol, const int lmax, const UnitCell* ucell_in) + { + this->device = base_device::get_device_type(this->ctx); + this->nhtol_ = nhtol; + this->lmax_ = lmax; this->ucell_ = ucell_in; } private: - const pseudopot_cell_vnl* nlpp_; + ModuleBase::matrix nhtol_; + int lmax_; const UnitCell* ucell_; Device* ctx = {}; @@ -33,14 +42,31 @@ class Nonlocal_maths public: // functions - /// calculate the G+K vectors - std::vector cal_gk(int ik, const ModulePW::PW_Basis_K* wfc_basis); - /// calculate the sperical bessel function for projections + /** + * @brief this function prepares all the q (G+k) information in one contiguous memory block + * including the x, y and z components, its norm and the reciprocal of its norm + * + * @param ik index of k point + * @param pw_basis the plane wave basis + * @return std::vector 1d contiguous memory block containing all the q information. The + * first 3*npw are data of x, y and z components, the next 2*npw are data of norm and 1/norm. + * This is beneficial for GPU memory access. + */ + std::vector cal_gk(int ik, const ModulePW::PW_Basis_K* pw_basis); + /** + * @brief calculate the real spherical harmonic functions on cpu (and optionally send to gpu, + * if gpu is available) + * + * @param lmax [in] maximum angular momentum to calculate + * @param npw [in] number of G+k vectors + * @param gk_in [in] the G+k vectors + * @param ylm [out] the spherical harmonic functions + */ void cal_ylm(int lmax, int npw, const FPTYPE* gk_in, FPTYPE* ylm); /// calculate the derivate of the sperical bessel function for projections void cal_ylm_deri(int lmax, int npw, const FPTYPE* gk_in, FPTYPE* ylm_deri); /// calculate the (-i)^l factors - std::vector> cal_pref(int it); + std::vector> cal_pref(int it, const int nh); /// calculate the vkb matrix for this atom /// vkb = sum_lm (-i)^l * ylm(g^) * vq(g^) * sk(g^) void cal_vkb(int it, @@ -99,68 +125,81 @@ class Nonlocal_maths const FPTYPE& x); }; -// cal_gk +// prepare a memory block containing information of vector G+k, this function can be named as eval_q or eval_gk +// seems this operation is not on gpu template -std::vector Nonlocal_maths::cal_gk(int ik, const ModulePW::PW_Basis_K* wfc_basis) +std::vector Nonlocal_maths::cal_gk(int ik, const ModulePW::PW_Basis_K* pw_basis) { - int npw = wfc_basis->npwk[ik]; + int npw = pw_basis->npwk[ik]; std::vector gk(npw * 5); - ModuleBase::Vector3 tmp; + ModuleBase::Vector3 q; for (int ig = 0; ig < npw; ++ig) { - tmp = wfc_basis->getgpluskcar(ik, ig); - gk[ig * 3] = tmp.x; - gk[ig * 3 + 1] = tmp.y; - gk[ig * 3 + 2] = tmp.z; - FPTYPE norm = sqrt(tmp.norm2()); - gk[3 * npw + ig] = norm * this->ucell_->tpiba; - gk[4 * npw + ig] = norm < 1e-8 ? 0.0 : 1.0 / norm * this->ucell_->tpiba; + // written in memory block from 0 to 3*npw. This is like a matrix with npw rows and 3 columns + q = pw_basis->getgpluskcar(ik, ig); + gk[ig * 3] = q.x; + gk[ig * 3 + 1] = q.y; + gk[ig * 3 + 2] = q.z; + // the following written in memory block from 3*npw to 5*npw, the excess 2*npw is for norm and 1/norm + // for memory consecutive consideration, there are blocks storing the norm and 1/norm. + FPTYPE norm = sqrt(q.norm2()); + gk[3 * npw + ig] = norm * this->ucell_->tpiba; // one line with length npw, storing the norm + gk[4 * npw + ig] = norm < 1e-8 ? 0.0 : 1.0 / norm * this->ucell_->tpiba; // one line with length npw, storing 1/norm } return gk; } -// cal_ylm +// tabulate the spherical haromonic functions up to lmax. The q vector is given as input. +// I would rather call this function as cal_ylm_cpu2gpu, distincting from the pure cpu implementation template -void Nonlocal_maths::cal_ylm(int lmax, int npw, const FPTYPE* gk_in, FPTYPE* ylm) +void Nonlocal_maths::cal_ylm(int lmax, int npw, const FPTYPE* q, FPTYPE* ylm) { - - const int x1 = (lmax + 1) * (lmax + 1); - + const int ntot_ylm = (lmax + 1) * (lmax + 1); if (this->device == base_device::GpuDevice) { + // alias using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op; - std::vector ylm_cpu(x1 * npw); - ModuleBase::YlmReal::Ylm_Real(cpu_ctx, x1, npw, gk_in, ylm_cpu.data()); + // allocate + std::vector ylm_cpu(ntot_ylm * npw); + // calculate + ModuleBase::YlmReal::Ylm_Real(cpu_ctx, ntot_ylm, npw, q, ylm_cpu.data()); + // send from cpu to gpu syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, ylm, ylm_cpu.data(), ylm_cpu.size()); } else { - ModuleBase::YlmReal::Ylm_Real(cpu_ctx, x1, npw, gk_in, ylm); + // calculate. Why not implement this logic branch inside some function??? + ModuleBase::YlmReal::Ylm_Real(cpu_ctx, ntot_ylm, npw, q, ylm); } - return; } -// cal_ylm_deri + +// this function calculate the numerical derivate of the spherical harmonic functions respect to the G vector... +// maybe called eval_dylmdq_cpu2gpu? template -void Nonlocal_maths::cal_ylm_deri(int lmax, int npw, const FPTYPE* gk_in, FPTYPE* ylm_deri) +void Nonlocal_maths::cal_ylm_deri(int lmax, int npw, const FPTYPE* q, FPTYPE* out) { - const int x1 = (lmax + 1) * (lmax + 1); + const int ntot_ylm = (lmax + 1) * (lmax + 1); if (this->device == base_device::GpuDevice) { - std::vector dylm(3 * x1 * npw); + // alias + using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op; + // allocate + std::vector dylmdq_cpu(3 * ntot_ylm * npw); + // calculate for (int ipol = 0; ipol < 3; ipol++) { - Nonlocal_maths::dylmr2(x1, npw, gk_in, &dylm[ipol * x1 * npw], ipol); + Nonlocal_maths::dylmr2(ntot_ylm, npw, q, &dylmdq_cpu[ipol * ntot_ylm * npw], ipol); } - using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op; - syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, ylm_deri, dylm.data(), dylm.size()); + // send from cpu to gpu + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, out, dylmdq_cpu.data(), dylmdq_cpu.size()); } else { for (int ipol = 0; ipol < 3; ipol++) { - Nonlocal_maths::dylmr2(x1, npw, gk_in, &ylm_deri[ipol * x1 * npw], ipol); + Nonlocal_maths::dylmr2(ntot_ylm, npw, q, &out[ipol * ntot_ylm * npw], ipol); } } @@ -168,13 +207,16 @@ void Nonlocal_maths::cal_ylm_deri(int lmax, int npw, const FPTYP } // cal_pref template -std::vector> Nonlocal_maths::cal_pref(int it) +std::vector> Nonlocal_maths::cal_pref(int it, const int nh) { - const int nh = this->ucell_->atoms[it].ncpp.nh; + // nh is the total number of m-channels of the beta functions + // for example, if angular momentum of beta functions are 0, 0, 1, 1, 1, 1, the nh will be + // 1 + 1 + 3 + 3 + 3 + 3 = 14 std::vector> pref(nh); for (int ih = 0; ih < nh; ih++) { - pref[ih] = std::pow(std::complex(0.0, -1.0), this->nlpp_->nhtol(it, ih)); + pref[ih] = std::pow(std::complex(0.0, -1.0), this->nhtol_(it, ih)); + // it is actually nh2l, which means to get the angular momentum... } return pref; } @@ -193,16 +235,16 @@ void Nonlocal_maths::cal_vkb(int it, { int ih = 0; // loop over all beta functions - for (int nb = 0; nb < this->ucell_->atoms[it].ncpp.nbeta; nb++) + for (int ib = 0; ib < this->ucell_->atoms[it].ncpp.nbeta; ib++) { - int l = this->nlpp_->nhtol(it, ih); + int l = this->nhtol_(it, ih); // loop over all m angular momentum for (int m = 0; m < 2 * l + 1; m++) { int lm = l * l + m; std::complex* vkb_ptr = &vkb_out[ih * npw]; const FPTYPE* ylm_ptr = &ylm_in[lm * npw]; - const FPTYPE* vq_ptr = &vq_in[nb * npw]; + const FPTYPE* vq_ptr = &vq_in[ib * npw]; // loop over all G-vectors for (int ig = 0; ig < npw; ig++) { @@ -230,12 +272,12 @@ void Nonlocal_maths::cal_vkb_deri(int it, const FPTYPE* gk_in, std::complex* vkb_out) { - const int x1 = (this->nlpp_->lmaxkb + 1) * (this->nlpp_->lmaxkb + 1); + const int x1 = (this->lmax_ + 1) * (this->lmax_ + 1); int ih = 0; // loop over all beta functions for (int nb = 0; nb < this->ucell_->atoms[it].ncpp.nbeta; nb++) { - const int l = this->nlpp_->nhtol(it, ih); + const int l = this->nhtol_(it, ih); // loop over all m angular momentum for (int m = 0; m < 2 * l + 1; m++) { @@ -262,7 +304,7 @@ void Nonlocal_maths::cal_vkb_deri(int it, const FPTYPE* ylm_deri_ptr1 = &ylm_deri_in[(ipol * x1 + lm) * npw]; const FPTYPE* ylm_deri_ptr2 = &ylm_deri_in[(jpol * x1 + lm) * npw]; const FPTYPE* vq_deri_ptr = &vq_deri_in[nb * npw]; - const FPTYPE* gkn = &gk_in[4 * npw]; + const FPTYPE* qnorm = &gk_in[4 * npw]; for (int ig = 0; ig < npw; ig++) { vkb_ptr[ig] -= (gk_in[ig * 3 + ipol] * ylm_deri_ptr2[ig] + gk_in[ig * 3 + jpol] * ylm_deri_ptr1[ig]) @@ -273,7 +315,7 @@ void Nonlocal_maths::cal_vkb_deri(int it, for (int ig = 0; ig < npw; ig++) { vkb_ptr[ig] -= 2.0 * ylm_ptr[ig] * vq_deri_ptr[ig] * sk_in[ig] * pref_in[ih] * gk_in[ig * 3 + ipol] - * gk_in[ig * 3 + jpol] * gkn[ig]; + * gk_in[ig * 3 + jpol] * qnorm[ig]; } ih++; } @@ -322,15 +364,16 @@ void Nonlocal_maths::cal_dvkb_index(const int nbeta, int* indexes) { int ih = 0; - const int x1 = (this->nlpp_->lmaxkb + 1) * (this->nlpp_->lmaxkb + 1); + const int x1 = (this->lmax_ + 1) * (this->lmax_ + 1); for (int nb = 0; nb < nbeta; nb++) { int l = nhtol[it * nhtol_nc + ih]; for (int m = 0; m < 2 * l + 1; m++) { + //std::cout << "in function cal_dvkb_index, nhtol(" << it << ", " << ih << ") = " << l << std::endl; int lm = l * l + m; - indexes[ih * 4] = lm; - indexes[ih * 4 + 1] = nb; + indexes[ih * 4] = lm; // the index of ylm matrix, for given l and m, together with ig to get value + indexes[ih * 4 + 1] = nb; // the iproj of present atom type indexes[ih * 4 + 2] = (ipol * x1 + lm); indexes[ih * 4 + 3] = (jpol * x1 + lm); diff --git a/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp b/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp new file mode 100644 index 0000000000..d4b7e51b65 --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.cpp @@ -0,0 +1,1023 @@ +#include "onsite_proj_tools.h" + +#include "module_base/math_polyint.h" +#include "module_base/math_ylmreal.h" +#include "module_base/memory.h" +#include "module_base/timer.h" +#include "module_base/tool_title.h" +#include "module_hamilt_pw/hamilt_pwdft/kernels/force_op.h" +#include "nonlocal_maths.hpp" + +#include + +namespace hamilt +{ +template +Onsite_Proj_tools::Onsite_Proj_tools(const pseudopot_cell_vnl* nlpp_in, + const UnitCell* ucell_in, + const psi::Psi, Device>* psi_in, + const K_Vectors* kv_in, + const ModulePW::PW_Basis_K* wfc_basis_in, + const Structure_Factor* sf_in, + const ModuleBase::matrix& wg, + const ModuleBase::matrix& ekb) + : nlpp_(nlpp_in), ucell_(ucell_in), psi_(psi_in), kv_(kv_in), wfc_basis_(wfc_basis_in), sf_(sf_in) +{ + // get the device context + this->device = base_device::get_device_type(this->ctx); + + // seems kvec_c never used... + this->kvec_c = this->wfc_basis_->template get_kvec_c_data(); + // the following is important for calculating the whole contribution to + // Hamiltonian or force, stress: sum{nk} fnk*sum_{ij}Dij + // among, Dij is deeq. + // For DFT+U and other projection involved operators, deeq also plays. + this->deeq = this->nlpp_->template get_deeq_data(); + this->deeq_dims[0] = this->nlpp_->deeq.getBound1(); + this->deeq_dims[1] = this->nlpp_->deeq.getBound2(); + this->deeq_dims[2] = this->nlpp_->deeq.getBound3(); + this->deeq_dims[3] = this->nlpp_->deeq.getBound4(); + this->deeq_nc = this->nlpp_->template get_deeq_nc_data(); + this->deeq_nc_dims[0] = this->nlpp_->deeq_nc.getBound1(); + this->deeq_nc_dims[1] = this->nlpp_->deeq_nc.getBound2(); + this->deeq_nc_dims[2] = this->nlpp_->deeq_nc.getBound3(); + this->deeq_nc_dims[3] = this->nlpp_->deeq_nc.getBound4(); + // ultrasoft pseudopotential + this->qq_nt = this->nlpp_->template get_qq_nt_data(); + // total number of projectors (all types, all atoms, not m-distinguishive) + this->nkb = nlpp_->nkb; + // not clear why do these following... + this->nbands = psi_->get_nbands(); + this->max_npw = wfc_basis_->npwk_max; + this->ntype = ucell_->ntype; + // because the code is needed to reuse, therefore all other parts should be general + // and not strongly depend on any structure of class pseudopot_cell_vnl, therefore + // here unpack all needed information. + this->tabtpr = &(nlpp_->tab); + this->nhtol = &(nlpp_->nhtol); + this->lprojmax = nlpp_->lmaxkb; + // There is a contribution for jh<>ih in US case or multi projectors case + // Actually, the judge of nondiagonal should be done on every atom type + this->nondiagonal = (PARAM.globalv.use_uspp || this->nlpp_->multi_proj) ? true : false; + + this->nproj.resize(this->ntype); + std::vector nch(this->ntype); + for (int it = 0; it < this->ntype; it++) + { + this->nproj[it] = this->ucell_->atoms[it].ncpp.nbeta; + nch[it] = this->ucell_->atoms[it].ncpp.nh; + } + // allocate memory + this->allocate_memory(wg, ekb, this->nproj, nch); + this->ppcell_vkb + = (this->device == base_device::GpuDevice) ? this->nlpp_->template get_vkb_data() : this->nlpp_->vkb.c; +} + +template +Onsite_Proj_tools::Onsite_Proj_tools( + const std::vector& nproj, // number of projectors for each atom type + const std::vector& lproj, + const ModuleBase::realArray& tab, // radials' spherical bessel transform + const ModuleBase::matrix& nhtol, // (it, ich) -> l, the ich is (l, m)-distinctive index + std::complex* vkb_buf, // the vkb buffer + const UnitCell* ucell_in, + const psi::Psi, Device>* psi_in, + const K_Vectors* kv_in, + const ModulePW::PW_Basis_K* wfc_basis_in, + const Structure_Factor* sf_in, + const ModuleBase::matrix& wg, + const ModuleBase::matrix& ekb) +{ + // this is a constructor for general case, including vnl, dftu, deltaspin, deepks, etc. + // what is needed for this kind of constructor? + + // ntype: from unitcell + // nproj: number of projectors own by each atom type + // projs: beta function or radial function + // lproj: angular momentum of projectors + // rgrid: radial grid + // deeq: the Dij matrix, Hubbard parameters or other quantities... + + // what are already programmed to be needed? + + // tab: the spherical transform of radial functions, with q = linspace(0, GlobalV::NQX, GlobalV::DQ) + // nhtol: the (it, ich) -> l, the ich is (l, m)-distinctive index + // nkb: total # of projectors <- std::accumulate(nproj.begin(), nproj.end(), 0) + // atom_nh: # of (l, m)-distinctive projectors for each atom type + // h_atom_nh: counterpart of atom_nh on host + // max_nh: std::max_element(atom_nh.begin(), atom_nh.end()) + + // in conclusion, this constructor needs the following individual information: + + // nproj + // tab (projs is not needed, should be calculated elsewhere) + // lproj + // deeq, with its dims. it will be good to pass the whole realarray + + // what can be built here + // nhtol + // nkb + // atom_nh, h_atom_nh, max_nh + // deeq_dims + + ucell_ = ucell_in; + psi_ = psi_in; + kv_ = kv_in; + wfc_basis_ = wfc_basis_in; + sf_ = sf_in; + + this->device = base_device::get_device_type(this->ctx); + + this->kvec_c = this->wfc_basis_->template get_kvec_c_data(); + // skip deeq, qq_nt + this->nbands = psi_->get_nbands(); + this->max_npw = wfc_basis_->npwk_max; + this->ntype = nproj.size(); + this->tabtpr = &tab; + + this->nhtol = &nhtol; + this->lprojmax = *std::max_element(lproj.begin(), lproj.end()); + this->nondiagonal = false; + + this->nkb = 0; + this->h_atom_nh.resize(this->ntype, 0); + int iproj = 0; + for (int it = 0; it < this->ntype; it++) + { + int nproj_it = nproj[it]; + for (int ip = 0; ip < nproj_it; ip++) + { + this->h_atom_nh[it] += 2 * lproj[iproj] + 1; + this->nkb += (2 * lproj[iproj] + 1) * this->ucell_->atoms[it].na; + iproj++; + } + } + this->nproj = nproj; + this->allocate_memory(wg, ekb, nproj, this->h_atom_nh); + // what is this??? seems it is not on gpu + this->ppcell_vkb = vkb_buf; +} + +template +Onsite_Proj_tools::~Onsite_Proj_tools() +{ + // delete memory + delete_memory(); +} + +template +void Onsite_Proj_tools::allocate_memory(const ModuleBase::matrix& wg, + const ModuleBase::matrix& ekb, + const std::vector& nproj, + const std::vector& nch) +{ + // allocate memory + + // prepare the memory of stress and init some variables: + this->h_atom_nh.resize(this->ntype); + this->h_atom_na.resize(this->ntype); + for (int it = 0; it < this->ntype; it++) + { + h_atom_nh[it] = nch[it]; + h_atom_na[it] = this->ucell_->atoms[it].na; + } + + int nprojmax = 0; + for (int it = 0; it < this->ntype; it++) // loop all elements + { + nprojmax = std::max(nproj[it], nprojmax); // 0000000000000000000000000 + this->max_nh = std::max(h_atom_nh[it], max_nh); + } + + // allocate the memory for vkb and vkb_deri. + if (this->device == base_device::GpuDevice) + { + resmem_int_op()(this->ctx, this->d_dvkb_indexes, max_nh * 4); + } + + resmem_var_op()(this->ctx, this->hd_vq, nprojmax * max_npw); + resmem_var_op()(this->ctx, this->hd_vq_deri, nprojmax * max_npw); + resmem_var_op()(this->ctx, this->hd_ylm, (lprojmax + 1) * (lprojmax + 1) * max_npw); + resmem_var_op()(this->ctx, this->hd_ylm_deri, 3 * (lprojmax + 1) * (lprojmax + 1) * max_npw); + + if (this->device == base_device::GpuDevice) + { + resmem_var_op()(this->ctx, d_wg, wg.nr * wg.nc); + resmem_var_op()(this->ctx, d_ekb, ekb.nr * ekb.nc); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, wg.c, wg.nr * wg.nc); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_ekb, ekb.c, ekb.nr * ekb.nc); + resmem_int_op()(this->ctx, atom_nh, this->ntype); + resmem_int_op()(this->ctx, atom_na, this->ntype); + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_nh, h_atom_nh.data(), this->ntype); + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, atom_na, h_atom_na.data(), this->ntype); + + resmem_var_op()(this->ctx, d_g_plus_k, max_npw * 5); + resmem_var_op()(this->ctx, d_pref, max_nh); + resmem_var_op()(this->ctx, d_vq_tab, this->tabtpr->getSize()); + resmem_complex_op()(this->ctx, d_pref_in, max_nh); + } + else + { + this->d_wg = wg.c; + this->d_ekb = ekb.c; + this->atom_nh = h_atom_nh.data(); + this->atom_na = h_atom_na.data(); + } +} + +template +void Onsite_Proj_tools::delete_memory() +{ + // delete memory + + delmem_var_op()(this->ctx, hd_vq); + delmem_var_op()(this->ctx, hd_vq_deri); + delmem_var_op()(this->ctx, hd_ylm); + delmem_var_op()(this->ctx, hd_ylm_deri); + + // delete memory on GPU + if (this->device == base_device::GpuDevice) + { + delmem_var_op()(this->ctx, d_wg); + delmem_var_op()(this->ctx, d_ekb); + delmem_int_op()(this->ctx, atom_nh); + delmem_int_op()(this->ctx, atom_na); + delmem_var_op()(this->ctx, d_g_plus_k); + delmem_var_op()(this->ctx, d_pref); + delmem_var_op()(this->ctx, d_vq_tab); + delmem_complex_op()(this->ctx, this->d_pref_in); + delmem_int_op()(this->ctx, d_dvkb_indexes); + } + + if (becp != nullptr) + { + delmem_complex_op()(this->ctx, becp); + delmem_complex_op()(this->ctx, hd_sk); + } + if (dbecp != nullptr) + { + delmem_complex_op()(this->ctx, dbecp); + } + if (this->pre_ik_f != -1) + { + delmem_int_op()(this->ctx, gcar_zero_indexes); + delmem_complex_op()(this->ctx, vkb_save); + delmem_var_op()(this->ctx, gcar); + } +} + +// cal_becp +// starts from vkb (nkb, ng) table +// it should be merely the multiplication of matrix (vkb, ng) * (ng, nbands) -> (vkb, nbands) +// should be irrelevant with what the matrix is. +// the vkb index generation should be maintained elsewhere. +// vkb already has atomic position information, calculated from the vq and sk +// . the multiplication with sk should be within specific operator +// because the atom selection task is operator-specific. +template +void Onsite_Proj_tools::cal_becp(int ik, + int npm, + std::complex* becp_in, + const std::complex* ppsi_in) +{ + ModuleBase::TITLE("Onsite_Proj_tools", "cal_becp"); + ModuleBase::timer::tick("Onsite_Proj_tools", "cal_becp"); + + const int npol = this->ucell_->get_npol(); + const std::complex* ppsi = ppsi_in == nullptr ? &(this->psi_[0](ik, 0, 0)) : ppsi_in; + const int npw = this->wfc_basis_->npwk[ik]; + if (becp_in == nullptr && this->becp == nullptr) + { + resmem_complex_op()(this->ctx, becp, this->nbands * npol * this->nkb); + } + std::complex* becp_tmp = becp_in == nullptr ? this->becp : becp_in; + const int size_becp_act = npm * npol * this->nkb; + if (ik != this->current_ik) // different ik, need to recalculate vkb + { + const int size_becp = this->nbands * npol * this->nkb; + if (this->becp == nullptr) + { + resmem_complex_op()(this->ctx, becp, size_becp); + } + + // prepare math tools + Nonlocal_maths maths(*(this->nhtol), this->lprojmax, this->ucell_); + + std::complex* vkb_ptr = this->ppcell_vkb; + + // calculate G+K + this->g_plus_k = maths.cal_gk(ik, this->wfc_basis_); + FPTYPE *gk = g_plus_k.data(), *vq_tb = this->tabtpr->ptr; + // vq_tb has dimension (ntype, nproj, GlobalV::NQX) + + // calculate sk + resmem_complex_op()(ctx, hd_sk, this->ucell_->nat * npw); + this->sf_->get_sk(ctx, ik, this->wfc_basis_, hd_sk); + std::complex* d_sk = this->hd_sk; + // prepare ylm,size: (lmax+1)^2 * this->max_npw + const int lmax_ = this->lprojmax; + maths.cal_ylm(lmax_, npw, g_plus_k.data(), hd_ylm); + + // DEBUG: ONCE YOU CHECK ylm VALUES, YOU UNCOMMENT THE FOLLOW + // std::vector> qs(npw); + // for (int ig = 0; ig < npw; ig++) + // { + // qs[ig] = this->wfc_basis_->getgpluskcar(ik, ig); + // } + // const int total_lm = (lmax_ + 1) * (lmax_ + 1); + // ModuleBase::matrix ylmref(total_lm, npw); + // ModuleBase::YlmReal::Ylm_Real(total_lm, npw, qs.data(), ylmref); + // std::cout << "Compare the Ylm values of two methods:" << std::endl; + // int lm = 0; + // for(int l_ = 0; l_ < lmax_ + 1; l_++) + // { + // for(int m_ = -l_; m_ <= l_; m_++) + // { + // std::cout << "l = " << l_ << " m = " << m_ << std::endl; + // lm = l_ * l_ + l_ + m_; + // for(int ig = 0; ig < npw; ig++) + // { + // std::cout << "[" << ylmref(lm, ig) << " " << hd_ylm[lm * npw + ig] << "]" << " "; + // } + // std::cout << std::endl; + // } + // std::cout << std::endl; + // } + // ModuleBase::WARNING_QUIT("Onsite_Proj_tools", "cal_becp"); + + if (this->device == base_device::GpuDevice) + { + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_g_plus_k, g_plus_k.data(), g_plus_k.size()); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_vq_tab, this->tabtpr->ptr, this->tabtpr->getSize()); + gk = d_g_plus_k; + vq_tb = d_vq_tab; + } + + // int vkb_size = 0; + for (int it = 0; it < this->ucell_->ntype; it++) // loop all elements + { + // interpolate (it, 0..nproj[it], 0..npw) to get hd_vq + cal_vq_op()(this->ctx, + vq_tb, // its data is correct, dimension (ntype, nprojmax, GlobalV::NQX) + it, // but maybe it is (ntype, nprojmax*npol, GlobalV::NQX) + gk, + npw, + this->tabtpr->getBound2(), + this->tabtpr->getBound3(), + PARAM.globalv.dq, + nproj[it], + hd_vq); // hd_vq has dimension (nprojmax, npwx), this size will be the largest needed. + + // DEBUG: ONCE YOU CHECK vq VALUES, YOU UNCOMMENT THE FOLLOWING LINE + // for(int ip = 0; ip < nproj[it]; ip++) + // { + // std::cout << "projector #" << ip << " of atom type " << it << std::endl; + // for(int iq = 0; iq < npw; iq++) + // { + // std::cout << hd_vq[ip * npw + iq] << " "; + // } + // std::cout << std::endl; + // } + // std::cout << std::endl; + + // prepare(-i)^l, size: nh + std::vector> pref = maths.cal_pref(it, h_atom_nh[it]); + const int nh = pref.size(); + this->dvkb_indexes.resize(nh * 4); + // print the value of nhtol + // nhtol->print(std::cout); // as checked, nhtol works as expected + maths.cal_dvkb_index(nproj[it], this->nhtol->c, this->nhtol->nc, npw, it, 0, 0, this->dvkb_indexes.data()); + + if (this->device == base_device::GpuDevice) + { + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_pref_in, pref.data(), nh); + } + + for (int ia = 0; ia < h_atom_na[it]; ia++) + { + if (this->device == base_device::CpuDevice) + { + d_pref_in = pref.data(); + d_dvkb_indexes = dvkb_indexes.data(); + } + cal_vkb_op()(this->ctx, nh, npw, d_dvkb_indexes, hd_vq, hd_ylm, d_sk, d_pref_in, vkb_ptr); + vkb_ptr += nh * npw; // vkb_ptr has dimension (nhtot, npwx), this size will be the largest needed. + d_sk += npw; + // vkb_size += nh * npw; + } + } + this->current_ik = ik; + } + // DEBUG: ONCE YOU CHECK vkb VALUES, YOU UNCOMMENT THE FOLLOWING LINE + // for(int i = 0; i < vkb_size; i++) + // { + // if (i % npw == 0) + // { + // std::cout << "The #" << i / npw << " projector" << std::endl; + // } + // std::cout << this->ppcell_vkb[i] << " "; + // } + // std::cout << std::endl; + // ModuleBase::WARNING_QUIT("Onsite_Proj_tools", "cal_becp"); + + // PLAN: seperate the lower and upper into two parts, individually called. + const char transa = 'C'; + const char transb = 'N'; + int npm_npol = npm * npol; + gemm_op()(this->ctx, + transa, + transb, + this->nkb, + npm_npol, // nbands(occ)*npol + npw, + &ModuleBase::ONE, + this->ppcell_vkb, + npw, + ppsi, + this->max_npw, + &ModuleBase::ZERO, + becp_tmp, + this->nkb); + + if (this->device == base_device::GpuDevice) + { + std::complex* h_becp = nullptr; + resmem_complex_h_op()(this->cpu_ctx, h_becp, size_becp_act); + syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, becp_tmp, size_becp_act); + Parallel_Reduce::reduce_pool(h_becp, size_becp_act); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, becp_tmp, h_becp, size_becp_act); + delmem_complex_h_op()(this->cpu_ctx, h_becp); + } + else + { + Parallel_Reduce::reduce_pool(becp_tmp, size_becp_act); + } + // DEBUG: ONCE YOU CHECK becp VALUES, YOU UNCOMMENT THE FOLLOWING LINE + // std::cout << "ik: " << ik << std::endl; + // for (int i = 0; i < npm_npol*this->nkb; i++) + // { + // std::cout << "becp[" << i << "]: " << becp[i] << std::endl; + // } + ModuleBase::timer::tick("Onsite_Proj_tools", "cal_becp"); +} + +// cal_dbecp +template +void Onsite_Proj_tools::cal_dbecp_s(int ik, int npm, int ipol, int jpol) +{ + ModuleBase::TITLE("Onsite_Proj_tools", "cal_dbecp_s"); + ModuleBase::timer::tick("Onsite_Proj_tools", "cal_dbecp_s"); + this->current_ik = -1; // reset the current ik, vkb has been reused to save dvkb + const int npol = this->ucell_->get_npol(); + const int size_becp = this->nbands * npol * this->nkb; + const int npm_npol = npm * npol; + if (this->dbecp == nullptr) + { + resmem_complex_op()(this->ctx, dbecp, size_becp); + } + + // prepare math tools + Nonlocal_maths maths(*(this->nhtol), this->lprojmax, this->ucell_); + + const std::complex* ppsi = &(this->psi_[0](ik, 0, 0)); + const int npw = this->wfc_basis_->npwk[ik]; + std::complex* vkb_deri_ptr = this->ppcell_vkb; + + if (this->pre_ik_s != ik) + { // k point has changed, we need to recalculate the g_plus_k + // this->g_plus_k = maths.cal_gk(ik, this->wfc_basis_); //has been calculated by cal_becp + + const int lmax_ = this->lprojmax; + // prepare ylm,size: (lmax+1)^2 * this->max_npw + // maths.cal_ylm(lmax_, npw, g_plus_k.data(), hd_ylm); //has been calculated by cal_becp + maths.cal_ylm_deri(lmax_, npw, g_plus_k.data(), hd_ylm_deri); + this->pre_ik_s = ik; + } + FPTYPE *gk = g_plus_k.data(), *vq_tb = this->tabtpr->ptr; + std::complex* d_sk = this->hd_sk; + if (this->device == base_device::GpuDevice) + { + gk = d_g_plus_k; + vq_tb = d_vq_tab; + } + + for (int it = 0; it < this->ucell_->ntype; it++) // loop all elements + { + cal_vq_op()(this->ctx, + vq_tb, + it, + gk, + npw, + this->tabtpr->getBound2(), + this->tabtpr->getBound3(), + PARAM.globalv.dq, + this->nproj[it], + hd_vq); + cal_vq_deri_op()(this->ctx, + vq_tb, + it, + gk, + npw, + this->tabtpr->getBound2(), + this->tabtpr->getBound3(), + PARAM.globalv.dq, + this->nproj[it], + hd_vq_deri); + + // prepare(-i)^l, size: nh + std::vector> pref = maths.cal_pref(it, h_atom_nh[it]); + int nh = pref.size(); + // prepare indexes for calculate vkb_deri + this->dvkb_indexes.resize(nh * 4); + maths.cal_dvkb_index(this->nproj[it], + this->nhtol->c, + this->nhtol->nc, + npw, + it, + ipol, + jpol, + this->dvkb_indexes.data()); + if (this->device == base_device::GpuDevice) + { + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, d_dvkb_indexes, dvkb_indexes.data(), nh * 4); + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, d_pref_in, pref.data(), nh); + } + for (int ia = 0; ia < h_atom_na[it]; ia++) + { + // 2. calculate dbecp: + // 2.a. calculate dbecp_noevc, repeat use the memory of ppcell.vkb + + if (this->device == base_device::CpuDevice) + { + d_dvkb_indexes = dvkb_indexes.data(); + d_pref_in = pref.data(); + d_g_plus_k = g_plus_k.data(); + } + cal_vkb_deri_op()(this->ctx, + nh, + npw, + ipol, + jpol, + d_dvkb_indexes, + hd_vq, + hd_vq_deri, + hd_ylm, + hd_ylm_deri, + d_sk, + d_pref_in, + d_g_plus_k, + vkb_deri_ptr); + d_sk += npw; + vkb_deri_ptr += nh * npw; + } + } + // 2.b calculate dbecp = dbecp_noevc * psi + const char transa = 'C'; + const char transb = 'N'; + + gemm_op()(this->ctx, + transa, + transb, + nkb, + npm_npol, + npw, + &ModuleBase::ONE, + ppcell_vkb, + npw, + ppsi, + this->max_npw, + &ModuleBase::ZERO, + dbecp, + nkb); + ModuleBase::timer::tick("Onsite_Proj_tools", "cal_dbecp_s"); +} + +// cal_dbecp_f +// starts from vkb (nkb, ng) table +// it should be again merely the multiplication of matrix (vkb, ng) * (ng, nbands) -> (vkb, nbands) +// the vkb is backed-up, and the memory space is reused for calculate ONE COMPONENT of dbecp +// . the direction of force is indexed by ipol (for stress, there are two, ipol and jpol). +// the dbecp_f is simply the becp multiplied with -i(G+k)_i +template +void Onsite_Proj_tools::cal_dbecp_f(int ik, int npm, int ipol) +{ + ModuleBase::TITLE("Onsite_Proj_tools", "cal_dbecp_f"); + ModuleBase::timer::tick("Onsite_Proj_tools", "cal_dbecp_f"); + + this->current_ik = -1; // reset the current ik, vkb has been reused to save dvkb + + const int npw = this->wfc_basis_->npwk[ik]; + + // STAGE1: calculate dvkb_f + // calculate gcarx, gcary/gcarx and gcarz/gcary, overwrite gcar + if (this->pre_ik_f == -1) // if it is the very first run, we allocate + { + resmem_var_op()(this->ctx, gcar, 3 * this->wfc_basis_->npwk_max); + resmem_int_op()(this->ctx, gcar_zero_indexes, 3 * this->wfc_basis_->npwk_max); + } + // first refresh the value of gcar_zero_indexes, gcar_zero_counts + if (this->pre_ik_f != ik) + { // the following lines will cause UNDEFINED BEHAVIOR because memory layout of vector3 instance + // is assumed to be always contiguous but it is not guaranteed. + this->transfer_gcar(npw, + this->wfc_basis_->npwk_max, + &(this->wfc_basis_->gcar[ik * this->wfc_basis_->npwk_max].x)); + } + + // backup vkb values to vkb_save + this->save_vkb(npw, ipol); + // for x, the coef is -i, for y and z it is 1 + const std::complex coeff = ipol == 0 ? ModuleBase::NEG_IMAG_UNIT : ModuleBase::ONE; + + const std::complex* vkb_ptr = this->ppcell_vkb; + std::complex* vkb_deri_ptr = this->ppcell_vkb; + // calculate the vkb_deri for ipol with the memory of ppcell_vkb + cal_vkb1_nl_op()(this->ctx, nkb, npw, npw, npw, ipol, coeff, vkb_ptr, gcar, vkb_deri_ptr); + + // ------------------------------------------------------------------------------->8 + + // STAGE2: calculate dbecp_f + // NPOL + // either 1 or 2, for NSPIN 1, 2 or 4 calculation + // once NSPIN 4, there are doubled number of pw in each "row" of psi + // on the other hand, for NSPIN 4 calculation, the number of bands is also doubled + const int npol = this->ucell_->get_npol(); + const int npm_npol = npm * npol; + const int size_becp = this->nbands * npol * this->nkb; + if (this->dbecp == nullptr) // if it is the very first run, we allocate + { // why not judging whether dbecp == nullptr inside resmem_complex_op? + resmem_complex_op()(this->ctx, dbecp, 3 * size_becp); + } + // do gemm to get dbecp and revert the ppcell_vkb for next ipol + const std::complex* ppsi = &(this->psi_[0](ik, 0, 0)); + // move the pointer to corresponding read&write position, according to ipol + std::complex* dbecp_ptr = this->dbecp + ipol * size_becp; // [out] + const char transa = 'C'; + const char transb = 'N'; + gemm_op()(this->ctx, + transa, + transb, + this->nkb, + npm_npol, + npw, + &ModuleBase::ONE, + vkb_deri_ptr, + npw, + ppsi, + this->max_npw, + &ModuleBase::ZERO, + dbecp_ptr, + nkb); + this->revert_vkb(npw, ipol); + this->pre_ik_f = ik; + ModuleBase::timer::tick("Onsite_Proj_tools", "cal_dbecp_f"); +} + +// save_vkb +template +void Onsite_Proj_tools::save_vkb(int npw, int ipol) +{ + if (this->device == base_device::CpuDevice) + { + const int gcar_zero_count = this->gcar_zero_indexes[ipol * this->wfc_basis_->npwk_max]; + const int* gcar_zero_ptrs = &this->gcar_zero_indexes[ipol * this->wfc_basis_->npwk_max + 1]; + const std::complex* vkb_ptr = this->ppcell_vkb; + std::complex* vkb_save_ptr = this->vkb_save; + // find the zero indexes to save the vkb values to vkb_save + for (int ikb = 0; ikb < this->nkb; ++ikb) + { + for (int icount = 0; icount < gcar_zero_count; ++icount) + { + *vkb_save_ptr = vkb_ptr[gcar_zero_ptrs[icount]]; + ++vkb_save_ptr; + } + vkb_ptr += npw; + } + } + else + { +#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM + saveVkbValues(this->gcar_zero_indexes, + this->ppcell_vkb, + this->vkb_save, + nkb, + this->gcar_zero_counts[ipol], + npw, + ipol, + this->wfc_basis_->npwk_max); +#endif + } +} + +// revert_vkb +template +void Onsite_Proj_tools::revert_vkb(int npw, int ipol) +{ + const std::complex coeff = ipol == 0 ? ModuleBase::NEG_IMAG_UNIT : ModuleBase::ONE; + if (this->device == base_device::CpuDevice) + { + const int gcar_zero_count = this->gcar_zero_indexes[ipol * this->wfc_basis_->npwk_max]; + const int* gcar_zero_ptrs = &this->gcar_zero_indexes[ipol * this->wfc_basis_->npwk_max + 1]; + std::complex* vkb_ptr = this->ppcell_vkb; + const std::complex* vkb_save_ptr = this->vkb_save; + // find the zero indexes to save the vkb values to vkb_save + for (int ikb = 0; ikb < this->nkb; ++ikb) + { + for (int icount = 0; icount < gcar_zero_count; ++icount) + { + vkb_ptr[gcar_zero_ptrs[icount]] = *vkb_save_ptr * coeff; + ++vkb_save_ptr; + } + vkb_ptr += npw; + } + } + else + { +#if __CUDA || __UT_USE_CUDA || __ROCM || __UT_USE_ROCM + revertVkbValues(this->gcar_zero_indexes, + this->ppcell_vkb, + this->vkb_save, + nkb, + this->gcar_zero_counts[ipol], + npw, + ipol, + this->wfc_basis_->npwk_max, + coeff); +#endif + } +} + +template +void Onsite_Proj_tools::transfer_gcar(int npw, int npw_max, const FPTYPE* gcar_in) +{ + std::vector gcar_tmp(3 * npw_max); // [out], will overwritten this->gcar + gcar_tmp.assign(gcar_in, + gcar_in + 3 * npw_max); // UNDEFINED BEHAVIOR!!! nobody always knows the memory layout of vector3 + std::vector gcar_zero_indexes_tmp(3 * npw_max); // a "checklist" + + int* gcar_zero_ptrs[3]; + for (int i = 0; i < 3; i++) + { + gcar_zero_ptrs[i] = &gcar_zero_indexes_tmp[i * npw_max]; + gcar_zero_ptrs[i][0] = -1; + this->gcar_zero_counts[i] = 0; + } + for (int ig = 0; ig < npw; ig++) + { + // calculate gcar.x , gcar.y/gcar.x, gcar.z/gcar.y + // if individual gcar is less than 1e-15, we will record the index + for (int i = 0; i < 3; ++i) + { + if (std::abs(gcar_tmp[ig * 3 + i]) < 1e-15) + { + ++gcar_zero_counts[i]; // num of zeros on each direction + gcar_zero_ptrs[i][gcar_zero_counts[i]] = ig; + } + } + // four cases for the gcar of y and z + if (gcar_zero_ptrs[0][gcar_zero_counts[0]] == ig && gcar_zero_ptrs[1][gcar_zero_counts[1]] == ig) + { // x == y == 0, z = z + } + else if (gcar_zero_ptrs[0][gcar_zero_counts[0]] != ig && gcar_zero_ptrs[1][gcar_zero_counts[1]] == ig) + { // x != 0, y == 0, z = z/x + gcar_tmp[ig * 3 + 2] /= gcar_tmp[ig * 3]; + } + else if (gcar_zero_ptrs[0][gcar_zero_counts[0]] == ig && gcar_zero_ptrs[1][gcar_zero_counts[1]] != ig) + { // x == 0, y != 0, y = y, z = z/y + gcar_tmp[ig * 3 + 2] /= gcar_tmp[ig * 3 + 1]; + } + else + { // x != 0, y != 0, y = y/x, z = z/y + gcar_tmp[ig * 3 + 2] /= gcar_tmp[ig * 3 + 1]; + gcar_tmp[ig * 3 + 1] /= gcar_tmp[ig * 3]; + } + } + for (int i = 0; i < 3; ++i) + { // record the counts to the first element + gcar_zero_ptrs[i][0] = gcar_zero_counts[i]; + } + // prepare the memory for vkb_save + const int max_count = std::max(gcar_zero_counts[0], std::max(gcar_zero_counts[1], gcar_zero_counts[2])); + resmem_complex_op()(this->ctx, this->vkb_save, this->nkb * max_count); + // transfer the gcar and gcar_zero_indexes to the device + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, gcar, gcar_tmp.data(), 3 * npw_max); + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, gcar_zero_indexes, gcar_zero_indexes_tmp.data(), 3 * npw_max); +} + +template +void Onsite_Proj_tools::cal_force_dftu(int ik, + int npm, + FPTYPE* force, + const int* orbital_corr, + const std::complex* vu, + const int size_vu, + const FPTYPE* h_wg) +{ + int* orbital_corr_tmp = nullptr; + std::complex* vu_tmp = nullptr; +#if defined(__CUDA) || defined(__ROCM) + if (this->device == base_device::GpuDevice) + { + resmem_int_op()(this->ctx, orbital_corr_tmp, this->ucell_->ntype); + syncmem_int_h2d_op()(this->ctx, cpu_ctx, orbital_corr_tmp, orbital_corr, this->ucell_->ntype); + resmem_complex_op()(this->ctx, vu_tmp, size_vu); + syncmem_complex_h2d_op()(this->ctx, cpu_ctx, vu_tmp, vu, size_vu); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1)); + } + else +#endif + { + orbital_corr_tmp = const_cast(orbital_corr); + vu_tmp = const_cast*>(vu); + d_wg = const_cast(h_wg); + } + const int force_nc = 3; + cal_force_nl_op()(this->ctx, + npm, + this->nbands, + this->ntype, + force_nc, + this->nbands, + ik, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg, + vu_tmp, + orbital_corr_tmp, + becp, + dbecp, + force); +#if defined(__CUDA) || defined(__ROCM) + if (this->device == base_device::GpuDevice) + { + delmem_complex_op()(this->ctx, vu_tmp); + delmem_int_op()(this->ctx, orbital_corr_tmp); + } +#endif +} + +template +void Onsite_Proj_tools::cal_force_dspin(int ik, + int npm, + FPTYPE* force, + const ModuleBase::Vector3* lambda, + const FPTYPE* h_wg) +{ + std::vector lambda_array(this->ucell_->nat * 3); + for (int iat = 0; iat < this->ucell_->nat; iat++) + { + lambda_array[iat * 3] = lambda[iat].x; + lambda_array[iat * 3 + 1] = lambda[iat].y; + lambda_array[iat * 3 + 2] = lambda[iat].z; + } + FPTYPE* lambda_tmp = nullptr; +#if defined(__CUDA) || defined(__ROCM) + if (this->device == base_device::GpuDevice) + { + resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array.data(), this->ucell_->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1)); + } + else +#endif + { + lambda_tmp = lambda_array.data(); + d_wg = const_cast(h_wg); + } + const int force_nc = 3; + cal_force_nl_op()(this->ctx, + npm, + this->nbands, + this->ntype, + force_nc, + this->nbands, + ik, + nkb, + atom_nh, + atom_na, + this->ucell_->tpiba, + d_wg, + lambda_tmp, + becp, + dbecp, + force); + +#if defined(__CUDA) || defined(__ROCM) + if (this->device == base_device::GpuDevice) + { + delmem_var_op()(this->ctx, lambda_tmp); + } +#endif +} + +template +void Onsite_Proj_tools::cal_stress_dftu(int ik, + int npm, + FPTYPE* stress, + const int* orbital_corr, + const std::complex* vu, + const int size_vu, + const FPTYPE* h_wg) +{ + int* orbital_corr_tmp = nullptr; + std::complex* vu_tmp = nullptr; +#if defined(__CUDA) || defined(__ROCM) + if (this->device == base_device::GpuDevice) + { + resmem_int_op()(this->ctx, orbital_corr_tmp, this->ucell_->ntype); + syncmem_int_h2d_op()(this->ctx, cpu_ctx, orbital_corr_tmp, orbital_corr, this->ucell_->ntype); + resmem_complex_op()(this->ctx, vu_tmp, size_vu); + syncmem_complex_h2d_op()(this->ctx, cpu_ctx, vu_tmp, vu, size_vu); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1)); + } + else +#endif + { + orbital_corr_tmp = const_cast(orbital_corr); + vu_tmp = const_cast*>(vu); + d_wg = const_cast(h_wg); + } + cal_stress_nl_op()(this->ctx, + nkb, + npm, + this->ntype, + this->nbands, + ik, + atom_nh, + atom_na, + d_wg, + vu_tmp, + orbital_corr_tmp, + becp, + dbecp, + stress); +#if defined(__CUDA) || defined(__ROCM) + if (this->device == base_device::GpuDevice) + { + delmem_complex_op()(this->ctx, vu_tmp); + delmem_int_op()(this->ctx, orbital_corr_tmp); + } +#endif +} + +template +void Onsite_Proj_tools::cal_stress_dspin(int ik, + int npm, + FPTYPE* stress, + const ModuleBase::Vector3* lambda, + const FPTYPE* h_wg) +{ + std::vector lambda_array(this->ucell_->nat * 3); + for (int iat = 0; iat < this->ucell_->nat; iat++) + { + lambda_array[iat * 3] = lambda[iat].x; + lambda_array[iat * 3 + 1] = lambda[iat].y; + lambda_array[iat * 3 + 2] = lambda[iat].z; + } + FPTYPE* lambda_tmp = nullptr; +#if defined(__CUDA) || defined(__ROCM) + if (this->device == base_device::GpuDevice) + { + resmem_var_op()(this->ctx, lambda_tmp, this->ucell_->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, lambda_tmp, lambda_array.data(), this->ucell_->nat * 3); + syncmem_var_h2d_op()(this->ctx, this->cpu_ctx, d_wg, h_wg, this->nbands * (ik+1)); + } + else +#endif + { + lambda_tmp = lambda_array.data(); + d_wg = const_cast(h_wg); + } + const int force_nc = 3; + cal_stress_nl_op()(this->ctx, + nkb, + npm, + this->ntype, + this->nbands, + ik, + atom_nh, + atom_na, + d_wg, + lambda_tmp, + becp, + dbecp, + stress); + +#if defined(__CUDA) || defined(__ROCM) + if (this->device == base_device::GpuDevice) + { + delmem_var_op()(this->ctx, lambda_tmp); + } +#endif +} + +// template instantiation +template class Onsite_Proj_tools; +#if ((defined __CUDA) || (defined __ROCM)) +template class Onsite_Proj_tools; +#endif + +} // namespace hamilt diff --git a/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.h b/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.h new file mode 100644 index 0000000000..17c7e06491 --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.h @@ -0,0 +1,216 @@ +#ifndef MODULEHAMILTPW_ONSITEPROJTOOLS_H +#define MODULEHAMILTPW_ONSITEPROJTOOLS_H + +#include "module_base/module_device/device.h" +#include "module_basis/module_pw/pw_basis_k.h" +#include "module_cell/klist.h" +#include "module_cell/unitcell.h" +#include "module_hamilt_pw/hamilt_pwdft/VNL_in_pw.h" +#include "module_hamilt_pw/hamilt_pwdft/kernels/stress_op.h" +#include "module_hsolver/kernels/math_kernel_op.h" +#include "module_psi/psi.h" + +#include + +namespace hamilt +{ + +/** + * @brief Nonlocal pseudopotential tools in plane wave basis set. + * used for calculating force and stress for different algorithm + * the main functions are: + * 1. cal_becp: calculate the becp = for all beta functions + * 2. cal_dbecp_s: calculate the dbecp_{ij} = for all beta functions + * stress_{ij} = -1/omega \sum_{n,k}f_{nk} \sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_{ij} also + * calculated + * 3. cal_dbecp_f: calculate the dbecp_i = for all beta functions + * 4. cal_force: calculate the force^I_i = - \sum_{n,k}f_{nk} \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_i + */ +template +class Onsite_Proj_tools +{ + public: + Onsite_Proj_tools(const pseudopot_cell_vnl* nlpp_in, + const UnitCell* ucell_in, + const psi::Psi, Device>* psi_in, + const K_Vectors* kv_in, + const ModulePW::PW_Basis_K* wfc_basis_in, + const Structure_Factor* sf_in, + const ModuleBase::matrix& wg, + const ModuleBase::matrix& ekb); + + // a more general constructor is in the following + Onsite_Proj_tools(const std::vector& nproj, // number of projectors for each atom type + const std::vector& lproj, + const ModuleBase::realArray& tab, // radials' spherical bessel transform + const ModuleBase::matrix& nhtol, + std::complex* vkb_buf, + const UnitCell* ucell_in, + const psi::Psi, Device>* psi_in, + const K_Vectors* kv_in, + const ModulePW::PW_Basis_K* wfc_basis_in, + const Structure_Factor* sf_in, + const ModuleBase::matrix& wg, + const ModuleBase::matrix& ekb); + + ~Onsite_Proj_tools(); + + /** + * @brief calculate the becp = for all beta functions + */ + void cal_becp(int ik, int npm, std::complex* becp_in = nullptr, const std::complex* ppsi_in = nullptr); + /** + * @brief calculate the dbecp_{ij} = for all beta functions + * stress_{ij} = -1/omega \sum_{n,k}f_{nk} \sum_I \sum_{lm,l'm'}D_{l,l'}^{I} becp * dbecp_{ij} also calculated + */ + void cal_dbecp_s(int ik, int npm, int ipol, int jpol); + /** + * @brief calculate the dbecp_i = for all beta functions + */ + void cal_dbecp_f(int ik, int npm, int ipol); + + void cal_force_dftu(int ik, int npm, FPTYPE* force, const int* orbital_corr, const std::complex* vu, const int size_vu, const FPTYPE* h_wg); + void cal_force_dspin(int ik, int npm, FPTYPE* force, const ModuleBase::Vector3* lambda, const FPTYPE* h_wg); + void cal_stress_dftu(int ik, int npm, FPTYPE* stress, const int* orbital_corr, const std::complex* vu, const int size_vu, const FPTYPE* h_wg); + void cal_stress_dspin(int ik, int npm, FPTYPE* stress, const ModuleBase::Vector3* lambda, const FPTYPE* h_wg); + + + std::complex* get_becp() { return becp; } + std::complex* get_dbecp() { return dbecp; } + + private: + /** + * @brief allocate the memory for the variables + */ + void allocate_memory(const ModuleBase::matrix& wg, + const ModuleBase::matrix& ekb, + const std::vector& nproj, + const std::vector& nch); + /** + * @brief delete the memory for the variables + */ + void delete_memory(); + + private: + /// pointers to access the data without memory arrangement + const Structure_Factor* sf_; + const pseudopot_cell_vnl* nlpp_; + const UnitCell* ucell_; + const psi::Psi, Device>* psi_; + const K_Vectors* kv_; + const ModulePW::PW_Basis_K* wfc_basis_; + + /// the following variables are used for the calculation + Device* ctx = {}; + base_device::DEVICE_CPU* cpu_ctx = {}; + base_device::AbacusDevice_t device = {}; + int nkb; + int nbands; + int deeq_dims[4] = {0, 0, 0, 0}; // deeq can be something other than that in pseudopotentials + int deeq_nc_dims[4] = {0, 0, 0, 0}; + + int current_ik = -1; + + int max_nh = 0; + int max_npw = 0; + int ntype; + bool nondiagonal; + int pre_ik_s = -1; + int pre_ik_f = -1; + + int* atom_nh = nullptr; + int* atom_na = nullptr; + std::vector h_atom_nh; + std::vector h_atom_na; + std::vector nproj; + + /// ------------------------- Key optimization ------------------------- + /// @brief the following variables are used for transfer gcar and reuse the values of vkb for force calculation + int* gcar_zero_indexes = nullptr; + int gcar_zero_counts[3] = {0, 0, 0}; + std::complex* vkb_save = nullptr; + /// @brief count zero gcar indexes and prepare zero_indexes, do gcar_y /= gcar_x, gcar_z /= gcar_y + void transfer_gcar(int npw, int npw_max, const FPTYPE* gcar_in); + /// @brief save the 0-value dvkbs for calculating the dbecp_i in the force calculation + void save_vkb(int npw, int ipol); + /// @brief revert the 0-value dvkbs for calculating the dbecp_i in the force calculation + void revert_vkb(int npw, int ipol); + /// --------------------------------------------------------------------- + + /// pointers to access the data without memory arrangement + const ModuleBase::realArray* tabtpr = nullptr; + const ModuleBase::matrix* nhtol = nullptr; + int lprojmax = -1; + + FPTYPE* d_wg = nullptr; + FPTYPE* d_ekb = nullptr; + FPTYPE* gcar = nullptr; + + FPTYPE* deeq = nullptr; + std::complex* deeq_nc = nullptr; + + FPTYPE* kvec_c = nullptr; + FPTYPE* qq_nt = nullptr; + /// --------------------- Key variable --------------------- + /// borrow the memory from the vkb in VNL_in_pw to calculate vkb and dvkb + std::complex* ppcell_vkb = nullptr; + /// --------------------------------------------------------- + /// the following variables are used for the calculation + /// allocate memory on CPU device only + std::vector g_plus_k; + /// allocate memory on CPU/GPU device + FPTYPE* hd_ylm = nullptr; // (lmax + 1) * (lmax + 1) * npw + FPTYPE* hd_ylm_deri = nullptr; // 3 * (lmax + 1) * (lmax + 1) * npw + FPTYPE* hd_vq = nullptr; // this->ucell->atoms[it].ncpp.nbeta * npw + FPTYPE* hd_vq_deri = nullptr; // this->ucell->atoms[it].ncpp.nbeta * npw + std::complex* hd_sk = nullptr; // this->ucell->nat * npw + /// allocate global memory on GPU device only + FPTYPE* d_g_plus_k = nullptr; // npw * 5 + FPTYPE* d_pref = nullptr; // this->ucell->atoms[it].ncpp.nh + FPTYPE* d_gk = nullptr; // this->ucell->atoms[it].ncpp.nh * npw + FPTYPE* d_vq_tab = nullptr; // this->ucell->atoms[it].ncpp.nbeta * npw + std::vector dvkb_indexes; // this->ucell->atoms[it].ncpp.nh * 4 + int* d_dvkb_indexes = nullptr; // this->ucell->atoms[it].ncpp.nh * 4 + std::complex* d_pref_in = nullptr; // this->ucell->atoms[it].ncpp.nh + + /// becp and dbecp: + std::complex* dbecp = nullptr; // nbands * nkb (for stress) or nbands * nkb * 3 (for force) + std::complex* becp = nullptr; // nbands * nkb + + /// @brief rename the operators for CPU/GPU device + using gemm_op = hsolver::gemm_op, Device>; + using cal_stress_nl_op = hamilt::cal_stress_nl_op; + using cal_dbecp_noevc_nl_op = hamilt::cal_dbecp_noevc_nl_op; + + using resmem_complex_op = base_device::memory::resize_memory_op, Device>; + using resmem_complex_h_op = base_device::memory::resize_memory_op, base_device::DEVICE_CPU>; + using setmem_complex_op = base_device::memory::set_memory_op, Device>; + using delmem_complex_op = base_device::memory::delete_memory_op, Device>; + using delmem_complex_h_op = base_device::memory::delete_memory_op, base_device::DEVICE_CPU>; + using syncmem_complex_h2d_op + = base_device::memory::synchronize_memory_op, Device, base_device::DEVICE_CPU>; + using syncmem_complex_d2h_op + = base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, Device>; + + using resmem_var_op = base_device::memory::resize_memory_op; + using resmem_var_h_op = base_device::memory::resize_memory_op; + using setmem_var_op = base_device::memory::set_memory_op; + using delmem_var_op = base_device::memory::delete_memory_op; + using delmem_var_h_op = base_device::memory::delete_memory_op; + using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op; + using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op; + + using resmem_int_op = base_device::memory::resize_memory_op; + using delmem_int_op = base_device::memory::delete_memory_op; + using syncmem_int_h2d_op = base_device::memory::synchronize_memory_op; + + using cal_vq_op = hamilt::cal_vq_op; + using cal_vq_deri_op = hamilt::cal_vq_deri_op; + + using cal_vkb_op = hamilt::cal_vkb_op; + using cal_vkb_deri_op = hamilt::cal_vkb_deri_op; +}; + +} // namespace hamilt + +#endif \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp b/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp new file mode 100644 index 0000000000..2bb69dc131 --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.cpp @@ -0,0 +1,643 @@ +#include +#include +#include +#include +#include +#include +#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h" + +#include "module_base/projgen.h" +#include "module_base/blas_connector.h" +#include "module_hsolver/kernels/math_kernel_op.h" +#ifdef __MPI +#include "module_base/parallel_reduce.h" +#include "module_base/parallel_common.h" +#endif +#include "module_parameter/parameter.h" +#include "module_base/timer.h" +#include "module_base/formatter.h" + + + +/** + * =============================================================================================== + * + * README + * + * =============================================================================================== + * + * This is a code demo for illustrating how to use unified radial projection in implementation of + * Operators involving local radial projectors on PW-expanded wavefunctions. + * + * Example usage: + * ```c++ + * // select the range of atoms that impose the operator in std::vector> it2ia like + * // it2ia[it] = {ia1, ia2, ...} for each type + * // if all atoms in present kind is "selected", just set it2ia[it].resize(na) and call + * // std::iota(it2ia[it].begin(), it2ia[it].end(), 0) + * + * std::vector> it2ia; // as if we have given its value... + * + * // you should have the `orbital_dir` as the directory containing the orbital files, then those + * // will be read by a static function `AtomicRadials::read_abacus_orb` to get the radial orbitals + * + * // call `init_proj` to initialize the radial projector, this function only needs to be called + * // once during the runtime. + * // its input... + * // the `nproj`, is for specifying number of projectors of each atom type, can be zero, + * // but cannot be the value larger than the number of zeta functions for the given angular momentum. + * // the `lproj` is the angular momentum of the projectors, and `iproj` is the index of zeta function + * // that each projector generated from. + * // the `lproj` along with `iproj` can enable radial projectors in any number developer wants. + * + * // the `onsite_r` is the onsite-radius for all valid projectors, it is used to generate the new + * // radial function that more localized than the original one, which is expected to have enhanced + * // projection efficiency. + * + * std::vector rgrid; + * std::vector> projs; + * std::vector> it2iproj; + * init_proj(orbital_dir, ucell, nproj, lproj, iproj, onsite_r, rgrid, projs, it2iproj); + * + * // then call the function `cal_becp` to calculate the becp. HOWEVER, there are quantities that + * // can be calculated in advance and reused in the following calculations. Please see the function + * // implementation, especially the comments about CACHE 0, CACHE 1, CACHE 2..., etc. + * + * // the input param of `cal_becp`... + * // the `it2ia` has been explained above + * // the `it2iproj` is the output of function `init_proj`, so you do not need to worry about it + * // the `rgrid` and `projs` are also the output of function `init_proj` + * // the `lproj` is the angular momentum for each projector, actually you have used it in `init_proj`, it + * // is the same as `lproj` + * // the `nq` is the number of G+k vectors, typically it is always GlobalV::NQX + * // the `dq` is the step size of G+k vectors, typically it is always GlobalV::DQ + * // the `ik` is the k-point index + * // the `pw_basis` is the plane wave basis, need ik + * // the `omega` is the cell volume + * // the `tpiba` is 2*pi/lat0 + * // the `sf` is the structure factor calculator + * // the `psi` is the wavefunction + * // the `becp` is the output of the function, it is the becp + * cal_becp(it2ia, it2iproj, rgrid, projs, lproj, nq, dq, ik, pw_basis, omega, tpiba, sf, psi, becp); + * + * // About parallelization, presently, the function `AtomicRadials::read_abacus_orb` is actually parallelized + * // by MPI, so after the reading of orbital, actually all processors have the same data. Therefore it is not + * // needed to call functions like `Parallel_Reduce` or `Parallel_Bcast` to synchronize the data. + * // However, what is strikingly memory-consuming is the table `tab_atomic_`. Performance optimization will + * // be needed if the memory is not enough. + */ + +template +projectors::OnsiteProjector* projectors::OnsiteProjector::get_instance() +{ + static projectors::OnsiteProjector instance; + return &instance; +} + +template +void projectors::OnsiteProjector::init(const std::string& orbital_dir, + const UnitCell* ucell_in, + const psi::Psi, Device>& psi, + const K_Vectors& kv, + const ModulePW::PW_Basis_K& pw_basis, // level1: the plane wave basis, need ik + Structure_Factor& sf, // level2: the structure factor calculator + const double onsite_radius, + const int nq, + const double dq, + const ModuleBase::matrix& wg, + const ModuleBase::matrix& ekb) +{ + this->device = base_device::get_device_type(this->ctx); + if(!this->initialed) + { + this->ucell = ucell_in; + this->ntype = ucell_in->ntype; + + this->pw_basis_ = &pw_basis; + this->sf_ = &sf; + + std::vector orb_files(ntype); + std::vector nproj(ntype); + int sum_nproj = 0; + for(int it=0;itorbital_fn[it]; + nproj[it] = ucell->atoms[it].nwl; + sum_nproj += nproj[it]; + } + this->lproj.resize(sum_nproj); + int index = 0; + for(int it=0;itlproj[index++] = il; + } + } + std::vector iproj(sum_nproj, 0); + std::vector onsite_r(sum_nproj, onsite_radius); + + this->it2ia.resize(this->ntype); + this->iat_nh.resize(this->ucell->nat); + int iat = 0; + for(int it = 0; it < it2ia.size(); it++) + { + it2ia[it].resize(this->ucell->atoms[it].na); + std::iota(it2ia[it].begin(), it2ia[it].end(), 0); + for(int ia = 0; ia < it2ia[it].size(); ia++) + { + iat_nh[iat++] = nproj[it] * nproj[it]; + } + } + + this->init_proj(PARAM.inp.orbital_dir, + orb_files, + nproj, + lproj, + iproj, + onsite_r); + + ModuleBase::timer::tick("OnsiteProj", "cubspl_tabulate"); + // STAGE 0 - making the interpolation table + // CACHE 0 - if cache the irow2it, irow2iproj, irow2m, itiaiprojm2irow, can be reused for + // SCF, RELAX and CELL-RELAX calculation + // [in] rgrid, projs, lproj, it2ia, it2iproj, nq, dq + RadialProjection::RadialProjector::_build_backward_map(it2iproj, lproj, irow2it_, irow2iproj_, irow2m_); + RadialProjection::RadialProjector::_build_forward_map(it2ia, it2iproj, lproj, itiaiprojm2irow_); + //rp_._build_sbt_tab(rgrid, projs, lproj, nq, dq); + rp_._build_sbt_tab(nproj, rgrid, projs, lproj, nq, dq, ucell_in->omega, psi.npol, tab, nhtol); + // For being compatible with present cal_force and cal_stress framework + // uncomment the following code block if you want to use the Onsite_Proj_tools + if(this->tab_atomic_ == nullptr) + { + this->tot_nproj = itiaiprojm2irow_.size(); + this->npwx_ = this->pw_basis_->npwk_max; + this->size_vproj = this->tot_nproj * this->npwx_; + resmem_complex_op()(this->ctx, this->tab_atomic_, this->size_vproj, "OnsiteP::tab_atomic_"); + } + + delete this->fs_tools; // it is okay to delete nullptr + this->fs_tools = new hamilt::Onsite_Proj_tools( + nproj, lproj, tab, nhtol, this->tab_atomic_, ucell_in, &psi, &kv, &pw_basis, &sf, wg, ekb); + + ModuleBase::timer::tick("OnsiteProj", "cubspl_tabulate"); + + this->initialed = true; + } +} + +template +projectors::OnsiteProjector::~OnsiteProjector() +{ + //delete[] becp; + delete fs_tools; + delmem_complex_op()(this->ctx, this->tab_atomic_); + if(this->device == base_device::GpuDevice) + { + delmem_complex_h_op()(this->cpu_ctx, this->h_becp); + } + delmem_complex_op()(this->ctx, this->becp); + +} + + +template +void projectors::OnsiteProjector::init_proj(const std::string& orbital_dir, + const std::vector& orb_files, + const std::vector& nproj, // for each type, the number of projectors + const std::vector& lproj, // angular momentum of projectors within the type (l of zeta function) + const std::vector& iproj, // index of projectors within the type (izeta) + const std::vector& onsite_r) +{ + // extract the information from ucell + const int ntype = nproj.size(); + assert(ntype == orb_files.size()); + this->it2iproj.resize(ntype); + + int nproj_tot = 0; + nproj_tot = std::accumulate(nproj.begin(), nproj.end(), nproj_tot, std::plus()); + assert(nproj_tot == lproj.size()); + assert(nproj_tot == iproj.size()); + assert(nproj_tot == onsite_r.size()); + this->projs.resize(nproj_tot); + + int idx = 0; + int nr = -1; + double dr = -1.0; + for(int it = 0; it < ntype; ++it) + { + const int nproj_it = nproj[it]; + this->it2iproj[it].resize(nproj_it); + if(nproj_it == 0) + { + std::cout << "BECP_PW >> No projectors defined for type " << it << std::endl; + continue; + } + std::ifstream ifs(orbital_dir + orb_files[it]); + std::string elem = ""; + double ecut = -1.0; + int nr_ = -1; + double dr_ = -1.0; + std::vector nzeta; // number of radials for each l + std::vector> radials; // radials arranged in serial + this->read_abacus_orb(ifs, elem, ecut, nr_, dr_, nzeta, radials); +#ifdef __DEBUG + assert(elem != ""); + assert(ecut != -1.0); + assert(nr_ != -1); + assert(dr_ != -1.0); +#endif + nr = std::max(nr, nr_); // the maximal nr + assert(dr == -1.0 || dr == dr_); // the dr should be the same for all types + dr = (dr == -1.0) ? dr_ : dr; + for(int ip = 0; ip < nproj_it; ++ip) + { + int l = lproj[idx]; + int izeta = iproj[idx]; + int irad = 0; + irad = std::accumulate(nzeta.begin(), nzeta.begin() + l, irad); + irad += izeta; + std::vector temp = radials[irad]; + rgrid.resize(nr); + std::iota(rgrid.begin(), rgrid.end(), 0); + std::for_each(rgrid.begin(), rgrid.end(), [dr](double& r_i) { r_i *= dr; }); + smoothgen(nr, rgrid.data(), temp.data(), onsite_r[idx], projs[idx]); + it2iproj[it][ip] = idx; + ++idx; + } + } + // do zero padding + if(nr != -1) + { + std::for_each(projs.begin(), projs.end(), [nr](std::vector& proj) { proj.resize(nr, 0.0); }); + } + // generate the rgrid + this->rgrid.resize(nr); + std::iota(rgrid.begin(), rgrid.end(), 0); + std::for_each(rgrid.begin(), rgrid.end(), [dr](double& r_i) { r_i *= dr; }); +} + +template +void projectors::OnsiteProjector::tabulate_atomic(const int ik, const char grad) +{ + ModuleBase::timer::tick("OnsiteProj", "tabulate_atomic"); + // assert(grad == 'n' || grad == 'x' || grad == 'y' || grad == 'z'); + // grad = 'n' means no gradient, grad = 'x' means gradient along x, etc. + + // STAGE 1 - calculate the for the given G+k vector + // CACHE 1 - if cache the tab_, can be reused for SCF and RELAX calculation + // [in] pw_basis, ik, omega, tpiba, irow2it + this->ik_ = ik; + this->npw_ = pw_basis_->npwk[ik]; + this->npwx_ = pw_basis_->npwk_max; + // std::vector> q(this->npw_); + // for(int ig = 0; ig < this->npw_; ++ig) + // { + // q[ig] = pw_basis_->getgpluskcar(ik, ig); // get the G+k vector, G+k will change during CELL-RELAX + // } + // const int nrow = irow2it_.size(); + // std::vector> tab_(nrow*this->npw_); + // // convention used here: 'l': , 'r': + // // denote q=G+k, = exp(iqr), the routine Fourier Transform written as F(q) = + // rp_.sbtft(q, tab_, 'l', this->ucell->omega, this->ucell->tpiba); + // // what is calculated is here + + // STAGE 2 - make_atomic: multiply e^iqtau and extend the to for each atom + // CACHE 2 - if cache the tab_atomic_, can be reused for SCF calculation + // [in] it2ia, itiaiprojm2irow, tab_, npw, sf + // for(int irow = 0; irow < nrow; ++irow) + // { + // const int it = irow2it_[irow]; + // const int iproj = irow2iproj_[irow]; + // const int m = irow2m_[irow]; + // for(int ia = 0; ia < na[it]; ++ia) + // { + // // why Structure_Factor needs the FULL pw_basis??? + // std::complex* sk = this->sf_->get_sk(ik, it, ia, pw_basis_); // exp(-iqtau) + // // Note: idea on extending the param list of get_sk + // // the get_sk should have an extra param 'grad' to calculate the gradient of S(q), which + // // is actually very simple to be + // // d(S(q))/dq = -i S(q) * tau, for one direction it is just -i S(q) * tau_x (if x is the direction) + // const int irow_out = itiaiprojm2irow_.at(std::make_tuple(it, ia, iproj, m)); + // for(int ig = 0; ig < this->npw_; ++ig) + // { + // std::complex deriv = (grad == 'n')? 1.0: ModuleBase::NEG_IMAG_UNIT; // because sk is exp(-iqtau) + // deriv = (grad == 'n')? 1.0: (grad == 'x')? deriv * q[ig].x: (grad == 'y')? deriv * q[ig].y: deriv * q[ig].z; + // // there must be something twisted in ABACUS + // // because the tab_ is , but the sk is exp(-iqtau). How can it get the + // // correct result? + // this->tab_atomic_[irow_out*this->npw_ + ig] = sk[ig] * tab_[irow*this->npw_ + ig] * deriv; + // } + // delete[] sk; + // } + // } + // q.clear(); + // q.shrink_to_fit(); // release memory + // tab_.clear(); + // tab_.shrink_to_fit(); // release memory + ModuleBase::timer::tick("OnsiteProj", "tabulate_atomic"); +} + +template +void projectors::OnsiteProjector::overlap_proj_psi( + const int npm, + const std::complex* ppsi + ) +{ + ModuleBase::timer::tick("OnsiteProj", "overlap"); + // STAGE 3 - cal_becp + // CACHE 3 - it is no use to cache becp, it will change in each SCF iteration + // [in] psi, tab_atomic_, npw, becp, ik +// const char transa = 'C'; +// const char transb = 'N'; +// const int ldb = this->npwx_; +// const int ldc = this->tot_nproj; +// const std::complex alpha = 1.0; +// const std::complex beta = 0.0; +// if(this->becp == nullptr || this->size_becp < npm*ldc) +// { +// delete[] this->becp; +// this->becp = new std::complex[npm*ldc]; +// this->size_becp = npm*ldc; +// } +// setmem_complex_op()(ctx, this->becp, 0.0, this->size_becp); +// gemm_op()( +// this->ctx, +// transa, // const char transa +// transb, // const char transb +// ldc, // const int m +// npm, // const int n +// this->npw_, // const int k +// &alpha, // const std::complex alpha +// this->tab_atomic_, // const std::complex* a +// this->npw_, // const int lda +// ppsi, // const std::complex* b +// ldb, // const int ldb +// &beta, // const std::complex beta +// becp, // std::complex* c +// ldc); // const int ldc +// #ifdef __MPI +// Parallel_Reduce::reduce_pool(becp, size_becp); +// #endif + + // notes on refactor for DCU calculation + // the npm here is nbands(occ) * npol, for calling cal_becp, the npol should be divided. + // std::cout << "npm: " << npm << std::endl; + // std::cout << "at " << __FILE__ << ": " << __LINE__ << " output tot_nproj: " << this->tot_nproj << std::endl; + // std::cout << "at " << __FILE__ << ": " << __LINE__ << " output npm: " << npm << std::endl; + // std::cout << "at " << __FILE__ << ": " << __LINE__ << " ik_: " << ik_ << std::endl; + int npol = this->ucell->get_npol(); + if(this->becp == nullptr || this->size_becp < npm*this->tot_nproj) + { + this->size_becp = npm*this->tot_nproj; + resmem_complex_op()(this->ctx, this->becp, this->size_becp); + if(this->device == base_device::GpuDevice ) + { + resmem_complex_h_op()(this->cpu_ctx, this->h_becp, this->size_becp); + } + else + { + this->h_becp = this->becp; + } + } + this->fs_tools->cal_becp(ik_, npm/npol, this->becp, ppsi); // in cal_becp, npm should be the one not multiplied by npol + if(this->device == base_device::GpuDevice) + { + syncmem_complex_d2h_op()(this->cpu_ctx, this->ctx, h_becp, this->becp, this->size_becp); + } + ModuleBase::timer::tick("OnsiteProj", "overlap"); +} + +template +void projectors::OnsiteProjector::read_abacus_orb(std::ifstream& ifs, + std::string& elem, + double& ecut, + int& nr, + double& dr, + std::vector& nzeta, + std::vector>& radials, + const int rank) +{ + nr = 0; // number of grid points + dr = 0; // grid spacing + int lmax = 0, nchi = 0; // number of radial functions + std::vector> radial_map_; // build a map from [l][izeta] to 1-d array index + std::string tmp; + // first read the header + if (rank == 0) + { + if (!ifs.is_open()) + { + ModuleBase::WARNING_QUIT("AtomicRadials::read_abacus_orb", "Couldn't open orbital file."); + } + while (ifs >> tmp) + { + if (tmp == "Element") + { + ifs >> elem; + } + else if (tmp == "Cutoff(Ry)") + { + ifs >> ecut; + } + else if (tmp == "Lmax") + { + ifs >> lmax; + nzeta.resize(lmax + 1); + for (int l = 0; l <= lmax; ++l) + { + ifs >> tmp >> tmp >> tmp >> nzeta[l]; + } + } + else if (tmp == "Mesh") + { + ifs >> nr; + continue; + } + else if (tmp == "dr") + { + ifs >> dr; + break; + } + } + radial_map_.resize(lmax + 1); + for (int l = 0; l <= lmax; ++l) + { + radial_map_[l].resize(nzeta[l]); + } + int ichi = 0; + for (int l = 0; l <= lmax; ++l) + { + for (int iz = 0; iz < nzeta[l]; ++iz) + { + radial_map_[l][iz] = ichi++; // return the value of ichi, then increment + } + } + nchi = ichi; // total number of radial functions + radials.resize(nchi); + std::for_each(radials.begin(), radials.end(), [nr](std::vector& v) { v.resize(nr); }); + } + + // broadcast the header information +#ifdef __MPI + Parallel_Common::bcast_string(elem); + Parallel_Common::bcast_double(ecut); + Parallel_Common::bcast_int(lmax); + Parallel_Common::bcast_int(nchi); + Parallel_Common::bcast_int(nr); + Parallel_Common::bcast_double(dr); +#endif + + // then adjust the size of the vectors + if (rank != 0) + { + nzeta.resize(lmax + 1); + radials.resize(nchi); + std::for_each(radials.begin(), radials.end(), [nr](std::vector& v) { v.resize(nr); }); + } + // broadcast the number of zeta functions for each angular momentum +#ifdef __MPI + Parallel_Common::bcast_int(nzeta.data(), lmax + 1); +#endif + + // read the radial functions by rank0 + int ichi = 0; + for (int i = 0; i != nchi; ++i) + { + if (rank == 0) + { + int l, izeta; + ifs >> tmp >> tmp >> tmp; + ifs >> tmp >> l >> izeta; + ichi = radial_map_[l][izeta]; + for (int ir = 0; ir != nr; ++ir) + { + ifs >> radials[ichi][ir]; + } + } + // broadcast the radial functions +#ifdef __MPI + Parallel_Common::bcast_int(ichi); // let other ranks know where to store the radial function + Parallel_Common::bcast_double(radials[ichi].data(), nr); +#endif + } +} // end of read_abacus_orb + +template +void projectors::OnsiteProjector::cal_occupations(const psi::Psi, Device>* psi_in, const ModuleBase::matrix& wg_in) +{ + ModuleBase::timer::tick("OnsiteProj", "cal_occupation"); + this->tabulate_atomic(0); + std::vector> occs(this->tot_nproj * 4, 0.0); + + // loop over k-points to calculate Mi of \sum_{k,i,l,m} + const int nbands = psi_in->get_nbands(); + for(int ik = 0; ik < psi_in->get_nk(); ik++) + { + psi_in->fix_k(ik); + if(ik != 0) + { + this->tabulate_atomic(ik); + } + // std::cout << __FILE__ << ":" << __LINE__ << " nbands = " << nbands << std::endl; + this->overlap_proj_psi( + nbands * psi_in->npol, + psi_in->get_pointer()); + const std::complex* becp_p = this->get_h_becp(); + // becp(nbands*npol , nkb) + // mag = wg * \sum_{nh}becp * becp + int nkb = this->tot_nproj; + //nkb = 18; + //std::cout << "at " << __FILE__ << ": " << __LINE__ << " output nbands: " << nbands << std::endl; + //std::cout << "at " << __FILE__ << ": " << __LINE__ << " output nkb: " << nkb << std::endl; + for(int ib = 0;ibiat_nh.size(); iat++) + { + const int nh = this->get_nh(iat); + for(int ih = 0; ih < nh; ih++) + { + const int occ_index = (begin_ih + ih) * 4; + const int index = ib*2*nkb + begin_ih + ih; + occs[occ_index] += weight * conj(becp_p[index]) * becp_p[index]; + occs[occ_index + 1] += weight * conj(becp_p[index]) * becp_p[index + nkb]; + occs[occ_index + 2] += weight * conj(becp_p[index + nkb]) * becp_p[index]; + occs[occ_index + 3] += weight * conj(becp_p[index + nkb]) * becp_p[index + nkb]; + } + begin_ih += nh; + } + } + } + // reduce mag from all k-pools + Parallel_Reduce::reduce_double_allpool(GlobalV::KPAR, GlobalV::NPROC_IN_POOL, (double*)(&(occs[0])), occs.size()*2); + // occ has been reduced and calculate mag + // parameters for orbital charge output + FmtCore fmt_of_chg("%15.4f"); + FmtCore fmt_of_label("%-15s"); + GlobalV::ofs_running << std::endl; + GlobalV::ofs_running << "-------------------------------------------------------------------------------------------" << std::endl; + GlobalV::ofs_running << "Orbital Charge Analysis Charge Mag(x) Mag(y) Mag(z)" << std::endl; + GlobalV::ofs_running << "-------------------------------------------------------------------------------------------" << std::endl; + // parameters for orbital charge output + // parameters for mag output + std::vector mag_x(this->ucell->nat, 0.0); + std::vector mag_y(this->ucell->nat, 0.0); + std::vector mag_z(this->ucell->nat,0.0); + auto atomLabels = this->ucell->get_atomLabels(); + const std::vector title = {"Total Magnetism (uB)", "", "", ""}; + const std::vector fmts = {"%-26s", "%20.10f", "%20.10f", "%20.10f"}; + const std::vector orb_names = {"s", "p", "d", "f", "g"}; + FmtTable table(title, this->ucell->nat, fmts, {FmtTable::Align::RIGHT, FmtTable::Align::LEFT}); + // parameters for mag output + int occ_index = 0; + for(int iat=0;iatucell->nat;iat++) + { + const int it = this->ucell->iat2it[iat]; + std::string atom_label = atomLabels[it]; + int ia = this->ucell->iat2ia[iat]; + GlobalV::ofs_running << FmtCore::format("%-20s", atom_label+std::to_string(ia+1)) << std::endl; + std::vector sum(4, 0.0); + int current_l = 1; + std::vector charge_mag(4, 0.0); + for(int ih=0;ihiat_nh[iat];ih++) + { + charge_mag[3] += (occs[occ_index] - occs[occ_index + 3]).real(); + charge_mag[1] += (occs[occ_index + 1] + occs[occ_index + 2]).real(); + charge_mag[2] += (occs[occ_index + 1] - occs[occ_index + 2]).imag(); + charge_mag[0] += (occs[occ_index] + occs[occ_index + 3]).real(); + if(ih == current_l * current_l - 1) + { + sum[0] += charge_mag[0]; + sum[1] += charge_mag[1]; + sum[2] += charge_mag[2]; + sum[3] += charge_mag[3]; + GlobalV::ofs_running << FmtCore::format("%20s", orb_names[current_l-1]) + << fmt_of_chg.format(charge_mag[0]) << fmt_of_chg.format(charge_mag[1]) + << fmt_of_chg.format(charge_mag[2]) << fmt_of_chg.format(charge_mag[3]) << std::endl; + current_l++; + charge_mag.assign(4, 0.0); + } + occ_index += 4; + } + mag_x[iat] = sum[1]; + mag_y[iat] = sum[2]; + mag_z[iat] = sum[3]; + GlobalV::ofs_running << FmtCore::format("%20s", std::string("Sum")) << "" + << fmt_of_chg.format(sum[0]) << fmt_of_chg.format(sum[1]) + << fmt_of_chg.format(sum[2]) << fmt_of_chg.format(sum[3]) << std::endl; + } + GlobalV::ofs_running << "-------------------------------------------------------------------------------------------" << std::endl; + GlobalV::ofs_running << std::endl; + table << atomLabels << mag_x << mag_y << mag_z; + GlobalV::ofs_running << table.str() << std::endl; + + // print charge + ModuleBase::timer::tick("OnsiteProj", "cal_occupation"); +} + +template class projectors::OnsiteProjector; +#if ((defined __CUDA) || (defined __ROCM)) +template class projectors::OnsiteProjector; +#endif \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.h b/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.h new file mode 100644 index 0000000000..a2bb99354b --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/onsite_projector.h @@ -0,0 +1,159 @@ +#ifndef MODULEHAMILTPW_ONSITEPROJECTOR_H +#define MODULEHAMILTPW_ONSITEPROJECTOR_H +#include "module_base/module_device/device.h" +#include "module_hsolver/kernels/math_kernel_op.h" +#include "module_hamilt_pw/hamilt_pwdft/structure_factor.h" +#include "module_basis/module_pw/pw_basis_k.h" +#include "module_hamilt_pw/hamilt_pwdft/radial_proj.h" +#include "module_psi/psi.h" +#include "module_hamilt_pw/hamilt_pwdft/onsite_proj_tools.h" + +#include +#include +#include +namespace projectors +{ + template + class OnsiteProjector + { + public: + + /** + * @brief initialize the radial projector for real-space projection involving operators + * + * @param orbital_dir You know what it is + * @param orb_files You know what it is + * @param nproj # of projectors for each type defined in UnitCell, can be zero + * @param lproj angular momentum for each projector + * @param iproj index of zeta function that each projector generated from + * @param onsite_r onsite-radius for all valid projectors + * @param rgrid [out] the radial grid shared by all projectors + * @param projs [out] projectors indexed by `iproj` + * @param it2iproj [out] for each type, the projector index (across all types) + */ + void init_proj(const std::string& orbital_dir, + const std::vector& orb_files, + const std::vector& nproj, // for each type, the number of projectors + const std::vector& lproj, // angular momentum of projectors within the type (l of zeta function) + const std::vector& iproj, // index of projectors within the type (izeta) + const std::vector& onsite_r); // for each type, the projector index (across all types) + + /** + * @brief calculate the onsite projectors in reciprocal space(|G+K>) for all atoms + */ + void tabulate_atomic(const int ik, const char grad = 'n'); + + void overlap_proj_psi( + const int npm, + const std::complex* ppsi + ); + void read_abacus_orb(std::ifstream& ifs, + std::string& elem, + double& ecut, + int& nr, + double& dr, + std::vector& nzeta, + std::vector>& radials, + const int rank = 0); + /// @brief static access to this class instance + static OnsiteProjector* get_instance(); + void init(const std::string& orbital_dir, + const UnitCell* ucell_in, + const psi::Psi, Device>& psi, + const K_Vectors& kv, + const ModulePW::PW_Basis_K& pw_basis, // level1: the plane wave basis, need ik + Structure_Factor& sf, // level2: the structure factor calculator + const double onsite_radius, + const int nq, + const double dq, + const ModuleBase::matrix& wg, + const ModuleBase::matrix& ekb); + + /// @brief calculate and print the occupations of all lm orbitals + void cal_occupations(const psi::Psi, Device>* psi, const ModuleBase::matrix& wg_in); + + int get_size_becp() const { return size_becp; } + std::complex* get_becp() const { return becp; } + std::complex* get_h_becp() const { return h_becp; } + std::complex* get_tab_atomic() const { return tab_atomic_; } + int get_tot_nproj() const { return tot_nproj; } + int get_npw() const { return npw_; } + int get_npwx() const { return npwx_; } + const int& get_nh(int iat) const { return iat_nh[iat]; } + + hamilt::Onsite_Proj_tools* get_fs_tools() const { return fs_tools; } + + private: + OnsiteProjector(){}; + ~OnsiteProjector(); + + Device* ctx = {}; + base_device::DEVICE_CPU* cpu_ctx = {}; + base_device::AbacusDevice_t device = {}; + static OnsiteProjector *instance; + + hamilt::Onsite_Proj_tools* fs_tools = nullptr; + + std::complex* tab_atomic_ = nullptr; + std::complex* becp = nullptr; // nbands * nkb + // save becp in CPU memory, only used when Device is GPU + std::complex* h_becp; + + int size_becp = 0; + int size_vproj = 0; + int tot_nproj = 0; + int npw_ = 0; + int npwx_ = 0; + int ik_ = 0; + std::vector> it2ia; + std::vector rgrid; + std::vector> projs; + std::vector> it2iproj; + std::vector lproj; + std::vector iat_nh; + + const UnitCell* ucell = nullptr; + + const ModulePW::PW_Basis_K* pw_basis_ = nullptr; // level1: the plane wave basis, need ik + Structure_Factor* sf_ = nullptr; // level2: the structure factor calculator + int ntype = 0; + + RadialProjection::RadialProjector rp_; + std::vector irow2it_; + std::vector irow2iproj_; + std::vector irow2m_; + std::map, int> itiaiprojm2irow_; + + ModuleBase::realArray tab; + ModuleBase::matrix nhtol; + + bool initialed = false; + + /// @brief rename the operators for CPU/GPU device + using gemm_op = hsolver::gemm_op, Device>; + + using resmem_complex_op = base_device::memory::resize_memory_op, Device>; + using resmem_complex_h_op = base_device::memory::resize_memory_op, base_device::DEVICE_CPU>; + using setmem_complex_op = base_device::memory::set_memory_op, Device>; + using delmem_complex_op = base_device::memory::delete_memory_op, Device>; + using delmem_complex_h_op = base_device::memory::delete_memory_op, base_device::DEVICE_CPU>; + using syncmem_complex_h2d_op + = base_device::memory::synchronize_memory_op, Device, base_device::DEVICE_CPU>; + using syncmem_complex_d2h_op + = base_device::memory::synchronize_memory_op, base_device::DEVICE_CPU, Device>; + + using resmem_var_op = base_device::memory::resize_memory_op; + using resmem_var_h_op = base_device::memory::resize_memory_op; + using setmem_var_op = base_device::memory::set_memory_op; + using delmem_var_op = base_device::memory::delete_memory_op; + using delmem_var_h_op = base_device::memory::delete_memory_op; + using syncmem_var_h2d_op = base_device::memory::synchronize_memory_op; + using syncmem_var_d2h_op = base_device::memory::synchronize_memory_op; + + using resmem_int_op = base_device::memory::resize_memory_op; + using delmem_int_op = base_device::memory::delete_memory_op; + using syncmem_int_h2d_op = base_device::memory::synchronize_memory_op; + }; +}// namespace projectors + +#endif // MODULEHAMILTPW_ONSITEPROJECTOR_H \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/CMakeLists.txt b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/CMakeLists.txt index 83f7955dbb..57f45558fb 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/CMakeLists.txt +++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/CMakeLists.txt @@ -5,6 +5,7 @@ list(APPEND operator_ks_pw_srcs nonlocal_pw.cpp meta_pw.cpp velocity_pw.cpp + onsite_proj_pw.cpp ) # this library is included in hamilt_pwdft now diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp new file mode 100644 index 0000000000..39f0c1458a --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.cpp @@ -0,0 +1,422 @@ +#include "onsite_proj_pw.h" + +#include "module_base/blas_connector.h" +#include "module_base/timer.h" +#include "module_base/parallel_reduce.h" +#include "module_base/tool_quit.h" +#include "module_hamilt_lcao/module_deltaspin/spin_constrain.h" +#include "module_hamilt_lcao/module_dftu/dftu.h" +#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h" +#include "module_hamilt_pw/hamilt_pwdft/kernels/onsite_op.h" +#ifdef USE_PAW +#include "module_cell/module_paw/paw_cell.h" +#endif + +namespace hamilt { + +template +OnsiteProj>::OnsiteProj(const int* isk_in, + const UnitCell* ucell_in, + const bool cal_delta_spin, + const bool cal_dftu) +{ + this->classname = "OnsiteProj"; + this->cal_type = calculation_type::pw_onsite; + this->isk = isk_in; + this->ucell = ucell_in; + this->has_delta_spin = cal_delta_spin; + this->has_dftu = cal_dftu; +} + +template +OnsiteProj>::~OnsiteProj() { + delmem_complex_op()(this->ctx, this->ps); + if(this->init_delta_spin) + { + delmem_int_op()(this->ctx, this->ip_iat); + delmem_complex_op()(this->ctx, this->lambda_coeff); + } + if(this->has_dftu) + { + if(!init_delta_spin) + { + delmem_int_op()(this->ctx, this->ip_iat); + } + delmem_int_op()(this->ctx, this->orb_l_iat); + delmem_int_op()(this->ctx, this->ip_m); + delmem_int_op()(this->ctx, this->vu_begin_iat); + delmem_complex_op()(this->ctx, this->vu_device); + } +} + +template +void OnsiteProj>::init(const int ik_in) +{ + ModuleBase::timer::tick("OnsiteProj", "getvnl"); + this->ik = ik_in; + + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + onsite_p->tabulate_atomic(ik_in); + this->tnp = onsite_p->get_tot_nproj(); + + if(this->next_op != nullptr) + { + this->next_op->init(ik_in); + } + + ModuleBase::timer::tick("OnsiteProj", "getvnl"); +} + +//-------------------------------------------------------------------------- +// this function sum up each non-local pseudopotential located on each atom, +//-------------------------------------------------------------------------- +template +void OnsiteProj>::add_onsite_proj(T *hpsi_in, const int npol, const int m) const +{ + ModuleBase::timer::tick("OnsiteProj", "add_onsite_proj"); + + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + // apply the operator to the wavefunction + //std::cout << "use of tab_atomic at " << __FILE__ << ": " << __LINE__ << std::endl; + const std::complex* tab_atomic = onsite_p->get_tab_atomic(); + const int npw = onsite_p->get_npw(); + const int npwx = onsite_p->get_npwx(); + char transa = 'N'; + char transb = 'T'; + int npm = m; + gemm_op()( + this->ctx, + transa, + transb, + npw, + npm, + this->tnp, + &this->one, + tab_atomic, + npw, + this->ps, + npm, + &this->one, + hpsi_in, + npwx + ); + ModuleBase::timer::tick("OnsiteProj", "add_onsite_proj"); +} + +template +void OnsiteProj>::update_becp(const T *psi_in, const int npol, const int m) const +{ + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + // calculate + // std::cout << __FILE__ << ":" << __LINE__ << " nbands = " << m << std::endl; + onsite_p->overlap_proj_psi(m, psi_in); +} + +template +void OnsiteProj>::cal_ps_delta_spin(const int npol, const int m) const +{ + if(!this->has_delta_spin) return; + + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + const std::complex* becp = onsite_p->get_becp(); + + spinconstrain::SpinConstrain>& sc = spinconstrain::SpinConstrain>::getScInstance(); + auto& constrain = sc.get_constrain(); + auto& lambda = sc.get_sc_lambda(); + + // T *ps = new T[tnp * m]; + // ModuleBase::GlobalFunc::ZEROS(ps, m * tnp); + if (this->nkb_m < m * tnp) { + resmem_complex_op()(this->ctx, this->ps, tnp * m, "OnsiteProj::ps"); + this->nkb_m = m * tnp; + } + setmem_complex_op()(this->ctx, this->ps, 0, tnp * m); + + if(!this->init_delta_spin) + { + this->init_delta_spin = true; + //prepare ip_iat and lambda_coeff + resmem_int_op()(this->ctx, this->ip_iat, onsite_p->get_tot_nproj()); + resmem_complex_op()(this->ctx, this->lambda_coeff, this->ucell->nat * 4); + std::vector ip_iat0(onsite_p->get_tot_nproj()); + int ip0 = 0; + for(int iat=0;iatucell->nat;iat++) + { + for(int ip=0;ipget_nh(iat);ip++) + { + ip_iat0[ip0++] = iat; + } + } + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj()); + } + + // prepare array of nh_iat and lambda_array to pass to the onsite_ps_op operator + std::vector> tmp_lambda_coeff(this->ucell->nat * 4); + for(int iat=0;iatucell->nat;iat++) + { + tmp_lambda_coeff[iat * 4] = std::complex(lambda[iat][2], 0.0); + tmp_lambda_coeff[iat * 4 + 1] = std::complex(lambda[iat][0], lambda[iat][1]); + tmp_lambda_coeff[iat * 4 + 2] = std::complex(lambda[iat][0], -1 * lambda[iat][1]); + tmp_lambda_coeff[iat * 4 + 3] = std::complex(-1 * lambda[iat][2], 0.0); + } + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, this->lambda_coeff, tmp_lambda_coeff.data(), this->ucell->nat * 4); + // TODO: code block above should be moved to the init function + + hamilt::onsite_ps_op()( + this->ctx, // device context + m, + npol, + this->ip_iat, + tnp, + this->lambda_coeff, + this->ps, becp); + + /*int sum = 0; + if (npol == 1) + { + const int current_spin = this->isk[this->ik]; + } + else + { + for (int iat = 0; iat < this->ucell->nat; iat++) + { + const int nproj = onsite_p->get_nh(iat); + if(constrain[iat].x == 0 && constrain[iat].y == 0 && constrain[iat].z == 0) + { + sum += nproj; + continue; + } + const std::complex coefficients0(lambda[iat][2], 0.0); + const std::complex coefficients1(lambda[iat][0] , lambda[iat][1]); + const std::complex coefficients2(lambda[iat][0] , -1 * lambda[iat][1]); + const std::complex coefficients3(-1 * lambda[iat][2], 0.0); + // each atom has nproj, means this is with structure factor; + // each projector (each atom) must multiply coefficient + // with all the other projectors. + for (int ib = 0; ib < m; ib+=2) + { + for (int ip = 0; ip < nproj; ip++) + { + const int psind = (sum + ip) * m + ib; + const int becpind = ib * tnp + sum + ip; + const std::complex becp1 = becp[becpind]; + const std::complex becp2 = becp[becpind + tnp]; + ps[psind] += coefficients0 * becp1 + + coefficients2 * becp2; + ps[psind + 1] += coefficients1 * becp1 + + coefficients3 * becp2; + } // end ip + } // end ib + sum += nproj; + } // end iat + }*/ +} + +template +void OnsiteProj>::cal_ps_dftu(const int npol, const int m) const +{ + if(!this->has_dftu) return; + + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + const std::complex* becp = onsite_p->get_becp(); + + auto* dftu = ModuleDFTU::DFTU::get_instance(); + + // T *ps = new T[tnp * m]; + // ModuleBase::GlobalFunc::ZEROS(ps, m * tnp); + if (this->nkb_m < m * tnp) { + resmem_complex_op()(this->ctx, this->ps, tnp * m, "OnsiteProj::ps"); + this->nkb_m = m * tnp; + } + if(!this->has_delta_spin) + { + setmem_complex_op()(this->ctx, this->ps, 0, tnp * m); + } + + if(!this->init_dftu) + { + this->init_dftu = true; + //prepare orb_l_iat, ip_m, vu_begin_iat and vu_device + resmem_int_op()(this->ctx, this->orb_l_iat, this->ucell->nat); + resmem_int_op()(this->ctx, this->ip_m, onsite_p->get_tot_nproj()); + resmem_int_op()(this->ctx, this->vu_begin_iat, this->ucell->nat); + // recal the ip_iat + resmem_int_op()(this->ctx, this->ip_iat, onsite_p->get_tot_nproj()); + std::vector ip_iat0(onsite_p->get_tot_nproj()); + std::vector ip_m0(onsite_p->get_tot_nproj()); + std::vector vu_begin_iat0(this->ucell->nat); + std::vector orb_l_iat0(this->ucell->nat); + int ip0 = 0; + int vu_begin = 0; + for(int iat=0;iatucell->nat;iat++) + { + const int it = this->ucell->iat2it[iat]; + const int target_l = dftu->orbital_corr[it]; + orb_l_iat0[iat] = target_l; + const int nproj = onsite_p->get_nh(iat); + if(target_l == -1) + { + for(int ip=0;ip= m_begin && ip < m_end) + { + ip_m0[ip0++] = ip - m_begin; + } + else + { + ip_m0[ip0++] = -1; + } + } + } + } + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->orb_l_iat, orb_l_iat0.data(), this->ucell->nat); + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->ip_iat, ip_iat0.data(), onsite_p->get_tot_nproj()); + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->ip_m, ip_m0.data(), onsite_p->get_tot_nproj()); + syncmem_int_h2d_op()(this->ctx, this->cpu_ctx, this->vu_begin_iat, vu_begin_iat0.data(), this->ucell->nat); + + resmem_complex_op()(this->ctx, this->vu_device, dftu->get_size_eff_pot_pw()); + } + + syncmem_complex_h2d_op()(this->ctx, this->cpu_ctx, this->vu_device, dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw()); + + hamilt::onsite_ps_op()( + this->ctx, // device context + m, + npol, + this->orb_l_iat, + this->ip_iat, + this->ip_m, + this->vu_begin_iat, + tnp, + this->vu_device, + this->ps, becp); + + /* + int sum = 0; + if (npol == 1) + { + const int current_spin = this->isk[this->ik]; + } + else + { + for (int iat = 0; iat < this->ucell->nat; iat++) + { + const int it = this->ucell->iat2it[iat]; + const int target_l = dftu->orbital_corr[it]; + const int nproj = onsite_p->get_nh(iat); + if(target_l == -1) + { + sum += nproj; + continue; + } + const int ip_begin = target_l * target_l; + const int ip_end = (target_l + 1) * (target_l + 1); + const int tlp1 = 2 * target_l + 1; + const int tlp1_2 = tlp1 * tlp1; + const std::complex* vu = dftu->get_eff_pot_pw(iat); + // each projector (each atom) must multiply coefficient + // with all the other projectors. + for (int ib = 0; ib < m; ib+=2) + { + for (int ip2 = ip_begin; ip2 < ip_end; ip2++) + { + const int psind = (sum + ip2) * m + ib; + const int m2 = ip2 - ip_begin; + for (int ip1 = ip_begin; ip1 < ip_end; ip1++) + { + const int becpind1 = ib * tnp + sum + ip1; + const int m1 = ip1 - ip_begin; + const int index_mm = m1 * tlp1 + m2; + const std::complex becp1 = becp[becpind1]; + const std::complex becp2 = becp[becpind1 + tnp]; + ps[psind] += vu[index_mm] * becp1 + + vu[index_mm + tlp1_2 * 2] * becp2; + ps[psind + 1] += vu[index_mm + tlp1_2 * 1] * becp1 + + vu[index_mm + tlp1_2 * 3] * becp2; + } // end ip1 + } // end ip2 + } // end ib + sum += nproj; + } // end iat + }*/ +} + +template<> +void OnsiteProj, base_device::DEVICE_CPU>>::add_onsite_proj(std::complex *hpsi_in, const int npol, const int m) const +{} +template<> +void OnsiteProj, base_device::DEVICE_CPU>>::update_becp(const std::complex *psi_in, const int npol, const int m) const +{} +template<> +void OnsiteProj, base_device::DEVICE_CPU>>::cal_ps_delta_spin(const int npol, const int m) const +{} +template<> +void OnsiteProj, base_device::DEVICE_CPU>>::cal_ps_dftu(const int npol, const int m) const +{} + +#if ((defined __CUDA) || (defined __ROCM)) +template<> +void OnsiteProj, base_device::DEVICE_GPU>>::add_onsite_proj(std::complex *hpsi_in, const int npol, const int m) const +{} +template<> +void OnsiteProj, base_device::DEVICE_GPU>>::update_becp(const std::complex *psi_in, const int npol, const int m) const +{} +template<> +void OnsiteProj, base_device::DEVICE_GPU>>::cal_ps_delta_spin(const int npol, const int m) const +{} +template<> +void OnsiteProj, base_device::DEVICE_GPU>>::cal_ps_dftu(const int npol, const int m) const +{} +#endif + +template +void OnsiteProj>::act( + const int nbands, + const int nbasis, + const int npol, + const T* tmpsi_in, + T* tmhpsi, + const int ngk_ik, + const bool is_first_node)const +{ + ModuleBase::timer::tick("Operator", "OnsiteProjPW"); + this->update_becp(tmpsi_in, npol, nbands); + this->cal_ps_delta_spin(npol, nbands); + this->cal_ps_dftu(npol, nbands); + this->add_onsite_proj(tmhpsi, npol, nbands); + ModuleBase::timer::tick("Operator", "OnsiteProjPW"); +} + +template +template +hamilt::OnsiteProj>::OnsiteProj(const OnsiteProj> *nonlocal) +{ + this->classname = "OnsiteProj"; + this->cal_type = calculation_type::pw_nonlocal; + // FIXME: +} + +template class OnsiteProj, base_device::DEVICE_CPU>>; +template class OnsiteProj, base_device::DEVICE_CPU>>; + +#if ((defined __CUDA) || (defined __ROCM)) +template class OnsiteProj, base_device::DEVICE_GPU>>; +template class OnsiteProj, base_device::DEVICE_GPU>>; +#endif +} // namespace hamilt \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.h b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.h new file mode 100644 index 0000000000..975967d5c8 --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/onsite_proj_pw.h @@ -0,0 +1,98 @@ +#ifndef MODULEHAMILTPW_ONSITE_PROJ_PW_H +#define MODULEHAMILTPW_ONSITE_PROJ_PW_H + +#include "operator_pw.h" + +#include "module_cell/unitcell.h" +#include "module_hsolver/kernels/math_kernel_op.h" + +namespace hamilt { + +#ifndef ONSITETEMPLATE_H +#define ONSITETEMPLATE_H + +template class OnsiteProj : public T {}; +// template +// class OnsiteProj : public OperatorPW {}; + +#endif + +template +class OnsiteProj> : public OperatorPW +{ + private: + using Real = typename GetTypeReal::type; + public: + OnsiteProj(const int* isk_in, + const UnitCell* ucell_in, + const bool cal_delta_spin, + const bool cal_dftu); + + template + explicit OnsiteProj(const OnsiteProj>* onsite_proj); + + virtual ~OnsiteProj(); + + virtual void init(const int ik_in)override; + + virtual void act(const int nbands, + const int nbasis, + const int npol, + const T* tmpsi_in, + T* tmhpsi, + const int ngk = 0, + const bool is_first_node = false)const override; + + const int *get_isk() const {return this->isk;} + const UnitCell *get_ucell() const {return this->ucell;} + + private: + void cal_ps_delta_spin(const int npol, const int m) const; + void cal_ps_dftu(const int npol, const int m) const; + void update_becp(const T* psi_in, const int npol, const int m) const; + void add_onsite_proj(T *hpsi_in, const int npol, const int m) const; + + const int* isk = nullptr; + + const UnitCell* ucell = nullptr; + + mutable int* ip_iat = nullptr; + mutable T* lambda_coeff = nullptr; + mutable int* orb_l_iat = nullptr; + mutable int* ip_m = nullptr; + mutable int* vu_begin_iat = nullptr; + mutable T* vu_device = nullptr; + + mutable int nkb_m = 0; + + bool has_delta_spin = false; + bool has_dftu = false; + + mutable bool init_dftu = false; + mutable bool init_delta_spin = false; + + mutable T *ps = nullptr; + int tnp = 0; + Device* ctx = {}; + base_device::DEVICE_CPU* cpu_ctx = {}; + + using gemv_op = hsolver::gemv_op; + using gemm_op = hsolver::gemm_op; + using setmem_complex_op = base_device::memory::set_memory_op; + using resmem_complex_op = base_device::memory::resize_memory_op; + using delmem_complex_op = base_device::memory::delete_memory_op; + using syncmem_complex_h2d_op = base_device::memory::synchronize_memory_op; + using resmem_int_op = base_device::memory::resize_memory_op; + using resmem_real_op = base_device::memory::resize_memory_op; + using delmem_int_op = base_device::memory::delete_memory_op; + using delmem_real_op = base_device::memory::delete_memory_op; + using syncmem_int_h2d_op = base_device::memory::synchronize_memory_op; + using syncmem_real_h2d_op = base_device::memory::synchronize_memory_op; + + T one{1, 0}; + T zero{0, 0}; +}; + +} // namespace hamilt + +#endif \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/projop_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/operator_pw/projop_pw.cpp deleted file mode 100644 index b419d9ce88..0000000000 --- a/source/module_hamilt_pw/hamilt_pwdft/operator_pw/projop_pw.cpp +++ /dev/null @@ -1,285 +0,0 @@ -#include -#include -#include "module_parameter/parameter.h" -#include -#include -#include -#include -#include -#include -#include - -#include "module_hamilt_pw/hamilt_pwdft/radial_proj.h" -#include "module_basis/module_nao/projgen.h" -#include "module_basis/module_nao/atomic_radials.h" -#include "module_hamilt_pw/hamilt_pwdft/structure_factor.h" -#include "module_basis/module_pw/pw_basis_k.h" -#include "module_cell/unitcell.h" -#include "module_base/blas_connector.h" -#ifdef __MPI -#include "module_base/parallel_reduce.h" -#endif -#include "module_io/orb_io.h" -/** - * =============================================================================================== - * - * README - * - * =============================================================================================== - * - * This is a code demo for illustrating how to use unified radial projection in implementation of - * Operators involving local radial projectors on PW-expanded wavefunctions. - * - * Example usage: - * ```c++ - * // select the range of atoms that impose the operator in std::vector> it2ia like - * // it2ia[it] = {ia1, ia2, ...} for each type - * // if all atoms in present kind is "selected", just set it2ia[it].resize(na) and call - * // std::iota(it2ia[it].begin(), it2ia[it].end(), 0) - * - * std::vector> it2ia; // as if we have given its value... - * - * // you should have the `orbital_dir` as the directory containing the orbital files, then those - * // will be read by a static function `AtomicRadials::read_abacus_orb` to get the radial orbitals - * - * // call `init_proj` to initialize the radial projector, this function only needs to be called - * // once during the runtime. - * // its input... - * // the `nproj`, is for specifying number of projectors of each atom type, can be zero, - * // but cannot be the value larger than the number of zeta functions for the given angular momentum. - * // the `lproj` is the angular momentum of the projectors, and `iproj` is the index of zeta function - * // that each projector generated from. - * // the `lproj` along with `iproj` can enable radial projectors in any number developer wants. - * - * // the `onsite_r` is the onsite-radius for all valid projectors, it is used to generate the new - * // radial function that more localized than the original one, which is expected to have enhanced - * // projection efficiency. - * - * std::vector rgrid; - * std::vector> projs; - * std::vector> it2iproj; - * init_proj(orbital_dir, ucell, nproj, lproj, iproj, onsite_r, rgrid, projs, it2iproj); - * - * // then call the function `cal_becp` to calculate the becp. HOWEVER, there are quantities that - * // can be calculated in advance and reused in the following calculations. Please see the function - * // implementation, especially the comments about CACHE 0, CACHE 1, CACHE 2..., etc. - * - * // the input param of `cal_becp`... - * // the `it2ia` has been explained above - * // the `it2iproj` is the output of function `init_proj`, so you do not need to worry about it - * // the `rgrid` and `projs` are also the output of function `init_proj` - * // the `iproj2l` is the angular momentum for each projector, actually you have used it in `init_proj`, it - * // is the same as `lproj` - * // the `nq` is the number of G+k vectors, typically it is always PARAM.globalv.nqx - * // the `dq` is the step size of G+k vectors, typically it is always PARAM.globalv.dq - * // the `ik` is the k-point index - * // the `pw_basis` is the plane wave basis, need ik - * // the `omega` is the cell volume - * // the `tpiba` is 2*pi/lat0 - * // the `sf` is the structure factor calculator - * // the `psi` is the wavefunction - * // the `becp` is the output of the function, it is the becp - * cal_becp(it2ia, it2iproj, rgrid, projs, iproj2l, nq, dq, ik, pw_basis, omega, tpiba, sf, psi, becp); - * - * // About parallelization, presently, the function `AtomicRadials::read_abacus_orb` is actually parallelized - * // by MPI, so after the reading of orbital, actually all processors have the same data. Therefore it is not - * // needed to call functions like `Parallel_Reduce` or `Parallel_Bcast` to synchronize the data. - * // However, what is strikingly memory-consuming is the table `tab_atomic_`. Performance optimization will - * // be needed if the memory is not enough. - */ - - -/** - * @brief initialize the radial projector for real-space projection involving operators - * - * @param orbital_dir You know what it is - * @param orb_files You know what it is - * @param nproj # of projectors for each type defined in UnitCell, can be zero - * @param lproj angular momentum for each projector - * @param iproj index of zeta function that each projector generated from - * @param onsite_r onsite-radius for all valid projectors - * @param rgrid [out] the radial grid shared by all projectors - * @param projs [out] projectors indexed by `iproj` - * @param it2iproj [out] for each type, the projector index (across all types) - */ -void init_proj(const std::string& orbital_dir, - const std::vector& orb_files, - const std::vector& nproj, // for each type, the number of projectors - const std::vector& lproj, // angular momentum of projectors within the type (l of zeta function) - const std::vector& iproj, // index of projectors within the type (izeta) - const std::vector& onsite_r, // for each projector, the "onsite_radius" - std::vector& rgrid, // the radial grid shared by all projectors - std::vector>& projs, // projectors indexed by `iproj` - std::vector>& it2iproj) // for each type, the projector index (across all types) -{ - // extract the information from ucell - const int ntype = nproj.size(); - assert(ntype == orb_files.size()); - int nproj_tot = 0; - std::accumulate(nproj.begin(), nproj.end(), nproj_tot); - assert(nproj_tot == lproj.size()); - assert(nproj_tot == iproj.size()); - assert(nproj_tot == onsite_r.size()); - projs.resize(nproj_tot); - - int idx = 0; - int nr = -1; - double dr = -1.0; - for(int it = 0; it < ntype; ++it) - { - const int nproj_it = nproj[it]; - it2iproj[it].resize(nproj_it); - if(nproj_it == 0) { continue; } - std::ifstream ifs(orbital_dir + "/" + orb_files[it]); - std::string elem = ""; - double ecut = -1.0; - int nr_ = -1; - double dr_ = -1.0; - std::vector nzeta; // number of radials for each l - std::vector> radials; // radials arranged in serial - ModuleIO::read_abacus_orb(ifs, elem, ecut, nr_, dr_, nzeta, radials); -#ifdef __DEBUG - assert(elem != ""); - assert(ecut != -1.0); - assert(nr_ != -1); - assert(dr_ != -1.0); -#endif - nr = std::max(nr, nr_); // the maximal nr - assert(dr == -1.0 || dr == dr_); // the dr should be the same for all types - dr = (dr == -1.0) ? dr_ : dr; - for(int ip = 0; ip < nproj_it; ++ip) - { - int l = lproj[idx]; - int izeta = iproj[idx]; - int irad = 0; - std::accumulate(nzeta.begin(), nzeta.begin() + l, irad); - irad += izeta; - std::vector temp = radials[irad]; - smoothgen(nr, rgrid.data(), temp.data(), onsite_r[idx], projs[idx]); - it2iproj[it][ip] = idx; - ++idx; - } - } - // do zero padding - if(nr != -1) - { - std::for_each(projs.begin(), projs.end(), [nr](std::vector& proj) { proj.resize(nr, 0.0); }); - } - // generate the rgrid - rgrid.resize(nr); - std::iota(rgrid.begin(), rgrid.end(), 0); - std::for_each(rgrid.begin(), rgrid.end(), [dr](double& r_i) { r_i *= dr; }); -} - -// I am sorry but what does becp mean?... -void cal_becp(const std::vector>& it2ia, // level0: for given type `it`, the atom indices `ia` - const std::vector>& it2iproj, // level0: for given type `it`, the proj indices `iproj` - const std::vector& rgrid, // level0: the radial grid shared by all projectors - const std::vector>& projs, // level0: projectors indexed by `iproj` - const std::vector& iproj2l, // level0: for given proj index `iproj`, the angular momentum `l` - const int nq, // level0: PARAM.globalv.nqx - const double& dq, // level0: PARAM.globalv.dq - const int ik, // level1: the k-point index - const ModulePW::PW_Basis_K& pw_basis, // level1: the plane wave basis, need ik - const double& omega, // level1: the cell volume - const double& tpiba, // level1: 2*pi/lat0 - Structure_Factor& sf, // level2: the structure factor calculator - const psi::Psi, base_device::DEVICE_CPU>& psi, - std::vector>& becp - ) -{ - // STAGE 0 - making the interpolation table - // CACHE 0 - if cache the irow2it, irow2iproj, irow2m, itiaiprojm2irow, can be reused for - // SCF, RELAX and CELL-RELAX calculation - // [in] rgrid, projs, iproj2l, it2ia, it2iproj, nq, dq - RadialProjection::RadialProjector rp; - std::vector irow2it; - std::vector irow2iproj; - std::vector irow2m; - std::map, int> itiaiprojm2irow; - RadialProjection::RadialProjector::_build_backward_map(it2iproj, iproj2l, irow2it, irow2iproj, irow2m); - RadialProjection::RadialProjector::_build_forward_map(it2ia, it2iproj, iproj2l, itiaiprojm2irow); - rp._build_sbt_tab(rgrid, projs, iproj2l, nq, dq); - - - // STAGE 1 - calculate the for the given G+k vector - // CACHE 1 - if cache the tab_, can be reused for SCF and RELAX calculation - // [in] pw_basis, ik, omega, tpiba, irow2it - const int npw = pw_basis.npwk[ik]; - std::vector> q(npw); - for(int ig = 0; ig < npw; ++ig) - { - q[ig] = pw_basis.getgpluskcar(ik, ig); // get the G+k vector, G+k will change during CELL-RELAX - } - const int nrow = irow2it.size(); - std::vector> tab_(nrow*npw); - rp.sbtft(q, tab_, 'l', omega, tpiba); // l: , r: - q.clear(); - q.shrink_to_fit(); // release memory - - - // STAGE 2 - make_atomic: multiply e^iqtau and extend the to for each atom - // CACHE 2 - if cache the tab_atomic_, can be reused for SCF calculation - // [in] it2ia, itiaiprojm2irow, tab_, npw, sf - std::vector na(it2ia.size()); - for(int it = 0; it < it2ia.size(); ++it) - { - na[it] = it2ia[it].size(); - } - const int nrow_out = itiaiprojm2irow.size(); - std::vector> tab_atomic_(nrow_out*npw); // memory usage peak HERE - for(int irow = 0; irow < nrow; ++irow) - { - const int it = irow2it[irow]; - const int iproj = irow2iproj[irow]; - const int m = irow2m[irow]; - for(int ia = 0; ia < na[it]; ++ia) - { - // why Structure_Factor needs the FULL pw_basis??? - std::complex* sk = sf.get_sk(ik, it, ia, &pw_basis); - const int irow_out = itiaiprojm2irow.at(std::make_tuple(it, ia, iproj, m)); - for(int ig = 0; ig < npw; ++ig) - { - tab_atomic_[irow_out*npw + ig] = sk[ig]*tab_[irow*npw + ig]; - } - delete[] sk; - } - } - tab_.clear(); - tab_.shrink_to_fit(); // release memory - - - // STAGE 3 - cal_becp - // CACHE 3 - it is no use to cache becp, it will change in each SCF iteration - // [in] psi, tab_atomic_, npw, becp, ik - const int nbands = psi.get_nbands(); - const char transa = 'N'; - const char transb = 'N'; - const int one = 1; - const int lda = nrow_out; - const int ldb = npw; - const int ldc = nrow_out; - const std::complex alpha = 1.0; - const std::complex beta = 0.0; - - becp.resize(nbands*nrow_out); - psi.fix_k(ik); - BlasConnector::gemm(transa, // const char transa - transb, // const char transb - nrow_out, // const int m - nbands, // const int n - npw, // const int k - alpha, // const std::complex alpha - tab_atomic_.data(), // const std::complex* a - lda, // const int lda - psi.get_pointer(), // const std::complex* b - ldb, // const int ldb - beta, // const std::complex beta - becp.data(), // std::complex* c - ldc); // const int ldc -#ifdef __MPI - Parallel_Reduce::reduce_pool(becp.data(), becp.size()); -#endif - tab_atomic_.clear(); - tab_atomic_.shrink_to_fit(); // release memory -} diff --git a/source/module_hamilt_pw/hamilt_pwdft/radial_proj.cpp b/source/module_hamilt_pw/hamilt_pwdft/radial_proj.cpp index 8ac987df45..88cfeeadee 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/radial_proj.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/radial_proj.cpp @@ -8,6 +8,7 @@ #include "module_base/matrix.h" #include "module_base/math_ylmreal.h" #include "module_base/spherical_bessel_transformer.h" +#include "module_base/timer.h" void RadialProjection::RadialProjector::_build_backward_map(const std::vector>& it2iproj, const std::vector& iproj2l, @@ -81,6 +82,7 @@ void RadialProjection::RadialProjector::_build_sbt_tab(const int nr, const int nq, const double& dq) { + ModuleBase::timer::tick("RadialProjection", "cubspl_tabulate_vq_each_radial"); l_ = l; const int nrad = radials.size(); assert(nrad == l.size()); @@ -104,6 +106,7 @@ void RadialProjection::RadialProjector::_build_sbt_tab(const int nr, std::for_each(_temp.begin(), _temp.end(), [pref](double& x){x = x/pref;}); cubspl_->add(_temp.data()); } + ModuleBase::timer::tick("RadialProjection", "cubspl_tabulate_vq_each_radial"); } void RadialProjection::RadialProjector::_build_sbt_tab(const std::vector& r, @@ -112,20 +115,91 @@ void RadialProjection::RadialProjector::_build_sbt_tab(const std::vector const int nq, const double& dq) { + ModuleBase::timer::tick("RadialProjection", "cubspl_tabulate_vq_each_radial"); const int nr = r.size(); const int nrad = radials.size(); for(int i = 0; i < nrad; i++) { assert(radials[i].size() == nr); } std::vector radptrs(radials.size()); for(int i = 0; i < radials.size(); i++) { radptrs[i] = const_cast(radials[i].data()); } + ModuleBase::timer::tick("RadialProjection", "cubspl_tabulate_vq_each_radial"); _build_sbt_tab(nr, r.data(), radptrs, l, nq, dq); } +void RadialProjection::RadialProjector::_build_sbt_tab(const std::vector& nproj, + const std::vector& r, + const std::vector>& radials, + const std::vector& l, + const int nq, //< GlobalV::DQ + const double& dq, //< GlobalV::NQX + const double& omega, + const int npol, // for nspin 4 + ModuleBase::realArray& tab, + ModuleBase::matrix& nhtol) // output table +{ + int nprojmax = *std::max_element(nproj.begin(), nproj.end()); + const int ntype = nproj.size(); + + tab.create(ntype, nprojmax*npol, nq); + tab.zero_out(); + + std::vector qgrid(nq); + std::iota(qgrid.begin(), qgrid.end(), 0); + std::transform(qgrid.begin(), qgrid.end(), qgrid.begin(), [dq](const double& q){return q*dq;}); + + ModuleBase::SphericalBesselTransformer sbt_(true); // bool: enable cache + int iproj = 0; + int nchmax = 0; + const double pref = 4*M_PI/std::sqrt(omega) / std::sqrt(2.0/std::acos(-1.0)); + for (int it = 0; it < ntype; it++) + { + int nch = 0; + const int nproj_it = nproj[it]; + for (int ip = 0; ip < nproj_it; ip++) + { + const int l_ = l[iproj]; + nch += 2*l_ + 1; + std::vector _temp(nq); + sbt_.direct(l_, r.size(), r.data(), radials[iproj].data(), nq, qgrid.data(), _temp.data()); + std::for_each(_temp.begin(), _temp.end(), [pref](double& x){x = x*pref;}); + for (int iq = 0; iq < nq; iq++) + { + tab(it, ip, iq) = _temp[iq]; + //std::cout << tab(it, ip, iq) << " "; + } + iproj++; + } + nchmax = std::max(nchmax, nch); + } + //std::cout << std::endl; + //ModuleBase::WARNING_QUIT("RadialProjection", "The following code is not implemented yet."); + + nhtol.create(ntype, nchmax); + nhtol.zero_out(); + iproj = 0; + for (int it = 0; it < ntype; it++) + { + int ih = 0; // channel index, across all projectors of present type + for (int ip = 0; ip < nproj[it]; ip++) + { + const int l_ = l[iproj]; + for (int m = -l_; m <= l_; m++) + { + nhtol(it, ih) = l_; + ih++; + } + iproj++; + } + } +} + void RadialProjection::RadialProjector::sbtft(const std::vector>& qs, std::vector>& out, const char type, const double& omega, const double& tpiba) { + ModuleBase::timer::tick("RadialProjection", "interp_sphbes_ft_flzYlm"); + assert(type == 'r' || type == 'l'); // type must be one of 'r' or 'l' // first cache the Ylm values const int lmax_ = *std::max_element(l_.begin(), l_.end()); const int total_lm = std::pow(lmax_+1, 2); @@ -146,7 +220,11 @@ void RadialProjection::RadialProjector::sbtft(const std::vector pref = (type == 'r')? std::pow(ModuleBase::IMAG_UNIT, l) : std::pow(ModuleBase::NEG_IMAG_UNIT, l); + // here is bug-prone + // we define l as and r as . The former is int{p(r)exp(iqr)} and the latter is int{p(r)exp(-iqr)} + // , in which we have use G+k=q notation. So once do Ylm expansion on exp(iqr), will get a pure imaginary + // prefactor i^l. + std::complex pref = (type == 'l')? std::pow(ModuleBase::IMAG_UNIT, l) : std::pow(ModuleBase::NEG_IMAG_UNIT, l); pref = pref * ModuleBase::FOUR_PI/std::sqrt(omega); cubspl_->eval(npw, qnorm.data(), Jlfq.data(), nullptr, nullptr, i); for(int m = -l; m <= l; m++) @@ -159,6 +237,7 @@ void RadialProjection::RadialProjector::sbtft(const std::vector& mask) diff --git a/source/module_hamilt_pw/hamilt_pwdft/radial_proj.h b/source/module_hamilt_pw/hamilt_pwdft/radial_proj.h index 2867403b4b..d4a5511bac 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/radial_proj.h +++ b/source/module_hamilt_pw/hamilt_pwdft/radial_proj.h @@ -155,30 +155,41 @@ namespace RadialProjection const double* r, const std::vector& radials, const std::vector& l, - const int nq, //< PARAM.globalv.dq - const double& dq); //< PARAM.globalv.nqx + const int nq, //< GlobalV::DQ + const double& dq); //< GlobalV::NQX void _build_sbt_tab(const std::vector& r, const std::vector>& radials, const std::vector& l, - const int nq, //< PARAM.globalv.dq - const double& dq); //< PARAM.globalv.nqx - + const int nq, //< GlobalV::DQ + const double& dq); //< GlobalV::NQX + // compatibility concern: for FS_Nonlocal_tools. Will not call sbtft so need omega + void _build_sbt_tab(const std::vector& nproj, + const std::vector& r, + const std::vector>& radials, + const std::vector& l, + const int nq, //< GlobalV::DQ + const double& dq, //< GlobalV::NQX + const double& omega, + const int npol, + ModuleBase::realArray& tab, + ModuleBase::matrix& nhtol); /** * @brief perform analytical version of the Fourier transform: * F(q) = int(f(r)*exp(-iq.r) d^3r) - * = 4*pi/sqrt(omega) * i^l * Jl[f](q) * Ylm(q) + * = 4*pi/sqrt(omega) * (-i)^l * Jl[f](q) * Ylm(q) * , where Ylm(q) is real spherical harmonic function, and Jl[f](q) is * the Spherial Bessel Transform of f(r): * Jl[f](q) = int(f(r)*j_l(q*r)*r^2 dr) * , where j_l(q*r) is the spherical Bessel function of the first kind. - * + * . If use another notation, F(q) = , this is denoted as type + * "r" for ket |>, and "l" for bra <|. */ void sbtft(const std::vector>& qs, std::vector>& out, - const char type = 'r', + const char type = 'r', // 'r' for ket |>, 'l' for bra <| const double& omega = 1.0, - const double& tpiba = 1.0); // 'r' for ket |>, 'l' for bra <| + const double& tpiba = 1.0); // 'n' for no gradient, 'x', 'y', 'z' for gradient in x, y, z direction void sbfft(); // interface for SBFFT diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h index 6d5ee5581e..a81dbc9d93 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_func.h +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func.h @@ -162,6 +162,23 @@ class Stress_Func const psi::Psi, Device>* psi_in, const pseudopot_cell_vnl& nlpp_in, const UnitCell& ucell_in); // nonlocal part in PW basis + // 8) the stress from the DFT+U and DeltaSpin calculations + /** + * @brief This routine computes the stress contribution from the DFT+U and DeltaSpin calculations + * Stress^{NL}_{ij} = -1/\Omega \sum_{n,k}f_{nk}\sum_I \sum_{lm,l'm'}(V^U_{lmm'\sigma\sigma'} + + * f(\lambda,\sigma\sigma')) [ \sum_G \langle c_{nk}(\mathbf{G+K})|\alpha_{lm}^I(\mathbf{G+K})\rangle * + * \sum_{G'}\langle \partial \alpha_{lm}^I(\mathbf{G+K})/\partial \varepsilon_{ij} + * |c_{nk}(\mathbf{G+K})\rangle ] there would be three parts in the above equation: (1) sum over becp and dbecp with + * f(U+\lambda, \sigma\sigma', lmm')^{I} ----- first line in the above equation (2) calculate becp = + * ----- second line in the above equation (3) calculate dbecp = ----- third line in the above + * equation + */ + void stress_onsite(ModuleBase::matrix& sigma, + const ModuleBase::matrix& wg, + const ModulePW::PW_Basis_K* wfc_basis, + const UnitCell& ucell_in, + const psi::Psi, Device>* psi_in, + ModuleSymmetry::Symmetry* p_symm); // nonlocal part in PW basis void get_dvnl1(ModuleBase::ComplexMatrix& vkb, const int ik, diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp new file mode 100644 index 0000000000..8568821a10 --- /dev/null +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_func_onsite.cpp @@ -0,0 +1,113 @@ +#include "module_base/module_device/device.h" +#include "module_base/timer.h" +#include "module_hamilt_pw/hamilt_pwdft/onsite_projector.h" +#include "module_parameter/parameter.h" +#include "module_hamilt_lcao/module_dftu/dftu.h" +#include "module_hamilt_lcao/module_deltaspin/spin_constrain.h" +#include "stress_func.h" +// calculate the nonlocal pseudopotential stress in PW +template +void Stress_Func::stress_onsite(ModuleBase::matrix& sigma, + const ModuleBase::matrix& wg, + const ModulePW::PW_Basis_K* wfc_basis, + const UnitCell& ucell_in, + const psi::Psi, Device>* psi_in, + ModuleSymmetry::Symmetry* p_symm) +{ + ModuleBase::TITLE("Stress_Func", "stress_onsite"); + if(psi_in == nullptr || wfc_basis == nullptr) + { + return; + } + ModuleBase::timer::tick("Stress_Func", "stress_onsite"); + + FPTYPE* stress_device = nullptr; + resmem_var_op()(this->ctx, stress_device, 9); + setmem_var_op()(this->ctx, stress_device, 0, 9); + std::vector sigma_onsite(9, 0.0); + + auto* onsite_p = projectors::OnsiteProjector::get_instance(); + + const int nks = wfc_basis->nks; + for (int ik = 0; ik < nks; ik++) // loop k points + { + // skip zero weights to speed up + int nbands_occ = wg.nc; + while (wg(ik, nbands_occ - 1) == 0.0) + { + nbands_occ--; + if (nbands_occ == 0) + { + break; + } + } + const int npm = nbands_occ; + + // calculate becp = for all beta functions + onsite_p->get_fs_tools()->cal_becp(ik, npm); + // calculate dbecp = for all beta functions + // calculate stress = \sum * * D_{ij} + for (int ipol = 0; ipol < 3; ipol++) + { + for (int jpol = 0; jpol <= ipol; jpol++) + { + FPTYPE* stress_device_tmp = stress_device + (ipol * 3 + jpol); + onsite_p->get_fs_tools()->cal_dbecp_s(ik, npm, ipol, jpol); + if(PARAM.inp.dft_plus_u) + { + auto* dftu = ModuleDFTU::DFTU::get_instance(); + onsite_p->get_fs_tools()->cal_stress_dftu(ik, npm, stress_device_tmp, dftu->orbital_corr.data(), dftu->get_eff_pot_pw(0), dftu->get_size_eff_pot_pw(), wg.c); + } + if(PARAM.inp.sc_mag_switch) + { + spinconstrain::SpinConstrain>& sc = spinconstrain::SpinConstrain>::getScInstance(); + const std::vector>& lambda = sc.get_sc_lambda(); + onsite_p->get_fs_tools()->cal_stress_dspin(ik, npm, stress_device_tmp, lambda.data(), wg.c); + } + } + } + } + // transfer stress from device to host + syncmem_var_d2h_op()(this->cpu_ctx, this->ctx, sigma_onsite.data(), stress_device, 9); + delmem_var_op()(this->ctx, stress_device); + // sum up forcenl from all processors + for (int l = 0; l < 3; l++) + { + for (int m = 0; m < 3; m++) + { + if (m > l) + { + sigma_onsite[l * 3 + m] = sigma_onsite[m * 3 + l]; + } + Parallel_Reduce::reduce_all(sigma_onsite[l * 3 + m]); // qianrui fix a bug for kpar > 1 + } + } + // rescale the stress with 1/omega + for (int ipol = 0; ipol < 3; ipol++) + { + for (int jpol = 0; jpol < 3; jpol++) + { + sigma_onsite[ipol * 3 + jpol] *= 1.0 / ucell_in.omega; + } + } + + for (int ipol = 0; ipol < 3; ipol++) + { + for (int jpol = 0; jpol < 3; jpol++) + { + sigma(ipol, jpol) = sigma_onsite[ipol * 3 + jpol]; + } + } + // do symmetry + if (ModuleSymmetry::Symmetry::symm_flag == 1) + { + p_symm->symmetrize_mat3(sigma, ucell_in.lat); + } // end symmetry + + ModuleBase::timer::tick("Stress_Func", "stress_onsite"); +} + +template class Stress_Func; +#if ((defined __CUDA) || (defined __ROCM)) +template class Stress_Func; +#endif \ No newline at end of file diff --git a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp index 2a6925c912..e9cc5ded2b 100644 --- a/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp +++ b/source/module_hamilt_pw/hamilt_pwdft/stress_pw.cpp @@ -45,6 +45,9 @@ void Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, // vdw stress ModuleBase::matrix sigmavdw; sigmavdw.create(3, 3); + // DFT+U and DeltaSpin stress + ModuleBase::matrix sigmaonsite; + sigmaonsite.create(3, 3); for (int i = 0; i < 3; i++) { @@ -59,6 +62,7 @@ void Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, sigmaewa(i, j) = 0.0; sigmaxcc(i, j) = 0.0; sigmavdw(i, j) = 0.0; + sigmaonsite(i, j) = 0.0; } } @@ -107,13 +111,19 @@ void Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, // vdw term stress_vdw(sigmavdw, ucell); + // DFT+U and DeltaSpin stress + if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch) + { + this->stress_onsite(sigmaonsite, this->pelec->wg, wfc_basis, ucell, d_psi_in, p_symm); + } + for (int ipol = 0; ipol < 3; ipol++) { for (int jpol = 0; jpol < 3; jpol++) { sigmatot(ipol, jpol) = sigmakin(ipol, jpol) + sigmahar(ipol, jpol) + sigmanl(ipol, jpol) + sigmaxc(ipol, jpol) + sigmaxcc(ipol, jpol) + sigmaewa(ipol, jpol) - + sigmaloc(ipol, jpol) + sigmavdw(ipol, jpol); + + sigmaloc(ipol, jpol) + sigmavdw(ipol, jpol) + sigmaonsite(ipol, jpol); } } @@ -138,6 +148,10 @@ void Stress_PW::cal_stress(ModuleBase::matrix& sigmatot, ModuleIO::print_stress("XC STRESS", sigmaxc, PARAM.inp.test_stress, ry); ModuleIO::print_stress("EWALD STRESS", sigmaewa, PARAM.inp.test_stress, ry); ModuleIO::print_stress("NLCC STRESS", sigmaxcc, PARAM.inp.test_stress, ry); + if(PARAM.inp.dft_plus_u || PARAM.inp.sc_mag_switch) + { + ModuleIO::print_stress("ONSITE STRESS", sigmaonsite, PARAM.inp.test_stress, ry); + } ModuleIO::print_stress("TOTAL STRESS", sigmatot, PARAM.inp.test_stress, ry); } ModuleBase::timer::tick("Stress_PW", "cal_stress"); diff --git a/source/module_hsolver/diago_iter_assist.cpp b/source/module_hsolver/diago_iter_assist.cpp index a092a8260c..5ec443ab4e 100644 --- a/source/module_hsolver/diago_iter_assist.cpp +++ b/source/module_hsolver/diago_iter_assist.cpp @@ -412,6 +412,171 @@ void DiagoIterAssist::diagH_LAPACK(const int nstart, ModuleBase::timer::tick("DiagoIterAssist", "diagH_LAPACK"); } +template +void DiagoIterAssist::cal_hs_subspace(const hamilt::Hamilt* pHamilt, // hamiltonian operator carrier + const psi::Psi& psi, // [in] wavefunction + T *hcc, + T *scc) +{ + const int nstart = psi.get_nbands(); + + setmem_complex_op()(ctx, hcc, 0, nstart * nstart); + setmem_complex_op()(ctx, scc, 0, nstart * nstart); + + const int dmin = psi.get_current_nbas(); + const int dmax = psi.get_nbasis(); + + T* temp = nullptr; + resmem_complex_op()(ctx, temp, nstart * dmax, "DiagSub::temp"); + setmem_complex_op()(ctx, temp, 0, nstart * dmax); + + { // code block to calculate hcc and scc + setmem_complex_op()(ctx, temp, 0, nstart * dmax); + + T* hphi = temp; + // do hPsi for all bands + psi::Range all_bands_range(1, psi.get_current_k(), 0, nstart - 1); + hpsi_info hpsi_in(&psi, all_bands_range, hphi); + pHamilt->ops->hPsi(hpsi_in); + + gemm_op()(ctx, + 'C', + 'N', + nstart, + nstart, + dmin, + &one, + psi.get_pointer(), + dmax, + hphi, + dmax, + &zero, + hcc, + nstart); + + T* sphi = temp; + // do sPsi for all bands + pHamilt->sPsi(psi.get_pointer(), sphi, dmax, dmin, nstart); + + gemm_op()(ctx, + 'C', + 'N', + nstart, + nstart, + dmin, + &one, + psi.get_pointer(), + dmax, + sphi, + dmax, + &zero, + scc, + nstart); + } + + if (GlobalV::NPROC_IN_POOL > 1) + { + Parallel_Reduce::reduce_pool(hcc, nstart * nstart); + Parallel_Reduce::reduce_pool(scc, nstart * nstart); + } + + delmem_complex_op()(ctx, temp); +} + +template +void DiagoIterAssist::diag_responce( const T* hcc, + const T* scc, + const int nbands, + const T* mat_in, // [out] target matrix to be multiplied + T* mat_out, + int mat_col, // [in] number of columns of target matrix + Real* en // [out] eigenvalues +) +{ + ModuleBase::TITLE("DiagoIterAssist", "diag_responce"); + ModuleBase::timer::tick("DiagoIterAssist", "diag_responce"); + + const int nstart = nbands; + + T *vcc = nullptr; + resmem_complex_op()(ctx, vcc, nstart * nstart, "DiagSub::vcc"); + setmem_complex_op()(ctx, vcc, 0, nstart * nstart); + + // after generation of H and S matrix, diag them + DiagoIterAssist::diagH_LAPACK(nstart, nstart, hcc, scc, nstart, en, vcc); + + { // code block to calculate tar_mat + gemm_op()(ctx, + 'N', + 'N', + mat_col, + nstart, + nstart, + &one, + mat_in, // mat_col * nstart + mat_col, + vcc, // nstart * nstart + nstart, + &zero, + mat_out, + mat_col); + } + + delmem_complex_op()(ctx, vcc); + + ModuleBase::timer::tick("DiagoIterAssist", "diag_responce"); +} + +template +void DiagoIterAssist::diag_subspace_psi(const T* hcc, + const T* scc, + const int dim_subspace, + psi::Psi& evc, + Real* en +) +{ + ModuleBase::TITLE("DiagoIterAssist", "diag_subspace_psi"); + ModuleBase::timer::tick("DiagoIterAssist", "diag_subspace_psi"); + + const int nstart = dim_subspace; + const int n_band = evc.get_nbands(); + + T *vcc = nullptr; + resmem_complex_op()(ctx, vcc, nstart * nstart, "DiagSub::vcc"); + setmem_complex_op()(ctx, vcc, 0, nstart * nstart); + + // after generation of H and S matrix, diag them + DiagoIterAssist::diagH_LAPACK(nstart, nstart, hcc, scc, nstart, en, vcc); + + { // code block to calculate tar_mat + const int dmin = evc.get_current_nbas(); + const int dmax = evc.get_nbasis(); + T* temp = nullptr; + resmem_complex_op()(ctx, temp, nstart * dmax, "DiagSub::temp"); + setmem_complex_op()(ctx, temp, 0, nstart * dmax); + gemm_op()(ctx, + 'N', + 'N', + dmin, + n_band, + nstart, + &one, + evc.get_pointer(), // dmin * nstart + dmax, + vcc, // nstart * n_band + nstart, + &zero, + temp, + dmin); + matrixSetToAnother()(ctx, n_band, temp, dmin, evc.get_pointer(), dmax); + delmem_complex_op()(ctx, temp); + } + + delmem_complex_op()(ctx, vcc); + + ModuleBase::timer::tick("DiagoIterAssist", "diag_subspace_psi"); +} + template bool DiagoIterAssist::test_exit_cond(const int& ntry, const int& notconv) { diff --git a/source/module_hsolver/diago_iter_assist.h b/source/module_hsolver/diago_iter_assist.h index 560b37d682..c5c4e9cfaa 100644 --- a/source/module_hsolver/diago_iter_assist.h +++ b/source/module_hsolver/diago_iter_assist.h @@ -62,6 +62,39 @@ class DiagoIterAssist Real* e, T* vcc); + /// @brief calculate Hamiltonian and overlap matrix in subspace spanned by nstart states psi + /// @param pHamilt : hamiltonian operator carrier + /// @param psi : wavefunction + /// @param hcc : Hamiltonian matrix + /// @param scc : overlap matrix + static void cal_hs_subspace(const hamilt::Hamilt* pHamilt, // hamiltonian operator carrier + const psi::Psi& psi, // [in] wavefunction + T *hcc, + T *scc); + + /// @brief calculate the response matrix from rotation matrix solved by diagonalization of H and S matrix + /// @param hcc : Hamiltonian matrix + /// @param scc : overlap matrix + /// @param nbands : number of bands + /// @param mat_in : input matrix to be rotated + /// @param mat_out : output matrix to be rotated + /// @param mat_col : number of columns of target matrix + /// @param en : eigenvalues + static void diag_responce(const T* hcc, + const T* scc, + const int nbands, + const T* mat_in, + T* mat_out, + int mat_col, + Real* en); + + /// @brief calculate the response wavefunction psi from rotation matrix solved by diagonalization of H and S matrix + static void diag_subspace_psi(const T* hcc, + const T* scc, + const int dim_subspace, + psi::Psi& evc, + Real* en); + static bool test_exit_cond(const int& ntry, const int& notconv); private: diff --git a/source/module_io/input_conv.cpp b/source/module_io/input_conv.cpp index ebcf40a6c6..7ce7f0d764 100644 --- a/source/module_io/input_conv.cpp +++ b/source/module_io/input_conv.cpp @@ -18,11 +18,11 @@ #include "module_ri/exx_abfs-jle.h" #endif +#include "module_hamilt_lcao/module_dftu/dftu.h" #ifdef __LCAO #include "module_basis/module_ao/ORB_read.h" #include "module_elecstate/potentials/H_TDDFT_pw.h" #include "module_hamilt_lcao/hamilt_lcaodft/FORCE_STRESS.h" -#include "module_hamilt_lcao/module_dftu/dftu.h" #include "module_hamilt_lcao/module_tddft/evolve_elec.h" #include "module_hamilt_lcao/module_tddft/td_velocity.h" #endif @@ -243,7 +243,6 @@ void Input_Conv::Convert() // iteration (1/3) //---------------------------------------------------------- -#ifdef __LCAO if (PARAM.inp.dft_plus_u) { GlobalC::dftu.Yukawa = PARAM.inp.yukawa_potential; @@ -258,7 +257,6 @@ void Input_Conv::Convert() ModuleBase::GlobalFunc::ZEROS(GlobalC::dftu.U.data(), PARAM.inp.ntype); } } -#endif //---------------------------------------------------------- // Yu Liu add 2022-05-18 diff --git a/source/module_io/output_mulliken.h b/source/module_io/output_mulliken.h index 560dedfeaa..2d78d2fa52 100644 --- a/source/module_io/output_mulliken.h +++ b/source/module_io/output_mulliken.h @@ -7,6 +7,8 @@ #include "module_elecstate/elecstate_lcao.h" #include "module_io/output_dmk.h" #include "module_io/output_sk.h" +#include "module_base/formatter.h" +#include "module_hamilt_lcao/hamilt_lcaodft/operator_lcao/dspin_lcao.h" #include #include @@ -88,29 +90,105 @@ void cal_mag(Parallel_Orbitals* pv, hamilt::Hamilt* p_ham, K_Vectors& kv, elecstate::ElecState* pelec, + const TwoCenterBundle& two_center_bundle, + const LCAO_Orbitals& orb, UnitCell& ucell, const int istep, const bool print) { - auto cell_index - = CellIndex(ucell.get_atomLabels(), ucell.get_atomCounts(), ucell.get_lnchiCounts(), PARAM.inp.nspin); - auto out_sk = ModuleIO::Output_Sk(p_ham, pv, PARAM.inp.nspin, kv.get_nks()); - auto out_dmk = ModuleIO::Output_DMK(dynamic_cast*>(pelec)->get_DM(), - pv, - PARAM.inp.nspin, - kv.get_nks()); - auto mulp = ModuleIO::Output_Mulliken(&(out_sk), &(out_dmk), pv, &cell_index, kv.isk, PARAM.inp.nspin); - auto atom_chg = mulp.get_atom_chg(); - /// used in updating mag info in STRU file - ucell.atom_mulliken = mulp.get_atom_mulliken(atom_chg); - if (print && GlobalV::MY_RANK == 0) + // 1) calculate and output Mulliken population charges and magnetic moments + if (PARAM.inp.out_mul) { - /// write the Orbital file - cell_index.write_orb_info(PARAM.globalv.global_out_dir); - /// write mulliken.txt - mulp.write(istep, PARAM.globalv.global_out_dir); - /// write atomic mag info in running log file - mulp.print_atom_mag(atom_chg, GlobalV::ofs_running); + auto cell_index + = CellIndex(ucell.get_atomLabels(), ucell.get_atomCounts(), ucell.get_lnchiCounts(), PARAM.inp.nspin); + auto out_sk = ModuleIO::Output_Sk(p_ham, pv, PARAM.inp.nspin, kv.get_nks()); + auto out_dmk = ModuleIO::Output_DMK(dynamic_cast*>(pelec)->get_DM(), + pv, + PARAM.inp.nspin, + kv.get_nks()); + auto mulp = ModuleIO::Output_Mulliken(&(out_sk), &(out_dmk), pv, &cell_index, kv.isk, PARAM.inp.nspin); + auto atom_chg = mulp.get_atom_chg(); + /// used in updating mag info in STRU file + ucell.atom_mulliken = mulp.get_atom_mulliken(atom_chg); + if (print && GlobalV::MY_RANK == 0) + { + /// write the Orbital file + cell_index.write_orb_info(PARAM.globalv.global_out_dir); + /// write mulliken.txt + mulp.write(istep, PARAM.globalv.global_out_dir); + /// write atomic mag info in running log file + mulp.print_atom_mag(atom_chg, GlobalV::ofs_running); + } + } + // 2) calculate and output the magnetizations of each atom with projection method + if (PARAM.inp.onsite_radius > 0) + { + std::vector> atom_mag(ucell.nat, std::vector(PARAM.inp.nspin, 0.0)); + std::vector> constrain(ucell.nat, ModuleBase::Vector3(1, 1, 1)); + const hamilt::HContainer* dmr + = dynamic_cast*>(pelec)->get_DM()->get_DMR_pointer(1); + std::vector moments; + std::vector mag_x(ucell.nat, 0.0); + std::vector mag_y(ucell.nat, 0.0); + std::vector mag_z(ucell.nat, 0.0); + auto atomLabels = ucell.get_atomLabels(); + if(PARAM.inp.nspin == 2) + { + auto sc_lambda = new hamilt::DeltaSpin>( + nullptr, + kv.kvec_d, + nullptr, + ucell, + &GlobalC::GridD, + two_center_bundle.overlap_orb_onsite.get(), + orb.cutoffs() + ); + dynamic_cast*>(pelec)->get_DM()->switch_dmr(2); + moments = sc_lambda->cal_moment(dmr, constrain); + dynamic_cast*>(pelec)->get_DM()->switch_dmr(0); + delete sc_lambda; + //const std::vector title = {"Total Magnetism (uB)", ""}; + //const std::vector fmts = {"%-26s", "%20.10f"}; + //FmtTable table(title, ucell.nat, fmts, {FmtTable::Align::RIGHT, FmtTable::Align::LEFT}); + for(int iat=0;iat, std::complex>>( + nullptr, + kv.kvec_d, + nullptr, + ucell, + &GlobalC::GridD, + two_center_bundle.overlap_orb_onsite.get(), + orb.cutoffs() + ); + moments = sc_lambda->cal_moment(dmr, constrain); + delete sc_lambda; + //const std::vector title = {"Total Magnetism (uB)", "", "", ""}; + //const std::vector fmts = {"%-26s", "%20.10f", "%20.10f", "%20.10f"}; + //FmtTable table(title, ucell.nat, fmts, {FmtTable::Align::RIGHT, FmtTable::Align::LEFT}); + for(int iat=0;iatadd_item(item); } + { + Input_Item item("sc_os_ndim"); + item.annotation = "number of old iterations used for oscillation detection, for Spin-Constrained DFT"; + read_sync_int(input.sc_os_ndim); + this->add_item(item); + } { Input_Item item("scf_thr_type"); item.annotation = "type of the criterion of scf_thr, 1: reci drho for " diff --git a/source/module_io/read_input_item_exx_dftu.cpp b/source/module_io/read_input_item_exx_dftu.cpp index 3cfbae13e3..dc7c6a6025 100644 --- a/source/module_io/read_input_item_exx_dftu.cpp +++ b/source/module_io/read_input_item_exx_dftu.cpp @@ -339,16 +339,9 @@ void ReadInput::item_dftu() const Input_para& input = para.input; if (input.dft_plus_u != 0) { - if (input.basis_type != "lcao") + if (input.basis_type == "pw" && input.nspin != 4) { - ModuleBase::WARNING_QUIT("ReadInput", "WRONG ARGUMENTS OF basis_type, only lcao is support"); - } - if (input.ks_solver != "genelpa" && input.ks_solver != "scalapack_gvx" && input.ks_solver != "default") - { - std::cout << " You'are using " << input.ks_solver << std::endl; - ModuleBase::WARNING_QUIT("ReadInput", - "WRONG ARGUMENTS OF ks_solver in DFT+U routine, only " - "genelpa and scalapack_gvx are supported "); + ModuleBase::WARNING_QUIT("ReadInput", "WRONG ARGUMENTS, only nspin2 with PW base is not supported now"); } } }; diff --git a/source/module_io/test/read_input_ptest.cpp b/source/module_io/test/read_input_ptest.cpp index 1fc3f0568d..33608f6569 100644 --- a/source/module_io/test/read_input_ptest.cpp +++ b/source/module_io/test/read_input_ptest.cpp @@ -167,6 +167,7 @@ TEST_F(InputParaTest, ParaRead) EXPECT_EQ(param.inp.scf_os_stop, 1); EXPECT_NEAR(param.inp.scf_os_thr, -0.02, 1.0e-15); EXPECT_EQ(param.inp.scf_os_ndim, 10); + EXPECT_EQ(param.inp.sc_os_ndim, 5); EXPECT_NEAR(param.inp.scf_ene_thr, 1.0e-6, 1.0e-15); EXPECT_EQ(param.inp.scf_nmax, 50); EXPECT_EQ(param.inp.relax_nmax, 1); diff --git a/source/module_io/test_serial/read_input_item_test.cpp b/source/module_io/test_serial/read_input_item_test.cpp index 91325b9f00..b83e2df05a 100644 --- a/source/module_io/test_serial/read_input_item_test.cpp +++ b/source/module_io/test_serial/read_input_item_test.cpp @@ -1428,21 +1428,6 @@ TEST_F(InputTest, Item_test2) param.input.orbital_corr = {-1, -1}; it->second.reset_value(it->second, param); EXPECT_EQ(param.input.dft_plus_u, 0); - - param.input.dft_plus_u = 1; - param.input.basis_type = "pw"; - param.input.ks_solver = "genelpa"; - testing::internal::CaptureStdout(); - EXPECT_EXIT(it->second.check_value(it->second, param), ::testing::ExitedWithCode(1), ""); - output = testing::internal::GetCapturedStdout(); - EXPECT_THAT(output, testing::HasSubstr("NOTICE")); - - param.input.basis_type = "lcao"; - param.input.ks_solver = "test"; - testing::internal::CaptureStdout(); - EXPECT_EXIT(it->second.check_value(it->second, param), ::testing::ExitedWithCode(1), ""); - output = testing::internal::GetCapturedStdout(); - EXPECT_THAT(output, testing::HasSubstr("NOTICE")); } { // uramping auto it = find_label("uramping", readinput.input_lists); diff --git a/source/module_parameter/input_parameter.h b/source/module_parameter/input_parameter.h index d421e86f2b..fe86fbfefc 100644 --- a/source/module_parameter/input_parameter.h +++ b/source/module_parameter/input_parameter.h @@ -118,6 +118,7 @@ struct Input_para bool scf_os_stop = false; ///< whether to stop scf when oscillation is detected double scf_os_thr = -0.01; ///< drho threshold for oscillation int scf_os_ndim = 0; ///< number of old iterations used for oscillation detection + int sc_os_ndim = 5; ///< number of old iterations used for oscillation detection in Spin-Constrained DFT bool lspinorb = false; ///< consider the spin-orbit interaction bool noncolin = false; ///< using non-collinear-spin diff --git a/tests/integrate/160_PW_DJ_PK_PU_SO/INPUT b/tests/integrate/160_PW_DJ_PK_PU_SO/INPUT new file mode 100644 index 0000000000..8e4f45ab0e --- /dev/null +++ b/tests/integrate/160_PW_DJ_PK_PU_SO/INPUT @@ -0,0 +1,46 @@ +INPUT_PARAMETERS +suffix autotest +nbands 40 + +calculation scf +ecutwfc 10 +scf_thr 1.0e-4 +scf_nmax 50 +out_chg 0 + +#init_chg file +#out_dos 1 +#dos_sigma 0.05 +#out_band 1 + +smearing_method gaussian +smearing_sigma 0.01 + +#force_thr_ev 0.01 +#relax_method cg +#relax_bfgs_init 0.5 + +mixing_type pulay +mixing_beta 0.3 +mixing_restart 1e-3 +mixing_dmr 1 +mixing_gg0 1.1 + +ks_solver dav_subspace +pw_diag_ndim 2 +basis_type pw +gamma_only 0 +noncolin 1 +lspinorb 1 +cal_force 1 +cal_stress 1 + +#Parameter DFT+U +dft_plus_u 1 +orbital_corr 2 +hubbard_u 5.0 +onsite_radius 3.0 +pseudo_dir ../../PP_ORB +orbital_dir ../../PP_ORB + +pw_seed 1 diff --git a/tests/integrate/160_PW_DJ_PK_PU_SO/KPT b/tests/integrate/160_PW_DJ_PK_PU_SO/KPT new file mode 100644 index 0000000000..e769af7638 --- /dev/null +++ b/tests/integrate/160_PW_DJ_PK_PU_SO/KPT @@ -0,0 +1,4 @@ +K_POINTS +0 +Gamma +2 1 1 0 0 0 diff --git a/tests/integrate/160_PW_DJ_PK_PU_SO/STRU b/tests/integrate/160_PW_DJ_PK_PU_SO/STRU new file mode 100644 index 0000000000..91021e0a69 --- /dev/null +++ b/tests/integrate/160_PW_DJ_PK_PU_SO/STRU @@ -0,0 +1,22 @@ +ATOMIC_SPECIES +Fe 1.000 Fe.upf + +NUMERICAL_ORBITAL +Fe_gga_6au_100Ry_4s2p2d1f.orb + +LATTICE_CONSTANT +8.190 + +LATTICE_VECTORS + 1.00 0.50 0.50 + 0.50 1.00 0.50 + 0.50 0.50 1.00 +ATOMIC_POSITIONS +Direct + +Fe +0.0 +2 +0.00 0.00 0.00 mag 1.0 1.0 1.0 +0.51 0.51 0.51 mag 1.0 1.0 1.0 + diff --git a/tests/integrate/160_PW_DJ_PK_PU_SO/jd b/tests/integrate/160_PW_DJ_PK_PU_SO/jd new file mode 100644 index 0000000000..a93b3b217e --- /dev/null +++ b/tests/integrate/160_PW_DJ_PK_PU_SO/jd @@ -0,0 +1 @@ +DFTU + NSPIN4, Fe2, multi-k case diff --git a/tests/integrate/160_PW_DJ_PK_PU_SO/result.ref b/tests/integrate/160_PW_DJ_PK_PU_SO/result.ref new file mode 100644 index 0000000000..e6b1657fb7 --- /dev/null +++ b/tests/integrate/160_PW_DJ_PK_PU_SO/result.ref @@ -0,0 +1,5 @@ +etotref -5662.3908859903258417 +etotperatomref -2831.1954429952 +totalforceref 17.965510 +totalstressref 100582.607209 +totaltimeref 1.26 diff --git a/tests/integrate/CASES_CPU.txt b/tests/integrate/CASES_CPU.txt index 05d4ae689a..8277d51eab 100644 --- a/tests/integrate/CASES_CPU.txt +++ b/tests/integrate/CASES_CPU.txt @@ -105,6 +105,7 @@ 140_PW_15_SO_average 140_PW_15_SO_wfcinit 150_PW_15_CR_VDW3 +160_PW_DJ_PK_PU_SO 170_PW_MD_1O 170_PW_MD_2O 180_PW_SDFT_10S_M diff --git a/tests/integrate/CASES_GPU.txt b/tests/integrate/CASES_GPU.txt index 00de10ccb6..3490ccc9a0 100644 --- a/tests/integrate/CASES_GPU.txt +++ b/tests/integrate/CASES_GPU.txt @@ -22,4 +22,4 @@ 934_NO_Si2_tzdp_neq_GPU 934_NO_Si2_tzdp_neq_ns2_GPU 934_NO_Si2_tzdp_ns2_GPU -935_NO_Si2_tzdp_ns2_k_GPU \ No newline at end of file +935_NO_Si2_tzdp_ns2_k_GPU