From 529672f687d82d1d9f62a950daeef0753137adbc Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Tue, 22 Oct 2024 10:44:26 -0400 Subject: [PATCH 1/4] add flop count functions --- src/green/gpu/gw_gpu_kernel.h | 11 ++++++++--- src/gw_gpu_kernel.cpp | 26 ++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h index 2627a6a..7d51096 100644 --- a/src/green/gpu/gw_gpu_kernel.h +++ b/src/green/gpu/gw_gpu_kernel.h @@ -80,13 +80,18 @@ namespace green::gpu { virtual void gw_innerloop(G_type& g, St_type& sigma_tau) = 0; void GW_check_devices_free_space(); - /* - * Read a chunk of Coulomb integral with given (k[0], k[3]) k-pair + /** + * \brief Read a chunk of Coulomb integral with given (k[0], k[3]) k-pair */ void read_next(const std::array& k); + /** + * \brief count and floating points operations per second achieved on GPU. + * This is not representative of the GPU capabilities, but instead, accounts for read/write overheads. + * The value is entirely in the context Green-MBPT solver. + */ + void flops_achieved(MPI_Comm comm); - protected: double _beta; size_t _nts; size_t _nts_b; diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 0908335..c2ec122 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -99,6 +99,7 @@ namespace green::gpu { gw_innerloop(g, sigma_tau); } MPI_Barrier(utils::context.global); + if (_devices_comm != MPI_COMM_NULL) flops_achieved(_devices_comm); sigma_tau.fence(); if (!utils::context.node_rank) { if (_devices_comm != MPI_COMM_NULL) statistics.start("selfenergy_reduce"); @@ -119,6 +120,29 @@ namespace green::gpu { MPI_Op_free(&matrix_sum_op); } + void gw_gpu_kernel::flops_achieved(MPI_Comm comm) { + double gpu_time, flops; + utils::event_t& cugw_event = statistics.event("Solve cuGW"); + if (!cugw_event.active) { + gpu_time = cugw_event.duration; + } else { + gpu_time = 0.; + cugw_event.active = false; + } + + if (!utils::context.global_rank) { + MPI_Reduce(MPI_IN_PLACE, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, comm); + + flops = _flop_count / gpu_time; + + if (!utils::context.global_rank && _verbose > 1) { + std::cout << "################### GPU FLOPs achieved ####################" << std::endl; + std::cout << "FLOPs achieved: " << flops / 1.0e9 << " Giga flops." << std::endl; + std::cout << "###########################################################" << std::endl; + } + } + } + void scalar_gw_gpu_kernel::gw_innerloop(G_type& g, St_type& sigma_tau) { if (!_sp) { compute_gw_selfenergy(g, sigma_tau); @@ -303,9 +327,11 @@ namespace green::gpu { }; ztensor<5> Sigma_tskij_host_local(_nts, 1, _ink, _nso, _nso); + statistics.start("Solve cuGW"); cugw.solve(_nts, psuedo_ns, _nk, _ink, _nao, _bz_utils.symmetry().reduced_to_full(), _bz_utils.symmetry().full_to_reduced(), _Vk1k2_Qij, Sigma_tskij_host_local, _devices_rank, _devices_size, true, _verbose, irre_pos, mom_cons, r1, r2); + statistics.end(); // Convert Sigma_tskij_host_local to (_nts, 1, _ink, _nso, _nso) // Copy back to Sigma_tskij_local_host MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, sigma_tau.win()); From 79b6401a19a95250efdc2eb5bdd66e42a6ddb220 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Wed, 23 Oct 2024 01:29:53 -0400 Subject: [PATCH 2/4] clean up --- src/gw_gpu_kernel.cpp | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index c2ec122..fa9de41 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -99,8 +99,9 @@ namespace green::gpu { gw_innerloop(g, sigma_tau); } MPI_Barrier(utils::context.global); - if (_devices_comm != MPI_COMM_NULL) flops_achieved(_devices_comm); sigma_tau.fence(); + // Print effective FLOPs achieved in the calculation + flops_achieved(_devices_comm); if (!utils::context.node_rank) { if (_devices_comm != MPI_COMM_NULL) statistics.start("selfenergy_reduce"); utils::allreduce(MPI_IN_PLACE, sigma_tau.object().data(), sigma_tau.object().size()/(_nso*_nso), dt_matrix, matrix_sum_op, utils::context.internode_comm); @@ -121,25 +122,30 @@ namespace green::gpu { } void gw_gpu_kernel::flops_achieved(MPI_Comm comm) { - double gpu_time, flops; - utils::event_t& cugw_event = statistics.event("Solve cuGW"); - if (!cugw_event.active) { - gpu_time = cugw_event.duration; - } else { - gpu_time = 0.; - cugw_event.active = false; + double gpu_time=0., flops=0.; + if (comm != MPI_COMM_NULL) { + utils::event_t& cugw_event = statistics.event("Solve cuGW"); + if (!cugw_event.active) { + gpu_time = cugw_event.duration; + } else { + gpu_time = 0.; + cugw_event.active = false; + throw std::runtime_error("'Solve cuGW' event not found"); + } } if (!utils::context.global_rank) { - MPI_Reduce(MPI_IN_PLACE, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, comm); + MPI_Reduce(MPI_IN_PLACE, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global); + } else { + MPI_Reduce(&gpu_time, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global); + } - flops = _flop_count / gpu_time; + flops = _flop_count / gpu_time; - if (!utils::context.global_rank && _verbose > 1) { - std::cout << "################### GPU FLOPs achieved ####################" << std::endl; - std::cout << "FLOPs achieved: " << flops / 1.0e9 << " Giga flops." << std::endl; - std::cout << "###########################################################" << std::endl; - } + if (!utils::context.global_rank && _verbose > 1) { + std::cout << "################### GPU FLOPs achieved ####################" << std::endl; + std::cout << "FLOPs achieved: " << flops / 1.0e9 << " Giga flops." << std::endl; + std::cout << "###########################################################" << std::endl; } } From 48b96be06e964c34152c6030aba3a1379c18d545 Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Wed, 23 Oct 2024 10:01:18 -0400 Subject: [PATCH 3/4] separate calculation and printing of flops --- src/green/gpu/gw_gpu_kernel.h | 9 ++++++++- src/gw_gpu_kernel.cpp | 16 +++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h index 7d51096..c2cba49 100644 --- a/src/green/gpu/gw_gpu_kernel.h +++ b/src/green/gpu/gw_gpu_kernel.h @@ -86,12 +86,18 @@ namespace green::gpu { void read_next(const std::array& k); /** - * \brief count and floating points operations per second achieved on GPU. + * \brief calculate effective floating points operations per second reached on GPU. * This is not representative of the GPU capabilities, but instead, accounts for read/write overheads. * The value is entirely in the context Green-MBPT solver. */ void flops_achieved(MPI_Comm comm); + /** + * \brief print the effective FLOPs achieved for the iteration. + * + */ + void print_effective_flops(); + double _beta; size_t _nts; size_t _nts_b; @@ -110,6 +116,7 @@ namespace green::gpu { int _nqkpt{}; double _flop_count{}; + double _eff_flops{}; LinearSolverType _cuda_lin_solver; }; diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index fa9de41..7c7a9af 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -112,6 +112,7 @@ namespace green::gpu { MPI_Barrier(utils::context.global); statistics.end(); statistics.print(utils::context.global); + print_effective_flops(); clean_MPI_structure(); clean_shared_Coulomb(); @@ -122,7 +123,7 @@ namespace green::gpu { } void gw_gpu_kernel::flops_achieved(MPI_Comm comm) { - double gpu_time=0., flops=0.; + double gpu_time=0.; if (comm != MPI_COMM_NULL) { utils::event_t& cugw_event = statistics.event("Solve cuGW"); if (!cugw_event.active) { @@ -140,12 +141,17 @@ namespace green::gpu { MPI_Reduce(&gpu_time, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global); } - flops = _flop_count / gpu_time; + _eff_flops = _flop_count / gpu_time; + } + void gw_gpu_kernel::print_effective_flops() { if (!utils::context.global_rank && _verbose > 1) { - std::cout << "################### GPU FLOPs achieved ####################" << std::endl; - std::cout << "FLOPs achieved: " << flops / 1.0e9 << " Giga flops." << std::endl; - std::cout << "###########################################################" << std::endl; + auto old_precision = std::cout.precision(); + std::cout << std::setprecision(6); + std::cout << "=================== GPU Performance ====================" << std::endl; + std::cout << "Effective FLOPs in the GW iteration: " << _eff_flops / 1.0e9 << " Giga flops." << std::endl; + std::cout << "============================================================" << std::endl; + std::cout << std::setprecision(old_precision); } } From 977a924436ce084d9c2e35087c52b389c462702e Mon Sep 17 00:00:00 2001 From: Gaurav Harsha Date: Wed, 23 Oct 2024 14:19:04 -0400 Subject: [PATCH 4/4] update error handling in flops_achieved function --- src/gw_gpu_kernel.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp index 7c7a9af..4fe3b56 100644 --- a/src/gw_gpu_kernel.cpp +++ b/src/gw_gpu_kernel.cpp @@ -129,9 +129,7 @@ namespace green::gpu { if (!cugw_event.active) { gpu_time = cugw_event.duration; } else { - gpu_time = 0.; - cugw_event.active = false; - throw std::runtime_error("'Solve cuGW' event not found"); + throw std::runtime_error("'Solve cuGW' still active, but it should not be."); } }