Merge pull request #6 from Green-Phys/flop

Effective FLOPS Definition and GPU Performance Insights
Green-Phys · Oct 31, 2024 · 92ea2d5 · 92ea2d5
2 parents 56ad7a9 + 977a924
commit 92ea2d5
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 3 deletions.
diff --git a/src/green/gpu/gw_gpu_kernel.h b/src/green/gpu/gw_gpu_kernel.h
@@ -80,13 +80,24 @@ namespace green::gpu {
     virtual void gw_innerloop(G_type& g, St_type& sigma_tau) = 0;
     void GW_check_devices_free_space();
 
-    /*
-     * Read a chunk of Coulomb integral with given (k[0], k[3]) k-pair
+    /**
+     * \brief Read a chunk of Coulomb integral with given (k[0], k[3]) k-pair
      */
     void read_next(const std::array<size_t, 4>& k);
 
+    /**
+     * \brief calculate effective floating points operations per second reached on GPU.
+     * This is not representative of the GPU capabilities, but instead, accounts for read/write overheads.
+     * The value is entirely in the context Green-MBPT solver.
+     */
+    void flops_achieved(MPI_Comm comm);
+
+    /**
+     * \brief print the effective FLOPs achieved for the iteration.
+     * 
+     */
+    void print_effective_flops();
 
-  protected:
     double                      _beta;
     size_t                      _nts;
     size_t                      _nts_b;
@@ -105,6 +116,7 @@ namespace green::gpu {
     int                         _nqkpt{};
 
     double                      _flop_count{};
+    double                      _eff_flops{};
     LinearSolverType            _cuda_lin_solver;
   };
 

diff --git a/src/gw_gpu_kernel.cpp b/src/gw_gpu_kernel.cpp
@@ -100,6 +100,8 @@ namespace green::gpu {
       }
       MPI_Barrier(utils::context.global);
       sigma_tau.fence();
+      // Print effective FLOPs achieved in the calculation
+      flops_achieved(_devices_comm);
       if (!utils::context.node_rank) {
         if (_devices_comm != MPI_COMM_NULL) statistics.start("selfenergy_reduce");
         utils::allreduce(MPI_IN_PLACE, sigma_tau.object().data(), sigma_tau.object().size()/(_nso*_nso), dt_matrix, matrix_sum_op, utils::context.internode_comm);
@@ -110,6 +112,7 @@ namespace green::gpu {
       MPI_Barrier(utils::context.global);
       statistics.end();
       statistics.print(utils::context.global);
+      print_effective_flops();
 
       clean_MPI_structure();
       clean_shared_Coulomb();
@@ -119,6 +122,37 @@ namespace green::gpu {
       MPI_Op_free(&matrix_sum_op);
     }
 
+    void gw_gpu_kernel::flops_achieved(MPI_Comm comm) {
+      double gpu_time=0.;
+      if (comm != MPI_COMM_NULL) {
+        utils::event_t& cugw_event = statistics.event("Solve cuGW");
+        if (!cugw_event.active) {
+          gpu_time = cugw_event.duration;
+        } else {
+          throw std::runtime_error("'Solve cuGW' still active, but it should not be.");
+        }
+      }
+
+      if (!utils::context.global_rank) {
+        MPI_Reduce(MPI_IN_PLACE, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global);
+      } else {
+        MPI_Reduce(&gpu_time, &gpu_time, 1, MPI_DOUBLE, MPI_SUM, 0, utils::context.global);
+      }
+
+      _eff_flops = _flop_count / gpu_time;
+    }
+
+    void gw_gpu_kernel::print_effective_flops() {
+      if (!utils::context.global_rank && _verbose > 1) {
+        auto old_precision = std::cout.precision();
+        std::cout << std::setprecision(6);
+        std::cout << "===================   GPU Performance   ====================" << std::endl;
+        std::cout << "Effective FLOPs in the GW iteration: " << _eff_flops / 1.0e9 << " Giga flops." << std::endl;
+        std::cout << "============================================================" << std::endl;
+        std::cout << std::setprecision(old_precision);
+      }
+    }
+
     void scalar_gw_gpu_kernel::gw_innerloop(G_type& g, St_type& sigma_tau) {
       if (!_sp) {
         compute_gw_selfenergy<double>(g, sigma_tau);
@@ -303,9 +337,11 @@ namespace green::gpu {
       };
 
       ztensor<5> Sigma_tskij_host_local(_nts, 1, _ink, _nso, _nso);
+      statistics.start("Solve cuGW");
       cugw.solve(_nts, psuedo_ns, _nk, _ink, _nao, _bz_utils.symmetry().reduced_to_full(), _bz_utils.symmetry().full_to_reduced(),
                  _Vk1k2_Qij, Sigma_tskij_host_local, _devices_rank, _devices_size, true, _verbose,
                  irre_pos, mom_cons, r1, r2);
+      statistics.end();
       // Convert Sigma_tskij_host_local to (_nts, 1, _ink, _nso, _nso)
       // Copy back to Sigma_tskij_local_host
       MPI_Win_lock(MPI_LOCK_EXCLUSIVE, 0, 0, sigma_tau.win());