Skip to content

Optimisation Guide

kaanolgu edited this page Aug 3, 2023 · 1 revision

Attempts for Optimising the Code

This page is used for the paper (doi soon) to showcase the code optimisations done and their effect on the performance metrics All the test results are according to the tests with RMAT-19-32 with 4 Compute units

Device Pointers

         // Old code
         unsigned int nodes_start = usm_nodes_start[idx];
        unsigned int nodes_end = usm_nodes_start[idx +1 ];

        // New code with split device pointers
        device_ptr<unsigned int> DevicePtr_start(usm_nodes_start+offset);  
        device_ptr<unsigned int> DevicePtr_end(usm_nodes_start + 1+offset);

        unsigned int nodes_start = DevicePtr_start[idx];
        unsigned int nodes_end = DevicePtr_end[idx];

Events as vectors

Vector version - execution time : 0.10s Single version - execution time : 0.08s

// Vector code
std::vector<event> e_vec;
e_vec.push_back(e1);
e_vec.push_back(e2);
e_vec.push_back(e3);

auto e =q.single_task<class Task>( e_vec, [=](){
...

// Seperate Events
 auto e =q.single_task<class Task>( {e1,e2,e3}, [=]() {
 ...

Single Task vs NDRange vs Paralllel_for

Throughput : parallel_for > singletask > NDrange

  q.single_task<class SingleTask>(  [=]() [[intel::kernel_args_restrict]] {
           #pragma unroll 16
           for(int tid =0; tid < no_of_nodes; tid++){
             unsigned int condition = usm_updating_mask[tid];
             if(condition){
               usm_updating_mask[tid]=0;  
            }
           }
         });
  q.parallel_for<class ParallelFor>(no_of_nodes, [=]() [[intel::kernel_args_restrict]] {
             unsigned int condition = usm_updating_mask[tid];
             if(condition){
               usm_updating_mask[tid]=0;  
            }
         });

  int BLOCK_SIZE = 512;
  int global_work_size = (no_of_nodes + BLOCK_SIZE - 1) / BLOCK_SIZE * BLOCK_SIZE;
  range<1> gws (global_work_size);
  range<1> lws (BLOCK_SIZE);
  q.parallel_for<class NDRange>(nd_range<1>(gws, lws), [=] (nd_item<1> item) [[intel::kernel_args_restrict]]  {
              int gid = item.get_global_id();
              if(gid<no_of_nodes){
             unsigned int condition = usm_updating_mask[tid];
             if(condition){
               usm_updating_mask[tid]=0;  
            }
           }
         });