brian-team · SudeshnaBora · Oct 30, 2021 · Oct 30, 2021 · Nov 20, 2021 · Apr 6, 2022
diff --git a/brian2cuda/device.py b/brian2cuda/device.py
@@ -89,6 +89,8 @@ def __init__(self):
         # list of pre/post ID arrays that are not needed in device memory
         self.delete_synaptic_pre = {}
         self.delete_synaptic_post = {}
+        # dictionary to store parallalelization information
+        self.stream_info = {}
         # The following nested dictionary collects all codeobjects that use random
         # number generation (RNG).
         self.codeobjects_with_rng = {
@@ -359,6 +361,7 @@ def code_object(self, owner, name, abstract_code, variables, template_name,
         template_kwds["sm_multiplier"] = prefs["devices.cuda_standalone.SM_multiplier"]
         template_kwds["syn_launch_bounds"] = prefs["devices.cuda_standalone.syn_launch_bounds"]
         template_kwds["calc_occupancy"] = prefs["devices.cuda_standalone.calc_occupancy"]
+        template_kwds["stream_info"] = self.stream_info
         if template_name in ["threshold", "spikegenerator"]:
             template_kwds["extra_threshold_kernel"] = prefs["devices.cuda_standalone.extra_threshold_kernel"]
         codeobj = super(CUDAStandaloneDevice, self).code_object(owner, name, abstract_code, variables,
@@ -374,7 +377,7 @@ def check_openmp_compatible(self, nb_threads):
         if nb_threads > 0:
             raise NotImplementedError("Using OpenMP in a CUDA standalone project is not supported")
 
-    def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks):
+    def generate_objects_source(self, writer, arange_arrays, synapses, static_array_specs, networks, stream_info):
         sm_multiplier = prefs.devices.cuda_standalone.SM_multiplier
         num_parallel_blocks = prefs.devices.cuda_standalone.parallel_blocks
         curand_generator_type = prefs.devices.cuda_standalone.random_number_generator_type
@@ -393,6 +396,9 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_
         for syn in synapses:
             if syn.multisynaptic_index is not None:
                 multisyn_vars.append(syn.variables[syn.multisynaptic_index])
+        # get number of unique streams
+
+        num_stream = max(Counter(stream_info).values())
         arr_tmp = self.code_object_class().templater.objects(
                         None, None,
                         array_specs=self.arrays,
@@ -415,7 +421,9 @@ def generate_objects_source(self, writer, arange_arrays, synapses, static_array_
                         eventspace_arrays=self.eventspace_arrays,
                         spikegenerator_eventspaces=self.spikegenerator_eventspaces,
                         multisynaptic_idx_vars=multisyn_vars,
-                        profiled_codeobjects=self.profiled_codeobjects)
+                        profiled_codeobjects=self.profiled_codeobjects,
+                        parallelize=True,
+                        stream_size=num_stream)
         # Reinsert deleted entries, in case we use self.arrays later? maybe unnecassary...
         self.arrays.update(self.eventspace_arrays)
         writer.write('objects.*', arr_tmp)
@@ -445,7 +453,8 @@ def generate_main_source(self, writer):
                         # For codeobjects run every tick, this happens in the init() of
                         # the random number buffer called at first clock cycle of the network
                         main_lines.append('random_number_buffer.ensure_enough_curand_states();')
-                main_lines.append(f'_run_{codeobj.name}();')
+                # add stream - default
+                main_lines.append(f'_run_{codeobj.name}(0);')
             elif func == 'after_run_code_object':
                 codeobj, = args
                 main_lines.append(f'_after_run_{codeobj.name}();')
@@ -986,10 +995,14 @@ def generate_network_source(self, writer):
         maximum_run_time = self._maximum_run_time
         if maximum_run_time is not None:
             maximum_run_time = float(maximum_run_time)
+        num_stream = max(Counter(self.stream_info).values())
         network_tmp = self.code_object_class().templater.network(None, None,
                                                                  maximum_run_time=maximum_run_time,
                                                                  eventspace_arrays=self.eventspace_arrays,
-                                                                 spikegenerator_eventspaces=self.spikegenerator_eventspaces)
+                                                                 spikegenerator_eventspaces=self.spikegenerator_eventspaces,
+                                                                 parallelize = True,
+                                                                 stream_info = self.stream_info,
+                                                                 num_stream= num_stream)
         writer.write('network.*', network_tmp)
 
     def generate_synapses_classes_source(self, writer):
@@ -1310,7 +1323,7 @@ def build(self, directory='output',
 
         self.generate_objects_source(self.writer, self.arange_arrays,
                                      net_synapses, self.static_array_specs,
-                                     self.networks)
+                                     self.networks, self.stream_info)
         self.generate_network_source(self.writer)
         self.generate_synapses_classes_source(self.writer)
         self.generate_run_source(self.writer)
@@ -1382,6 +1395,25 @@ def network_run(self, net, duration, report=None, report_period=10*second,
         self.clocks.update(net._clocks)
         net.t_ = float(t_end)
 
+
+        # Create dictionary for parallelisation with stream
+        streams_organization = defaultdict(list)
+        for obj in net.sorted_objects:
+            streams_organization[(obj.when, obj.order)].append(obj)
+
+        # associate each code object with a particular stream
+        streams_details = defaultdict(list)
+        count = 1
+        for key in streams_organization:
+            for object in streams_organization[key]:
+                streams_details[object.name] = count
+            count +=1
+
+        self.stream_info = streams_details
+        self.stream_info['default'] = 0
+
+
+
         # TODO: remove this horrible hack
         for clock in self.clocks:
             if clock.name=='clock':
@@ -1516,11 +1548,21 @@ def network_run(self, net, duration, report=None, report_period=10*second,
 
         # create all random numbers needed for the next clock cycle
         for clock in net._clocks:
-            run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer);')
+            run_lines.append(f'{net.name}.add(&{clock.name}, _run_random_number_buffer, {self.stream_info["default"]});')
 
         all_clocks = set()
+        # TODO add for every code object ->  add where in the list are there.
+        # TODO create new dic (code object, position in list)
         for clock, codeobj in code_objects:
-            run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name});')
+            # add this position as additional number here
+            # check if codeobj.name has _codeobject in it
+            name = codeobj.name
+            if "_codeobject" in codeobj.name:
+                name = codeobj.name[:-11]
+            if name in self.stream_info.keys():
+                run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name}, {self.stream_info[name]});')
+            else:
+                run_lines.append(f'{net.name}.add(&{clock.name}, _run_{codeobj.name}, {self.stream_info["default"]});')
             all_clocks.add(clock)
 
         # Under some rare circumstances (e.g. a NeuronGroup only defining a

diff --git a/brian2cuda/templates/common_group.cu b/brian2cuda/templates/common_group.cu
@@ -155,7 +155,7 @@ _run_kernel_{{codeobj_name}}(
 {% endblock kernel %}
 
 
-void _run_{{codeobj_name}}()
+void _run_{{codeobj_name}}(cudaStream_t stream)
 {
     using namespace brian;
 
@@ -292,7 +292,7 @@ void _run_{{codeobj_name}}()
     {% endblock %}
 
     {% block kernel_call %}
-    _run_kernel_{{codeobj_name}}<<<num_blocks, num_threads>>>(
+    _run_kernel_{{codeobj_name}}<<<num_blocks, num_threads, 0, stream>>>(
             _N,
             num_threads,
             ///// HOST_PARAMETERS /////
@@ -326,7 +326,7 @@ void _run_{{codeobj_name}}()
 #ifndef _INCLUDED_{{codeobj_name}}
 #define _INCLUDED_{{codeobj_name}}
 
-void _run_{{codeobj_name}}();
+void _run_{{codeobj_name}}(cudaStream_t);
 
 {% block extra_functions_h %}
 {% endblock %}
@@ -362,7 +362,7 @@ void _after_run_{{codeobj_name}}()
 }
 {% endmacro %}
 
-
+// {{codeobj_name}}
 {% macro after_run_h_file() %}
 #ifndef _INCLUDED_{{codeobj_name}}_after
 #define _INCLUDED_{{codeobj_name}}_affer

diff --git a/brian2cuda/templates/makefile b/brian2cuda/templates/makefile
@@ -7,7 +7,7 @@ OBJS := ${OBJS:.cpp=.o}
 OBJS := ${OBJS:.c=.o}
 NVCC = @{{ nvcc_path }} -ccbin $(CXX)
 NVCCFLAGS = -I. -std=c++11 {{gpu_arch_flags}} {{nvcc_compiler_flags}} {{compiler_debug_flags}} -Xcompiler "{{cpp_compiler_flags}}"
-LFLAGS = -lcurand -I. {{gpu_arch_flags}} {{cpp_linker_flags}} {{linker_debug_flags}}
+LFLAGS = -lcurand -lcudart -I. {{gpu_arch_flags}} {{cpp_linker_flags}} {{linker_debug_flags}}
 
 all: $(PROGRAM)
 

diff --git a/brian2cuda/templates/network.cu b/brian2cuda/templates/network.cu
@@ -14,23 +14,33 @@
 
 double Network::_last_run_time = 0.0;
 double Network::_last_run_completed_fraction = 0.0;
+{% if parallelize %}
+cudaStream_t custom_stream[{{num_stream}}];
+{% endif %}
 
 Network::Network()
 {
     t = 0.0;
+    {% if parallelize %}
+    for(int i=0;i<{{num_stream}};i++){
+        CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i])));
+    }
+    {% endif %}
 }
 
 void Network::clear()
 {
     objects.clear();
 }
 
-void Network::add(Clock *clock, codeobj_func func)
+// TODO have to makr change in objects - make it a tuple
+// make decision which bject has which stream
+void Network::add(Clock *clock, codeobj_func func, int group_num)
 {
 #if defined(_MSC_VER) && (_MSC_VER>=1700)
-    objects.push_back(std::make_pair(std::move(clock), std::move(func)));
+    objects.push_back(std::make_tuple(std::move(clock), std::move(func), std::move(group_num)));
 #else
-    objects.push_back(std::make_pair(clock, func));
+    objects.push_back(std::make_tuple(clock, func, group_num));
 #endif
 }
 
@@ -56,7 +66,7 @@ void Network::run(const double duration, void (*report_func)(const double, const
     Clock* clock = next_clocks();
     double elapsed_realtime;
     bool did_break_early = false;
-
+    //TODO here
     while(clock && clock->running())
     {
         t = clock->t[0];
@@ -73,17 +83,42 @@ void Network::run(const double duration, void (*report_func)(const double, const
                     next_report_time += report_period;
                 }
             }
-            Clock *obj_clock = objects[i].first;
+            // TODO tuple of clock and function
+            //Clock *obj_clock = objects[i].first;
+            Clock *obj_clock = std::get<0>(objects[i]);
+            int group_int = std::get<2>(objects[i]);
             // Only execute the object if it uses the right clock for this step
             if (curclocks.find(obj_clock) != curclocks.end())
             {
-                codeobj_func func = objects[i].second;
+               // function -> whixh is in templates like common_group.cu
+               // sort the code object - waiting mechanism between groups
+               // cudaEvent or cudaSynchronise
+                //codeobj_func func = objects[i].second;
+                codeobj_func func = std::get<1>(objects[i]);
+                int func_group_int = std::get<2>(objects[i]);
                 if (func)  // code objects can be NULL in cases where we store just the clock
                 {
-                    func();
+                      func_groups[func_group_int].push_back(func);
+                      //func_groups.push_back(std::make_pair(func_group_int,func));
+                    //func();
+                    // [[func1,func2,func3],[func4...]]
                 }
             }
         }
+
+        // get maximum in objects.cu array
+
+        // go through each list of func group - 2 loops
+        for(int i=0; i<func_groups.size(); i++){
+            for(int j=0; j<func_groups[i].size(); j++){
+                codeobj_func func = func_groups[i][j];
+                func(custom_stream[j]);
+            }
+            // reset the func group for that sub stream
+            cudaDeviceSynchronize();
+            func_groups[i].resize(0);
+        }
+
         for(std::set<Clock*>::iterator i=curclocks.begin(); i!=curclocks.end(); i++)
             (*i)->tick();
         clock = next_clocks();
@@ -129,7 +164,8 @@ void Network::compute_clocks()
     clocks.clear();
     for(int i=0; i<objects.size(); i++)
     {
-        Clock *clock = objects[i].first;
+        Clock *clock = std::get<0>(objects[i]);
+        // Clock *clock = std::get<0>()objects[i].first;
         clocks.insert(clock);
     }
 }
@@ -174,22 +210,30 @@ Clock* Network::next_clocks()
 #include <ctime>
 #include "brianlib/clocks.h"
 
-typedef void (*codeobj_func)();
+typedef void (*codeobj_func)(cudaStream_t);
 
 class Network
 {
     std::set<Clock*> clocks, curclocks;
     void compute_clocks();
     Clock* next_clocks();
 public:
-    std::vector< std::pair< Clock*, codeobj_func > > objects;
+// TODO vectory of tuples having clock , codeobj_func and stread integer
+    std::vector< std::tuple< Clock*, codeobj_func, int > > objects;
+    //std::vector< std::pair< Clock*, codeobj_func > > objects;
+    std::vector<std::vector<codeobj_func >> func_groups = std::vector<std::vector<codeobj_func >>({{num_stream}});
+    //std::vector<std::pair< int, codeobj_func >> func_groups;
     double t;
     static double _last_run_time;
     static double _last_run_completed_fraction;
+    int num_streams;
+    {% if parallelize %}
+    cudaStream_t custom_stream[{{num_stream}}];
+    {% endif %}
 
     Network();
     void clear();
-    void add(Clock *clock, codeobj_func func);
+    void add(Clock *clock, codeobj_func func, int num_streams);
     void run(const double duration, void (*report_func)(const double, const double, const double, const double), const double report_period);
 };
 

diff --git a/brian2cuda/templates/objects.cu b/brian2cuda/templates/objects.cu
@@ -40,6 +40,12 @@ const int brian::_num_{{varname}} = {{var.size}};
 {% endif %}
 {% endfor %}
 
+
+///////////////// array of streams for parallelization //////////////////////////
+// {% if parallelize %}
+// cudaStream_t brian::custom_stream[{{stream_size}}];
+// {% endif %}
+
 //////////////// eventspaces ///////////////
 // we dynamically create multiple eventspaces in no_or_const_delay_mode
 // for initiating the first spikespace, we need a host pointer
@@ -226,6 +232,14 @@ void _init_arrays()
             );
     {% endif %}
 
+//     {% if parallelize %}
+//     for(int i=0;i<{{stream_size}};i++){
+//         CUDA_SAFE_CALL(cudaStreamCreate(&(custom_stream[i])));
+//     }
+//     {% endif %}
+
+
+
     // this sets seed for host and device api RNG
     random_number_buffer.set_seed(seed);
 
@@ -546,6 +560,7 @@ typedef {{curand_float_type}} randomNumber_t;  // random number type
 #include "network.h"
 #include "rand.h"
 
+#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
 #include <curand.h>
 #include <curand_kernel.h>
@@ -597,6 +612,12 @@ extern thrust::device_vector<{{c_data_type(var.dtype)}}*> addresses_monitor_{{va
 extern thrust::device_vector<{{c_data_type(var.dtype)}}>* {{varname}};
 {% endfor %}
 
+//////////////// stream ////////////
+// {% if parallelize %}
+// extern cudaStream_t custom_stream[{{stream_size}}];
+// {% endif %}
+
+
 /////////////// static arrays /////////////
 {% for (name, dtype_spec, N, filename) in static_array_specs | sort %}
 {# arrays that are initialized from static data are already declared #}

diff --git a/brian2cuda/templates/rand.cu b/brian2cuda/templates/rand.cu
@@ -44,8 +44,9 @@ namespace {
 
 
 // need a function pointer for Network::add(), can't pass a pointer to a class
-// method, which is of different type
-void _run_random_number_buffer()
+// method, which is of different type. Random number buffer runs in default
+// stream always, the `stream` parameter is not used.
+void _run_random_number_buffer(cudaStream_t stream)
 {
     // random_number_buffer is a RandomNumberBuffer instance, declared in objects.cu
     random_number_buffer.next_time_step();
@@ -472,7 +473,7 @@ void RandomNumberBuffer::next_time_step()
 
 #include <curand.h>
 
-void _run_random_number_buffer();
+void _run_random_number_buffer(cudaStream_t);
 
 class RandomNumberBuffer
 {
@@ -562,4 +563,4 @@ public:
 
 #endif
 
-{% endmacro %}
+{% endmacro %}