ORNL · mkstoyanov · Feb 22, 2024 · Feb 16, 2024 · Feb 16, 2024 · Feb 20, 2024
diff --git a/SparseGrids/tsgEnumerates.hpp b/SparseGrids/tsgEnumerates.hpp
@@ -54,6 +54,9 @@
 #include <type_traits>
 #include <cassert>
 #include <limits>
+#include <array>
+
+#include <chrono>
 
 #include "TasmanianConfig.hpp" // contains build options passed down from CMake
 #include "tsgUtils.hpp" // contains small utilities

diff --git a/SparseGrids/tsgGridLocalPolynomial.cpp b/SparseGrids/tsgGridLocalPolynomial.cpp
@@ -734,11 +734,93 @@ void GridLocalPolynomial::evaluateHierarchicalFunctions(const double x[], int nu
 void GridLocalPolynomial::recomputeSurpluses(){
     surpluses = Data2D<double>(num_outputs, points.getNumIndexes(), std::vector<double>(values.begin(), values.end()));
 
-    Data2D<int> dagUp = HierarchyManipulations::computeDAGup(points, rule.get());
+    // There are two available algorithms here:
+    // - global sparse Kronecker (kron), implemented here in recomputeSurpluses()
+    // - sparse matrix in matrix-free form (mat), implemented in updateSurpluses()
+    //
+    // (kron) has the additional restriction that it will only work when the hierarchy is complete
+    // i.e., all point (multi-indexes) have all of their parents
+    // this is guaranteed for a full-tensor grid, non-adaptive grid, or a grid adapted with the stable refinement strategy
+    // other refinement strategies or using dynamic refinement (even with a stable refinement strategy)
+    // may yield a hierarchy-complete grid, but this is not mathematically guaranteed
+    //
+    // computing the dagUp allows us to check (at runtime) if the grid is_complete, and exclude (kron) if incomplete
+    //    the completeness check can be done on the fly but it does incur cost
+    //
+    // approximate computational cost, let n be the number of points and d be the number of dimensions
+    // (kron) d n n^(1/d) due to standard Kronecker reasons, except it is hard to find the "effective" n due to sparsity
+    // (mat) d^2 n log(n) since there are d log(n) ancestors and basis functions are product of d one dimensional functions
+    //    naturally, there are constants and those also depend on the system, e.g., number of threads, thread scheduling, cache ...
+    //    (mat) has a more even load per thread and parallelizes better
+    //
+    // for d = 1, the algorithms are the same, except (kron) explicitly forms the matrix and the matrix free (mat) version is much better
+    // for d = 2, the (mat) algorithm is still faster
+    // for d = 3, and sufficiently large size the (mat) algorithm still wins
+    //            the breaking point depends on n, the order and system
+    // for d = 4 and above, the (kron) algorithm is much faster
+    //           it is possible that for sufficiently large n (mat) will win again
+    //           but tested on 12 cores CPUs (Intel and AMD) up to n = 3.5 million, the (kron) method is over 2x faster
+    // higher dimensions will favor (kron) even more
+
+    if (num_dimensions <= 2 or (num_dimensions == 3 and points.getNumIndexes() > 2000000)) {
+        Data2D<int> dagUp = HierarchyManipulations::computeDAGup(points, rule.get());
+        std::vector<int> level = HierarchyManipulations::computeLevels(points, rule.get());
+        updateSurpluses(points, top_level, level, dagUp);
+        return;
+    }
+
+    bool is_complete = true;
+    Data2D<int> dagUp = HierarchyManipulations::computeDAGup(points, rule.get(), is_complete);
+
+    if (not is_complete) {
+        // incomplete hierarchy, must use the slow algorithm
+        std::vector<int> level = HierarchyManipulations::computeLevels(points, rule.get());
+        updateSurpluses(points, top_level, level, dagUp);
+        return;
+    }
+
+    int num_nodes = 1 + *std::max_element(points.begin(), points.end());
+
+    std::vector<int> vpntr, vindx;
+    std::vector<double> vvals;
+    rule->van_matrix(num_nodes, vpntr, vindx, vvals);
 
-    std::vector<int> level = HierarchyManipulations::computeLevels(points, rule.get());
+    std::vector<std::vector<int>> map;
+    std::vector<std::vector<int>> lines1d;
+    MultiIndexManipulations::resortIndexes(points, map, lines1d);
 
-    updateSurpluses(points, top_level, level, dagUp);
+    for(int d=num_dimensions-1; d>=0; d--) {
+        #pragma omp parallel for schedule(dynamic)
+        for(int job = 0; job < static_cast<int>(lines1d[d].size() - 1); job++) {
+            for(int i=lines1d[d][job]+1; i<lines1d[d][job+1]; i++) {
+                double *row_strip = surpluses.getStrip(map[d][i]);
+
+                int row = points.getIndex(map[d][i])[d];
+                int im  = vpntr[row];
+                int ijx = lines1d[d][job];
+                int ix  = points.getIndex(map[d][ijx])[d];
+
+                while(vindx[im] < row or ix < row) {
+                    if (vindx[im] < ix) {
+                        ++im; // move the index of the matrix pattern (missing entry)
+                    } else if (ix < vindx[im]) {
+                        // entry not connected, move to the next one
+                        ++ijx;
+                        ix = points.getIndex(map[d][ijx])[d];
+                    } else {
+                        double const *col_strip = surpluses.getStrip(map[d][ijx]);
+                        double const v = vvals[im];
+                        for(int k=0; k<num_outputs; k++)
+                            row_strip[k] -= v * col_strip[k];
+
+                        ++im;
+                        ++ijx;
+                        ix = points.getIndex(map[d][ijx])[d];
+                    }
+                }
+            }
+        }
+    }
 }
 
 void GridLocalPolynomial::updateSurpluses(MultiIndexSet const &work, int max_level, std::vector<int> const &level, Data2D<int> const &dagUp){

diff --git a/SparseGrids/tsgGridLocalPolynomial.hpp b/SparseGrids/tsgGridLocalPolynomial.hpp
@@ -142,6 +142,7 @@ class GridLocalPolynomial : public BaseCanonicalGrid{
     //! \brief Looks for a batch of constructed points and processes all that will result in a connected graph.
     void loadConstructedPoints();
 
+    //! \brief Fast algorithm, uses global Kronecker algorithm to recompute all surpluses
     void recomputeSurpluses();
 
     /*!
@@ -155,6 +156,9 @@ class GridLocalPolynomial : public BaseCanonicalGrid{
      * - \b dagUp must have been computed using \b MultiIndexManipulations::computeDAGup(\b work, \b rule, \b dagUp)
      *
      * Note: adjusting the \b level vector allows to update the surpluses for only a portion of the graph.
+     *
+     * Note: see the comments inside recomputeSurpluses() for the performance comparison between different algorithms
+     *       also note that this method can be used to partially update, i.e., update the surpluses for some of the indexes
      */
     void updateSurpluses(MultiIndexSet const &work, int max_level, std::vector<int> const &level, Data2D<int> const &dagUp);
 

diff --git a/SparseGrids/tsgHierarchyManipulator.cpp b/SparseGrids/tsgHierarchyManipulator.cpp
@@ -94,6 +94,89 @@ Data2D<int> computeDAGup(MultiIndexSet const &mset, const BaseRuleLocalPolynomia
         return parents;
     }
 }
+Data2D<int> computeDAGup(MultiIndexSet const &mset, const BaseRuleLocalPolynomial *rule, bool &is_complete){
+    size_t num_dimensions = mset.getNumDimensions();
+    int num_points = mset.getNumIndexes();
+    if (rule->getMaxNumParents() > 1){ // allow for multiple parents and level 0 may have more than one node
+        int max_parents = rule->getMaxNumParents() * (int) num_dimensions;
+        Data2D<int> parents(max_parents, num_points, -1);
+        int level0_offset = rule->getNumPoints(0);
+        int any_fail = 0; // count if there are failures
+        #pragma omp parallel
+        {
+            int fail = 0; // local thread count fails
+
+            #pragma omp for schedule(static)
+            for(int i=0; i<num_points; i++){
+                const int *p = mset.getIndex(i);
+                std::vector<int> dad(num_dimensions);
+                std::copy_n(p, num_dimensions, dad.data());
+                int *pp = parents.getStrip(i);
+                for(size_t j=0; j<num_dimensions; j++){
+                    if (dad[j] >= level0_offset){
+                        int current = p[j];
+                        dad[j] = rule->getParent(current);
+                        pp[2*j] = mset.getSlot(dad);
+                        if (pp[2*j] == -1)
+                            fail = 1;
+                        while ((dad[j] >= level0_offset) && (pp[2*j] == -1)){
+                            current = dad[j];
+                            dad[j] = rule->getParent(current);
+                            pp[2*j] = mset.getSlot(dad);
+                        }
+                        dad[j] = rule->getStepParent(current);
+                        if (dad[j] != -1){
+                            pp[2*j + 1] = mset.getSlot(dad);
+                            if (pp[2*j + 1] == -1)
+                                fail = 1;
+                        }
+                        dad[j] = p[j];
+                    }
+                }
+            }
+
+            #pragma omp atomic
+            any_fail += fail;
+        }
+        is_complete = (any_fail == 0);
+        return parents;
+    }else{ // this assumes that level zero has only one node
+        Data2D<int> parents((int) num_dimensions, num_points);
+        int any_fail = 0; // count if there are failures
+        #pragma omp parallel
+        {
+            int fail = 0; // local thread count fails
+
+            #pragma omp for schedule(static)
+            for(int i=0; i<num_points; i++){
+                const int *p = mset.getIndex(i);
+                std::vector<int> dad(num_dimensions);
+                std::copy_n(p, num_dimensions, dad.data());
+                int *pp = parents.getStrip(i);
+                for(size_t j=0; j<num_dimensions; j++){
+                    if (dad[j] == 0){
+                        pp[j] = -1;
+                    }else{
+                        dad[j] = rule->getParent(dad[j]);
+                        pp[j] = mset.getSlot(dad.data());
+                        if (pp[j] == -1)
+                            fail = 1;
+                        while((dad[j] != 0) && (pp[j] == -1)){
+                            dad[j] = rule->getParent(dad[j]);
+                            pp[j] = mset.getSlot(dad);
+                        }
+                        dad[j] = p[j];
+                    }
+                }
+            }
+
+            #pragma omp atomic
+            any_fail += fail;
+        }
+        is_complete = (any_fail == 0);
+        return parents;
+    }
+}
 
 Data2D<int> computeDAGDown(MultiIndexSet const &mset, const BaseRuleLocalPolynomial *rule){
     size_t num_dimensions = mset.getNumDimensions();

diff --git a/SparseGrids/tsgHierarchyManipulator.hpp b/SparseGrids/tsgHierarchyManipulator.hpp
@@ -85,6 +85,17 @@ namespace HierarchyManipulations{
  * \endinternal
  */
 Data2D<int> computeDAGup(MultiIndexSet const &mset, const BaseRuleLocalPolynomial *rule);
+/*!
+ * \internal
+ * \ingroup TasmanianHierarchyManipulations
+ * \brief Variant that also check if all points have all parents
+ *
+ * This is merged together so it will do only one pass over the data.
+ *
+ * On exit, \b is_complete will indicate whether there are points with missing parents.
+ * \endinternal
+ */
+Data2D<int> computeDAGup(MultiIndexSet const &mset, const BaseRuleLocalPolynomial *rule, bool &is_complete);
 
 /*!
  * \internal