better implementation of the van matrix

ORNL · Feb 22, 2024 · 234aea4 · 234aea4
1 parent de3b275
commit 234aea4
Show file tree

Hide file tree

Showing 5 changed files with 319 additions and 53 deletions.
diff --git a/SparseGrids/tsgEnumerates.hpp b/SparseGrids/tsgEnumerates.hpp
@@ -54,6 +54,9 @@
 #include <type_traits>
 #include <cassert>
 #include <limits>
+#include <array>
+
+#include <chrono>
 
 #include "TasmanianConfig.hpp" // contains build options passed down from CMake
 #include "tsgUtils.hpp" // contains small utilities

diff --git a/SparseGrids/tsgGridLocalPolynomial.cpp b/SparseGrids/tsgGridLocalPolynomial.cpp
@@ -731,38 +731,44 @@ void GridLocalPolynomial::evaluateHierarchicalFunctions(const double x[], int nu
     }
 }
 
-void GridLocalPolynomial::build_van_matrix1d(std::vector<int> &vpntr, std::vector<int> &vindx, std::vector<double> &vvals) {
-    vindx.clear();
-    vvals.clear();
-
-    //int max_level = *std::max_element(points.begin(), points.end());
-
-    // up to top-level
-    //int num_nodes = rule->getNumPoints(max_level);
-    int num_nodes = 1 + *std::max_element(points.begin(), points.end());
-    vpntr.resize(num_nodes + 1);
-
-    // TODO: switch to a graph-based algorithm
-    for(int i=0; i<num_nodes; i++) {
-        vpntr[i] = static_cast<int>(vindx.size());
-        double x = rule->getNode(i);
-        for(int j=0; j<i; j++) {
-            bool supported;
-            double v = rule->evalSupport(j, x, supported);
-            if (supported) {
-                vindx.push_back(j);
-                vvals.push_back(v);
-            }
-        }
-        vindx.push_back(i);
-        vvals.push_back(1.0);
-    }
-    vpntr.back() = static_cast<int>(vindx.size());
-}
-
 void GridLocalPolynomial::recomputeSurpluses(){
     surpluses = Data2D<double>(num_outputs, points.getNumIndexes(), std::vector<double>(values.begin(), values.end()));
 
+    // There are two available algorithms here:
+    // - global sparse Kronecker (kron), implemented here in recomputeSurpluses()
+    // - sparse matrix in matrix-free form (mat), implemented in updateSurpluses()
+    //
+    // (kron) has the additional restriction that it will only work when the hierarchy is complete
+    // i.e., all point (multi-indexes) have all of their parents
+    // this is guaranteed for a full-tensor grid, non-adaptive grid, or a grid adapted with the stable refinement strategy
+    // other refinement strategies or using dynamic refinement (even with a stable refinement strategy)
+    // may yield a hierarchy-complete grid, but this is not mathematically guaranteed
+    //
+    // computing the dagUp allows us to check (at runtime) if the grid is_complete, and exclude (kron) if incomplete
+    //    the completeness check can be done on the fly but it does incur cost
+    //
+    // approximate computational cost, let n be the number of points and d be the number of dimensions
+    // (kron) d n n^(1/d) due to standard Kronecker reasons, except it is hard to find the "effective" n due to sparsity
+    // (mat) d^2 n log(n) since there are d log(n) ancestors and basis functions are product of d one dimensional functions
+    //    naturally, there are constants and those also depend on the system, e.g., number of threads, thread scheduling, cache ...
+    //    (mat) has a more even load per thread and parallelizes better
+    //
+    // for d = 1, the algorithms are the same, except (kron) explicitly forms the matrix and the matrix free (mat) version is much better
+    // for d = 2, the (mat) algorithm is still faster
+    // for d = 3, and sufficiently large size the (mat) algorithm still wins
+    //            the breaking point depends on n, the order and system
+    // for d = 4 and above, the (kron) algorithm is much faster
+    //           it is possible that for sufficiently large n (mat) will win again
+    //           but tested on 12 cores CPUs (Intel and AMD) up to n = 3.5 million, the (kron) method is over 2x faster
+    // higher dimensions will favor (kron) even more
+
+    if (num_dimensions <= 2 or (num_dimensions == 3 and points.getNumIndexes() > 2000000)) {
+        Data2D<int> dagUp = HierarchyManipulations::computeDAGup(points, rule.get());
+        std::vector<int> level = HierarchyManipulations::computeLevels(points, rule.get());
+        updateSurpluses(points, top_level, level, dagUp);
+        return;
+    }
+
     bool is_complete = true;
     Data2D<int> dagUp = HierarchyManipulations::computeDAGup(points, rule.get(), is_complete);
 
@@ -773,16 +779,18 @@ void GridLocalPolynomial::recomputeSurpluses(){
         return;
     }
 
+    int num_nodes = 1 + *std::max_element(points.begin(), points.end());
+
     std::vector<int> vpntr, vindx;
     std::vector<double> vvals;
-    build_van_matrix1d(vpntr, vindx, vvals);
+    rule->van_matrix(num_nodes, vpntr, vindx, vvals);
 
     std::vector<std::vector<int>> map;
     std::vector<std::vector<int>> lines1d;
     MultiIndexManipulations::resortIndexes(points, map, lines1d);
 
     for(int d=num_dimensions-1; d>=0; d--) {
-        #pragma omp parallel for
+        #pragma omp parallel for schedule(dynamic)
         for(int job = 0; job < static_cast<int>(lines1d[d].size() - 1); job++) {
             for(int i=lines1d[d][job]+1; i<lines1d[d][job+1]; i++) {
                 double *row_strip = surpluses.getStrip(map[d][i]);

diff --git a/SparseGrids/tsgGridLocalPolynomial.hpp b/SparseGrids/tsgGridLocalPolynomial.hpp
@@ -142,9 +142,6 @@ class GridLocalPolynomial : public BaseCanonicalGrid{
     //! \brief Looks for a batch of constructed points and processes all that will result in a connected graph.
     void loadConstructedPoints();
 
-    //! \brief Construct a sparse matrix for the 1d basis
-    void build_van_matrix1d(std::vector<int> &pntr, std::vector<int> &indx, std::vector<double> &vals);
-
     //! \brief Fast algorithm, uses global Kronecker algorithm to recompute all surpluses
     void recomputeSurpluses();
 
@@ -160,7 +157,8 @@ class GridLocalPolynomial : public BaseCanonicalGrid{
      *
      * Note: adjusting the \b level vector allows to update the surpluses for only a portion of the graph.
      *
-     * Note: uses a slow algorithm but good for updating only some of the indexes
+     * Note: see the comments inside recomputeSurpluses() for the performance comparison between different algorithms
+     *       also note that this method can be used to partially update, i.e., update the surpluses for some of the indexes
      */
     void updateSurpluses(MultiIndexSet const &work, int max_level, std::vector<int> const &level, Data2D<int> const &dagUp);
 

diff --git a/SparseGrids/tsgHierarchyManipulator.cpp b/SparseGrids/tsgHierarchyManipulator.cpp
@@ -178,24 +178,6 @@ Data2D<int> computeDAGup(MultiIndexSet const &mset, const BaseRuleLocalPolynomia
     }
 }
 
-bool checkComplete(MultiIndexSet const &mset, Data2D<int> const &dagUp, const BaseRuleLocalPolynomial *rule){
-    int const num_no_parents = rule->getNumPoints(0);
-    int const num_dimensions = mset.getNumDimensions();
-    int const max_parents    = rule->getMaxNumParents();
-
-    for(int i=0; i<mset.getNumIndexes(); i++) {
-        for(int d=0; d<num_dimensions; d++) {
-            if (mset.getIndex(i)[d] >= num_no_parents) { // expect a parent
-                for(int j=0; j<max_parents; j++)
-                    if (dagUp.getStrip(i)[d * max_parents + j] == -1)
-                        return false;
-            }
-        }
-    }
-
-    return true;
-}
-
 Data2D<int> computeDAGDown(MultiIndexSet const &mset, const BaseRuleLocalPolynomial *rule){
     size_t num_dimensions = mset.getNumDimensions();
     int max_1d_kids = rule->getMaxNumKids();