Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fast localp coefficients recompute #755

Merged
merged 3 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions SparseGrids/tsgEnumerates.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@
#include <type_traits>
#include <cassert>
#include <limits>
#include <array>

#include <chrono>

#include "TasmanianConfig.hpp" // contains build options passed down from CMake
#include "tsgUtils.hpp" // contains small utilities
Expand Down
88 changes: 85 additions & 3 deletions SparseGrids/tsgGridLocalPolynomial.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -734,11 +734,93 @@ void GridLocalPolynomial::evaluateHierarchicalFunctions(const double x[], int nu
void GridLocalPolynomial::recomputeSurpluses(){
surpluses = Data2D<double>(num_outputs, points.getNumIndexes(), std::vector<double>(values.begin(), values.end()));

Data2D<int> dagUp = HierarchyManipulations::computeDAGup(points, rule.get());
// There are two available algorithms here:
// - global sparse Kronecker (kron), implemented here in recomputeSurpluses()
// - sparse matrix in matrix-free form (mat), implemented in updateSurpluses()
//
// (kron) has the additional restriction that it will only work when the hierarchy is complete
// i.e., all point (multi-indexes) have all of their parents
// this is guaranteed for a full-tensor grid, non-adaptive grid, or a grid adapted with the stable refinement strategy
// other refinement strategies or using dynamic refinement (even with a stable refinement strategy)
// may yield a hierarchy-complete grid, but this is not mathematically guaranteed
//
// computing the dagUp allows us to check (at runtime) if the grid is_complete, and exclude (kron) if incomplete
// the completeness check can be done on the fly but it does incur cost
//
// approximate computational cost, let n be the number of points and d be the number of dimensions
// (kron) d n n^(1/d) due to standard Kronecker reasons, except it is hard to find the "effective" n due to sparsity
// (mat) d^2 n log(n) since there are d log(n) ancestors and basis functions are product of d one dimensional functions
// naturally, there are constants and those also depend on the system, e.g., number of threads, thread scheduling, cache ...
// (mat) has a more even load per thread and parallelizes better
//
// for d = 1, the algorithms are the same, except (kron) explicitly forms the matrix and the matrix free (mat) version is much better
// for d = 2, the (mat) algorithm is still faster
// for d = 3, and sufficiently large size the (mat) algorithm still wins
// the breaking point depends on n, the order and system
// for d = 4 and above, the (kron) algorithm is much faster
// it is possible that for sufficiently large n (mat) will win again
// but tested on 12 cores CPUs (Intel and AMD) up to n = 3.5 million, the (kron) method is over 2x faster
// higher dimensions will favor (kron) even more

if (num_dimensions <= 2 or (num_dimensions == 3 and points.getNumIndexes() > 2000000)) {
Data2D<int> dagUp = HierarchyManipulations::computeDAGup(points, rule.get());
std::vector<int> level = HierarchyManipulations::computeLevels(points, rule.get());
updateSurpluses(points, top_level, level, dagUp);
return;
}

bool is_complete = true;
Data2D<int> dagUp = HierarchyManipulations::computeDAGup(points, rule.get(), is_complete);

if (not is_complete) {
// incomplete hierarchy, must use the slow algorithm
std::vector<int> level = HierarchyManipulations::computeLevels(points, rule.get());
updateSurpluses(points, top_level, level, dagUp);
return;
}

int num_nodes = 1 + *std::max_element(points.begin(), points.end());

std::vector<int> vpntr, vindx;
std::vector<double> vvals;
rule->van_matrix(num_nodes, vpntr, vindx, vvals);

std::vector<int> level = HierarchyManipulations::computeLevels(points, rule.get());
std::vector<std::vector<int>> map;
std::vector<std::vector<int>> lines1d;
MultiIndexManipulations::resortIndexes(points, map, lines1d);

updateSurpluses(points, top_level, level, dagUp);
for(int d=num_dimensions-1; d>=0; d--) {
#pragma omp parallel for schedule(dynamic)
for(int job = 0; job < static_cast<int>(lines1d[d].size() - 1); job++) {
for(int i=lines1d[d][job]+1; i<lines1d[d][job+1]; i++) {
double *row_strip = surpluses.getStrip(map[d][i]);

int row = points.getIndex(map[d][i])[d];
int im = vpntr[row];
int ijx = lines1d[d][job];
int ix = points.getIndex(map[d][ijx])[d];

while(vindx[im] < row or ix < row) {
if (vindx[im] < ix) {
++im; // move the index of the matrix pattern (missing entry)
} else if (ix < vindx[im]) {
// entry not connected, move to the next one
++ijx;
ix = points.getIndex(map[d][ijx])[d];
} else {
double const *col_strip = surpluses.getStrip(map[d][ijx]);
double const v = vvals[im];
for(int k=0; k<num_outputs; k++)
row_strip[k] -= v * col_strip[k];

++im;
++ijx;
ix = points.getIndex(map[d][ijx])[d];
}
}
}
}
}
}

void GridLocalPolynomial::updateSurpluses(MultiIndexSet const &work, int max_level, std::vector<int> const &level, Data2D<int> const &dagUp){
Expand Down
4 changes: 4 additions & 0 deletions SparseGrids/tsgGridLocalPolynomial.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ class GridLocalPolynomial : public BaseCanonicalGrid{
//! \brief Looks for a batch of constructed points and processes all that will result in a connected graph.
void loadConstructedPoints();

//! \brief Fast algorithm, uses global Kronecker algorithm to recompute all surpluses
void recomputeSurpluses();

/*!
Expand All @@ -155,6 +156,9 @@ class GridLocalPolynomial : public BaseCanonicalGrid{
* - \b dagUp must have been computed using \b MultiIndexManipulations::computeDAGup(\b work, \b rule, \b dagUp)
*
* Note: adjusting the \b level vector allows to update the surpluses for only a portion of the graph.
*
* Note: see the comments inside recomputeSurpluses() for the performance comparison between different algorithms
* also note that this method can be used to partially update, i.e., update the surpluses for some of the indexes
*/
void updateSurpluses(MultiIndexSet const &work, int max_level, std::vector<int> const &level, Data2D<int> const &dagUp);

Expand Down
83 changes: 83 additions & 0 deletions SparseGrids/tsgHierarchyManipulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,89 @@ Data2D<int> computeDAGup(MultiIndexSet const &mset, const BaseRuleLocalPolynomia
return parents;
}
}
Data2D<int> computeDAGup(MultiIndexSet const &mset, const BaseRuleLocalPolynomial *rule, bool &is_complete){
size_t num_dimensions = mset.getNumDimensions();
int num_points = mset.getNumIndexes();
if (rule->getMaxNumParents() > 1){ // allow for multiple parents and level 0 may have more than one node
int max_parents = rule->getMaxNumParents() * (int) num_dimensions;
Data2D<int> parents(max_parents, num_points, -1);
int level0_offset = rule->getNumPoints(0);
int any_fail = 0; // count if there are failures
#pragma omp parallel
{
int fail = 0; // local thread count fails

#pragma omp for schedule(static)
for(int i=0; i<num_points; i++){
const int *p = mset.getIndex(i);
std::vector<int> dad(num_dimensions);
std::copy_n(p, num_dimensions, dad.data());
int *pp = parents.getStrip(i);
for(size_t j=0; j<num_dimensions; j++){
if (dad[j] >= level0_offset){
int current = p[j];
dad[j] = rule->getParent(current);
pp[2*j] = mset.getSlot(dad);
if (pp[2*j] == -1)
fail = 1;
while ((dad[j] >= level0_offset) && (pp[2*j] == -1)){
current = dad[j];
dad[j] = rule->getParent(current);
pp[2*j] = mset.getSlot(dad);
}
dad[j] = rule->getStepParent(current);
if (dad[j] != -1){
pp[2*j + 1] = mset.getSlot(dad);
if (pp[2*j + 1] == -1)
fail = 1;
}
dad[j] = p[j];
}
}
}

#pragma omp atomic
any_fail += fail;
}
is_complete = (any_fail == 0);
return parents;
}else{ // this assumes that level zero has only one node
Data2D<int> parents((int) num_dimensions, num_points);
int any_fail = 0; // count if there are failures
#pragma omp parallel
{
int fail = 0; // local thread count fails

#pragma omp for schedule(static)
for(int i=0; i<num_points; i++){
const int *p = mset.getIndex(i);
std::vector<int> dad(num_dimensions);
std::copy_n(p, num_dimensions, dad.data());
int *pp = parents.getStrip(i);
for(size_t j=0; j<num_dimensions; j++){
if (dad[j] == 0){
pp[j] = -1;
}else{
dad[j] = rule->getParent(dad[j]);
pp[j] = mset.getSlot(dad.data());
if (pp[j] == -1)
fail = 1;
while((dad[j] != 0) && (pp[j] == -1)){
dad[j] = rule->getParent(dad[j]);
pp[j] = mset.getSlot(dad);
}
dad[j] = p[j];
}
}
}

#pragma omp atomic
any_fail += fail;
}
is_complete = (any_fail == 0);
return parents;
}
}

Data2D<int> computeDAGDown(MultiIndexSet const &mset, const BaseRuleLocalPolynomial *rule){
size_t num_dimensions = mset.getNumDimensions();
Expand Down
11 changes: 11 additions & 0 deletions SparseGrids/tsgHierarchyManipulator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,17 @@ namespace HierarchyManipulations{
* \endinternal
*/
Data2D<int> computeDAGup(MultiIndexSet const &mset, const BaseRuleLocalPolynomial *rule);
/*!
* \internal
* \ingroup TasmanianHierarchyManipulations
* \brief Variant that also check if all points have all parents
*
* This is merged together so it will do only one pass over the data.
*
* On exit, \b is_complete will indicate whether there are points with missing parents.
* \endinternal
*/
Data2D<int> computeDAGup(MultiIndexSet const &mset, const BaseRuleLocalPolynomial *rule, bool &is_complete);

/*!
* \internal
Expand Down
Loading
Loading