-
Notifications
You must be signed in to change notification settings - Fork 88
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge numerical LU factorization kernels
This adds numerical LU factorization kernels to initialize the factors from the matrix and run the factorization. It uses the Csr lookup structures to implement the required sparse vector additions, and a sync-free waiting approach similar to our custom triangular solvers. Related PR: #1058
- Loading branch information
Showing
28 changed files
with
517,101 additions
and
42 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
/*******************************<GINKGO LICENSE>****************************** | ||
Copyright (c) 2017-2022, the Ginkgo authors | ||
All rights reserved. | ||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions | ||
are met: | ||
1. Redistributions of source code must retain the above copyright | ||
notice, this list of conditions and the following disclaimer. | ||
2. Redistributions in binary form must reproduce the above copyright | ||
notice, this list of conditions and the following disclaimer in the | ||
documentation and/or other materials provided with the distribution. | ||
3. Neither the name of the copyright holder nor the names of its | ||
contributors may be used to endorse or promote products derived from | ||
this software without specific prior written permission. | ||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS | ||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | ||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A | ||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
******************************<GINKGO LICENSE>*******************************/ | ||
|
||
template <typename ValueType, typename IndexType> | ||
__device__ __forceinline__ | ||
std::enable_if_t<!is_complex_s<ValueType>::value, ValueType> | ||
load(const ValueType* values, IndexType index) | ||
{ | ||
const volatile ValueType* val = values + index; | ||
return *val; | ||
} | ||
|
||
template <typename ValueType, typename IndexType> | ||
__device__ __forceinline__ std::enable_if_t< | ||
std::is_floating_point<ValueType>::value, thrust::complex<ValueType>> | ||
load(const thrust::complex<ValueType>* values, IndexType index) | ||
{ | ||
auto real = reinterpret_cast<const ValueType*>(values); | ||
auto imag = real + 1; | ||
return {load(real, 2 * index), load(imag, 2 * index)}; | ||
} | ||
|
||
template <typename ValueType, typename IndexType> | ||
__device__ __forceinline__ | ||
std::enable_if_t<!is_complex_s<ValueType>::value, void> | ||
store(ValueType* values, IndexType index, ValueType value) | ||
{ | ||
volatile ValueType* val = values + index; | ||
*val = value; | ||
} | ||
|
||
template <typename ValueType, typename IndexType> | ||
__device__ __forceinline__ void store(thrust::complex<ValueType>* values, | ||
IndexType index, | ||
thrust::complex<ValueType> value) | ||
{ | ||
auto real = reinterpret_cast<ValueType*>(values); | ||
auto imag = real + 1; | ||
store(real, 2 * index, value.real()); | ||
store(imag, 2 * index, value.imag()); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,206 @@ | ||
/*******************************<GINKGO LICENSE>****************************** | ||
Copyright (c) 2017-2022, the Ginkgo authors | ||
All rights reserved. | ||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions | ||
are met: | ||
1. Redistributions of source code must retain the above copyright | ||
notice, this list of conditions and the following disclaimer. | ||
2. Redistributions in binary form must reproduce the above copyright | ||
notice, this list of conditions and the following disclaimer in the | ||
documentation and/or other materials provided with the distribution. | ||
3. Neither the name of the copyright holder nor the names of its | ||
contributors may be used to endorse or promote products derived from | ||
this software without specific prior written permission. | ||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS | ||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | ||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A | ||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
******************************<GINKGO LICENSE>*******************************/ | ||
|
||
namespace kernel { | ||
|
||
|
||
template <typename ValueType, typename IndexType> | ||
__global__ __launch_bounds__(default_block_size) void initialize( | ||
const IndexType* __restrict__ mtx_row_ptrs, | ||
const IndexType* __restrict__ mtx_cols, | ||
const ValueType* __restrict__ mtx_vals, | ||
const IndexType* __restrict__ factor_row_ptrs, | ||
const IndexType* __restrict__ factor_cols, | ||
const IndexType* __restrict__ factor_storage_offsets, | ||
const int32* __restrict__ factor_storage, | ||
const int64* __restrict__ factor_row_descs, | ||
ValueType* __restrict__ factor_vals, IndexType* __restrict__ diag_idxs, | ||
size_type num_rows) | ||
{ | ||
const auto row = thread::get_subwarp_id_flat<config::warp_size>(); | ||
if (row >= num_rows) { | ||
return; | ||
} | ||
const auto warp = | ||
group::tiled_partition<config::warp_size>(group::this_thread_block()); | ||
// first zero out this row of the factor | ||
const auto factor_begin = factor_row_ptrs[row]; | ||
const auto factor_end = factor_row_ptrs[row + 1]; | ||
const auto lane = static_cast<int>(warp.thread_rank()); | ||
for (auto nz = factor_begin + lane; nz < factor_end; | ||
nz += config::warp_size) { | ||
factor_vals[nz] = zero<ValueType>(); | ||
} | ||
warp.sync(); | ||
// then fill in the values from mtx | ||
gko::matrix::csr::device_sparsity_lookup<IndexType> lookup{ | ||
factor_row_ptrs, factor_cols, factor_storage_offsets, | ||
factor_storage, factor_row_descs, row}; | ||
const auto row_begin = mtx_row_ptrs[row]; | ||
const auto row_end = mtx_row_ptrs[row + 1]; | ||
for (auto nz = row_begin + lane; nz < row_end; nz += config::warp_size) { | ||
const auto col = mtx_cols[nz]; | ||
const auto val = mtx_vals[nz]; | ||
factor_vals[lookup.lookup_unsafe(col) + factor_begin] = val; | ||
} | ||
if (lane == 0) { | ||
diag_idxs[row] = lookup.lookup_unsafe(row) + factor_begin; | ||
} | ||
} | ||
|
||
|
||
template <typename ValueType, typename IndexType> | ||
__global__ __launch_bounds__(default_block_size) void factorize( | ||
const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols, | ||
const IndexType* __restrict__ storage_offsets, | ||
const int32* __restrict__ storage, const int64* __restrict__ row_descs, | ||
const IndexType* __restrict__ diag_idxs, ValueType* __restrict__ vals, | ||
int* __restrict__ ready, int* __restrict__ block_counter, | ||
size_type num_rows) | ||
{ | ||
__shared__ int threadblock_offset; | ||
__shared__ int sh_ready[default_block_size / config::warp_size]; | ||
if (threadIdx.x == 0) { | ||
threadblock_offset = atomic_add(block_counter, 1) * default_block_size; | ||
} | ||
if (threadIdx.x < default_block_size / config::warp_size) { | ||
sh_ready[threadIdx.x] = 0; | ||
} | ||
__syncthreads(); | ||
const auto row = (threadblock_offset + static_cast<int>(threadIdx.x)) / | ||
config::warp_size; | ||
if (row >= num_rows) { | ||
return; | ||
} | ||
const auto block = threadblock_offset / default_block_size; | ||
const auto warp = | ||
group::tiled_partition<config::warp_size>(group::this_thread_block()); | ||
const auto lane = warp.thread_rank(); | ||
const auto row_begin = row_ptrs[row]; | ||
const auto row_diag = diag_idxs[row]; | ||
const auto row_end = row_ptrs[row + 1]; | ||
const auto row_local = row % (default_block_size / config::warp_size); | ||
gko::matrix::csr::device_sparsity_lookup<IndexType> lookup{ | ||
row_ptrs, cols, storage_offsets, storage, row_descs, row}; | ||
// for each lower triangular entry: eliminate with corresponding row | ||
for (auto lower_nz = row_begin; lower_nz < row_diag; lower_nz++) { | ||
const auto dep = cols[lower_nz]; | ||
const auto dep_block = dep / (default_block_size / config::warp_size); | ||
const auto dep_local = dep % (default_block_size / config::warp_size); | ||
auto val = vals[lower_nz]; | ||
const auto diag_idx = diag_idxs[dep]; | ||
const auto dep_end = row_ptrs[dep + 1]; | ||
// assert(dep < row); | ||
if (dep_block == block) { | ||
// wait for a local dependency | ||
while (!load(sh_ready, dep_local)) { | ||
__threadfence(); | ||
} | ||
} else { | ||
// wait for a global dependency | ||
while (!load(ready, dep)) { | ||
__threadfence(); | ||
} | ||
} | ||
__threadfence(); | ||
const auto diag = vals[diag_idx]; | ||
const auto scale = val / diag; | ||
if (lane == 0) { | ||
vals[lower_nz] = scale; | ||
} | ||
// subtract all entries past the diagonal | ||
for (auto upper_nz = diag_idx + 1 + lane; upper_nz < dep_end; | ||
upper_nz += config::warp_size) { | ||
const auto upper_col = cols[upper_nz]; | ||
const auto upper_val = vals[upper_nz]; | ||
const auto output_pos = lookup.lookup_unsafe(upper_col) + row_begin; | ||
vals[output_pos] -= scale * upper_val; | ||
} | ||
} | ||
__threadfence(); | ||
// notify local warps | ||
store(sh_ready, row_local, 1); | ||
// notify other blocks | ||
store(ready, row, 1); | ||
} | ||
|
||
|
||
} // namespace kernel | ||
|
||
|
||
template <typename ValueType, typename IndexType> | ||
void initialize(std::shared_ptr<const DefaultExecutor> exec, | ||
const matrix::Csr<ValueType, IndexType>* mtx, | ||
const IndexType* factor_lookup_offsets, | ||
const int64* factor_lookup_descs, | ||
const int32* factor_lookup_storage, IndexType* diag_idxs, | ||
matrix::Csr<ValueType, IndexType>* factors) | ||
{ | ||
const auto num_rows = mtx->get_size()[0]; | ||
if (num_rows > 0) { | ||
const auto num_blocks = | ||
ceildiv(num_rows, default_block_size / config::warp_size); | ||
kernel::initialize<<<num_blocks, default_block_size>>>( | ||
mtx->get_const_row_ptrs(), mtx->get_const_col_idxs(), | ||
as_device_type(mtx->get_const_values()), | ||
factors->get_const_row_ptrs(), factors->get_const_col_idxs(), | ||
factor_lookup_offsets, factor_lookup_storage, factor_lookup_descs, | ||
as_device_type(factors->get_values()), diag_idxs, num_rows); | ||
} | ||
} | ||
|
||
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE); | ||
|
||
|
||
template <typename ValueType, typename IndexType> | ||
void factorize(std::shared_ptr<const DefaultExecutor> exec, | ||
const IndexType* lookup_offsets, const int64* lookup_descs, | ||
const int32* lookup_storage, const IndexType* diag_idxs, | ||
matrix::Csr<ValueType, IndexType>* factors, | ||
array<int>& tmp_storage) | ||
{ | ||
const auto num_rows = factors->get_size()[0]; | ||
if (num_rows > 0) { | ||
tmp_storage.resize_and_reset(num_rows + 1); | ||
components::fill_array(exec, tmp_storage.get_data(), num_rows + 1, | ||
int{}); | ||
const auto num_blocks = | ||
ceildiv(num_rows, default_block_size / config::warp_size); | ||
kernel::factorize<<<num_blocks, default_block_size>>>( | ||
factors->get_const_row_ptrs(), factors->get_const_col_idxs(), | ||
lookup_offsets, lookup_storage, lookup_descs, diag_idxs, | ||
as_device_type(factors->get_values()), tmp_storage.get_data(), | ||
tmp_storage.get_data() + num_rows, num_rows); | ||
} | ||
} | ||
|
||
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
/*******************************<GINKGO LICENSE>****************************** | ||
Copyright (c) 2017-2022, the Ginkgo authors | ||
All rights reserved. | ||
Redistribution and use in source and binary forms, with or without | ||
modification, are permitted provided that the following conditions | ||
are met: | ||
1. Redistributions of source code must retain the above copyright | ||
notice, this list of conditions and the following disclaimer. | ||
2. Redistributions in binary form must reproduce the above copyright | ||
notice, this list of conditions and the following disclaimer in the | ||
documentation and/or other materials provided with the distribution. | ||
3. Neither the name of the copyright holder nor the names of its | ||
contributors may be used to endorse or promote products derived from | ||
this software without specific prior written permission. | ||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS | ||
IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED | ||
TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A | ||
PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | ||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | ||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | ||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | ||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | ||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | ||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | ||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||
******************************<GINKGO LICENSE>*******************************/ | ||
|
||
#ifndef GKO_CORE_FACTORIZATION_LU_KERNELS_HPP_ | ||
#define GKO_CORE_FACTORIZATION_LU_KERNELS_HPP_ | ||
|
||
|
||
#include <memory> | ||
|
||
|
||
#include <ginkgo/core/base/executor.hpp> | ||
#include <ginkgo/core/base/types.hpp> | ||
#include <ginkgo/core/matrix/csr.hpp> | ||
|
||
|
||
#include "core/base/kernel_declaration.hpp" | ||
|
||
|
||
namespace gko { | ||
namespace kernels { | ||
|
||
|
||
#define GKO_DECLARE_LU_INITIALIZE(ValueType, IndexType) \ | ||
void initialize(std::shared_ptr<const DefaultExecutor> exec, \ | ||
const matrix::Csr<ValueType, IndexType>* mtx, \ | ||
const IndexType* factor_lookup_offsets, \ | ||
const int64* factor_lookup_descs, \ | ||
const int32* factor_lookup_storage, IndexType* diag_idxs, \ | ||
matrix::Csr<ValueType, IndexType>* factors) | ||
|
||
|
||
#define GKO_DECLARE_LU_FACTORIZE(ValueType, IndexType) \ | ||
void factorize( \ | ||
std::shared_ptr<const DefaultExecutor> exec, \ | ||
const IndexType* lookup_storage_offsets, const int64* lookup_descs, \ | ||
const int32* lookup_storage, const IndexType* diag_idxs, \ | ||
matrix::Csr<ValueType, IndexType>* factors, array<int>& tmp_storage) | ||
|
||
|
||
#define GKO_DECLARE_ALL_AS_TEMPLATES \ | ||
template <typename ValueType, typename IndexType> \ | ||
GKO_DECLARE_LU_INITIALIZE(ValueType, IndexType); \ | ||
template <typename ValueType, typename IndexType> \ | ||
GKO_DECLARE_LU_FACTORIZE(ValueType, IndexType) | ||
|
||
|
||
GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(lu_factorization, | ||
GKO_DECLARE_ALL_AS_TEMPLATES); | ||
|
||
|
||
#undef GKO_DECLARE_ALL_AS_TEMPLATES | ||
|
||
|
||
} // namespace kernels | ||
} // namespace gko | ||
|
||
|
||
#endif // GKO_CORE_FACTORIZATION_LU_KERNELS_HPP_ |
Oops, something went wrong.