mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
Add an OPM implementation of ILU0
improve file structure in cuistl run clang-format
This commit is contained in:
parent
ad595fed5e
commit
7a30aaa46e
@ -205,16 +205,21 @@ if (HAVE_CUDA)
|
|||||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/cusparse_matrix_operations.cu)
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/cusparse_matrix_operations.cu)
|
||||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuSparseHandle.cpp)
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/CuSparseHandle.cpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuBuffer.cpp)
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuBuffer.cpp)
|
||||||
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.cu)
|
||||||
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.cu)
|
||||||
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.cu)
|
||||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuVector.cpp)
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuVector.cpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuView.cpp)
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuView.cpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu)
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg detail/vector_operations.cu)
|
||||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSparseMatrix.cpp)
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSparseMatrix.cpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuDILU.cpp)
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuDILU.cpp)
|
||||||
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg OpmCuILU0.cpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuJac.cpp)
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuJac.cpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSeqILU0.cpp)
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg CuSeqILU0.cpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg set_device.cpp)
|
ADD_CUDA_OR_HIP_FILE(MAIN_SOURCE_FILES opm/simulators/linalg set_device.cpp)
|
||||||
|
|
||||||
# HEADERS
|
# HEADERS
|
||||||
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/coloringAndReorderingUtils.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_safe_call.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cuda_safe_call.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_matrix_operations.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_matrix_operations.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_safe_call.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/cusparse_safe_call.hpp)
|
||||||
@ -223,7 +228,11 @@ if (HAVE_CUDA)
|
|||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuBlasHandle.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuBlasHandle.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuSparseHandle.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/CuSparseHandle.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuBuffer.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuBuffer.hpp)
|
||||||
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/DILUKernels.hpp)
|
||||||
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/ILU0Kernels.hpp)
|
||||||
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditionerKernels/JacKernels.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuDILU.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuDILU.hpp)
|
||||||
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg OpmCuILU0.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuJac.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuJac.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuVector.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuVector.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuView.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuView.hpp)
|
||||||
@ -238,6 +247,8 @@ if (HAVE_CUDA)
|
|||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/vector_operations.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/vector_operations.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/has_function.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/has_function.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditioner_should_call_post_pre.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/preconditioner_should_call_post_pre.hpp)
|
||||||
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/deviceBlockOperations.hpp)
|
||||||
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/gpuThreadUtils.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg PreconditionerAdapter.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg PreconditionerAdapter.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuSeqILU0.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg CuSeqILU0.hpp)
|
||||||
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/fix_zero_diagonal.hpp)
|
ADD_CUDA_OR_HIP_FILE(PUBLIC_HEADER_FILES opm/simulators/linalg detail/fix_zero_diagonal.hpp)
|
||||||
|
@ -24,6 +24,7 @@
|
|||||||
#if USE_HIP
|
#if USE_HIP
|
||||||
#include <opm/simulators/linalg/hipistl/CuBlockPreconditioner.hpp>
|
#include <opm/simulators/linalg/hipistl/CuBlockPreconditioner.hpp>
|
||||||
#include <opm/simulators/linalg/hipistl/CuDILU.hpp>
|
#include <opm/simulators/linalg/hipistl/CuDILU.hpp>
|
||||||
|
#include <opm/simulators/linalg/hipistl/OpmCuILU0.hpp>
|
||||||
#include <opm/simulators/linalg/hipistl/CuJac.hpp>
|
#include <opm/simulators/linalg/hipistl/CuJac.hpp>
|
||||||
#include <opm/simulators/linalg/hipistl/CuSeqILU0.hpp>
|
#include <opm/simulators/linalg/hipistl/CuSeqILU0.hpp>
|
||||||
#include <opm/simulators/linalg/hipistl/PreconditionerAdapter.hpp>
|
#include <opm/simulators/linalg/hipistl/PreconditionerAdapter.hpp>
|
||||||
@ -32,6 +33,7 @@
|
|||||||
#else
|
#else
|
||||||
#include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
|
#include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
|
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuJac.hpp>
|
#include <opm/simulators/linalg/cuistl/CuJac.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuSeqILU0.hpp>
|
#include <opm/simulators/linalg/cuistl/CuSeqILU0.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
|
#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
|
||||||
|
@ -349,7 +349,6 @@ struct StandardPreconditioners {
|
|||||||
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
|
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
|
||||||
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
||||||
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
||||||
// const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
|
||||||
using field_type = typename V::field_type;
|
using field_type = typename V::field_type;
|
||||||
using CuDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
using CuDILU = typename cuistl::CuDILU<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
||||||
auto cuDILU = std::make_shared<CuDILU>(op.getmat(), split_matrix, tune_gpu_kernels);
|
auto cuDILU = std::make_shared<CuDILU>(op.getmat(), split_matrix, tune_gpu_kernels);
|
||||||
@ -358,6 +357,18 @@ struct StandardPreconditioners {
|
|||||||
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
||||||
return wrapped;
|
return wrapped;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
F::addCreator("OPMCUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
|
||||||
|
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
||||||
|
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
||||||
|
using field_type = typename V::field_type;
|
||||||
|
using OpmCuILU0 = typename cuistl::OpmCuILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
||||||
|
auto cuilu0 = std::make_shared<OpmCuILU0>(op.getmat(), split_matrix, tune_gpu_kernels);
|
||||||
|
|
||||||
|
auto adapted = std::make_shared<cuistl::PreconditionerAdapter<V, V, OpmCuILU0>>(cuilu0);
|
||||||
|
auto wrapped = std::make_shared<cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
||||||
|
return wrapped;
|
||||||
|
});
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -605,6 +616,15 @@ struct StandardPreconditioners<Operator, Dune::Amg::SequentialInformation> {
|
|||||||
std::make_shared<CUJac>(op.getmat(), w));
|
std::make_shared<CUJac>(op.getmat(), w));
|
||||||
});
|
});
|
||||||
|
|
||||||
|
F::addCreator("OPMCUILU0", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
||||||
|
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
||||||
|
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
||||||
|
using field_type = typename V::field_type;
|
||||||
|
using CUILU0 = typename cuistl::OpmCuILU0<M, cuistl::CuVector<field_type>, cuistl::CuVector<field_type>>;
|
||||||
|
|
||||||
|
return std::make_shared<cuistl::PreconditionerAdapter<V, V, CUILU0>>(std::make_shared<CUILU0>(op.getmat(), split_matrix, tune_gpu_kernels));
|
||||||
|
});
|
||||||
|
|
||||||
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
||||||
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
const bool split_matrix = prm.get<bool>("split_matrix", true);
|
||||||
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
const bool tune_gpu_kernels = prm.get<bool>("tune_gpu_kernels", true);
|
||||||
|
@ -16,110 +16,23 @@
|
|||||||
You should have received a copy of the GNU General Public License
|
You should have received a copy of the GNU General Public License
|
||||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
#include <cuda.h>
|
#include <chrono>
|
||||||
#include <cuda_runtime.h>
|
#include <config.h>
|
||||||
#include <dune/common/fmatrix.hh>
|
#include <dune/common/fmatrix.hh>
|
||||||
#include <dune/istl/bcrsmatrix.hh>
|
#include <dune/istl/bcrsmatrix.hh>
|
||||||
#include <fmt/core.h>
|
#include <fmt/core.h>
|
||||||
|
#include <limits>
|
||||||
#include <opm/common/ErrorMacros.hpp>
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
#include <opm/common/TimingMacros.hpp>
|
||||||
|
#include <opm/simulators/linalg/GraphColoring.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
|
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
|
||||||
#include <opm/simulators/linalg/matrixblock.hh>
|
#include <opm/simulators/linalg/matrixblock.hh>
|
||||||
#include <vector>
|
|
||||||
#include <config.h>
|
|
||||||
#include <chrono>
|
|
||||||
#include <limits>
|
|
||||||
#include <tuple>
|
#include <tuple>
|
||||||
|
|
||||||
|
|
||||||
namespace
|
|
||||||
{
|
|
||||||
std::vector<int>
|
|
||||||
createReorderedToNatural(Opm::SparseTable<size_t>& levelSets)
|
|
||||||
{
|
|
||||||
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
|
|
||||||
int globCnt = 0;
|
|
||||||
for (auto row : levelSets) {
|
|
||||||
for (auto col : row) {
|
|
||||||
OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
|
|
||||||
fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
|
|
||||||
res[globCnt++] = static_cast<int>(col);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<int>
|
|
||||||
createNaturalToReordered(Opm::SparseTable<size_t>& levelSets)
|
|
||||||
{
|
|
||||||
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
|
|
||||||
int globCnt = 0;
|
|
||||||
for (auto row : levelSets) {
|
|
||||||
for (auto col : row) {
|
|
||||||
OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
|
|
||||||
fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
|
|
||||||
res[col] = globCnt++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class M, class field_type, class GPUM>
|
|
||||||
void
|
|
||||||
createReorderedMatrix(const M& naturalMatrix,
|
|
||||||
std::vector<int>& reorderedToNatural,
|
|
||||||
std::unique_ptr<GPUM>& reorderedGpuMat)
|
|
||||||
{
|
|
||||||
M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
|
|
||||||
for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
|
|
||||||
auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
|
|
||||||
// For elements in A
|
|
||||||
for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
|
|
||||||
dstRowIt.insert(elem.index());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
reorderedGpuMat.reset(new auto (GPUM::fromMatrix(reorderedMatrix, true)));
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class M, class field_type, class GPUM>
|
|
||||||
void
|
|
||||||
extractLowerAndUpperMatrices(const M& naturalMatrix,
|
|
||||||
std::vector<int>& reorderedToNatural,
|
|
||||||
std::unique_ptr<GPUM>& lower,
|
|
||||||
std::unique_ptr<GPUM>& upper)
|
|
||||||
{
|
|
||||||
const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N()) / 2;
|
|
||||||
|
|
||||||
M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
|
|
||||||
M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
|
|
||||||
|
|
||||||
for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin();
|
|
||||||
lowerIt != reorderedLower.createend();
|
|
||||||
++lowerIt, ++upperIt) {
|
|
||||||
|
|
||||||
auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()];
|
|
||||||
|
|
||||||
for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) {
|
|
||||||
if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal
|
|
||||||
lowerIt.insert(elem.index());
|
|
||||||
} else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal
|
|
||||||
upperIt.insert(elem.index());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
lower.reset(new auto (GPUM::fromMatrix(reorderedLower, true)));
|
|
||||||
upper.reset(new auto (GPUM::fromMatrix(reorderedUpper, true)));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // NAMESPACE
|
|
||||||
|
|
||||||
namespace Opm::cuistl
|
namespace Opm::cuistl
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -127,8 +40,8 @@ template <class M, class X, class Y, int l>
|
|||||||
CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
|
CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
|
||||||
: m_cpuMatrix(A)
|
: m_cpuMatrix(A)
|
||||||
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
|
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
|
||||||
, m_reorderedToNatural(createReorderedToNatural(m_levelSets))
|
, m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
|
||||||
, m_naturalToReordered(createNaturalToReordered(m_levelSets))
|
, m_naturalToReordered(detail::createNaturalToReordered(m_levelSets))
|
||||||
, m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
|
, m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
|
||||||
, m_gpuNaturalToReorder(m_naturalToReordered)
|
, m_gpuNaturalToReorder(m_naturalToReordered)
|
||||||
, m_gpuReorderToNatural(m_reorderedToNatural)
|
, m_gpuReorderToNatural(m_reorderedToNatural)
|
||||||
@ -154,12 +67,12 @@ CuDILU<M, X, Y, l>::CuDILU(const M& A, bool splitMatrix, bool tuneKernels)
|
|||||||
m_gpuMatrix.nonzeroes(),
|
m_gpuMatrix.nonzeroes(),
|
||||||
A.nonzeroes()));
|
A.nonzeroes()));
|
||||||
if (m_splitMatrix) {
|
if (m_splitMatrix) {
|
||||||
m_gpuMatrixReorderedDiag.reset(new auto(CuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N())));
|
m_gpuMatrixReorderedDiag = std::make_unique<CuVector<field_type>>(blocksize_ * blocksize_ * m_cpuMatrix.N());
|
||||||
extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(
|
std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
|
||||||
m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper);
|
= detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
|
||||||
} else {
|
m_reorderedToNatural);
|
||||||
createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
|
m_gpuMatrixReordered = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
|
||||||
m_cpuMatrix, m_reorderedToNatural, m_gpuMatrixReordered);
|
m_cpuMatrix, m_reorderedToNatural);
|
||||||
}
|
}
|
||||||
computeDiagAndMoveReorderedData();
|
computeDiagAndMoveReorderedData();
|
||||||
|
|
||||||
@ -188,7 +101,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
if (m_splitMatrix) {
|
if (m_splitMatrix) {
|
||||||
detail::computeLowerSolveLevelSetSplit<field_type, blocksize_>(
|
detail::DILU::solveLowerLevelSetSplit<field_type, blocksize_>(
|
||||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||||
@ -200,7 +113,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||||||
v.data(),
|
v.data(),
|
||||||
m_applyThreadBlockSize);
|
m_applyThreadBlockSize);
|
||||||
} else {
|
} else {
|
||||||
detail::computeLowerSolveLevelSet<field_type, blocksize_>(
|
detail::DILU::solveLowerLevelSet<field_type, blocksize_>(
|
||||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReordered->getRowIndices().data(),
|
m_gpuMatrixReordered->getRowIndices().data(),
|
||||||
m_gpuMatrixReordered->getColumnIndices().data(),
|
m_gpuMatrixReordered->getColumnIndices().data(),
|
||||||
@ -221,7 +134,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
levelStartIdx -= numOfRowsInLevel;
|
levelStartIdx -= numOfRowsInLevel;
|
||||||
if (m_splitMatrix) {
|
if (m_splitMatrix) {
|
||||||
detail::computeUpperSolveLevelSetSplit<field_type, blocksize_>(
|
detail::DILU::solveUpperLevelSetSplit<field_type, blocksize_>(
|
||||||
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||||
@ -232,7 +145,7 @@ CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
|||||||
v.data(),
|
v.data(),
|
||||||
m_applyThreadBlockSize);
|
m_applyThreadBlockSize);
|
||||||
} else {
|
} else {
|
||||||
detail::computeUpperSolveLevelSet<field_type, blocksize_>(
|
detail::DILU::solveUpperLevelSet<field_type, blocksize_>(
|
||||||
m_gpuMatrixReordered->getNonZeroValues().data(),
|
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReordered->getRowIndices().data(),
|
m_gpuMatrixReordered->getRowIndices().data(),
|
||||||
m_gpuMatrixReordered->getColumnIndices().data(),
|
m_gpuMatrixReordered->getColumnIndices().data(),
|
||||||
@ -304,7 +217,7 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
if (m_splitMatrix) {
|
if (m_splitMatrix) {
|
||||||
detail::computeDiluDiagonalSplit<field_type, blocksize_>(
|
detail::DILU::computeDiluDiagonalSplit<field_type, blocksize_>(
|
||||||
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||||
@ -319,7 +232,8 @@ CuDILU<M, X, Y, l>::computeDiagAndMoveReorderedData()
|
|||||||
m_gpuDInv.data(),
|
m_gpuDInv.data(),
|
||||||
m_updateThreadBlockSize);
|
m_updateThreadBlockSize);
|
||||||
} else {
|
} else {
|
||||||
detail::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered->getNonZeroValues().data(),
|
detail::DILU::computeDiluDiagonal<field_type, blocksize_>(
|
||||||
|
m_gpuMatrixReordered->getNonZeroValues().data(),
|
||||||
m_gpuMatrixReordered->getRowIndices().data(),
|
m_gpuMatrixReordered->getRowIndices().data(),
|
||||||
m_gpuMatrixReordered->getColumnIndices().data(),
|
m_gpuMatrixReordered->getColumnIndices().data(),
|
||||||
m_gpuReorderToNatural.data(),
|
m_gpuReorderToNatural.data(),
|
||||||
@ -360,7 +274,8 @@ CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
|
|||||||
std::ignore = cudaDeviceSynchronize();
|
std::ignore = cudaDeviceSynchronize();
|
||||||
auto afterUpdate = std::chrono::high_resolution_clock::now();
|
auto afterUpdate = std::chrono::high_resolution_clock::now();
|
||||||
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
|
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
|
||||||
long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
|
long long durationInMicroSec
|
||||||
|
= std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
|
||||||
if (durationInMicroSec < bestUpdateTime) {
|
if (durationInMicroSec < bestUpdateTime) {
|
||||||
bestUpdateTime = durationInMicroSec;
|
bestUpdateTime = durationInMicroSec;
|
||||||
bestUpdateBlockSize = thrBlockSize;
|
bestUpdateBlockSize = thrBlockSize;
|
||||||
@ -373,7 +288,8 @@ CuDILU<M, X, Y, l>::tuneThreadBlockSizes()
|
|||||||
std::ignore = cudaDeviceSynchronize();
|
std::ignore = cudaDeviceSynchronize();
|
||||||
auto afterApply = std::chrono::high_resolution_clock::now();
|
auto afterApply = std::chrono::high_resolution_clock::now();
|
||||||
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
|
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
|
||||||
long long durationInMicroSec = std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
|
long long durationInMicroSec
|
||||||
|
= std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
|
||||||
if (durationInMicroSec < bestApplyTime) {
|
if (durationInMicroSec < bestApplyTime) {
|
||||||
bestApplyTime = durationInMicroSec;
|
bestApplyTime = durationInMicroSec;
|
||||||
bestApplyBlockSize = thrBlockSize;
|
bestApplyBlockSize = thrBlockSize;
|
||||||
|
@ -19,17 +19,11 @@
|
|||||||
#ifndef OPM_CUDILU_HPP
|
#ifndef OPM_CUDILU_HPP
|
||||||
#define OPM_CUDILU_HPP
|
#define OPM_CUDILU_HPP
|
||||||
|
|
||||||
#include <dune/istl/preconditioner.hh>
|
#include <memory>
|
||||||
#include <opm/grid/utility/SparseTable.hpp>
|
#include <opm/grid/utility/SparseTable.hpp>
|
||||||
#include <opm/simulators/linalg/GraphColoring.hpp>
|
|
||||||
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
|
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/CuMatrixDescription.hpp>
|
|
||||||
#include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
|
|
||||||
#include <opm/simulators/linalg/cuistl/detail/CuSparseResource.hpp>
|
|
||||||
#include <optional>
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <memory>
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -16,27 +16,13 @@
|
|||||||
You should have received a copy of the GNU General Public License
|
You should have received a copy of the GNU General Public License
|
||||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
#include <cuda.h>
|
|
||||||
#include <cuda_runtime.h>
|
|
||||||
#include <cusparse.h>
|
|
||||||
#include <dune/common/fmatrix.hh>
|
#include <dune/common/fmatrix.hh>
|
||||||
#include <dune/common/fvector.hh>
|
|
||||||
#include <dune/istl/bcrsmatrix.hh>
|
#include <dune/istl/bcrsmatrix.hh>
|
||||||
#include <dune/istl/bvector.hh>
|
|
||||||
#include <fmt/core.h>
|
#include <fmt/core.h>
|
||||||
#include <opm/common/ErrorMacros.hpp>
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuJac.hpp>
|
#include <opm/simulators/linalg/cuistl/CuJac.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp>
|
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
|
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
|
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cusparse_constants.hpp>
|
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cusparse_safe_call.hpp>
|
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cusparse_wrapper.hpp>
|
|
||||||
#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
|
|
||||||
#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
|
|
||||||
#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
|
||||||
#include <opm/simulators/linalg/matrixblock.hh>
|
#include <opm/simulators/linalg/matrixblock.hh>
|
||||||
|
|
||||||
@ -67,11 +53,7 @@ CuJac<M, X, Y, l>::CuJac(const M& A, field_type w)
|
|||||||
A.nonzeroes()));
|
A.nonzeroes()));
|
||||||
|
|
||||||
// Compute the inverted diagonal of A and store it in a vector format in m_diagInvFlattened
|
// Compute the inverted diagonal of A and store it in a vector format in m_diagInvFlattened
|
||||||
detail::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(m_gpuMatrix.getNonZeroValues().data(),
|
invertDiagonalAndFlatten();
|
||||||
m_gpuMatrix.getRowIndices().data(),
|
|
||||||
m_gpuMatrix.getColumnIndices().data(),
|
|
||||||
m_gpuMatrix.N(),
|
|
||||||
m_diagInvFlattened.data());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <class M, class X, class Y, int l>
|
template <class M, class X, class Y, int l>
|
||||||
@ -111,7 +93,15 @@ void
|
|||||||
CuJac<M, X, Y, l>::update()
|
CuJac<M, X, Y, l>::update()
|
||||||
{
|
{
|
||||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix);
|
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix);
|
||||||
detail::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(m_gpuMatrix.getNonZeroValues().data(),
|
invertDiagonalAndFlatten();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class M, class X, class Y, int l>
|
||||||
|
void
|
||||||
|
CuJac<M, X, Y, l>::invertDiagonalAndFlatten()
|
||||||
|
{
|
||||||
|
detail::JAC::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(
|
||||||
|
m_gpuMatrix.getNonZeroValues().data(),
|
||||||
m_gpuMatrix.getRowIndices().data(),
|
m_gpuMatrix.getRowIndices().data(),
|
||||||
m_gpuMatrix.getColumnIndices().data(),
|
m_gpuMatrix.getColumnIndices().data(),
|
||||||
m_gpuMatrix.N(),
|
m_gpuMatrix.N(),
|
||||||
|
@ -103,6 +103,8 @@ private:
|
|||||||
CuSparseMatrix<field_type> m_gpuMatrix;
|
CuSparseMatrix<field_type> m_gpuMatrix;
|
||||||
//! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector
|
//! \brief the diagonal of cuMatrix inverted, and then flattened to fit in a vector
|
||||||
CuVector<field_type> m_diagInvFlattened;
|
CuVector<field_type> m_diagInvFlattened;
|
||||||
|
|
||||||
|
void invertDiagonalAndFlatten();
|
||||||
};
|
};
|
||||||
} // end namespace Opm::cuistl
|
} // end namespace Opm::cuistl
|
||||||
|
|
||||||
|
318
opm/simulators/linalg/cuistl/OpmCuILU0.cpp
Normal file
318
opm/simulators/linalg/cuistl/OpmCuILU0.cpp
Normal file
@ -0,0 +1,318 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2024 SINTEF AS
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#include <chrono>
|
||||||
|
#include <config.h>
|
||||||
|
#include <dune/common/fmatrix.hh>
|
||||||
|
#include <dune/istl/bcrsmatrix.hh>
|
||||||
|
#include <fmt/core.h>
|
||||||
|
#include <limits>
|
||||||
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
#include <opm/common/TimingMacros.hpp>
|
||||||
|
#include <opm/simulators/linalg/GraphColoring.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/OpmCuILU0.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/coloringAndReorderingUtils.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
|
||||||
|
#include <opm/simulators/linalg/matrixblock.hh>
|
||||||
|
#include <tuple>
|
||||||
|
namespace Opm::cuistl
|
||||||
|
{
|
||||||
|
|
||||||
|
template <class M, class X, class Y, int l>
|
||||||
|
OpmCuILU0<M, X, Y, l>::OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels)
|
||||||
|
: m_cpuMatrix(A)
|
||||||
|
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
|
||||||
|
, m_reorderedToNatural(detail::createReorderedToNatural(m_levelSets))
|
||||||
|
, m_naturalToReordered(detail::createNaturalToReordered(m_levelSets))
|
||||||
|
, m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
|
||||||
|
, m_gpuMatrixReorderedLower(nullptr)
|
||||||
|
, m_gpuMatrixReorderedUpper(nullptr)
|
||||||
|
, m_gpuNaturalToReorder(m_naturalToReordered)
|
||||||
|
, m_gpuReorderToNatural(m_reorderedToNatural)
|
||||||
|
, m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
|
||||||
|
, m_splitMatrix(splitMatrix)
|
||||||
|
, m_tuneThreadBlockSizes(tuneKernels)
|
||||||
|
{
|
||||||
|
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
|
||||||
|
// Some sanity check
|
||||||
|
OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
|
||||||
|
fmt::format("CuSparse matrix not same size as DUNE matrix. {} vs {}.", m_gpuMatrix.N(), A.N()));
|
||||||
|
OPM_ERROR_IF(A[0][0].N() != m_gpuMatrix.blockSize(),
|
||||||
|
fmt::format("CuSparse matrix not same blocksize as DUNE matrix. {} vs {}.",
|
||||||
|
m_gpuMatrix.blockSize(),
|
||||||
|
A[0][0].N()));
|
||||||
|
OPM_ERROR_IF(A.N() * A[0][0].N() != m_gpuMatrix.dim(),
|
||||||
|
fmt::format("CuSparse matrix not same dimension as DUNE matrix. {} vs {}.",
|
||||||
|
m_gpuMatrix.dim(),
|
||||||
|
A.N() * A[0][0].N()));
|
||||||
|
OPM_ERROR_IF(A.nonzeroes() != m_gpuMatrix.nonzeroes(),
|
||||||
|
fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
|
||||||
|
m_gpuMatrix.nonzeroes(),
|
||||||
|
A.nonzeroes()));
|
||||||
|
if (m_splitMatrix) {
|
||||||
|
m_gpuMatrixReorderedDiag.emplace(CuVector<field_type>(blocksize_ * blocksize_ * m_cpuMatrix.N()));
|
||||||
|
std::tie(m_gpuMatrixReorderedLower, m_gpuMatrixReorderedUpper)
|
||||||
|
= detail::extractLowerAndUpperMatrices<M, field_type, CuSparseMatrix<field_type>>(m_cpuMatrix,
|
||||||
|
m_reorderedToNatural);
|
||||||
|
} else {
|
||||||
|
m_gpuReorderedLU = detail::createReorderedMatrix<M, field_type, CuSparseMatrix<field_type>>(
|
||||||
|
m_cpuMatrix, m_reorderedToNatural);
|
||||||
|
}
|
||||||
|
LUFactorizeAndMoveData();
|
||||||
|
|
||||||
|
#ifdef USE_HIP
|
||||||
|
if (m_tuneThreadBlockSizes) {
|
||||||
|
tuneThreadBlockSizes();
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class M, class X, class Y, int l>
|
||||||
|
void
|
||||||
|
OpmCuILU0<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class M, class X, class Y, int l>
|
||||||
|
void
|
||||||
|
OpmCuILU0<M, X, Y, l>::apply(X& v, const Y& d)
|
||||||
|
{
|
||||||
|
OPM_TIMEBLOCK(prec_apply);
|
||||||
|
{
|
||||||
|
int levelStartIdx = 0;
|
||||||
|
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||||
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
|
if (m_splitMatrix) {
|
||||||
|
detail::ILU0::solveLowerLevelSetSplit<field_type, blocksize_>(
|
||||||
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
|
levelStartIdx,
|
||||||
|
numOfRowsInLevel,
|
||||||
|
d.data(),
|
||||||
|
v.data(),
|
||||||
|
m_applyThreadBlockSize);
|
||||||
|
} else {
|
||||||
|
detail::ILU0::solveLowerLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||||
|
m_gpuReorderedLU->getRowIndices().data(),
|
||||||
|
m_gpuReorderedLU->getColumnIndices().data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
|
levelStartIdx,
|
||||||
|
numOfRowsInLevel,
|
||||||
|
d.data(),
|
||||||
|
v.data(),
|
||||||
|
m_applyThreadBlockSize);
|
||||||
|
}
|
||||||
|
levelStartIdx += numOfRowsInLevel;
|
||||||
|
}
|
||||||
|
|
||||||
|
levelStartIdx = m_cpuMatrix.N();
|
||||||
|
for (int level = m_levelSets.size() - 1; level >= 0; --level) {
|
||||||
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
|
levelStartIdx -= numOfRowsInLevel;
|
||||||
|
if (m_splitMatrix) {
|
||||||
|
detail::ILU0::solveUpperLevelSetSplit<field_type, blocksize_>(
|
||||||
|
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
|
levelStartIdx,
|
||||||
|
numOfRowsInLevel,
|
||||||
|
m_gpuMatrixReorderedDiag.value().data(),
|
||||||
|
v.data(),
|
||||||
|
m_applyThreadBlockSize);
|
||||||
|
} else {
|
||||||
|
detail::ILU0::solveUpperLevelSet<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||||
|
m_gpuReorderedLU->getRowIndices().data(),
|
||||||
|
m_gpuReorderedLU->getColumnIndices().data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
|
levelStartIdx,
|
||||||
|
numOfRowsInLevel,
|
||||||
|
v.data(),
|
||||||
|
m_applyThreadBlockSize);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class M, class X, class Y, int l>
|
||||||
|
void
|
||||||
|
OpmCuILU0<M, X, Y, l>::post([[maybe_unused]] X& x)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class M, class X, class Y, int l>
|
||||||
|
Dune::SolverCategory::Category
|
||||||
|
OpmCuILU0<M, X, Y, l>::category() const
|
||||||
|
{
|
||||||
|
return Dune::SolverCategory::sequential;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class M, class X, class Y, int l>
|
||||||
|
void
|
||||||
|
OpmCuILU0<M, X, Y, l>::update()
|
||||||
|
{
|
||||||
|
OPM_TIMEBLOCK(prec_update);
|
||||||
|
{
|
||||||
|
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||||
|
LUFactorizeAndMoveData();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class M, class X, class Y, int l>
|
||||||
|
void
|
||||||
|
OpmCuILU0<M, X, Y, l>::LUFactorizeAndMoveData()
|
||||||
|
{
|
||||||
|
OPM_TIMEBLOCK(prec_update);
|
||||||
|
{
|
||||||
|
if (m_splitMatrix) {
|
||||||
|
detail::copyMatDataToReorderedSplit<field_type, blocksize_>(
|
||||||
|
m_gpuMatrix.getNonZeroValues().data(),
|
||||||
|
m_gpuMatrix.getRowIndices().data(),
|
||||||
|
m_gpuMatrix.getColumnIndices().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedDiag.value().data(),
|
||||||
|
m_gpuNaturalToReorder.data(),
|
||||||
|
m_gpuMatrixReorderedLower->N(),
|
||||||
|
m_updateThreadBlockSize);
|
||||||
|
} else {
|
||||||
|
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
||||||
|
m_gpuMatrix.getRowIndices().data(),
|
||||||
|
m_gpuReorderedLU->getNonZeroValues().data(),
|
||||||
|
m_gpuReorderedLU->getRowIndices().data(),
|
||||||
|
m_gpuNaturalToReorder.data(),
|
||||||
|
m_gpuReorderedLU->N(),
|
||||||
|
m_updateThreadBlockSize);
|
||||||
|
}
|
||||||
|
int levelStartIdx = 0;
|
||||||
|
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||||
|
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||||
|
|
||||||
|
if (m_splitMatrix) {
|
||||||
|
detail::ILU0::LUFactorizationSplit<field_type, blocksize_>(
|
||||||
|
m_gpuMatrixReorderedLower->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedLower->getColumnIndices().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getNonZeroValues().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getRowIndices().data(),
|
||||||
|
m_gpuMatrixReorderedUpper->getColumnIndices().data(),
|
||||||
|
m_gpuMatrixReorderedDiag.value().data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
|
m_gpuNaturalToReorder.data(),
|
||||||
|
levelStartIdx,
|
||||||
|
numOfRowsInLevel,
|
||||||
|
m_updateThreadBlockSize);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
detail::ILU0::LUFactorization<field_type, blocksize_>(m_gpuReorderedLU->getNonZeroValues().data(),
|
||||||
|
m_gpuReorderedLU->getRowIndices().data(),
|
||||||
|
m_gpuReorderedLU->getColumnIndices().data(),
|
||||||
|
m_gpuNaturalToReorder.data(),
|
||||||
|
m_gpuReorderToNatural.data(),
|
||||||
|
numOfRowsInLevel,
|
||||||
|
levelStartIdx,
|
||||||
|
m_updateThreadBlockSize);
|
||||||
|
}
|
||||||
|
levelStartIdx += numOfRowsInLevel;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class M, class X, class Y, int l>
|
||||||
|
void
|
||||||
|
OpmCuILU0<M, X, Y, l>::tuneThreadBlockSizes()
|
||||||
|
{
|
||||||
|
// TODO generalize this tuning process in a function separate of the class
|
||||||
|
long long bestApplyTime = std::numeric_limits<long long>::max();
|
||||||
|
long long bestUpdateTime = std::numeric_limits<long long>::max();
|
||||||
|
int bestApplyBlockSize = -1;
|
||||||
|
int bestUpdateBlockSize = -1;
|
||||||
|
int interval = 64;
|
||||||
|
|
||||||
|
// temporary buffers for the apply
|
||||||
|
CuVector<field_type> tmpV(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||||
|
CuVector<field_type> tmpD(m_gpuMatrix.N() * m_gpuMatrix.blockSize());
|
||||||
|
tmpD = 1;
|
||||||
|
|
||||||
|
for (int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
|
||||||
|
// sometimes the first kernel launch kan be slower, so take the time twice
|
||||||
|
for (int i = 0; i < 2; ++i) {
|
||||||
|
|
||||||
|
auto beforeUpdate = std::chrono::high_resolution_clock::now();
|
||||||
|
m_updateThreadBlockSize = thrBlockSize;
|
||||||
|
update();
|
||||||
|
std::ignore = cudaDeviceSynchronize();
|
||||||
|
auto afterUpdate = std::chrono::high_resolution_clock::now();
|
||||||
|
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
|
||||||
|
long long durationInMicroSec
|
||||||
|
= std::chrono::duration_cast<std::chrono::microseconds>(afterUpdate - beforeUpdate).count();
|
||||||
|
if (durationInMicroSec < bestUpdateTime) {
|
||||||
|
bestUpdateTime = durationInMicroSec;
|
||||||
|
bestUpdateBlockSize = thrBlockSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto beforeApply = std::chrono::high_resolution_clock::now();
|
||||||
|
m_applyThreadBlockSize = thrBlockSize;
|
||||||
|
apply(tmpV, tmpD);
|
||||||
|
std::ignore = cudaDeviceSynchronize();
|
||||||
|
auto afterApply = std::chrono::high_resolution_clock::now();
|
||||||
|
if (cudaSuccess == cudaGetLastError()) { // kernel launch was valid
|
||||||
|
long long durationInMicroSec
|
||||||
|
= std::chrono::duration_cast<std::chrono::microseconds>(afterApply - beforeApply).count();
|
||||||
|
if (durationInMicroSec < bestApplyTime) {
|
||||||
|
bestApplyTime = durationInMicroSec;
|
||||||
|
bestApplyBlockSize = thrBlockSize;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
m_applyThreadBlockSize = bestApplyBlockSize;
|
||||||
|
m_updateThreadBlockSize = bestUpdateBlockSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace Opm::cuistl
|
||||||
|
#define INSTANTIATE_CUDILU_DUNE(realtype, blockdim) \
|
||||||
|
template class ::Opm::cuistl::OpmCuILU0<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>, \
|
||||||
|
::Opm::cuistl::CuVector<realtype>, \
|
||||||
|
::Opm::cuistl::CuVector<realtype>>; \
|
||||||
|
template class ::Opm::cuistl::OpmCuILU0<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>, \
|
||||||
|
::Opm::cuistl::CuVector<realtype>, \
|
||||||
|
::Opm::cuistl::CuVector<realtype>>
|
||||||
|
|
||||||
|
INSTANTIATE_CUDILU_DUNE(double, 1);
|
||||||
|
INSTANTIATE_CUDILU_DUNE(double, 2);
|
||||||
|
INSTANTIATE_CUDILU_DUNE(double, 3);
|
||||||
|
INSTANTIATE_CUDILU_DUNE(double, 4);
|
||||||
|
INSTANTIATE_CUDILU_DUNE(double, 5);
|
||||||
|
INSTANTIATE_CUDILU_DUNE(double, 6);
|
||||||
|
|
||||||
|
INSTANTIATE_CUDILU_DUNE(float, 1);
|
||||||
|
INSTANTIATE_CUDILU_DUNE(float, 2);
|
||||||
|
INSTANTIATE_CUDILU_DUNE(float, 3);
|
||||||
|
INSTANTIATE_CUDILU_DUNE(float, 4);
|
||||||
|
INSTANTIATE_CUDILU_DUNE(float, 5);
|
||||||
|
INSTANTIATE_CUDILU_DUNE(float, 6);
|
139
opm/simulators/linalg/cuistl/OpmCuILU0.hpp
Normal file
139
opm/simulators/linalg/cuistl/OpmCuILU0.hpp
Normal file
@ -0,0 +1,139 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2022-2023 SINTEF AS
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#ifndef OPM_CUILU0_OPM_Impl_HPP
|
||||||
|
#define OPM_CUILU0_OPM_Impl_HPP
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <opm/grid/utility/SparseTable.hpp>
|
||||||
|
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
||||||
|
#include <optional>
|
||||||
|
#include <type_traits>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
|
||||||
|
namespace Opm::cuistl
|
||||||
|
{
|
||||||
|
//! \brief ILU0 preconditioner on the GPU.
|
||||||
|
//!
|
||||||
|
//! \tparam M The matrix type to operate on
|
||||||
|
//! \tparam X Type of the update
|
||||||
|
//! \tparam Y Type of the defect
|
||||||
|
//! \tparam l Ignored. Just there to have the same number of template arguments
|
||||||
|
//! as other preconditioners.
|
||||||
|
//!
|
||||||
|
//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
|
||||||
|
//! arguments in case of future additions.
|
||||||
|
template <class M, class X, class Y, int l = 1>
|
||||||
|
class OpmCuILU0 : public Dune::PreconditionerWithUpdate<X, Y>
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
//! \brief The matrix type the preconditioner is for.
|
||||||
|
using matrix_type = typename std::remove_const<M>::type;
|
||||||
|
//! \brief The domain type of the preconditioner.
|
||||||
|
using domain_type = X;
|
||||||
|
//! \brief The range type of the preconditioner.
|
||||||
|
using range_type = Y;
|
||||||
|
//! \brief The field type of the preconditioner.
|
||||||
|
using field_type = typename X::field_type;
|
||||||
|
//! \brief The GPU matrix type
|
||||||
|
using CuMat = CuSparseMatrix<field_type>;
|
||||||
|
|
||||||
|
//! \brief Constructor.
|
||||||
|
//!
|
||||||
|
//! Constructor gets all parameters to operate the prec.
|
||||||
|
//! \param A The matrix to operate on.
|
||||||
|
//! \param w The relaxation factor.
|
||||||
|
//!
|
||||||
|
explicit OpmCuILU0(const M& A, bool splitMatrix, bool tuneKernels);
|
||||||
|
|
||||||
|
//! \brief Prepare the preconditioner.
|
||||||
|
//! \note Does nothing at the time being.
|
||||||
|
void pre(X& x, Y& b) override;
|
||||||
|
|
||||||
|
//! \brief Apply the preconditoner.
|
||||||
|
void apply(X& v, const Y& d) override;
|
||||||
|
|
||||||
|
//! \brief Post processing
|
||||||
|
//! \note Does nothing at the moment
|
||||||
|
void post(X& x) override;
|
||||||
|
|
||||||
|
//! Category of the preconditioner (see SolverCategory::Category)
|
||||||
|
Dune::SolverCategory::Category category() const override;
|
||||||
|
|
||||||
|
//! \brief Updates the matrix data.
|
||||||
|
void update() final;
|
||||||
|
|
||||||
|
//! \brief Compute LU factorization, and update the data of the reordered matrix
|
||||||
|
void LUFactorizeAndMoveData();
|
||||||
|
|
||||||
|
//! \brief function that will experimentally tune the thread block sizes of the important cuda kernels
|
||||||
|
void tuneThreadBlockSizes();
|
||||||
|
|
||||||
|
//! \returns false
|
||||||
|
static constexpr bool shouldCallPre()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
//! \returns false
|
||||||
|
static constexpr bool shouldCallPost()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private:
|
||||||
|
//! \brief Reference to the underlying matrix
|
||||||
|
const M& m_cpuMatrix;
|
||||||
|
//! \brief size_t describing the dimensions of the square block elements
|
||||||
|
static constexpr const size_t blocksize_ = matrix_type::block_type::cols;
|
||||||
|
//! \brief SparseTable storing each row by level
|
||||||
|
Opm::SparseTable<size_t> m_levelSets;
|
||||||
|
//! \brief converts from index in reordered structure to index natural ordered structure
|
||||||
|
std::vector<int> m_reorderedToNatural;
|
||||||
|
//! \brief converts from index in natural ordered structure to index reordered strucutre
|
||||||
|
std::vector<int> m_naturalToReordered;
|
||||||
|
//! \brief The A matrix stored on the gpu, and its reordred version
|
||||||
|
CuMat m_gpuMatrix;
|
||||||
|
std::unique_ptr<CuMat> m_gpuReorderedLU;
|
||||||
|
//! \brief If matrix splitting is enabled, then we store the lower and upper part separately
|
||||||
|
std::unique_ptr<CuMat> m_gpuMatrixReorderedLower;
|
||||||
|
std::unique_ptr<CuMat> m_gpuMatrixReorderedUpper;
|
||||||
|
//! \brief If matrix splitting is enabled, we also store the diagonal separately
|
||||||
|
std::optional<CuVector<field_type>> m_gpuMatrixReorderedDiag;
|
||||||
|
//! row conversion from natural to reordered matrix indices stored on the GPU
|
||||||
|
CuVector<int> m_gpuNaturalToReorder;
|
||||||
|
//! row conversion from reordered to natural matrix indices stored on the GPU
|
||||||
|
CuVector<int> m_gpuReorderToNatural;
|
||||||
|
//! \brief Stores the inverted diagonal that we use in ILU0
|
||||||
|
CuVector<field_type> m_gpuDInv;
|
||||||
|
//! \brief Bool storing whether or not we should store matrices in a split format
|
||||||
|
bool m_splitMatrix;
|
||||||
|
//! \brief Bool storing whether or not we will tune the threadblock sizes. Only used for AMD cards
|
||||||
|
bool m_tuneThreadBlockSizes;
|
||||||
|
//! \brief variables storing the threadblocksizes to use if using the tuned sizes and AMD cards
|
||||||
|
//! The default value of -1 indicates that we have not calibrated and selected a value yet
|
||||||
|
int m_applyThreadBlockSize = -1;
|
||||||
|
int m_updateThreadBlockSize = -1;
|
||||||
|
};
|
||||||
|
} // end namespace Opm::cuistl
|
||||||
|
|
||||||
|
#endif
|
@ -0,0 +1,110 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2024 SINTEF AS
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#ifndef OPM_COLORING_AND_REORDERING_UTILS_HPP
|
||||||
|
#define OPM_COLORING_AND_REORDERING_UTILS_HPP
|
||||||
|
|
||||||
|
#include <fmt/core.h>
|
||||||
|
#include <memory>
|
||||||
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
#include <opm/grid/utility/SparseTable.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
|
||||||
|
#include <tuple>
|
||||||
|
#include <vector>
|
||||||
|
/*
|
||||||
|
This file contains a collection of utility functions used in the GPU implementation of ILU and DILU
|
||||||
|
The functions deal with creating the mappings between reordered and natural indices, as well as
|
||||||
|
extracting sparsity structures from dune matrices and creating cusparsematrix indices
|
||||||
|
*/
|
||||||
|
namespace Opm::cuistl::detail
|
||||||
|
{
|
||||||
|
inline std::vector<int>
|
||||||
|
createReorderedToNatural(const Opm::SparseTable<size_t>& levelSets)
|
||||||
|
{
|
||||||
|
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
|
||||||
|
int globCnt = 0;
|
||||||
|
for (auto row : levelSets) {
|
||||||
|
for (auto col : row) {
|
||||||
|
OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
|
||||||
|
fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
|
||||||
|
res[globCnt++] = static_cast<int>(col);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::vector<int>
|
||||||
|
createNaturalToReordered(const Opm::SparseTable<size_t>& levelSets)
|
||||||
|
{
|
||||||
|
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
|
||||||
|
int globCnt = 0;
|
||||||
|
for (auto row : levelSets) {
|
||||||
|
for (auto col : row) {
|
||||||
|
OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
|
||||||
|
fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
|
||||||
|
res[col] = globCnt++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class M, class field_type, class GPUM>
|
||||||
|
inline std::unique_ptr<GPUM>
|
||||||
|
createReorderedMatrix(const M& naturalMatrix, const std::vector<int>& reorderedToNatural)
|
||||||
|
{
|
||||||
|
M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
|
||||||
|
for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
|
||||||
|
auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
|
||||||
|
// For elements in A
|
||||||
|
for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
|
||||||
|
dstRowIt.insert(elem.index());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedMatrix, true)));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class M, class field_type, class GPUM>
|
||||||
|
inline std::tuple<std::unique_ptr<GPUM>, std::unique_ptr<GPUM>>
|
||||||
|
extractLowerAndUpperMatrices(const M& naturalMatrix, const std::vector<int>& reorderedToNatural)
|
||||||
|
{
|
||||||
|
const size_t new_nnz = (naturalMatrix.nonzeroes() - naturalMatrix.N()) / 2;
|
||||||
|
|
||||||
|
M reorderedLower(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
|
||||||
|
M reorderedUpper(naturalMatrix.N(), naturalMatrix.N(), new_nnz, M::row_wise);
|
||||||
|
|
||||||
|
for (auto lowerIt = reorderedLower.createbegin(), upperIt = reorderedUpper.createbegin();
|
||||||
|
lowerIt != reorderedLower.createend();
|
||||||
|
++lowerIt, ++upperIt) {
|
||||||
|
|
||||||
|
auto srcRow = naturalMatrix.begin() + reorderedToNatural[lowerIt.index()];
|
||||||
|
|
||||||
|
for (auto elem = srcRow->begin(); elem != srcRow->end(); ++elem) {
|
||||||
|
if (elem.index() < srcRow.index()) { // add index to lower matrix if under the diagonal
|
||||||
|
lowerIt.insert(elem.index());
|
||||||
|
} else if (elem.index() > srcRow.index()) { // add element to upper matrix if above the diagonal
|
||||||
|
upperIt.insert(elem.index());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedLower, true))),
|
||||||
|
std::unique_ptr<GPUM>(new auto(GPUM::fromMatrix(reorderedUpper, true)))};
|
||||||
|
}
|
||||||
|
} // namespace Opm::cuistl::detail
|
||||||
|
|
||||||
|
#endif
|
@ -16,425 +16,17 @@
|
|||||||
You should have received a copy of the GNU General Public License
|
You should have received a copy of the GNU General Public License
|
||||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
#include <config.h>
|
||||||
#include <opm/common/ErrorMacros.hpp>
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <config.h>
|
|
||||||
|
|
||||||
namespace Opm::cuistl::detail
|
namespace Opm::cuistl::detail
|
||||||
{
|
{
|
||||||
namespace
|
namespace
|
||||||
{
|
{
|
||||||
|
|
||||||
// TODO: figure out if this can be generalized effectively, this seems excessively verbose
|
|
||||||
// explicit formulas based on Dune cpu code
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__device__ __forceinline__ void invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
|
|
||||||
{
|
|
||||||
if (blocksize == 1) {
|
|
||||||
dstBlock[0] = 1.0 / (srcBlock[0]);
|
|
||||||
} else if (blocksize == 2) {
|
|
||||||
T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
|
|
||||||
dstBlock[0] = srcBlock[3] * detInv;
|
|
||||||
dstBlock[1] = -srcBlock[1] * detInv;
|
|
||||||
dstBlock[2] = -srcBlock[2] * detInv;
|
|
||||||
dstBlock[3] = srcBlock[0] * detInv;
|
|
||||||
} else if (blocksize == 3) {
|
|
||||||
// based on Dune implementation
|
|
||||||
T t4 = srcBlock[0] * srcBlock[4];
|
|
||||||
T t6 = srcBlock[0] * srcBlock[5];
|
|
||||||
T t8 = srcBlock[1] * srcBlock[3];
|
|
||||||
T t10 = srcBlock[2] * srcBlock[3];
|
|
||||||
T t12 = srcBlock[1] * srcBlock[6];
|
|
||||||
T t14 = srcBlock[2] * srcBlock[6];
|
|
||||||
|
|
||||||
T t17 = 1.0
|
|
||||||
/ (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
|
|
||||||
- t14 * srcBlock[4]); // t17 is 1/determinant
|
|
||||||
|
|
||||||
dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
|
|
||||||
dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
|
|
||||||
dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
|
|
||||||
dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
|
|
||||||
dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
|
|
||||||
dstBlock[5] = -(t6 - t10) * t17;
|
|
||||||
dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
|
|
||||||
dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
|
|
||||||
dstBlock[8] = (t4 - t8) * t17;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// explicit formulas based on Dune cpu code
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__device__ __forceinline__ void invBlockInPlace(T* __restrict__ block)
|
|
||||||
{
|
|
||||||
if (blocksize == 1) {
|
|
||||||
block[0] = 1.0 / (block[0]);
|
|
||||||
} else if (blocksize == 2) {
|
|
||||||
T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]);
|
|
||||||
|
|
||||||
T temp = block[0];
|
|
||||||
block[0] = block[3] * detInv;
|
|
||||||
block[1] = -block[1] * detInv;
|
|
||||||
block[2] = -block[2] * detInv;
|
|
||||||
block[3] = temp * detInv;
|
|
||||||
} else if (blocksize == 3) {
|
|
||||||
T t4 = block[0] * block[4];
|
|
||||||
T t6 = block[0] * block[5];
|
|
||||||
T t8 = block[1] * block[3];
|
|
||||||
T t10 = block[2] * block[3];
|
|
||||||
T t12 = block[1] * block[6];
|
|
||||||
T t14 = block[2] * block[6];
|
|
||||||
|
|
||||||
T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
|
|
||||||
T t17 = T(1.0) / det;
|
|
||||||
|
|
||||||
T matrix01 = block[1];
|
|
||||||
T matrix00 = block[0];
|
|
||||||
T matrix10 = block[3];
|
|
||||||
T matrix11 = block[4];
|
|
||||||
|
|
||||||
block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
|
|
||||||
block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
|
|
||||||
block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
|
|
||||||
block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
|
|
||||||
block[4] = (matrix00 * block[8] - t14) * t17;
|
|
||||||
block[5] = -(t6 - t10) * t17;
|
|
||||||
block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
|
|
||||||
block[7] = -(matrix00 * block[7] - t12) * t17;
|
|
||||||
block[8] = (t4 - t8) * t17;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
enum class MVType { SET, PLUS, MINUS };
|
|
||||||
// SET: c = A*b
|
|
||||||
// PLS: c += A*b
|
|
||||||
// MINUS: c -= A*b
|
|
||||||
template <class T, int blocksize, MVType OP>
|
|
||||||
__device__ __forceinline__ void matrixVectorProductWithAction(const T* A, const T* b, T* c)
|
|
||||||
{
|
|
||||||
for (int i = 0; i < blocksize; ++i) {
|
|
||||||
if (OP == MVType::SET) {
|
|
||||||
c[i] = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int j = 0; j < blocksize; ++j) {
|
|
||||||
if (OP == MVType::SET || OP == MVType::PLUS) {
|
|
||||||
c[i] += A[i * blocksize + j] * b[j];
|
|
||||||
} else if (OP == MVType::MINUS) {
|
|
||||||
c[i] -= A[i * blocksize + j] * b[j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__device__ __forceinline__ void mv(const T* a, const T* b, T* c)
|
|
||||||
{
|
|
||||||
matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__device__ __forceinline__ void umv(const T* a, const T* b, T* c)
|
|
||||||
{
|
|
||||||
matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__device__ __forceinline__ void mmv(const T* a, const T* b, T* c)
|
|
||||||
{
|
|
||||||
matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
|
|
||||||
}
|
|
||||||
|
|
||||||
// dst -= A*B*C
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__device__ __forceinline__ void mmx2Subtraction(T* A, T* B, T* C, T* dst)
|
|
||||||
{
|
|
||||||
|
|
||||||
T tmp[blocksize * blocksize] = {0};
|
|
||||||
// tmp = A*B
|
|
||||||
for (int i = 0; i < blocksize; ++i) {
|
|
||||||
for (int k = 0; k < blocksize; ++k) {
|
|
||||||
for (int j = 0; j < blocksize; ++j) {
|
|
||||||
tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// dst = tmp*C
|
|
||||||
for (int i = 0; i < blocksize; ++i) {
|
|
||||||
for (int k = 0; k < blocksize; ++k) {
|
|
||||||
for (int j = 0; j < blocksize; ++j) {
|
|
||||||
dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__global__ void
|
|
||||||
cuInvertDiagonalAndFlatten(T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
|
|
||||||
{
|
|
||||||
const auto row = blockDim.x * blockIdx.x + threadIdx.x;
|
|
||||||
|
|
||||||
if (row < numberOfRows) {
|
|
||||||
size_t nnzIdx = rowIndices[row];
|
|
||||||
size_t nnzIdxLim = rowIndices[row + 1];
|
|
||||||
|
|
||||||
// this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
|
|
||||||
// zero element
|
|
||||||
while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
|
|
||||||
++nnzIdx;
|
|
||||||
}
|
|
||||||
|
|
||||||
// diagBlock points to the start of where the diagonal block is stored
|
|
||||||
T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
|
|
||||||
// vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
|
|
||||||
// element should be stored
|
|
||||||
T* vecBlock = &vec[blocksize * blocksize * row];
|
|
||||||
|
|
||||||
invBlockOutOfPlace<T, blocksize>(diagBlock, vecBlock);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__global__ void cuComputeLowerSolveLevelSet(T* mat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* indexConversion,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
const T* dInv,
|
|
||||||
const T* d,
|
|
||||||
T* v)
|
|
||||||
{
|
|
||||||
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
|
||||||
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
|
||||||
|
|
||||||
const size_t nnzIdx = rowIndices[reorderedRowIdx];
|
|
||||||
const int naturalRowIdx = indexConversion[reorderedRowIdx];
|
|
||||||
|
|
||||||
T rhs[blocksize];
|
|
||||||
for (int i = 0; i < blocksize; i++) {
|
|
||||||
rhs[i] = d[naturalRowIdx * blocksize + i];
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
|
|
||||||
const int col = colIndices[block];
|
|
||||||
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
|
||||||
}
|
|
||||||
|
|
||||||
mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__global__ void cuComputeLowerSolveLevelSetSplit(T* mat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* indexConversion,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
const T* dInv,
|
|
||||||
const T* d,
|
|
||||||
T* v)
|
|
||||||
{
|
|
||||||
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
|
||||||
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
|
||||||
|
|
||||||
const size_t nnzIdx = rowIndices[reorderedRowIdx];
|
|
||||||
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
|
|
||||||
const int naturalRowIdx = indexConversion[reorderedRowIdx];
|
|
||||||
|
|
||||||
T rhs[blocksize];
|
|
||||||
for (int i = 0; i < blocksize; i++) {
|
|
||||||
rhs[i] = d[naturalRowIdx * blocksize + i];
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: removce the first condition in the for loop
|
|
||||||
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
|
||||||
const int col = colIndices[block];
|
|
||||||
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
|
||||||
}
|
|
||||||
|
|
||||||
mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__global__ void cuComputeUpperSolveLevelSet(T* mat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* indexConversion,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
const T* dInv,
|
|
||||||
T* v)
|
|
||||||
{
|
|
||||||
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
|
||||||
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
|
||||||
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
|
|
||||||
const int naturalRowIdx = indexConversion[reorderedRowIdx];
|
|
||||||
|
|
||||||
T rhs[blocksize] = {0};
|
|
||||||
for (int block = nnzIdxLim - 1; colIndices[block] > naturalRowIdx; --block) {
|
|
||||||
const int col = colIndices[block];
|
|
||||||
umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
|
||||||
}
|
|
||||||
|
|
||||||
mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__global__ void cuComputeUpperSolveLevelSetSplit(T* mat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* indexConversion,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
const T* dInv,
|
|
||||||
T* v)
|
|
||||||
{
|
|
||||||
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
|
||||||
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
|
||||||
const size_t nnzIdx = rowIndices[reorderedRowIdx];
|
|
||||||
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
|
|
||||||
const int naturalRowIdx = indexConversion[reorderedRowIdx];
|
|
||||||
|
|
||||||
T rhs[blocksize] = {0};
|
|
||||||
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
|
||||||
const int col = colIndices[block];
|
|
||||||
umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
|
||||||
}
|
|
||||||
|
|
||||||
mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__global__ void cuComputeDiluDiagonal(T* mat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* reorderedToNatural,
|
|
||||||
int* naturalToReordered,
|
|
||||||
const int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
T* dInv)
|
|
||||||
{
|
|
||||||
const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
|
|
||||||
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
|
||||||
const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
|
|
||||||
const size_t nnzIdx = rowIndices[reorderedRowIdx];
|
|
||||||
|
|
||||||
int diagIdx = nnzIdx;
|
|
||||||
while (colIndices[diagIdx] != naturalRowIdx) {
|
|
||||||
++diagIdx;
|
|
||||||
}
|
|
||||||
|
|
||||||
T dInvTmp[blocksize * blocksize];
|
|
||||||
for (int i = 0; i < blocksize; ++i) {
|
|
||||||
for (int j = 0; j < blocksize; ++j) {
|
|
||||||
dInvTmp[i * blocksize + j] = mat[diagIdx * blocksize * blocksize + i * blocksize + j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
|
|
||||||
const int col = naturalToReordered[colIndices[block]];
|
|
||||||
// find element with indices swapped
|
|
||||||
// Binary search over block in the right row, [rowIndices[col], rowindices[col+1]-1] defines the range
|
|
||||||
// we binary search over
|
|
||||||
int left = rowIndices[col];
|
|
||||||
int right = rowIndices[col + 1] - 1;
|
|
||||||
int mid = left + (right - left) / 2; // overflow-safe average;
|
|
||||||
|
|
||||||
while (left <= right) {
|
|
||||||
const int col = colIndices[mid];
|
|
||||||
|
|
||||||
if (col == naturalRowIdx) {
|
|
||||||
break;
|
|
||||||
} else if (col < naturalRowIdx) {
|
|
||||||
left = mid + 1;
|
|
||||||
} else {
|
|
||||||
right = mid - 1;
|
|
||||||
}
|
|
||||||
mid = left + (right - left) / 2; // overflow-safe average
|
|
||||||
}
|
|
||||||
|
|
||||||
const int symOpposite = mid;
|
|
||||||
|
|
||||||
mmx2Subtraction<T, blocksize>(&mat[block * blocksize * blocksize],
|
|
||||||
&dInv[col * blocksize * blocksize],
|
|
||||||
&mat[symOpposite * blocksize * blocksize],
|
|
||||||
dInvTmp);
|
|
||||||
}
|
|
||||||
|
|
||||||
invBlockInPlace<T, blocksize>(dInvTmp);
|
|
||||||
|
|
||||||
for (int i = 0; i < blocksize; ++i) {
|
|
||||||
for (int j = 0; j < blocksize; ++j) {
|
|
||||||
dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
__global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
|
|
||||||
int* lowerRowIndices,
|
|
||||||
int* lowerColIndices,
|
|
||||||
T* reorderedUpperMat,
|
|
||||||
int* upperRowIndices,
|
|
||||||
int* upperColIndices,
|
|
||||||
T* diagonal,
|
|
||||||
int* reorderedToNatural,
|
|
||||||
int* naturalToReordered,
|
|
||||||
const int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
T* dInv)
|
|
||||||
{
|
|
||||||
const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
|
|
||||||
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
|
||||||
const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
|
|
||||||
const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
|
|
||||||
const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];
|
|
||||||
|
|
||||||
T dInvTmp[blocksize * blocksize];
|
|
||||||
for (int i = 0; i < blocksize; ++i) {
|
|
||||||
for (int j = 0; j < blocksize; ++j) {
|
|
||||||
dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int block = lowerRowStart; block < lowerRowEnd; ++block) {
|
|
||||||
const int col = naturalToReordered[lowerColIndices[block]];
|
|
||||||
|
|
||||||
int symOppositeIdx = upperRowIndices[col];
|
|
||||||
for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx) {
|
|
||||||
if (naturalRowIdx == upperColIndices[symOppositeIdx]) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const int symOppositeBlock = symOppositeIdx;
|
|
||||||
|
|
||||||
mmx2Subtraction<T, blocksize>(&reorderedLowerMat[block * blocksize * blocksize],
|
|
||||||
&dInv[col * blocksize * blocksize],
|
|
||||||
&reorderedUpperMat[symOppositeBlock * blocksize * blocksize],
|
|
||||||
dInvTmp);
|
|
||||||
}
|
|
||||||
|
|
||||||
invBlockInPlace<T, blocksize>(dInvTmp);
|
|
||||||
|
|
||||||
for (int i = 0; i < blocksize; ++i) {
|
|
||||||
for (int j = 0; j < blocksize; ++j) {
|
|
||||||
dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
template <class T, int blocksize>
|
||||||
__global__ void cuMoveDataToReordered(
|
__global__ void cuMoveDataToReordered(
|
||||||
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* indexConversion, size_t numberOfRows)
|
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* indexConversion, size_t numberOfRows)
|
||||||
@ -505,195 +97,21 @@ namespace
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Kernel here is the function object of the cuda kernel
|
|
||||||
template <class Kernel>
|
|
||||||
inline int getCudaRecomendedThreadBlockSize(Kernel k, int suggestedThrBlockSize=-1)
|
|
||||||
{
|
|
||||||
if (suggestedThrBlockSize != -1){
|
|
||||||
return suggestedThrBlockSize;
|
|
||||||
}
|
|
||||||
// Use cuda API to maximize occupancy, otherwise we just pick a thread block size if it is not tuned
|
|
||||||
#if USE_HIP
|
|
||||||
return 512;
|
|
||||||
#else
|
|
||||||
int blockSize = 1024;
|
|
||||||
int tmpGridSize;
|
|
||||||
std::ignore = cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
|
|
||||||
return blockSize;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
|
|
||||||
{
|
|
||||||
return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
template <class T, int blocksize>
|
template <class T, int blocksize>
|
||||||
void
|
void
|
||||||
invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
|
copyMatDataToReordered(T* srcMatrix,
|
||||||
{
|
int* srcRowIndices,
|
||||||
if (blocksize <= 3) {
|
T* dstMatrix,
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten<T, blocksize>);
|
int* dstRowIndices,
|
||||||
int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
|
|
||||||
cuInvertDiagonalAndFlatten<T, blocksize>
|
|
||||||
<<<nThreadBlocks, threadBlockSize>>>(mat, rowIndices, colIndices, numberOfRows, vec);
|
|
||||||
} else {
|
|
||||||
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// perform the lower solve for all rows in the same level set
|
|
||||||
template <class T, int blocksize>
|
|
||||||
void
|
|
||||||
computeLowerSolveLevelSet(T* reorderedMat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* indexConversion,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
const T* dInv,
|
|
||||||
const T* d,
|
|
||||||
T* v,
|
|
||||||
int thrBlockSize)
|
|
||||||
{
|
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSet<T, blocksize>, thrBlockSize);
|
|
||||||
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
|
||||||
cuComputeLowerSolveLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
|
||||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
void
|
|
||||||
computeLowerSolveLevelSetSplit(T* reorderedMat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* indexConversion,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
const T* dInv,
|
|
||||||
const T* d,
|
|
||||||
T* v,
|
|
||||||
int thrBlockSize)
|
|
||||||
{
|
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeLowerSolveLevelSetSplit<T, blocksize>, thrBlockSize);
|
|
||||||
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
|
||||||
cuComputeLowerSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
|
||||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
|
||||||
}
|
|
||||||
// perform the upper solve for all rows in the same level set
|
|
||||||
template <class T, int blocksize>
|
|
||||||
void
|
|
||||||
computeUpperSolveLevelSet(T* reorderedMat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* indexConversion,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
const T* dInv,
|
|
||||||
T* v,
|
|
||||||
int thrBlockSize)
|
|
||||||
{
|
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeUpperSolveLevelSet<T, blocksize>, thrBlockSize);
|
|
||||||
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
|
||||||
cuComputeUpperSolveLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
|
||||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
void
|
|
||||||
computeUpperSolveLevelSetSplit(T* reorderedMat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* indexConversion,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
const T* dInv,
|
|
||||||
T* v,
|
|
||||||
int thrBlockSize)
|
|
||||||
{
|
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeUpperSolveLevelSetSplit<T, blocksize>, thrBlockSize);
|
|
||||||
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
|
||||||
cuComputeUpperSolveLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
|
||||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
void
|
|
||||||
computeDiluDiagonal(T* reorderedMat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* reorderedToNatural,
|
|
||||||
int* naturalToReordered,
|
int* naturalToReordered,
|
||||||
const int startIdx,
|
size_t numberOfRows,
|
||||||
int rowsInLevelSet,
|
|
||||||
T* dInv,
|
|
||||||
int thrBlockSize)
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
if (blocksize <= 3) {
|
int threadBlockSize
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeDiluDiagonal<T, blocksize>, thrBlockSize);
|
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
|
||||||
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
|
||||||
cuComputeDiluDiagonal<T, blocksize>
|
|
||||||
<<<nThreadBlocks, threadBlockSize>>>(reorderedMat,
|
|
||||||
rowIndices,
|
|
||||||
colIndices,
|
|
||||||
reorderedToNatural,
|
|
||||||
naturalToReordered,
|
|
||||||
startIdx,
|
|
||||||
rowsInLevelSet,
|
|
||||||
dInv);
|
|
||||||
} else {
|
|
||||||
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
void
|
|
||||||
computeDiluDiagonalSplit(T* reorderedLowerMat,
|
|
||||||
int* lowerRowIndices,
|
|
||||||
int* lowerColIndices,
|
|
||||||
T* reorderedUpperMat,
|
|
||||||
int* upperRowIndices,
|
|
||||||
int* upperColIndices,
|
|
||||||
T* diagonal,
|
|
||||||
int* reorderedToNatural,
|
|
||||||
int* naturalToReordered,
|
|
||||||
const int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
T* dInv,
|
|
||||||
int thrBlockSize)
|
|
||||||
{
|
|
||||||
if (blocksize <= 3) {
|
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
|
|
||||||
int nThreadBlocks = getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
|
||||||
cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
|
|
||||||
lowerRowIndices,
|
|
||||||
lowerColIndices,
|
|
||||||
reorderedUpperMat,
|
|
||||||
upperRowIndices,
|
|
||||||
upperColIndices,
|
|
||||||
diagonal,
|
|
||||||
reorderedToNatural,
|
|
||||||
naturalToReordered,
|
|
||||||
startIdx,
|
|
||||||
rowsInLevelSet,
|
|
||||||
dInv);
|
|
||||||
} else {
|
|
||||||
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T, int blocksize>
|
|
||||||
void
|
|
||||||
copyMatDataToReordered(
|
|
||||||
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows,
|
|
||||||
int thrBlockSize)
|
|
||||||
{
|
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuMoveDataToReordered<T, blocksize>, thrBlockSize);
|
|
||||||
int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
|
|
||||||
cuMoveDataToReordered<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
cuMoveDataToReordered<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
|
srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
|
||||||
}
|
}
|
||||||
@ -712,8 +130,9 @@ copyMatDataToReorderedSplit(T* srcMatrix,
|
|||||||
size_t numberOfRows,
|
size_t numberOfRows,
|
||||||
int thrBlockSize)
|
int thrBlockSize)
|
||||||
{
|
{
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(cuMoveDataToReorderedSplit<T, blocksize>, thrBlockSize);
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||||
int nThreadBlocks = getNumberOfBlocks(numberOfRows, threadBlockSize);
|
cuMoveDataToReorderedSplit<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
|
||||||
cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(srcMatrix,
|
cuMoveDataToReorderedSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(srcMatrix,
|
||||||
srcRowIndices,
|
srcRowIndices,
|
||||||
srcColumnIndices,
|
srcColumnIndices,
|
||||||
@ -727,16 +146,8 @@ copyMatDataToReorderedSplit(T* srcMatrix,
|
|||||||
}
|
}
|
||||||
|
|
||||||
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
|
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
|
||||||
template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*); \
|
|
||||||
template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t, int); \
|
template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t, int); \
|
||||||
template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int); \
|
template void copyMatDataToReorderedSplit<T, blocksize>(T*, int*, int*, T*, int*, T*, int*, T*, int*, size_t, int);
|
||||||
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \
|
|
||||||
template void computeDiluDiagonalSplit<T, blocksize>( \
|
|
||||||
T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int); \
|
|
||||||
template void computeUpperSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
|
||||||
template void computeLowerSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); \
|
|
||||||
template void computeUpperSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
|
||||||
template void computeLowerSolveLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
|
|
||||||
|
|
||||||
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
||||||
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
||||||
|
@ -22,189 +22,6 @@
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
namespace Opm::cuistl::detail
|
namespace Opm::cuistl::detail
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief This function receives a matrix, and the inverse of the matrix containing only its diagonal is stored in d_vec
|
|
||||||
* @param mat pointer to GPU memory containing nonzerovalues of the sparse matrix
|
|
||||||
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
|
||||||
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
|
||||||
* @param numberOfRows Integer describing the number of rows in the matrix
|
|
||||||
* @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored
|
|
||||||
*/
|
|
||||||
template <class T, int blocksize>
|
|
||||||
void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
|
|
||||||
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
|
||||||
* that rows in the same level sets are contiguous
|
|
||||||
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
|
||||||
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
|
||||||
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
|
||||||
* in the natural ordered matrix
|
|
||||||
* @param startIdx Index of the first row of the matrix to be solve
|
|
||||||
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
|
||||||
* function
|
|
||||||
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
|
|
||||||
* reorderedMat
|
|
||||||
* @param d Stores the defect
|
|
||||||
* @param [out] v Will store the results of the lower solve
|
|
||||||
*/
|
|
||||||
template <class T, int blocksize>
|
|
||||||
void computeLowerSolveLevelSet(T* reorderedMat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* indexConversion,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
const T* dInv,
|
|
||||||
const T* d,
|
|
||||||
T* v,
|
|
||||||
int threadBlockSize);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
|
|
||||||
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
|
||||||
* that rows in the same level sets are contiguous. Thismatrix is assumed to be strictly lower triangular
|
|
||||||
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
|
||||||
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
|
||||||
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
|
||||||
* in the natural ordered matrix
|
|
||||||
* @param startIdx Index of the first row of the matrix to be solve
|
|
||||||
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
|
||||||
* function
|
|
||||||
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
|
|
||||||
* reorderedUpperMat
|
|
||||||
* @param d Stores the defect
|
|
||||||
* @param [out] v Will store the results of the lower solve
|
|
||||||
*/
|
|
||||||
template <class T, int blocksize>
|
|
||||||
void computeLowerSolveLevelSetSplit(T* reorderedUpperMat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* indexConversion,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
const T* dInv,
|
|
||||||
const T* d,
|
|
||||||
T* v,
|
|
||||||
int threadBlockSize);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
|
||||||
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
|
||||||
* that rows in the same level sets are contiguous
|
|
||||||
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
|
||||||
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
|
||||||
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
|
||||||
* in the natural ordered matrix
|
|
||||||
* @param startIdx Index of the first row of the matrix to be solve
|
|
||||||
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
|
||||||
* function
|
|
||||||
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
|
|
||||||
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
|
|
||||||
* solve
|
|
||||||
*/
|
|
||||||
template <class T, int blocksize>
|
|
||||||
void computeUpperSolveLevelSet(T* reorderedMat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* indexConversion,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
const T* dInv,
|
|
||||||
T* v,
|
|
||||||
int threadBlockSize);
|
|
||||||
template <class T, int blocksize>
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
|
||||||
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
|
||||||
* that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
|
|
||||||
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
|
||||||
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
|
||||||
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
|
||||||
* in the natural ordered matrix
|
|
||||||
* @param startIdx Index of the first row of the matrix to be solve
|
|
||||||
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
|
||||||
* function
|
|
||||||
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
|
|
||||||
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
|
|
||||||
* solve
|
|
||||||
*/
|
|
||||||
void computeUpperSolveLevelSetSplit(T* reorderedUpperMat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* indexConversion,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
const T* dInv,
|
|
||||||
T* v,
|
|
||||||
int threadBlockSize);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
|
|
||||||
* containing the diagonal blocks
|
|
||||||
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
|
||||||
* that rows in the same level sets are contiguous
|
|
||||||
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
|
||||||
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
|
||||||
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
|
|
||||||
* index in the natural ordered matrix
|
|
||||||
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
|
|
||||||
* index in the natural ordered matrix
|
|
||||||
* @param startIdx Index of the first row of the matrix to be solve
|
|
||||||
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
|
||||||
* function
|
|
||||||
* @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
|
|
||||||
*/
|
|
||||||
template <class T, int blocksize>
|
|
||||||
void computeDiluDiagonal(T* reorderedMat,
|
|
||||||
int* rowIndices,
|
|
||||||
int* colIndices,
|
|
||||||
int* reorderedToNatural,
|
|
||||||
int* naturalToReordered,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
T* dInv,
|
|
||||||
int threadBlockSize);
|
|
||||||
template <class T, int blocksize>
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
|
|
||||||
* containing the diagonal blocks
|
|
||||||
* @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse matrix. The matrix reordered such
|
|
||||||
* that rows in the same level sets are contiguous
|
|
||||||
* @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
|
|
||||||
* @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
|
|
||||||
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse matrix. The matrix reordered such
|
|
||||||
* that rows in the same level sets are contiguous
|
|
||||||
* @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
|
|
||||||
* @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
|
|
||||||
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
|
|
||||||
* index in the natural ordered matrix
|
|
||||||
* @param diagonal The diagonal elements of the reordered matrix
|
|
||||||
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
|
|
||||||
* index in the natural ordered matrix
|
|
||||||
* @param startIdx Index of the first row of the matrix to be solve
|
|
||||||
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
|
||||||
* function
|
|
||||||
* @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
|
|
||||||
*/
|
|
||||||
void computeDiluDiagonalSplit(T* reorderedLowerMat,
|
|
||||||
int* lowerRowIndices,
|
|
||||||
int* lowerColIndices,
|
|
||||||
T* reorderedUpperMat,
|
|
||||||
int* upperRowIndices,
|
|
||||||
int* upperColIndices,
|
|
||||||
T* diagonal,
|
|
||||||
int* reorderedToNatural,
|
|
||||||
int* naturalToReordered,
|
|
||||||
int startIdx,
|
|
||||||
int rowsInLevelSet,
|
|
||||||
T* dInv,
|
|
||||||
int threadBlockSize);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list
|
* @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list
|
||||||
* @param srcMatrix The source matrix we will copy data from
|
* @param srcMatrix The source matrix we will copy data from
|
||||||
@ -216,26 +33,40 @@ void computeDiluDiagonalSplit(T* reorderedLowerMat,
|
|||||||
* @param numberOfRows The number of rows in the matrices
|
* @param numberOfRows The number of rows in the matrices
|
||||||
*/
|
*/
|
||||||
template <class T, int blocksize>
|
template <class T, int blocksize>
|
||||||
void copyMatDataToReordered(
|
void copyMatDataToReordered(T* srcMatrix,
|
||||||
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows, int threadBlockSize);
|
int* srcRowIndices,
|
||||||
|
T* dstMatrix,
|
||||||
|
int* dstRowIndices,
|
||||||
|
int* naturalToReordered,
|
||||||
|
size_t numberOfRows,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list
|
* @brief Reorders the elements of a matrix by copying them from one matrix to a split matrix using a permutation list
|
||||||
* @param srcMatrix The source matrix we will copy data from
|
* @param srcMatrix The source matrix we will copy data from
|
||||||
* @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format
|
* @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format
|
||||||
* @param [out] dstLowerMatrix The destination of entries that originates from the strictly lower triangular matrix
|
* @param [out] dstLowerMatrix The destination of entries that originates from the strictly lower triangular matrix
|
||||||
* @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht bsr
|
* @param dstRowIndices Pointer to vector on GPU containing rww indices for the destination lower matrix compliant wiht
|
||||||
* format
|
* bsr format
|
||||||
* @param [out] dstUpperMatrix The destination of entries that originates from the strictly upper triangular matrix
|
* @param [out] dstUpperMatrix The destination of entries that originates from the strictly upper triangular matrix
|
||||||
* @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht bsr
|
* @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination upper matrix compliant wiht
|
||||||
* format
|
* bsr format
|
||||||
* @param [out] dstDiag The destination buffer for the diagonal part of the matrix
|
* @param [out] dstDiag The destination buffer for the diagonal part of the matrix
|
||||||
* @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix
|
* @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix
|
||||||
* @param numberOfRows The number of rows in the matrices
|
* @param numberOfRows The number of rows in the matrices
|
||||||
*/
|
*/
|
||||||
template <class T, int blocksize>
|
template <class T, int blocksize>
|
||||||
void copyMatDataToReorderedSplit(
|
void copyMatDataToReorderedSplit(T* srcMatrix,
|
||||||
T* srcMatrix, int* srcRowIndices, int* srcColumnIndices, T* dstLowerMatrix, int* dstLowerRowIndices, T* dstUpperMatrix, int* dstUpperRowIndices, T* dstDiag, int* naturalToReordered, size_t numberOfRows, int threadBlockSize);
|
int* srcRowIndices,
|
||||||
|
int* srcColumnIndices,
|
||||||
|
T* dstLowerMatrix,
|
||||||
|
int* dstLowerRowIndices,
|
||||||
|
T* dstUpperMatrix,
|
||||||
|
int* dstUpperRowIndices,
|
||||||
|
T* dstDiag,
|
||||||
|
int* naturalToReordered,
|
||||||
|
size_t numberOfRows,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
} // namespace Opm::cuistl::detail
|
} // namespace Opm::cuistl::detail
|
||||||
#endif
|
#endif
|
||||||
|
234
opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp
Normal file
234
opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp
Normal file
@ -0,0 +1,234 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2024 SINTEF AS
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
|
||||||
|
#define OPM_CUISTL_DEVICE_BLOCK_OPERATIONS_HPP
|
||||||
|
|
||||||
|
#include <config.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
This file provides inlineable functions intended for CUDA kernels operating on block matrix elements
|
||||||
|
The functions provides various matrix operations that are used by the preconditioners.
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
// TODO: figure out if this can be generalized effectively, this seems excessively verbose
|
||||||
|
// explicit formulas based on Dune cpu code
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
|
||||||
|
{
|
||||||
|
if (blocksize == 1) {
|
||||||
|
dstBlock[0] = 1.0 / (srcBlock[0]);
|
||||||
|
} else if (blocksize == 2) {
|
||||||
|
T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
|
||||||
|
dstBlock[0] = srcBlock[3] * detInv;
|
||||||
|
dstBlock[1] = -srcBlock[1] * detInv;
|
||||||
|
dstBlock[2] = -srcBlock[2] * detInv;
|
||||||
|
dstBlock[3] = srcBlock[0] * detInv;
|
||||||
|
} else if (blocksize == 3) {
|
||||||
|
// based on Dune implementation
|
||||||
|
T t4 = srcBlock[0] * srcBlock[4];
|
||||||
|
T t6 = srcBlock[0] * srcBlock[5];
|
||||||
|
T t8 = srcBlock[1] * srcBlock[3];
|
||||||
|
T t10 = srcBlock[2] * srcBlock[3];
|
||||||
|
T t12 = srcBlock[1] * srcBlock[6];
|
||||||
|
T t14 = srcBlock[2] * srcBlock[6];
|
||||||
|
|
||||||
|
T t17 = 1.0
|
||||||
|
/ (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
|
||||||
|
- t14 * srcBlock[4]); // t17 is 1/determinant
|
||||||
|
|
||||||
|
dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
|
||||||
|
dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
|
||||||
|
dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
|
||||||
|
dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
|
||||||
|
dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
|
||||||
|
dstBlock[5] = -(t6 - t10) * t17;
|
||||||
|
dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
|
||||||
|
dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
|
||||||
|
dstBlock[8] = (t4 - t8) * t17;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// explicit formulas based on Dune cpu code
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
invBlockInPlace(T* __restrict__ block)
|
||||||
|
{
|
||||||
|
if (blocksize == 1) {
|
||||||
|
block[0] = 1.0 / (block[0]);
|
||||||
|
} else if (blocksize == 2) {
|
||||||
|
T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]);
|
||||||
|
|
||||||
|
T temp = block[0];
|
||||||
|
block[0] = block[3] * detInv;
|
||||||
|
block[1] = -block[1] * detInv;
|
||||||
|
block[2] = -block[2] * detInv;
|
||||||
|
block[3] = temp * detInv;
|
||||||
|
} else if (blocksize == 3) {
|
||||||
|
T t4 = block[0] * block[4];
|
||||||
|
T t6 = block[0] * block[5];
|
||||||
|
T t8 = block[1] * block[3];
|
||||||
|
T t10 = block[2] * block[3];
|
||||||
|
T t12 = block[1] * block[6];
|
||||||
|
T t14 = block[2] * block[6];
|
||||||
|
|
||||||
|
T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
|
||||||
|
T t17 = T(1.0) / det;
|
||||||
|
|
||||||
|
T matrix01 = block[1];
|
||||||
|
T matrix00 = block[0];
|
||||||
|
T matrix10 = block[3];
|
||||||
|
T matrix11 = block[4];
|
||||||
|
|
||||||
|
block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
|
||||||
|
block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
|
||||||
|
block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
|
||||||
|
block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
|
||||||
|
block[4] = (matrix00 * block[8] - t14) * t17;
|
||||||
|
block[5] = -(t6 - t10) * t17;
|
||||||
|
block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
|
||||||
|
block[7] = -(matrix00 * block[7] - t12) * t17;
|
||||||
|
block[8] = (t4 - t8) * t17;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
enum class MVType { SET, PLUS, MINUS };
|
||||||
|
// SET: c = A*b
|
||||||
|
// PLS: c += A*b
|
||||||
|
// MINUS: c -= A*b
|
||||||
|
template <class T, int blocksize, MVType OP>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
matrixVectorProductWithAction(const T* A, const T* b, T* c)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
if (OP == MVType::SET) {
|
||||||
|
c[i] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
if (OP == MVType::SET || OP == MVType::PLUS) {
|
||||||
|
c[i] += A[i * blocksize + j] * b[j];
|
||||||
|
} else if (OP == MVType::MINUS) {
|
||||||
|
c[i] -= A[i * blocksize + j] * b[j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
mv(const T* a, const T* b, T* c)
|
||||||
|
{
|
||||||
|
matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
umv(const T* a, const T* b, T* c)
|
||||||
|
{
|
||||||
|
matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
mmv(const T* a, const T* b, T* c)
|
||||||
|
{
|
||||||
|
matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
|
||||||
|
}
|
||||||
|
|
||||||
|
// dst -= A*B*C
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
mmx2Subtraction(T* A, T* B, T* C, T* dst)
|
||||||
|
{
|
||||||
|
|
||||||
|
T tmp[blocksize * blocksize] = {0};
|
||||||
|
// tmp = A*B
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
for (int k = 0; k < blocksize; ++k) {
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// dst = tmp*C
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
for (int k = 0; k < blocksize; ++k) {
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// C = A*B, assumes the three buffers do not overlap
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
mmNoOverlap(T* A, T* B, T* C)
|
||||||
|
{
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
for (int k = 0; k < blocksize; ++k) {
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
C[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// C = A*B, buffers may overlap
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
mmOverlap(T* A, T* B, T* C)
|
||||||
|
{
|
||||||
|
T tmp[blocksize * blocksize] = {0};
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
for (int k = 0; k < blocksize; ++k) {
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
C[i * blocksize + j] = tmp[i * blocksize + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// A -= B
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
matrixSubtraction(T* A, T* B)
|
||||||
|
{
|
||||||
|
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
A[i * blocksize + j] -= B[i * blocksize + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
#endif
|
66
opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp
Normal file
66
opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp
Normal file
@ -0,0 +1,66 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2024 SINTEF AS
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#ifndef OPM_GPU_THREAD_UTILS_HPP
|
||||||
|
#define OPM_GPU_THREAD_UTILS_HPP
|
||||||
|
#include <cstddef>
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
|
||||||
|
|
||||||
|
/*
|
||||||
|
This file provides some logic for handling how to choose the correct thread-block size
|
||||||
|
*/
|
||||||
|
namespace Opm::cuistl::detail
|
||||||
|
{
|
||||||
|
constexpr inline size_t
|
||||||
|
getThreads([[maybe_unused]] size_t numberOfRows)
|
||||||
|
{
|
||||||
|
return 1024;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline size_t
|
||||||
|
getBlocks(size_t numberOfRows)
|
||||||
|
{
|
||||||
|
const auto threads = getThreads(numberOfRows);
|
||||||
|
return (numberOfRows + threads - 1) / threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Kernel here is the function object of the cuda kernel
|
||||||
|
template <class Kernel>
|
||||||
|
inline int
|
||||||
|
getCudaRecomendedThreadBlockSize(Kernel k, int suggestedThrBlockSize = -1)
|
||||||
|
{
|
||||||
|
if (suggestedThrBlockSize != -1) {
|
||||||
|
return suggestedThrBlockSize;
|
||||||
|
}
|
||||||
|
int blockSize;
|
||||||
|
int tmpGridSize;
|
||||||
|
OPM_CUDA_SAFE_CALL(cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0));
|
||||||
|
return blockSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
inline int
|
||||||
|
getNumberOfBlocks(int wantedThreads, int threadBlockSize)
|
||||||
|
{
|
||||||
|
return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace Opm::cuistl::detail
|
||||||
|
|
||||||
|
#endif
|
@ -0,0 +1,437 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2024 SINTEF AS
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#include <config.h>
|
||||||
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/DILUKernels.hpp>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
namespace Opm::cuistl::detail::DILU
|
||||||
|
{
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void cuSolveLowerLevelSet(T* mat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
const T* d,
|
||||||
|
T* v)
|
||||||
|
{
|
||||||
|
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
||||||
|
|
||||||
|
const size_t nnzIdx = rowIndices[reorderedRowIdx];
|
||||||
|
const int naturalRowIdx = indexConversion[reorderedRowIdx];
|
||||||
|
|
||||||
|
T rhs[blocksize];
|
||||||
|
for (int i = 0; i < blocksize; i++) {
|
||||||
|
rhs[i] = d[naturalRowIdx * blocksize + i];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
|
||||||
|
const int col = colIndices[block];
|
||||||
|
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void cuSolveLowerLevelSetSplit(T* mat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
const T* d,
|
||||||
|
T* v)
|
||||||
|
{
|
||||||
|
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
||||||
|
|
||||||
|
const size_t nnzIdx = rowIndices[reorderedRowIdx];
|
||||||
|
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
|
||||||
|
const int naturalRowIdx = indexConversion[reorderedRowIdx];
|
||||||
|
|
||||||
|
T rhs[blocksize];
|
||||||
|
for (int i = 0; i < blocksize; i++) {
|
||||||
|
rhs[i] = d[naturalRowIdx * blocksize + i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: removce the first condition in the for loop
|
||||||
|
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
||||||
|
const int col = colIndices[block];
|
||||||
|
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void cuSolveUpperLevelSet(T* mat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
T* v)
|
||||||
|
{
|
||||||
|
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
||||||
|
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
|
||||||
|
const int naturalRowIdx = indexConversion[reorderedRowIdx];
|
||||||
|
|
||||||
|
T rhs[blocksize] = {0};
|
||||||
|
for (int block = nnzIdxLim - 1; colIndices[block] > naturalRowIdx; --block) {
|
||||||
|
const int col = colIndices[block];
|
||||||
|
umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void cuSolveUpperLevelSetSplit(T* mat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
T* v)
|
||||||
|
{
|
||||||
|
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
||||||
|
const size_t nnzIdx = rowIndices[reorderedRowIdx];
|
||||||
|
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
|
||||||
|
const int naturalRowIdx = indexConversion[reorderedRowIdx];
|
||||||
|
|
||||||
|
T rhs[blocksize] = {0};
|
||||||
|
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
||||||
|
const int col = colIndices[block];
|
||||||
|
umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void cuComputeDiluDiagonal(T* mat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* reorderedToNatural,
|
||||||
|
int* naturalToReordered,
|
||||||
|
const int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
T* dInv)
|
||||||
|
{
|
||||||
|
const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
||||||
|
const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
|
||||||
|
const size_t nnzIdx = rowIndices[reorderedRowIdx];
|
||||||
|
|
||||||
|
int diagIdx = nnzIdx;
|
||||||
|
while (colIndices[diagIdx] != naturalRowIdx) {
|
||||||
|
++diagIdx;
|
||||||
|
}
|
||||||
|
|
||||||
|
T dInvTmp[blocksize * blocksize];
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
dInvTmp[i * blocksize + j] = mat[diagIdx * blocksize * blocksize + i * blocksize + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
|
||||||
|
const int col = naturalToReordered[colIndices[block]];
|
||||||
|
// find element with indices swapped
|
||||||
|
// Binary search over block in the right row, [rowIndices[col], rowindices[col+1]-1] defines the range
|
||||||
|
// we binary search over
|
||||||
|
int left = rowIndices[col];
|
||||||
|
int right = rowIndices[col + 1] - 1;
|
||||||
|
int mid;
|
||||||
|
|
||||||
|
while (left <= right) {
|
||||||
|
mid = left + (right - left) / 2; // overflow-safe average
|
||||||
|
const int col = colIndices[mid];
|
||||||
|
|
||||||
|
if (col == naturalRowIdx) {
|
||||||
|
break;
|
||||||
|
} else if (col < naturalRowIdx) {
|
||||||
|
left = mid + 1;
|
||||||
|
} else {
|
||||||
|
right = mid - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int symOpposite = mid;
|
||||||
|
|
||||||
|
mmx2Subtraction<T, blocksize>(&mat[block * blocksize * blocksize],
|
||||||
|
&dInv[col * blocksize * blocksize],
|
||||||
|
&mat[symOpposite * blocksize * blocksize],
|
||||||
|
dInvTmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
invBlockInPlace<T, blocksize>(dInvTmp);
|
||||||
|
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void cuComputeDiluDiagonalSplit(T* reorderedLowerMat,
|
||||||
|
int* lowerRowIndices,
|
||||||
|
int* lowerColIndices,
|
||||||
|
T* reorderedUpperMat,
|
||||||
|
int* upperRowIndices,
|
||||||
|
int* upperColIndices,
|
||||||
|
T* diagonal,
|
||||||
|
int* reorderedToNatural,
|
||||||
|
int* naturalToReordered,
|
||||||
|
const int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
T* dInv)
|
||||||
|
{
|
||||||
|
const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
||||||
|
const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
|
||||||
|
const size_t lowerRowStart = lowerRowIndices[reorderedRowIdx];
|
||||||
|
const size_t lowerRowEnd = lowerRowIndices[reorderedRowIdx + 1];
|
||||||
|
|
||||||
|
T dInvTmp[blocksize * blocksize];
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
dInvTmp[i * blocksize + j] = diagonal[reorderedRowIdx * blocksize * blocksize + i * blocksize + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int block = lowerRowStart; block < lowerRowEnd; ++block) {
|
||||||
|
const int col = naturalToReordered[lowerColIndices[block]];
|
||||||
|
|
||||||
|
int symOppositeIdx = upperRowIndices[col];
|
||||||
|
for (; symOppositeIdx < upperRowIndices[col + 1]; ++symOppositeIdx) {
|
||||||
|
if (naturalRowIdx == upperColIndices[symOppositeIdx]) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const int symOppositeBlock = symOppositeIdx;
|
||||||
|
|
||||||
|
mmx2Subtraction<T, blocksize>(&reorderedLowerMat[block * blocksize * blocksize],
|
||||||
|
&dInv[col * blocksize * blocksize],
|
||||||
|
&reorderedUpperMat[symOppositeBlock * blocksize * blocksize],
|
||||||
|
dInvTmp);
|
||||||
|
}
|
||||||
|
|
||||||
|
invBlockInPlace<T, blocksize>(dInvTmp);
|
||||||
|
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
for (int j = 0; j < blocksize; ++j) {
|
||||||
|
dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
// perform the lower solve for all rows in the same level set
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
solveLowerLevelSet(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
const T* d,
|
||||||
|
T* v,
|
||||||
|
int thrBlockSize)
|
||||||
|
{
|
||||||
|
int threadBlockSize
|
||||||
|
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
solveLowerLevelSetSplit(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
const T* d,
|
||||||
|
T* v,
|
||||||
|
int thrBlockSize)
|
||||||
|
{
|
||||||
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||||
|
cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
||||||
|
}
|
||||||
|
// perform the upper solve for all rows in the same level set
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
solveUpperLevelSet(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
T* v,
|
||||||
|
int thrBlockSize)
|
||||||
|
{
|
||||||
|
int threadBlockSize
|
||||||
|
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
solveUpperLevelSetSplit(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
T* v,
|
||||||
|
int thrBlockSize)
|
||||||
|
{
|
||||||
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||||
|
cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
computeDiluDiagonal(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* reorderedToNatural,
|
||||||
|
int* naturalToReordered,
|
||||||
|
const int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
T* dInv,
|
||||||
|
int thrBlockSize)
|
||||||
|
{
|
||||||
|
if (blocksize <= 3) {
|
||||||
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||||
|
cuComputeDiluDiagonal<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuComputeDiluDiagonal<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedMat,
|
||||||
|
rowIndices,
|
||||||
|
colIndices,
|
||||||
|
reorderedToNatural,
|
||||||
|
naturalToReordered,
|
||||||
|
startIdx,
|
||||||
|
rowsInLevelSet,
|
||||||
|
dInv);
|
||||||
|
} else {
|
||||||
|
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
computeDiluDiagonalSplit(T* reorderedLowerMat,
|
||||||
|
int* lowerRowIndices,
|
||||||
|
int* lowerColIndices,
|
||||||
|
T* reorderedUpperMat,
|
||||||
|
int* upperRowIndices,
|
||||||
|
int* upperColIndices,
|
||||||
|
T* diagonal,
|
||||||
|
int* reorderedToNatural,
|
||||||
|
int* naturalToReordered,
|
||||||
|
const int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
T* dInv,
|
||||||
|
int thrBlockSize)
|
||||||
|
{
|
||||||
|
if (blocksize <= 3) {
|
||||||
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||||
|
cuComputeDiluDiagonalSplit<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuComputeDiluDiagonalSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
|
||||||
|
lowerRowIndices,
|
||||||
|
lowerColIndices,
|
||||||
|
reorderedUpperMat,
|
||||||
|
upperRowIndices,
|
||||||
|
upperColIndices,
|
||||||
|
diagonal,
|
||||||
|
reorderedToNatural,
|
||||||
|
naturalToReordered,
|
||||||
|
startIdx,
|
||||||
|
rowsInLevelSet,
|
||||||
|
dInv);
|
||||||
|
} else {
|
||||||
|
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
|
||||||
|
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*, int); \
|
||||||
|
template void computeDiluDiagonalSplit<T, blocksize>( \
|
||||||
|
T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, T*, int); \
|
||||||
|
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||||
|
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int); \
|
||||||
|
template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||||
|
template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*, int);
|
||||||
|
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 3);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 4);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 5);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 6);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 1);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 2);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 3);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 4);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
|
||||||
|
} // namespace Opm::cuistl::detail::DILU
|
@ -0,0 +1,202 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2024 SINTEF AS
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#ifndef OPM_DILU_KERNELS_HPP
|
||||||
|
#define OPM_DILU_KERNELS_HPP
|
||||||
|
|
||||||
|
#include <cstddef>
|
||||||
|
#include <cuda.h>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
namespace Opm::cuistl::detail::DILU
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
|
||||||
|
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
||||||
|
* that rows in the same level sets are contiguous
|
||||||
|
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||||
|
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||||
|
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
||||||
|
* in the natural ordered matrix
|
||||||
|
* @param startIdx Index of the first row of the matrix to be solve
|
||||||
|
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||||
|
* function
|
||||||
|
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
|
||||||
|
* reorderedMat
|
||||||
|
* @param d Stores the defect
|
||||||
|
* @param [out] v Will store the results of the lower solve
|
||||||
|
*/
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void solveLowerLevelSet(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
const T* d,
|
||||||
|
T* v,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
|
||||||
|
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
|
||||||
|
* such that rows in the same level sets are contiguous. Thismatrix is assumed to be strictly lower triangular
|
||||||
|
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||||
|
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||||
|
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
||||||
|
* in the natural ordered matrix
|
||||||
|
* @param startIdx Index of the first row of the matrix to be solve
|
||||||
|
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||||
|
* function
|
||||||
|
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
|
||||||
|
* reorderedUpperMat
|
||||||
|
* @param d Stores the defect
|
||||||
|
* @param [out] v Will store the results of the lower solve
|
||||||
|
*/
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void solveLowerLevelSetSplit(T* reorderedUpperMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
const T* d,
|
||||||
|
T* v,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
||||||
|
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
||||||
|
* that rows in the same level sets are contiguous
|
||||||
|
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||||
|
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||||
|
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
||||||
|
* in the natural ordered matrix
|
||||||
|
* @param startIdx Index of the first row of the matrix to be solve
|
||||||
|
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||||
|
* function
|
||||||
|
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
|
||||||
|
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
|
||||||
|
* solve
|
||||||
|
*/
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void solveUpperLevelSet(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
T* v,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
||||||
|
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
|
||||||
|
* such that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
|
||||||
|
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||||
|
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||||
|
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
||||||
|
* in the natural ordered matrix
|
||||||
|
* @param startIdx Index of the first row of the matrix to be solve
|
||||||
|
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||||
|
* function
|
||||||
|
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
|
||||||
|
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
|
||||||
|
* solve
|
||||||
|
*/
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void solveUpperLevelSetSplit(T* reorderedUpperMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
T* v,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
|
||||||
|
* containing the diagonal blocks
|
||||||
|
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
||||||
|
* that rows in the same level sets are contiguous
|
||||||
|
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||||
|
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||||
|
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
|
||||||
|
* index in the natural ordered matrix
|
||||||
|
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
|
||||||
|
* index in the natural ordered matrix
|
||||||
|
* @param startIdx Index of the first row of the matrix to be solve
|
||||||
|
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||||
|
* function
|
||||||
|
* @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
|
||||||
|
*/
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void computeDiluDiagonal(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* reorderedToNatural,
|
||||||
|
int* naturalToReordered,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
T* dInv,
|
||||||
|
int threadBlockSize);
|
||||||
|
template <class T, int blocksize>
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Computes the ILU0 of the diagonal elements of the split reordered matrix and stores it in a reordered vector
|
||||||
|
* containing the diagonal blocks
|
||||||
|
* @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse
|
||||||
|
* matrix. The matrix reordered such that rows in the same level sets are contiguous
|
||||||
|
* @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
|
||||||
|
* @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
|
||||||
|
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse
|
||||||
|
* matrix. The matrix reordered such that rows in the same level sets are contiguous
|
||||||
|
* @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
|
||||||
|
* @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
|
||||||
|
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
|
||||||
|
* index in the natural ordered matrix
|
||||||
|
* @param diagonal The diagonal elements of the reordered matrix
|
||||||
|
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
|
||||||
|
* index in the natural ordered matrix
|
||||||
|
* @param startIdx Index of the first row of the matrix to be solve
|
||||||
|
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||||
|
* function
|
||||||
|
* @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
|
||||||
|
*/
|
||||||
|
void computeDiluDiagonalSplit(T* reorderedLowerMat,
|
||||||
|
int* lowerRowIndices,
|
||||||
|
int* lowerColIndices,
|
||||||
|
T* reorderedUpperMat,
|
||||||
|
int* upperRowIndices,
|
||||||
|
int* upperColIndices,
|
||||||
|
T* diagonal,
|
||||||
|
int* reorderedToNatural,
|
||||||
|
int* naturalToReordered,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
T* dInv,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
|
} // namespace Opm::cuistl::detail::DILU
|
||||||
|
#endif
|
@ -0,0 +1,476 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2024 SINTEF AS
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#include <config.h>
|
||||||
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/ILU0Kernels.hpp>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
/*
|
||||||
|
The LU factorization and apply step is written based on the Dune implementations
|
||||||
|
*/
|
||||||
|
|
||||||
|
namespace Opm::cuistl::detail::ILU0
|
||||||
|
{
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void cuLUFactorization(T* srcMatrix,
|
||||||
|
int* srcRowIndices,
|
||||||
|
int* srcColumnIndices,
|
||||||
|
int* naturalToReordered,
|
||||||
|
int* reorderedToNatual,
|
||||||
|
size_t rowsInLevelSet,
|
||||||
|
int startIdx)
|
||||||
|
{
|
||||||
|
auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
constexpr int scalarsInBlock = blocksize * blocksize;
|
||||||
|
|
||||||
|
if (reorderedIdx < rowsInLevelSet + startIdx) {
|
||||||
|
int naturalIdx = reorderedToNatual[reorderedIdx];
|
||||||
|
// for each element under the diagonal
|
||||||
|
int endOfRowI = srcRowIndices[reorderedIdx + 1];
|
||||||
|
int ij;
|
||||||
|
for (ij = srcRowIndices[reorderedIdx];
|
||||||
|
ij < srcRowIndices[reorderedIdx + 1] && srcColumnIndices[ij] < naturalIdx;
|
||||||
|
++ij) {
|
||||||
|
// consider the block we are looking at to be A_ij
|
||||||
|
// we want to set B_ij = A_ij * A_jj, where A_jj is a diagonal element on a previous row, meaning it is
|
||||||
|
// already inverted find the A_jj block
|
||||||
|
int jj; // index in bcsr of the A_jj element
|
||||||
|
int j = naturalToReordered[srcColumnIndices[ij]];
|
||||||
|
int startOfRowJ = srcRowIndices[j];
|
||||||
|
int endOfRowJ = srcRowIndices[j + 1];
|
||||||
|
for (int blockInRowJ = startOfRowJ; blockInRowJ < endOfRowJ; ++blockInRowJ) {
|
||||||
|
if (srcColumnIndices[blockInRowJ] == srcColumnIndices[ij]) {
|
||||||
|
jj = blockInRowJ;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// the DUNE implementation is inplace, and I need to take care to make sure
|
||||||
|
// this out of place implementation is equivalent
|
||||||
|
mmOverlap<T, blocksize>(
|
||||||
|
&srcMatrix[ij * scalarsInBlock], &srcMatrix[jj * scalarsInBlock], &srcMatrix[ij * scalarsInBlock]);
|
||||||
|
|
||||||
|
// we have now accounted for an element under the diagonal and the diagonal element above it
|
||||||
|
// now iterate over the blocks that align in the
|
||||||
|
int jk = jj + 1;
|
||||||
|
int ik = ij + 1;
|
||||||
|
// this code is NOT gpu friendly, thread divergence will probably slow this down significantly...
|
||||||
|
while (ik != endOfRowI && jk != endOfRowJ) {
|
||||||
|
if (srcColumnIndices[ik] == srcColumnIndices[jk]) {
|
||||||
|
// A_jk = A_ij * A_jk
|
||||||
|
T tmp[scalarsInBlock] = {0};
|
||||||
|
mmNoOverlap<T, blocksize>(
|
||||||
|
&srcMatrix[ij * scalarsInBlock], &srcMatrix[jk * scalarsInBlock], tmp);
|
||||||
|
matrixSubtraction<T, blocksize>(&srcMatrix[ik * scalarsInBlock], tmp);
|
||||||
|
++ik;
|
||||||
|
++jk;
|
||||||
|
} else {
|
||||||
|
if (srcColumnIndices[ik] < srcColumnIndices[jk]) {
|
||||||
|
++ik;
|
||||||
|
} else {
|
||||||
|
++jk;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
invBlockInPlace<T, blocksize>(&srcMatrix[ij * scalarsInBlock]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// An index in the BCSR matrix can be in one of three states, above, on, or above the diagonal
|
||||||
|
enum class POSITION_TYPE { UNDER_DIAG, ON_DIAG, ABOVE_DIAG };
|
||||||
|
// this handles incrementation of an index on a row that is in a lower/upper split datastructure.
|
||||||
|
// The point is that if we are in the lower or upper part we increment as usual.
|
||||||
|
// if we reach the diagonal, we update the state, the index is not needed.
|
||||||
|
// when we increment from the diagonal, we set it to be the first block in the upper part of that row
|
||||||
|
__device__ __forceinline__ void
|
||||||
|
incrementAcrossSplitStructure(int& idx, POSITION_TYPE& currentState, int lastLowerIdx, int firstUpperIdx)
|
||||||
|
{
|
||||||
|
if (currentState == POSITION_TYPE::UNDER_DIAG) {
|
||||||
|
if (idx != lastLowerIdx - 1) { // we should increment to be on the diagonal
|
||||||
|
++idx;
|
||||||
|
} else {
|
||||||
|
currentState = POSITION_TYPE::ON_DIAG;
|
||||||
|
}
|
||||||
|
} else if (currentState == POSITION_TYPE::ON_DIAG) {
|
||||||
|
idx = firstUpperIdx;
|
||||||
|
currentState = POSITION_TYPE::ABOVE_DIAG;
|
||||||
|
} else { // currentState = POSITION_TYPE::ABOVE_DIAG
|
||||||
|
++idx;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void cuLUFactorizationSplit(T* reorderedLowerMat,
|
||||||
|
int* lowerRowIndices,
|
||||||
|
int* lowerColIndices,
|
||||||
|
T* reorderedUpperMat,
|
||||||
|
int* upperRowIndices,
|
||||||
|
int* upperColIndices,
|
||||||
|
T* diagonal,
|
||||||
|
int* reorderedToNatural,
|
||||||
|
int* naturalToReordered,
|
||||||
|
const int startIdx,
|
||||||
|
int rowsInLevelSet)
|
||||||
|
{
|
||||||
|
auto reorderedIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
constexpr int scalarsInBlock = blocksize * blocksize;
|
||||||
|
|
||||||
|
if (reorderedIdx < rowsInLevelSet + startIdx) {
|
||||||
|
// if (reorderedIdx < rowsInLevelSet){
|
||||||
|
int naturalIdx = reorderedToNatural[reorderedIdx];
|
||||||
|
// for each element under the diagonal
|
||||||
|
int endOfRowILower = lowerRowIndices[reorderedIdx + 1];
|
||||||
|
int endOfRowIUpper = upperRowIndices[reorderedIdx + 1];
|
||||||
|
int startOfRowILower = lowerRowIndices[reorderedIdx];
|
||||||
|
int startOfRowIUpper = upperRowIndices[reorderedIdx];
|
||||||
|
for (int ij = startOfRowILower; ij < endOfRowILower; ++ij) {
|
||||||
|
|
||||||
|
// consider the block we are looking at to be A_ij
|
||||||
|
// we want to set B_ij = A_ij * A_jj, where A_jj is a diagonal element on a previous row, meaning it is
|
||||||
|
// already inverted find the A_jj block
|
||||||
|
int j = naturalToReordered[lowerColIndices[ij]];
|
||||||
|
int endOfRowJ = upperRowIndices[j + 1];
|
||||||
|
int startOfRowJ = upperRowIndices[j];
|
||||||
|
|
||||||
|
// the DUNE implementation is inplace, and I need to take care to make sure
|
||||||
|
// this out of place implementation is equivalent
|
||||||
|
mmOverlap<T, blocksize>(&reorderedLowerMat[ij * scalarsInBlock],
|
||||||
|
&diagonal[j * scalarsInBlock],
|
||||||
|
&reorderedLowerMat[ij * scalarsInBlock]);
|
||||||
|
|
||||||
|
// we have now accounted for an element under the diagonal and the diagonal element above it
|
||||||
|
// now iterate over the blocks that align in the
|
||||||
|
int jk = startOfRowJ;
|
||||||
|
POSITION_TYPE ikState = POSITION_TYPE::UNDER_DIAG; // it is guaranteed that the ij element we consider
|
||||||
|
// is under the diagonal
|
||||||
|
int ik = ij;
|
||||||
|
incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
|
||||||
|
// this code is NOT gpu friendly, thread divergence will probably slow this down significantly...
|
||||||
|
|
||||||
|
// checking for ik != endOfRowUpper is not sufficient alone because
|
||||||
|
// the same block index might be found in the lower part of the matrix
|
||||||
|
while (!(ik == endOfRowIUpper && ikState == POSITION_TYPE::ABOVE_DIAG) && jk != endOfRowJ) {
|
||||||
|
|
||||||
|
T* ikBlockPtr;
|
||||||
|
int ikColumn;
|
||||||
|
if (ikState == POSITION_TYPE::UNDER_DIAG) {
|
||||||
|
ikBlockPtr = &reorderedLowerMat[ik * scalarsInBlock];
|
||||||
|
ikColumn = lowerColIndices[ik];
|
||||||
|
} else if (ikState == POSITION_TYPE::ON_DIAG) {
|
||||||
|
ikBlockPtr = &diagonal[reorderedIdx * scalarsInBlock];
|
||||||
|
ikColumn = naturalIdx;
|
||||||
|
} else { // ikState == POSITION_TYPE::ABOVE_DIAG
|
||||||
|
ikBlockPtr = &reorderedUpperMat[ik * scalarsInBlock];
|
||||||
|
ikColumn = upperColIndices[ik];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ikColumn == upperColIndices[jk]) {
|
||||||
|
// A_jk = A_ij * A_jk
|
||||||
|
T tmp[scalarsInBlock] = {0};
|
||||||
|
|
||||||
|
mmNoOverlap<T, blocksize>(
|
||||||
|
&reorderedLowerMat[ij * scalarsInBlock], &reorderedUpperMat[jk * scalarsInBlock], tmp);
|
||||||
|
matrixSubtraction<T, blocksize>(ikBlockPtr, tmp);
|
||||||
|
incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
|
||||||
|
;
|
||||||
|
++jk;
|
||||||
|
} else {
|
||||||
|
if (ikColumn < upperColIndices[jk]) {
|
||||||
|
incrementAcrossSplitStructure(ik, ikState, endOfRowILower, startOfRowIUpper);
|
||||||
|
;
|
||||||
|
} else {
|
||||||
|
++jk;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
invBlockInPlace<T, blocksize>(&diagonal[reorderedIdx * scalarsInBlock]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void cuSolveLowerLevelSet(T* mat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* d,
|
||||||
|
T* v)
|
||||||
|
{
|
||||||
|
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (reorderedIdx < rowsInLevelSet + startIdx) {
|
||||||
|
|
||||||
|
const size_t nnzIdx = rowIndices[reorderedIdx];
|
||||||
|
const int naturalRowIdx = indexConversion[reorderedIdx];
|
||||||
|
|
||||||
|
T rhs[blocksize];
|
||||||
|
for (int i = 0; i < blocksize; i++) {
|
||||||
|
rhs[i] = d[naturalRowIdx * blocksize + i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int block = nnzIdx;
|
||||||
|
for (; colIndices[block] < naturalRowIdx; ++block) {
|
||||||
|
const int col = colIndices[block];
|
||||||
|
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
v[naturalRowIdx * blocksize + i] = rhs[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void cuSolveUpperLevelSet(
|
||||||
|
T* mat, int* rowIndices, int* colIndices, int* indexConversion, int startIdx, int rowsInLevelSet, T* v)
|
||||||
|
{
|
||||||
|
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (reorderedIdx < rowsInLevelSet + startIdx) {
|
||||||
|
|
||||||
|
const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
|
||||||
|
const int naturalRowIdx = indexConversion[reorderedIdx];
|
||||||
|
|
||||||
|
T rhs[blocksize];
|
||||||
|
for (int i = 0; i < blocksize; i++) {
|
||||||
|
rhs[i] = v[naturalRowIdx * blocksize + i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int block = nnzIdxLim - 1;
|
||||||
|
for (; colIndices[block] > naturalRowIdx; --block) {
|
||||||
|
const int col = colIndices[block];
|
||||||
|
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
mv<T, blocksize>(&mat[block * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void cuSolveLowerLevelSetSplit(T* mat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* d,
|
||||||
|
T* v)
|
||||||
|
{
|
||||||
|
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (reorderedIdx < rowsInLevelSet + startIdx) {
|
||||||
|
const size_t nnzIdx = rowIndices[reorderedIdx];
|
||||||
|
const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
|
||||||
|
const int naturalRowIdx = indexConversion[reorderedIdx];
|
||||||
|
|
||||||
|
T rhs[blocksize];
|
||||||
|
for (int i = 0; i < blocksize; i++) {
|
||||||
|
rhs[i] = d[naturalRowIdx * blocksize + i];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
||||||
|
const int col = colIndices[block];
|
||||||
|
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < blocksize; ++i) {
|
||||||
|
v[naturalRowIdx * blocksize + i] = rhs[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void cuSolveUpperLevelSetSplit(T* mat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
T* v)
|
||||||
|
{
|
||||||
|
auto reorderedIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
||||||
|
if (reorderedIdx < rowsInLevelSet + startIdx) {
|
||||||
|
const size_t nnzIdx = rowIndices[reorderedIdx];
|
||||||
|
const size_t nnzIdxLim = rowIndices[reorderedIdx + 1];
|
||||||
|
const int naturalRowIdx = indexConversion[reorderedIdx];
|
||||||
|
|
||||||
|
T rhs[blocksize];
|
||||||
|
for (int i = 0; i < blocksize; i++) {
|
||||||
|
rhs[i] = v[naturalRowIdx * blocksize + i];
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int block = nnzIdx; block < nnzIdxLim; ++block) {
|
||||||
|
const int col = colIndices[block];
|
||||||
|
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||||
|
}
|
||||||
|
|
||||||
|
mv<T, blocksize>(&dInv[reorderedIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
solveLowerLevelSet(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* d,
|
||||||
|
T* v,
|
||||||
|
int thrBlockSize)
|
||||||
|
{
|
||||||
|
int threadBlockSize
|
||||||
|
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveLowerLevelSet<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuSolveLowerLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
|
||||||
|
}
|
||||||
|
// perform the upper solve for all rows in the same level set
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
solveUpperLevelSet(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
T* v,
|
||||||
|
int thrBlockSize)
|
||||||
|
{
|
||||||
|
int threadBlockSize
|
||||||
|
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuSolveUpperLevelSet<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuSolveUpperLevelSet<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
solveLowerLevelSetSplit(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* d,
|
||||||
|
T* v,
|
||||||
|
int thrBlockSize)
|
||||||
|
{
|
||||||
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||||
|
cuSolveLowerLevelSetSplit<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuSolveLowerLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, d, v);
|
||||||
|
}
|
||||||
|
// perform the upper solve for all rows in the same level set
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
solveUpperLevelSetSplit(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
T* v,
|
||||||
|
int thrBlockSize)
|
||||||
|
{
|
||||||
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(
|
||||||
|
cuSolveUpperLevelSetSplit<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuSolveUpperLevelSetSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
|
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
LUFactorization(T* srcMatrix,
|
||||||
|
int* srcRowIndices,
|
||||||
|
int* srcColumnIndices,
|
||||||
|
int* naturalToReordered,
|
||||||
|
int* reorderedToNatual,
|
||||||
|
size_t rowsInLevelSet,
|
||||||
|
int startIdx,
|
||||||
|
int thrBlockSize)
|
||||||
|
{
|
||||||
|
int threadBlockSize
|
||||||
|
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorization<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuLUFactorization<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(
|
||||||
|
srcMatrix, srcRowIndices, srcColumnIndices, naturalToReordered, reorderedToNatual, rowsInLevelSet, startIdx);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
LUFactorizationSplit(T* reorderedLowerMat,
|
||||||
|
int* lowerRowIndices,
|
||||||
|
int* lowerColIndices,
|
||||||
|
T* reorderedUpperMat,
|
||||||
|
int* upperRowIndices,
|
||||||
|
int* upperColIndices,
|
||||||
|
T* diagonal,
|
||||||
|
int* reorderedToNatural,
|
||||||
|
int* naturalToReordered,
|
||||||
|
const int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
int thrBlockSize)
|
||||||
|
{
|
||||||
|
int threadBlockSize
|
||||||
|
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuLUFactorizationSplit<T, blocksize>, thrBlockSize);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(rowsInLevelSet, threadBlockSize);
|
||||||
|
cuLUFactorizationSplit<T, blocksize><<<nThreadBlocks, threadBlockSize>>>(reorderedLowerMat,
|
||||||
|
lowerRowIndices,
|
||||||
|
lowerColIndices,
|
||||||
|
reorderedUpperMat,
|
||||||
|
upperRowIndices,
|
||||||
|
upperColIndices,
|
||||||
|
diagonal,
|
||||||
|
reorderedToNatural,
|
||||||
|
naturalToReordered,
|
||||||
|
startIdx,
|
||||||
|
rowsInLevelSet);
|
||||||
|
}
|
||||||
|
|
||||||
|
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
|
||||||
|
template void solveUpperLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, T*, int); \
|
||||||
|
template void solveLowerLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||||
|
template void LUFactorization<T, blocksize>(T*, int*, int*, int*, int*, size_t, int, int); \
|
||||||
|
template void LUFactorizationSplit<T, blocksize>( \
|
||||||
|
T*, int*, int*, T*, int*, int*, T*, int*, int*, const int, int, int); \
|
||||||
|
template void solveUpperLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int); \
|
||||||
|
template void solveLowerLevelSetSplit<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*, int);
|
||||||
|
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 3);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 4);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 5);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 6);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 1);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 2);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 3);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 4);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
|
||||||
|
} // namespace Opm::cuistl::detail::ILU0
|
@ -0,0 +1,193 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2024 SINTEF AS
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#ifndef OPM_ILU0_KERNELS_HPP
|
||||||
|
#define OPM_ILU0_KERNELS_HPP
|
||||||
|
#include <cstddef>
|
||||||
|
#include <vector>
|
||||||
|
namespace Opm::cuistl::detail::ILU0
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Perform a upper solve on certain rows in a matrix that can safely be computed in parallel
|
||||||
|
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
||||||
|
* that rows in the same level sets are contiguous
|
||||||
|
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||||
|
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||||
|
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
||||||
|
* in the natural ordered matrix
|
||||||
|
* @param startIdx Index of the first row of the matrix to be solve
|
||||||
|
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||||
|
* function
|
||||||
|
* @param [out] v Will store the results of the upper solve
|
||||||
|
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
||||||
|
*/
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void solveUpperLevelSet(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
T* v,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
|
||||||
|
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
||||||
|
* that rows in the same level sets are contiguous
|
||||||
|
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||||
|
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||||
|
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
||||||
|
* in the natural ordered matrix
|
||||||
|
* @param startIdx Index of the first row of the matrix to be solve
|
||||||
|
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||||
|
* function
|
||||||
|
* @param d Stores the defect
|
||||||
|
* @param [out] v Will store the results of the lower solve
|
||||||
|
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
||||||
|
*/
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void solveLowerLevelSet(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* d,
|
||||||
|
T* v,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
||||||
|
* @param reorderedUpperMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
|
||||||
|
* such that rows in the same level sets are contiguous. This matrix is assumed to be strictly upper triangular
|
||||||
|
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||||
|
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||||
|
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
||||||
|
* in the natural ordered matrix
|
||||||
|
* @param startIdx Index of the first row of the matrix to be solve
|
||||||
|
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||||
|
* function
|
||||||
|
* @param dInv The diagonal elements of the LU factorization. Store the inverse of the diagonal entries
|
||||||
|
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
|
||||||
|
* solve
|
||||||
|
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
||||||
|
*/
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void solveUpperLevelSetSplit(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* dInv,
|
||||||
|
T* v,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Perform an lower solve on certain rows in a matrix that can safely be computed in parallel
|
||||||
|
* @param reorderedLowerMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
|
||||||
|
* such that rows in the same level sets are contiguous. This matrix is assumed to be strictly lower triangular
|
||||||
|
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||||
|
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||||
|
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
||||||
|
* in the natural ordered matrix
|
||||||
|
* @param startIdx Index of the first row of the matrix to be solve
|
||||||
|
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||||
|
* function
|
||||||
|
* @param dInv The diagonal elements of the LU factorization. Store the inverse of the diagonal entries
|
||||||
|
* @param d Stores the defect
|
||||||
|
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
|
||||||
|
* solve
|
||||||
|
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
||||||
|
*/
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void solveLowerLevelSetSplit(T* reorderedLowerMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* colIndices,
|
||||||
|
int* indexConversion,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
const T* d,
|
||||||
|
T* v,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Computes the ILU Factorization of the input bcsr matrix, which is stored in a reordered way. The diagonal
|
||||||
|
* elements store the inverse of the diagonal entries
|
||||||
|
* @param [out] reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered
|
||||||
|
* such that rows in the same level sets are contiguous
|
||||||
|
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||||
|
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||||
|
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
|
||||||
|
* index in the natural ordered matrix
|
||||||
|
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
|
||||||
|
* index in the natural ordered matrix
|
||||||
|
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||||
|
* function
|
||||||
|
* @param startIdx Index of the first row of the matrix to be solve
|
||||||
|
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
||||||
|
*/
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void LUFactorization(T* reorderedMat,
|
||||||
|
int* rowIndices,
|
||||||
|
int* columnIndices,
|
||||||
|
int* naturalToReordered,
|
||||||
|
int* reorderedToNatual,
|
||||||
|
size_t rowsInLevelSet,
|
||||||
|
int startIdx,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Computes the ILU0 factorization in-place of a bcsr matrix stored in a split format (lower, diagonal and upper
|
||||||
|
* triangular part)
|
||||||
|
* @param [out] reorderedLowerMat pointer to GPU memory containing nonzerovalues of the strictly lower triangular sparse
|
||||||
|
* matrix. The matrix reordered such that rows in the same level sets are contiguous
|
||||||
|
* @param lowerRowIndices Pointer to vector on GPU containing row indices of the lower matrix compliant wiht bsr format
|
||||||
|
* @param lowerColIndices Pointer to vector on GPU containing col indices of the lower matrix compliant wiht bsr format
|
||||||
|
* @param [out] reorderedUpperMat pointer to GPU memory containing nonzerovalues of the strictly upper triangular sparse
|
||||||
|
* matrix. The matrix reordered such that rows in the same level sets are contiguous
|
||||||
|
* @param upperRowIndices Pointer to vector on GPU containing row indices of the upper matrix compliant wiht bsr format
|
||||||
|
* @param upperColIndices Pointer to vector on GPU containing col indices of the upper matrix compliant wiht bsr format
|
||||||
|
* @param [out] diagonal The diagonal elements of the ILU0 factorization inverted
|
||||||
|
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
|
||||||
|
* index in the natural ordered matrix
|
||||||
|
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
|
||||||
|
* index in the natural ordered matrix
|
||||||
|
* @param startIdx Index of the first row of the matrix to be solve
|
||||||
|
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||||
|
* function
|
||||||
|
* @param threadBlockSize The number of threads per threadblock. Leave as -1 if no blocksize is already chosen
|
||||||
|
*/
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void LUFactorizationSplit(T* reorderedLowerMat,
|
||||||
|
int* lowerRowIndices,
|
||||||
|
int* lowerColIndices,
|
||||||
|
T* reorderedUpperMat,
|
||||||
|
int* upperRowIndices,
|
||||||
|
int* upperColIndices,
|
||||||
|
T* diagonal,
|
||||||
|
int* reorderedToNatural,
|
||||||
|
int* naturalToReordered,
|
||||||
|
int startIdx,
|
||||||
|
int rowsInLevelSet,
|
||||||
|
int threadBlockSize);
|
||||||
|
|
||||||
|
} // namespace Opm::cuistl::detail::ILU0
|
||||||
|
#endif
|
@ -0,0 +1,88 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2024 SINTEF AS
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#include <config.h>
|
||||||
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/deviceBlockOperations.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
|
||||||
|
#include <stdexcept>
|
||||||
|
|
||||||
|
namespace Opm::cuistl::detail::JAC
|
||||||
|
{
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
template <class T, int blocksize>
|
||||||
|
__global__ void
|
||||||
|
cuInvertDiagonalAndFlatten(T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
|
||||||
|
{
|
||||||
|
const auto row = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
|
if (row < numberOfRows) {
|
||||||
|
size_t nnzIdx = rowIndices[row];
|
||||||
|
size_t nnzIdxLim = rowIndices[row + 1];
|
||||||
|
|
||||||
|
// this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
|
||||||
|
// zero element
|
||||||
|
while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
|
||||||
|
++nnzIdx;
|
||||||
|
}
|
||||||
|
|
||||||
|
// diagBlock points to the start of where the diagonal block is stored
|
||||||
|
T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
|
||||||
|
// vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
|
||||||
|
// element should be stored
|
||||||
|
T* vecBlock = &vec[blocksize * blocksize * row];
|
||||||
|
|
||||||
|
invBlockOutOfPlace<T, blocksize>(diagBlock, vecBlock);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void
|
||||||
|
invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
|
||||||
|
{
|
||||||
|
if (blocksize <= 3) {
|
||||||
|
int threadBlockSize
|
||||||
|
= ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(cuInvertDiagonalAndFlatten<T, blocksize>);
|
||||||
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfRows, threadBlockSize);
|
||||||
|
cuInvertDiagonalAndFlatten<T, blocksize>
|
||||||
|
<<<nThreadBlocks, threadBlockSize>>>(mat, rowIndices, colIndices, numberOfRows, vec);
|
||||||
|
} else {
|
||||||
|
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
|
||||||
|
template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*);
|
||||||
|
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 3);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 4);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 5);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(float, 6);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 1);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 2);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 3);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 4);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
|
||||||
|
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
|
||||||
|
|
||||||
|
} // namespace Opm::cuistl::detail::JAC
|
@ -0,0 +1,38 @@
|
|||||||
|
/*
|
||||||
|
Copyright 2024 SINTEF AS
|
||||||
|
|
||||||
|
This file is part of the Open Porous Media project (OPM).
|
||||||
|
|
||||||
|
OPM is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
OPM is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
#ifndef OPM_JAC_KERNELS_HPP
|
||||||
|
#define OPM_JAC_KERNELS_HPP
|
||||||
|
#include <cstddef>
|
||||||
|
#include <vector>
|
||||||
|
namespace Opm::cuistl::detail::JAC
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief This function receives a matrix, and the inverse of the matrix containing only its diagonal is stored in d_vec
|
||||||
|
* @param mat pointer to GPU memory containing nonzerovalues of the sparse matrix
|
||||||
|
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||||
|
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||||
|
* @param numberOfRows Integer describing the number of rows in the matrix
|
||||||
|
* @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored
|
||||||
|
*/
|
||||||
|
template <class T, int blocksize>
|
||||||
|
void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec);
|
||||||
|
|
||||||
|
} // namespace Opm::cuistl::detail::JAC
|
||||||
|
#endif
|
@ -16,14 +16,15 @@
|
|||||||
You should have received a copy of the GNU General Public License
|
You should have received a copy of the GNU General Public License
|
||||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
#include <config.h>
|
||||||
#include <opm/common/ErrorMacros.hpp>
|
#include <opm/common/ErrorMacros.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
|
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/cublas_safe_call.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/cublas_wrapper.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/cuda_safe_call.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/gpuThreadUtils.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/vector_operations.hpp>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <config.h>
|
|
||||||
namespace Opm::cuistl::detail
|
namespace Opm::cuistl::detail
|
||||||
{
|
{
|
||||||
|
|
||||||
@ -83,20 +84,8 @@ namespace
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
constexpr inline size_t getThreads([[maybe_unused]] size_t numberOfElements)
|
|
||||||
{
|
|
||||||
return 1024;
|
|
||||||
}
|
|
||||||
|
|
||||||
inline size_t getBlocks(size_t numberOfElements)
|
|
||||||
{
|
|
||||||
const auto threads = getThreads(numberOfElements);
|
|
||||||
return (numberOfElements + threads - 1) / threads;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
__global__ void
|
__global__ void prepareSendBufKernel(const T* a, T* buffer, size_t numberOfElements, const int* indices)
|
||||||
prepareSendBufKernel(const T* a, T* buffer, size_t numberOfElements, const int* indices)
|
|
||||||
{
|
{
|
||||||
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
|
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
@ -105,8 +94,7 @@ namespace
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
template <class T>
|
template <class T>
|
||||||
__global__ void
|
__global__ void syncFromRecvBufKernel(T* a, T* buffer, size_t numberOfElements, const int* indices)
|
||||||
syncFromRecvBufKernel(T* a, T* buffer, size_t numberOfElements, const int* indices)
|
|
||||||
{
|
{
|
||||||
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
|
const auto globalIndex = blockDim.x * blockIdx.x + threadIdx.x;
|
||||||
|
|
||||||
@ -116,33 +104,13 @@ namespace
|
|||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
// Kernel here is the function object of the cuda kernel
|
|
||||||
template <class Kernel>
|
|
||||||
inline int getCudaRecomendedThreadBlockSize(Kernel k)
|
|
||||||
{
|
|
||||||
#if USE_HIP
|
|
||||||
return 512;
|
|
||||||
#else
|
|
||||||
int blockSize;
|
|
||||||
int tmpGridSize;
|
|
||||||
std::ignore = cudaOccupancyMaxPotentialBlockSize(&tmpGridSize, &blockSize, k, 0, 0);
|
|
||||||
return blockSize;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
inline int getNumberOfBlocks(int wantedThreads, int threadBlockSize)
|
|
||||||
{
|
|
||||||
return (wantedThreads + threadBlockSize - 1) / threadBlockSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
void
|
void
|
||||||
setVectorValue(T* deviceData, size_t numberOfElements, const T& value)
|
setVectorValue(T* deviceData, size_t numberOfElements, const T& value)
|
||||||
{
|
{
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(setVectorValueKernel<T>);
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(setVectorValueKernel<T>);
|
||||||
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
setVectorValueKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
setVectorValueKernel<<<nThreadBlocks, threadBlockSize>>>(deviceData, numberOfElements, value);
|
||||||
deviceData, numberOfElements, value);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template void setVectorValue(double*, size_t, const double&);
|
template void setVectorValue(double*, size_t, const double&);
|
||||||
@ -153,10 +121,9 @@ template <class T>
|
|||||||
void
|
void
|
||||||
setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices)
|
setZeroAtIndexSet(T* deviceData, size_t numberOfElements, const int* indices)
|
||||||
{
|
{
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel<T>);
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(setZeroAtIndexSetKernel<T>);
|
||||||
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
setZeroAtIndexSetKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
setZeroAtIndexSetKernel<<<nThreadBlocks, threadBlockSize>>>(deviceData, numberOfElements, indices);
|
||||||
deviceData, numberOfElements, indices);
|
|
||||||
}
|
}
|
||||||
template void setZeroAtIndexSet(double*, size_t, const int*);
|
template void setZeroAtIndexSet(double*, size_t, const int*);
|
||||||
template void setZeroAtIndexSet(float*, size_t, const int*);
|
template void setZeroAtIndexSet(float*, size_t, const int*);
|
||||||
@ -164,12 +131,16 @@ template void setZeroAtIndexSet(int*, size_t, const int*);
|
|||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
T
|
T
|
||||||
innerProductAtIndices(cublasHandle_t cublasHandle, const T* deviceA, const T* deviceB, T* buffer, size_t numberOfElements, const int* indices)
|
innerProductAtIndices(cublasHandle_t cublasHandle,
|
||||||
|
const T* deviceA,
|
||||||
|
const T* deviceB,
|
||||||
|
T* buffer,
|
||||||
|
size_t numberOfElements,
|
||||||
|
const int* indices)
|
||||||
{
|
{
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel<T>);
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(elementWiseMultiplyKernel<T>);
|
||||||
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
elementWiseMultiplyKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
elementWiseMultiplyKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, deviceB, buffer, numberOfElements, indices);
|
||||||
deviceA, deviceB, buffer, numberOfElements, indices);
|
|
||||||
|
|
||||||
// TODO: [perf] Get rid of the allocation here.
|
// TODO: [perf] Get rid of the allocation here.
|
||||||
CuVector<T> oneVector(numberOfElements);
|
CuVector<T> oneVector(numberOfElements);
|
||||||
@ -184,11 +155,12 @@ template float innerProductAtIndices(cublasHandle_t, const float*, const float*,
|
|||||||
template int innerProductAtIndices(cublasHandle_t, const int*, const int*, int* buffer, size_t, const int*);
|
template int innerProductAtIndices(cublasHandle_t, const int*, const int*, int* buffer, size_t, const int*);
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
void prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
|
void
|
||||||
|
prepareSendBuf(const T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
|
||||||
{
|
{
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(prepareSendBufKernel<T>);
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(prepareSendBufKernel<T>);
|
||||||
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
prepareSendBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
|
prepareSendBufKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, buffer, numberOfElements, indices);
|
||||||
OPM_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // The buffers are prepared for MPI. Wait for them to finish.
|
OPM_CUDA_SAFE_CALL(cudaDeviceSynchronize()); // The buffers are prepared for MPI. Wait for them to finish.
|
||||||
}
|
}
|
||||||
template void prepareSendBuf(const double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
|
template void prepareSendBuf(const double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
|
||||||
@ -196,11 +168,12 @@ template void prepareSendBuf(const float* deviceA, float* buffer, size_t numberO
|
|||||||
template void prepareSendBuf(const int* deviceA, int* buffer, size_t numberOfElements, const int* indices);
|
template void prepareSendBuf(const int* deviceA, int* buffer, size_t numberOfElements, const int* indices);
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
void syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
|
void
|
||||||
|
syncFromRecvBuf(T* deviceA, T* buffer, size_t numberOfElements, const int* indices)
|
||||||
{
|
{
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel<T>);
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(syncFromRecvBufKernel<T>);
|
||||||
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
syncFromRecvBufKernel<<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(deviceA, buffer, numberOfElements, indices);
|
syncFromRecvBufKernel<<<nThreadBlocks, threadBlockSize>>>(deviceA, buffer, numberOfElements, indices);
|
||||||
// cudaDeviceSynchronize(); // Not needed, I guess...
|
// cudaDeviceSynchronize(); // Not needed, I guess...
|
||||||
}
|
}
|
||||||
template void syncFromRecvBuf(double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
|
template void syncFromRecvBuf(double* deviceA, double* buffer, size_t numberOfElements, const int* indices);
|
||||||
@ -217,30 +190,24 @@ weightedDiagMV(const T* squareBlockVector,
|
|||||||
T* dstVec)
|
T* dstVec)
|
||||||
{
|
{
|
||||||
switch (blocksize) {
|
switch (blocksize) {
|
||||||
case 1:
|
case 1: {
|
||||||
{
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 1>);
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 1>);
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
weightedDiagMV<T, 1>
|
||||||
weightedDiagMV<T, 1><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
<<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
|
||||||
squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
|
} break;
|
||||||
}
|
case 2: {
|
||||||
break;
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 2>);
|
||||||
case 2:
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
{
|
weightedDiagMV<T, 2>
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 2>);
|
<<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
|
||||||
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
} break;
|
||||||
weightedDiagMV<T, 2><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
case 3: {
|
||||||
squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
|
int threadBlockSize = ::Opm::cuistl::detail::getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 3>);
|
||||||
}
|
int nThreadBlocks = ::Opm::cuistl::detail::getNumberOfBlocks(numberOfElements, threadBlockSize);
|
||||||
break;
|
weightedDiagMV<T, 3>
|
||||||
case 3:
|
<<<nThreadBlocks, threadBlockSize>>>(squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
|
||||||
{
|
} break;
|
||||||
int threadBlockSize = getCudaRecomendedThreadBlockSize(weightedDiagMV<T, 3>);
|
|
||||||
int nThreadBlocks = getNumberOfBlocks(numberOfElements, threadBlockSize);
|
|
||||||
weightedDiagMV<T, 3><<<getBlocks(numberOfElements), getThreads(numberOfElements)>>>(
|
|
||||||
squareBlockVector, numberOfElements, relaxationFactor, srcVec, dstVec);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
default:
|
default:
|
||||||
OPM_THROW(std::invalid_argument, "blockvector Hadamard product not implemented for blocksize>3");
|
OPM_THROW(std::invalid_argument, "blockvector Hadamard product not implemented for blocksize>3");
|
||||||
break;
|
break;
|
||||||
|
@ -28,6 +28,7 @@
|
|||||||
#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
|
#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
||||||
#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
|
#include <opm/simulators/linalg/cuistl/detail/fix_zero_diagonal.hpp>
|
||||||
|
#include <opm/simulators/linalg/cuistl/detail/preconditionerKernels/JacKernels.hpp>
|
||||||
|
|
||||||
using NumericTypes = boost::mpl::list<double, float>;
|
using NumericTypes = boost::mpl::list<double, float>;
|
||||||
|
|
||||||
@ -87,7 +88,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith3By3Blocks, T, Numeric
|
|||||||
Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
|
Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
|
||||||
Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
|
Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
|
||||||
|
|
||||||
Opm::cuistl::detail::invertDiagonalAndFlatten<T, 3>(
|
Opm::cuistl::detail::JAC::invertDiagonalAndFlatten<T, 3>(
|
||||||
m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());
|
m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());
|
||||||
|
|
||||||
std::vector<T> expectedInvDiag {-1.0 / 4.0,
|
std::vector<T> expectedInvDiag {-1.0 / 4.0,
|
||||||
@ -161,7 +162,7 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric
|
|||||||
Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
|
Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
|
||||||
Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
|
Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
|
||||||
|
|
||||||
Opm::cuistl::detail::invertDiagonalAndFlatten<T, 2>(
|
Opm::cuistl::detail::JAC::invertDiagonalAndFlatten<T, 2>(
|
||||||
m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());
|
m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());
|
||||||
|
|
||||||
std::vector<T> expectedInvDiag {2.0, -2.0, -1.0 / 2.0, 1.0, -1.0, 0.0, 0.0, -1.0};
|
std::vector<T> expectedInvDiag {2.0, -2.0, -1.0 / 2.0, 1.0, -1.0, 0.0, 0.0, -1.0};
|
||||||
|
Loading…
Reference in New Issue
Block a user