mirror of
https://github.com/OPM/opm-simulators.git
synced 2025-02-25 18:55:30 -06:00
commit
2626fbb84b
@ -162,6 +162,7 @@ if(CUDA_FOUND)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuVector.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/detail/vector_operations.cu)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuSparseMatrix.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuDILU.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuJac.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/CuSeqILU0.cpp)
|
||||
list (APPEND MAIN_SOURCE_FILES opm/simulators/linalg/cuistl/set_device.cpp)
|
||||
@ -174,6 +175,7 @@ if(CUDA_FOUND)
|
||||
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/cuda_check_last_error.hpp)
|
||||
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/CuBlasHandle.hpp)
|
||||
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp)
|
||||
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuDILU.hpp)
|
||||
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuJac.hpp)
|
||||
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuVector.hpp)
|
||||
list (APPEND PUBLIC_HEADER_FILES opm/simulators/linalg/cuistl/CuSparseMatrix.hpp)
|
||||
|
@ -50,6 +50,7 @@
|
||||
#include <opm/simulators/linalg/cuistl/PreconditionerAdapter.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/PreconditionerConvertFieldTypeAdapter.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/CuBlockPreconditioner.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/CuJac.hpp>
|
||||
|
||||
#endif
|
||||
@ -270,6 +271,16 @@ struct StandardPreconditioners
|
||||
auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
||||
return wrapped;
|
||||
});
|
||||
|
||||
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t, const C& comm) {
|
||||
using field_type = typename V::field_type;
|
||||
using CuDILU = typename Opm::cuistl::CuDILU<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
|
||||
auto cuDILU = std::make_shared<CuDILU>(op.getmat());
|
||||
|
||||
auto adapted = std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CuDILU>>(cuDILU);
|
||||
auto wrapped = std::make_shared<Opm::cuistl::CuBlockPreconditioner<V, V, Comm>>(adapted, comm);
|
||||
return wrapped;
|
||||
});
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -490,6 +501,25 @@ struct StandardPreconditioners<Operator,Dune::Amg::SequentialInformation>
|
||||
using CUJac = typename Opm::cuistl::CuJac<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
|
||||
return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUJac>>(std::make_shared<CUJac>(op.getmat(), w));
|
||||
});
|
||||
|
||||
F::addCreator("CUDILU", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
||||
using field_type = typename V::field_type;
|
||||
using CUDILU = typename Opm::cuistl::CuDILU<M, Opm::cuistl::CuVector<field_type>, Opm::cuistl::CuVector<field_type>>;
|
||||
return std::make_shared<Opm::cuistl::PreconditionerAdapter<V, V, CUDILU>>(std::make_shared<CUDILU>(op.getmat()));
|
||||
});
|
||||
|
||||
F::addCreator("CUDILUFloat", [](const O& op, [[maybe_unused]] const P& prm, const std::function<V()>&, std::size_t) {
|
||||
using block_type = typename V::block_type;
|
||||
using VTo = Dune::BlockVector<Dune::FieldVector<float, block_type::dimension>>;
|
||||
using matrix_type_to = typename Dune::BCRSMatrix<Dune::FieldMatrix<float, block_type::dimension, block_type::dimension>>;
|
||||
using CuDILU = typename Opm::cuistl::CuDILU<matrix_type_to, Opm::cuistl::CuVector<float>, Opm::cuistl::CuVector<float>>;
|
||||
using Adapter = typename Opm::cuistl::PreconditionerAdapter<VTo, VTo, CuDILU>;
|
||||
using Converter = typename Opm::cuistl::PreconditionerConvertFieldTypeAdapter<Adapter, M, V, V>;
|
||||
auto converted = std::make_shared<Converter>(op.getmat());
|
||||
auto adapted = std::make_shared<Adapter>(std::make_shared<CuDILU>(converted->getConvertedMatrix()));
|
||||
converted->setUnderlyingPreconditioner(adapted);
|
||||
return converted;
|
||||
});
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
235
opm/simulators/linalg/cuistl/CuDILU.cpp
Normal file
235
opm/simulators/linalg/cuistl/CuDILU.cpp
Normal file
@ -0,0 +1,235 @@
|
||||
/*
|
||||
Copyright 2022-2023 SINTEF AS
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#include <cuda.h>
|
||||
#include <cuda_runtime.h>
|
||||
#include <dune/common/fmatrix.hh>
|
||||
#include <dune/istl/bcrsmatrix.hh>
|
||||
#include <fmt/core.h>
|
||||
#include <opm/common/ErrorMacros.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/CuDILU.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/CuVector.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/detail/cusparse_matrix_operations.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/detail/safe_conversion.hpp>
|
||||
#include <opm/simulators/linalg/matrixblock.hh>
|
||||
#include <vector>
|
||||
|
||||
namespace
|
||||
{
|
||||
std::vector<int>
|
||||
createReorderedToNatural(Opm::SparseTable<size_t> levelSets)
|
||||
{
|
||||
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
|
||||
int globCnt = 0;
|
||||
for (auto row : levelSets) {
|
||||
for (auto col : row) {
|
||||
OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
|
||||
fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
|
||||
res[globCnt++] = static_cast<int>(col);
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
std::vector<int>
|
||||
createNaturalToReordered(Opm::SparseTable<size_t> levelSets)
|
||||
{
|
||||
auto res = std::vector<int>(Opm::cuistl::detail::to_size_t(levelSets.dataSize()));
|
||||
int globCnt = 0;
|
||||
for (auto row : levelSets) {
|
||||
for (auto col : row) {
|
||||
OPM_ERROR_IF(Opm::cuistl::detail::to_size_t(globCnt) >= res.size(),
|
||||
fmt::format("Internal error. globCnt = {}, res.size() = {}", globCnt, res.size()));
|
||||
res[col] = globCnt++;
|
||||
}
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
// TODO: When this function is called we already have the natural ordered matrix on the GPU
|
||||
// TODO: could it be possible to create the reordered one in a kernel to speed up the constructor?
|
||||
template <class M, class field_type>
|
||||
Opm::cuistl::CuSparseMatrix<field_type>
|
||||
createReorderedMatrix(const M& naturalMatrix, std::vector<int> reorderedToNatural)
|
||||
{
|
||||
M reorderedMatrix(naturalMatrix.N(), naturalMatrix.N(), naturalMatrix.nonzeroes(), M::row_wise);
|
||||
for (auto dstRowIt = reorderedMatrix.createbegin(); dstRowIt != reorderedMatrix.createend(); ++dstRowIt) {
|
||||
auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
|
||||
// For elements in A
|
||||
for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
|
||||
dstRowIt.insert(elem.index());
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: There is probably a faster way to copy by copying whole rows at a time
|
||||
for (auto dstRowIt = reorderedMatrix.begin(); dstRowIt != reorderedMatrix.end(); ++dstRowIt) {
|
||||
auto srcRow = naturalMatrix.begin() + reorderedToNatural[dstRowIt.index()];
|
||||
for (auto elem = srcRow->begin(); elem != srcRow->end(); elem++) {
|
||||
reorderedMatrix[dstRowIt.index()][elem.index()] = *elem;
|
||||
}
|
||||
}
|
||||
|
||||
return Opm::cuistl::CuSparseMatrix<field_type>::fromMatrix(reorderedMatrix, true);
|
||||
}
|
||||
|
||||
} // NAMESPACE
|
||||
|
||||
namespace Opm::cuistl
|
||||
{
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
CuDILU<M, X, Y, l>::CuDILU(const M& A)
|
||||
: m_cpuMatrix(A)
|
||||
, m_levelSets(Opm::getMatrixRowColoring(m_cpuMatrix, Opm::ColoringType::LOWER))
|
||||
, m_reorderedToNatural(createReorderedToNatural(m_levelSets))
|
||||
, m_naturalToReordered(createNaturalToReordered(m_levelSets))
|
||||
, m_gpuMatrix(CuSparseMatrix<field_type>::fromMatrix(m_cpuMatrix, true))
|
||||
, m_gpuMatrixReordered(createReorderedMatrix<M, field_type>(m_cpuMatrix, m_reorderedToNatural))
|
||||
, m_gpuNaturalToReorder(m_naturalToReordered)
|
||||
, m_gpuReorderToNatural(m_reorderedToNatural)
|
||||
, m_gpuDInv(m_gpuMatrix.N() * m_gpuMatrix.blockSize() * m_gpuMatrix.blockSize())
|
||||
|
||||
{
|
||||
// TODO: Should in some way verify that this matrix is symmetric, only do it debug mode?
|
||||
// Some sanity check
|
||||
OPM_ERROR_IF(A.N() != m_gpuMatrix.N(),
|
||||
fmt::format("CuSparse matrix not same size as DUNE matrix. {} vs {}.", m_gpuMatrix.N(), A.N()));
|
||||
OPM_ERROR_IF(A[0][0].N() != m_gpuMatrix.blockSize(),
|
||||
fmt::format("CuSparse matrix not same blocksize as DUNE matrix. {} vs {}.",
|
||||
m_gpuMatrix.blockSize(),
|
||||
A[0][0].N()));
|
||||
OPM_ERROR_IF(A.N() * A[0][0].N() != m_gpuMatrix.dim(),
|
||||
fmt::format("CuSparse matrix not same dimension as DUNE matrix. {} vs {}.",
|
||||
m_gpuMatrix.dim(),
|
||||
A.N() * A[0][0].N()));
|
||||
OPM_ERROR_IF(A.nonzeroes() != m_gpuMatrix.nonzeroes(),
|
||||
fmt::format("CuSparse matrix not same number of non zeroes as DUNE matrix. {} vs {}. ",
|
||||
m_gpuMatrix.nonzeroes(),
|
||||
A.nonzeroes()));
|
||||
update();
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
CuDILU<M, X, Y, l>::pre([[maybe_unused]] X& x, [[maybe_unused]] Y& b)
|
||||
{
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
CuDILU<M, X, Y, l>::apply(X& v, const Y& d)
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_apply);
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
detail::computeLowerSolveLevelSet<field_type, blocksize_>(m_gpuMatrixReordered.getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered.getRowIndices().data(),
|
||||
m_gpuMatrixReordered.getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
d.data(),
|
||||
v.data());
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
|
||||
levelStartIdx = m_cpuMatrix.N();
|
||||
// upper triangular solve: (D + U_A) v = Dy
|
||||
for (int level = m_levelSets.size() - 1; level >= 0; --level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
levelStartIdx -= numOfRowsInLevel;
|
||||
detail::computeUpperSolveLevelSet<field_type, blocksize_>(m_gpuMatrixReordered.getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered.getRowIndices().data(),
|
||||
m_gpuMatrixReordered.getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data(),
|
||||
v.data());
|
||||
}
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
CuDILU<M, X, Y, l>::post([[maybe_unused]] X& x)
|
||||
{
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
Dune::SolverCategory::Category
|
||||
CuDILU<M, X, Y, l>::category() const
|
||||
{
|
||||
return Dune::SolverCategory::sequential;
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
void
|
||||
CuDILU<M, X, Y, l>::update()
|
||||
{
|
||||
OPM_TIMEBLOCK(prec_update);
|
||||
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix, true); // send updated matrix to the gpu
|
||||
|
||||
detail::copyMatDataToReordered<field_type, blocksize_>(m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuMatrixReordered.getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered.getRowIndices().data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
m_gpuMatrixReordered.N());
|
||||
|
||||
int levelStartIdx = 0;
|
||||
for (int level = 0; level < m_levelSets.size(); ++level) {
|
||||
const int numOfRowsInLevel = m_levelSets[level].size();
|
||||
|
||||
detail::computeDiluDiagonal<field_type, blocksize_>(m_gpuMatrixReordered.getNonZeroValues().data(),
|
||||
m_gpuMatrixReordered.getRowIndices().data(),
|
||||
m_gpuMatrixReordered.getColumnIndices().data(),
|
||||
m_gpuReorderToNatural.data(),
|
||||
m_gpuNaturalToReorder.data(),
|
||||
levelStartIdx,
|
||||
numOfRowsInLevel,
|
||||
m_gpuDInv.data());
|
||||
levelStartIdx += numOfRowsInLevel;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Opm::cuistl
|
||||
#define INSTANTIATE_CUDILU_DUNE(realtype, blockdim) \
|
||||
template class ::Opm::cuistl::CuDILU<Dune::BCRSMatrix<Dune::FieldMatrix<realtype, blockdim, blockdim>>, \
|
||||
::Opm::cuistl::CuVector<realtype>, \
|
||||
::Opm::cuistl::CuVector<realtype>>; \
|
||||
template class ::Opm::cuistl::CuDILU<Dune::BCRSMatrix<Opm::MatrixBlock<realtype, blockdim, blockdim>>, \
|
||||
::Opm::cuistl::CuVector<realtype>, \
|
||||
::Opm::cuistl::CuVector<realtype>>
|
||||
|
||||
INSTANTIATE_CUDILU_DUNE(double, 1);
|
||||
INSTANTIATE_CUDILU_DUNE(double, 2);
|
||||
INSTANTIATE_CUDILU_DUNE(double, 3);
|
||||
INSTANTIATE_CUDILU_DUNE(double, 4);
|
||||
INSTANTIATE_CUDILU_DUNE(double, 5);
|
||||
INSTANTIATE_CUDILU_DUNE(double, 6);
|
||||
|
||||
INSTANTIATE_CUDILU_DUNE(float, 1);
|
||||
INSTANTIATE_CUDILU_DUNE(float, 2);
|
||||
INSTANTIATE_CUDILU_DUNE(float, 3);
|
||||
INSTANTIATE_CUDILU_DUNE(float, 4);
|
||||
INSTANTIATE_CUDILU_DUNE(float, 5);
|
||||
INSTANTIATE_CUDILU_DUNE(float, 6);
|
121
opm/simulators/linalg/cuistl/CuDILU.hpp
Normal file
121
opm/simulators/linalg/cuistl/CuDILU.hpp
Normal file
@ -0,0 +1,121 @@
|
||||
/*
|
||||
Copyright 2022-2023 SINTEF AS
|
||||
|
||||
This file is part of the Open Porous Media project (OPM).
|
||||
|
||||
OPM is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
OPM is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU General Public License
|
||||
along with OPM. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
#ifndef OPM_CUDILU_HPP
|
||||
#define OPM_CUDILU_HPP
|
||||
|
||||
#include <dune/istl/preconditioner.hh>
|
||||
#include <opm/grid/utility/SparseTable.hpp>
|
||||
#include <opm/simulators/linalg/GraphColoring.hpp>
|
||||
#include <opm/simulators/linalg/PreconditionerWithUpdate.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/CuSparseMatrix.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/detail/CuMatrixDescription.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/detail/CuSparseHandle.hpp>
|
||||
#include <opm/simulators/linalg/cuistl/detail/CuSparseResource.hpp>
|
||||
#include <vector>
|
||||
|
||||
|
||||
|
||||
namespace Opm::cuistl
|
||||
{
|
||||
//! \brief DILU preconditioner on the GPU.
|
||||
//!
|
||||
//! \tparam M The matrix type to operate on
|
||||
//! \tparam X Type of the update
|
||||
//! \tparam Y Type of the defect
|
||||
//! \tparam l Ignored. Just there to have the same number of template arguments
|
||||
//! as other preconditioners.
|
||||
//!
|
||||
//! \note We assume X and Y are both CuVector<real_type>, but we leave them as template
|
||||
//! arguments in case of future additions.
|
||||
template <class M, class X, class Y, int l = 1>
|
||||
class CuDILU : public Dune::PreconditionerWithUpdate<X, Y>
|
||||
{
|
||||
public:
|
||||
//! \brief The matrix type the preconditioner is for.
|
||||
using matrix_type = typename std::remove_const<M>::type;
|
||||
//! \brief The domain type of the preconditioner.
|
||||
using domain_type = X;
|
||||
//! \brief The range type of the preconditioner.
|
||||
using range_type = Y;
|
||||
//! \brief The field type of the preconditioner.
|
||||
using field_type = typename X::field_type;
|
||||
|
||||
//! \brief Constructor.
|
||||
//!
|
||||
//! Constructor gets all parameters to operate the prec.
|
||||
//! \param A The matrix to operate on.
|
||||
//! \param w The relaxation factor.
|
||||
//!
|
||||
explicit CuDILU(const M& A);
|
||||
|
||||
//! \brief Prepare the preconditioner.
|
||||
//! \note Does nothing at the time being.
|
||||
void pre(X& x, Y& b) override;
|
||||
|
||||
//! \brief Apply the preconditoner.
|
||||
void apply(X& v, const Y& d) override;
|
||||
|
||||
//! \brief Post processing
|
||||
//! \note Does nothing at the moment
|
||||
void post(X& x) override;
|
||||
|
||||
//! Category of the preconditioner (see SolverCategory::Category)
|
||||
Dune::SolverCategory::Category category() const override;
|
||||
|
||||
//! \brief Updates the matrix data.
|
||||
void update() final;
|
||||
|
||||
|
||||
//! \returns false
|
||||
static constexpr bool shouldCallPre()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
//! \returns false
|
||||
static constexpr bool shouldCallPost()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
//! \brief Reference to the underlying matrix
|
||||
const M& m_cpuMatrix;
|
||||
//! \brief size_t describing the dimensions of the square block elements
|
||||
static constexpr const size_t blocksize_ = matrix_type::block_type::cols;
|
||||
//! \brief SparseTable storing each row by level
|
||||
Opm::SparseTable<size_t> m_levelSets;
|
||||
//! \brief converts from index in reordered structure to index natural ordered structure
|
||||
std::vector<int> m_reorderedToNatural;
|
||||
//! \brief converts from index in natural ordered structure to index reordered strucutre
|
||||
std::vector<int> m_naturalToReordered;
|
||||
//! \brief The A matrix stored on the gpu, and its reordred version
|
||||
CuSparseMatrix<field_type> m_gpuMatrix;
|
||||
CuSparseMatrix<field_type> m_gpuMatrixReordered;
|
||||
//! row conversion from natural to reordered matrix indices stored on the GPU
|
||||
CuVector<int> m_gpuNaturalToReorder;
|
||||
//! row conversion from reordered to natural matrix indices stored on the GPU
|
||||
CuVector<int> m_gpuReorderToNatural;
|
||||
//! \brief Stores the inverted diagonal that we use in DILU
|
||||
CuVector<field_type> m_gpuDInv;
|
||||
};
|
||||
} // end namespace Opm::cuistl
|
||||
|
||||
#endif
|
@ -67,12 +67,11 @@ CuJac<M, X, Y, l>::CuJac(const M& A, field_type w)
|
||||
A.nonzeroes()));
|
||||
|
||||
// Compute the inverted diagonal of A and store it in a vector format in m_diagInvFlattened
|
||||
detail::invertDiagonalAndFlatten(m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuMatrix.getColumnIndices().data(),
|
||||
m_gpuMatrix.N(),
|
||||
m_gpuMatrix.blockSize(),
|
||||
m_diagInvFlattened.data());
|
||||
detail::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuMatrix.getColumnIndices().data(),
|
||||
m_gpuMatrix.N(),
|
||||
m_diagInvFlattened.data());
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
@ -90,12 +89,8 @@ CuJac<M, X, Y, l>::apply(X& v, const Y& d)
|
||||
|
||||
// Compute the MV product where the matrix is diagonal and therefore stored as a vector.
|
||||
// The product is thus computed as a hadamard product.
|
||||
detail::weightedDiagMV(m_diagInvFlattened.data(),
|
||||
m_gpuMatrix.N(),
|
||||
m_gpuMatrix.blockSize(),
|
||||
m_relaxationFactor,
|
||||
d.data(),
|
||||
v.data());
|
||||
detail::weightedDiagMV(
|
||||
m_diagInvFlattened.data(), m_gpuMatrix.N(), m_gpuMatrix.blockSize(), m_relaxationFactor, d.data(), v.data());
|
||||
}
|
||||
|
||||
template <class M, class X, class Y, int l>
|
||||
@ -116,12 +111,11 @@ void
|
||||
CuJac<M, X, Y, l>::update()
|
||||
{
|
||||
m_gpuMatrix.updateNonzeroValues(m_cpuMatrix);
|
||||
detail::invertDiagonalAndFlatten(m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuMatrix.getColumnIndices().data(),
|
||||
m_gpuMatrix.N(),
|
||||
m_gpuMatrix.blockSize(),
|
||||
m_diagInvFlattened.data());
|
||||
detail::invertDiagonalAndFlatten<field_type, matrix_type::block_type::cols>(m_gpuMatrix.getNonZeroValues().data(),
|
||||
m_gpuMatrix.getRowIndices().data(),
|
||||
m_gpuMatrix.getColumnIndices().data(),
|
||||
m_gpuMatrix.N(),
|
||||
m_diagInvFlattened.data());
|
||||
}
|
||||
|
||||
} // namespace Opm::cuistl
|
||||
|
@ -21,113 +21,323 @@
|
||||
#include <stdexcept>
|
||||
namespace Opm::cuistl::detail
|
||||
{
|
||||
|
||||
namespace
|
||||
{
|
||||
// TODO: combine the three following kernels into one using template arguments so that
|
||||
// no compile time optimization is lost while making the code more compact
|
||||
template <class T>
|
||||
__global__ void cuInvertDiagonalAndFlattenBlocksize1(
|
||||
T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
|
||||
{
|
||||
|
||||
// TODO: figure out if this can be generalized effectively, this seems excessively verbose
|
||||
// explicit formulas based on Dune cpu code
|
||||
template <class T, int blocksize>
|
||||
__device__ __forceinline__ void invBlockOutOfPlace(const T* __restrict__ srcBlock, T* __restrict__ dstBlock)
|
||||
{
|
||||
const auto row = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int blocksize = 1;
|
||||
|
||||
if (row < numberOfRows) {
|
||||
size_t nnzIdx = rowIndices[row];
|
||||
size_t nnzIdxLim = rowIndices[row + 1];
|
||||
|
||||
// this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
|
||||
// zero element
|
||||
while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
|
||||
++nnzIdx;
|
||||
}
|
||||
|
||||
// diagBlock points to the start of where the diagonal block is stored
|
||||
T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
|
||||
// vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
|
||||
// element should be stored
|
||||
T* vecBlock = &vec[blocksize * blocksize * row];
|
||||
|
||||
vecBlock[0] = 1.0 / (diagBlock[0]);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T>
|
||||
__global__ void cuInvertDiagonalAndFlattenBlocksize2(
|
||||
T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
|
||||
{
|
||||
const auto row = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int blocksize = 2;
|
||||
|
||||
if (row < numberOfRows) {
|
||||
size_t nnzIdx = rowIndices[row];
|
||||
size_t nnzIdxLim = rowIndices[row + 1];
|
||||
|
||||
// this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
|
||||
// zero element
|
||||
while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
|
||||
++nnzIdx;
|
||||
}
|
||||
|
||||
// diagBlock points to the start of where the diagonal block is stored
|
||||
T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
|
||||
// vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
|
||||
// element should be stored
|
||||
T* vecBlock = &vec[blocksize * blocksize * row];
|
||||
|
||||
if (blocksize == 1) {
|
||||
dstBlock[0] = 1.0 / (srcBlock[0]);
|
||||
} else if (blocksize == 2) {
|
||||
T detInv = 1.0 / (srcBlock[0] * srcBlock[3] - srcBlock[1] * srcBlock[2]);
|
||||
dstBlock[0] = srcBlock[3] * detInv;
|
||||
dstBlock[1] = -srcBlock[1] * detInv;
|
||||
dstBlock[2] = -srcBlock[2] * detInv;
|
||||
dstBlock[3] = srcBlock[0] * detInv;
|
||||
} else if (blocksize == 3) {
|
||||
// based on Dune implementation
|
||||
T detInv = 1.0 / (diagBlock[0] * diagBlock[3] - diagBlock[1] * diagBlock[2]);
|
||||
vecBlock[0] = diagBlock[3] * detInv;
|
||||
vecBlock[1] = -diagBlock[1] * detInv;
|
||||
vecBlock[2] = -diagBlock[2] * detInv;
|
||||
vecBlock[3] = diagBlock[0] * detInv;
|
||||
}
|
||||
}
|
||||
template <class T>
|
||||
__global__ void cuInvertDiagonalAndFlattenBlocksize3(
|
||||
T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
|
||||
{
|
||||
const auto row = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
const int blocksize = 3;
|
||||
|
||||
if (row < numberOfRows) {
|
||||
size_t nnzIdx = rowIndices[row];
|
||||
size_t nnzIdxLim = rowIndices[row + 1];
|
||||
|
||||
// this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
|
||||
// zero element
|
||||
while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
|
||||
++nnzIdx;
|
||||
}
|
||||
|
||||
// diagBlock points to the start of where the diagonal block is stored
|
||||
T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
|
||||
// vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
|
||||
// element should be stored
|
||||
T* vecBlock = &vec[blocksize * blocksize * row];
|
||||
|
||||
// based on Dune implementation
|
||||
T t4 = diagBlock[0] * diagBlock[4];
|
||||
T t6 = diagBlock[0] * diagBlock[5];
|
||||
T t8 = diagBlock[1] * diagBlock[3];
|
||||
T t10 = diagBlock[2] * diagBlock[3];
|
||||
T t12 = diagBlock[1] * diagBlock[6];
|
||||
T t14 = diagBlock[2] * diagBlock[6];
|
||||
T t4 = srcBlock[0] * srcBlock[4];
|
||||
T t6 = srcBlock[0] * srcBlock[5];
|
||||
T t8 = srcBlock[1] * srcBlock[3];
|
||||
T t10 = srcBlock[2] * srcBlock[3];
|
||||
T t12 = srcBlock[1] * srcBlock[6];
|
||||
T t14 = srcBlock[2] * srcBlock[6];
|
||||
|
||||
T t17 = 1.0
|
||||
/ (t4 * diagBlock[8] - t6 * diagBlock[7] - t8 * diagBlock[8] + t10 * diagBlock[7] + t12 * diagBlock[5]
|
||||
- t14 * diagBlock[4]); // t17 is 1/determinant
|
||||
/ (t4 * srcBlock[8] - t6 * srcBlock[7] - t8 * srcBlock[8] + t10 * srcBlock[7] + t12 * srcBlock[5]
|
||||
- t14 * srcBlock[4]); // t17 is 1/determinant
|
||||
|
||||
vecBlock[0] = (diagBlock[4] * diagBlock[8] - diagBlock[5] * diagBlock[7]) * t17;
|
||||
vecBlock[1] = -(diagBlock[1] * diagBlock[8] - diagBlock[2] * diagBlock[7]) * t17;
|
||||
vecBlock[2] = (diagBlock[1] * diagBlock[5] - diagBlock[2] * diagBlock[4]) * t17;
|
||||
vecBlock[3] = -(diagBlock[3] * diagBlock[8] - diagBlock[5] * diagBlock[6]) * t17;
|
||||
vecBlock[4] = (diagBlock[0] * diagBlock[8] - t14) * t17;
|
||||
vecBlock[5] = -(t6 - t10) * t17;
|
||||
vecBlock[6] = (diagBlock[3] * diagBlock[7] - diagBlock[4] * diagBlock[6]) * t17;
|
||||
vecBlock[7] = -(diagBlock[0] * diagBlock[7] - t12) * t17;
|
||||
vecBlock[8] = (t4 - t8) * t17;
|
||||
dstBlock[0] = (srcBlock[4] * srcBlock[8] - srcBlock[5] * srcBlock[7]) * t17;
|
||||
dstBlock[1] = -(srcBlock[1] * srcBlock[8] - srcBlock[2] * srcBlock[7]) * t17;
|
||||
dstBlock[2] = (srcBlock[1] * srcBlock[5] - srcBlock[2] * srcBlock[4]) * t17;
|
||||
dstBlock[3] = -(srcBlock[3] * srcBlock[8] - srcBlock[5] * srcBlock[6]) * t17;
|
||||
dstBlock[4] = (srcBlock[0] * srcBlock[8] - t14) * t17;
|
||||
dstBlock[5] = -(t6 - t10) * t17;
|
||||
dstBlock[6] = (srcBlock[3] * srcBlock[7] - srcBlock[4] * srcBlock[6]) * t17;
|
||||
dstBlock[7] = -(srcBlock[0] * srcBlock[7] - t12) * t17;
|
||||
dstBlock[8] = (t4 - t8) * t17;
|
||||
}
|
||||
}
|
||||
|
||||
// explicit formulas based on Dune cpu code
|
||||
template <class T, int blocksize>
|
||||
__device__ __forceinline__ void invBlockInPlace(T* __restrict__ block)
|
||||
{
|
||||
if (blocksize == 1) {
|
||||
block[0] = 1.0 / (block[0]);
|
||||
} else if (blocksize == 2) {
|
||||
T detInv = 1.0 / (block[0] * block[3] - block[1] * block[2]);
|
||||
|
||||
T temp = block[0];
|
||||
block[0] = block[3] * detInv;
|
||||
block[1] = -block[1] * detInv;
|
||||
block[2] = -block[2] * detInv;
|
||||
block[3] = temp * detInv;
|
||||
} else if (blocksize == 3) {
|
||||
T t4 = block[0] * block[4];
|
||||
T t6 = block[0] * block[5];
|
||||
T t8 = block[1] * block[3];
|
||||
T t10 = block[2] * block[3];
|
||||
T t12 = block[1] * block[6];
|
||||
T t14 = block[2] * block[6];
|
||||
|
||||
T det = (t4 * block[8] - t6 * block[7] - t8 * block[8] + t10 * block[7] + t12 * block[5] - t14 * block[4]);
|
||||
T t17 = T(1.0) / det;
|
||||
|
||||
T matrix01 = block[1];
|
||||
T matrix00 = block[0];
|
||||
T matrix10 = block[3];
|
||||
T matrix11 = block[4];
|
||||
|
||||
block[0] = (block[4] * block[8] - block[5] * block[7]) * t17;
|
||||
block[1] = -(block[1] * block[8] - block[2] * block[7]) * t17;
|
||||
block[2] = (matrix01 * block[5] - block[2] * block[4]) * t17;
|
||||
block[3] = -(block[3] * block[8] - block[5] * block[6]) * t17;
|
||||
block[4] = (matrix00 * block[8] - t14) * t17;
|
||||
block[5] = -(t6 - t10) * t17;
|
||||
block[6] = (matrix10 * block[7] - matrix11 * block[6]) * t17;
|
||||
block[7] = -(matrix00 * block[7] - t12) * t17;
|
||||
block[8] = (t4 - t8) * t17;
|
||||
}
|
||||
}
|
||||
|
||||
enum class MVType { SET, PLUS, MINUS };
|
||||
// SET: c = A*b
|
||||
// PLS: c += A*b
|
||||
// MINUS: c -= A*b
|
||||
template <class T, int blocksize, MVType OP>
|
||||
__device__ __forceinline__ void matrixVectorProductWithAction(const T* A, const T* b, T* c)
|
||||
{
|
||||
for (int i = 0; i < blocksize; ++i) {
|
||||
if (OP == MVType::SET) {
|
||||
c[i] = 0;
|
||||
}
|
||||
|
||||
for (int j = 0; j < blocksize; ++j) {
|
||||
if (OP == MVType::SET || OP == MVType::PLUS) {
|
||||
c[i] += A[i * blocksize + j] * b[j];
|
||||
} else if (OP == MVType::MINUS) {
|
||||
c[i] -= A[i * blocksize + j] * b[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
__device__ __forceinline__ void mv(const T* a, const T* b, T* c)
|
||||
{
|
||||
matrixVectorProductWithAction<T, blocksize, MVType::SET>(a, b, c);
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
__device__ __forceinline__ void umv(const T* a, const T* b, T* c)
|
||||
{
|
||||
matrixVectorProductWithAction<T, blocksize, MVType::PLUS>(a, b, c);
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
__device__ __forceinline__ void mmv(const T* a, const T* b, T* c)
|
||||
{
|
||||
matrixVectorProductWithAction<T, blocksize, MVType::MINUS>(a, b, c);
|
||||
}
|
||||
|
||||
// dst -= A*B*C
|
||||
template <class T, int blocksize>
|
||||
__device__ __forceinline__ void mmx2Subtraction(T* A, T* B, T* C, T* dst)
|
||||
{
|
||||
|
||||
T tmp[blocksize * blocksize] = {0};
|
||||
// tmp = A*B
|
||||
for (int i = 0; i < blocksize; ++i) {
|
||||
for (int k = 0; k < blocksize; ++k) {
|
||||
for (int j = 0; j < blocksize; ++j) {
|
||||
tmp[i * blocksize + j] += A[i * blocksize + k] * B[k * blocksize + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dst = tmp*C
|
||||
for (int i = 0; i < blocksize; ++i) {
|
||||
for (int k = 0; k < blocksize; ++k) {
|
||||
for (int j = 0; j < blocksize; ++j) {
|
||||
dst[i * blocksize + j] -= tmp[i * blocksize + k] * C[k * blocksize + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
__global__ void
|
||||
cuInvertDiagonalAndFlatten(T* matNonZeroValues, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
|
||||
{
|
||||
const auto row = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
|
||||
if (row < numberOfRows) {
|
||||
size_t nnzIdx = rowIndices[row];
|
||||
size_t nnzIdxLim = rowIndices[row + 1];
|
||||
|
||||
// this loop will cause some extra checks that we are within the limit in the case of the diagonal having a
|
||||
// zero element
|
||||
while (colIndices[nnzIdx] != row && nnzIdx <= nnzIdxLim) {
|
||||
++nnzIdx;
|
||||
}
|
||||
|
||||
// diagBlock points to the start of where the diagonal block is stored
|
||||
T* diagBlock = &matNonZeroValues[blocksize * blocksize * nnzIdx];
|
||||
// vecBlock points to the start of the block element in the vector where the inverse of the diagonal block
|
||||
// element should be stored
|
||||
T* vecBlock = &vec[blocksize * blocksize * row];
|
||||
|
||||
invBlockOutOfPlace<T, blocksize>(diagBlock, vecBlock);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
__global__ void cuComputeLowerSolveLevelSet(T* mat,
|
||||
int* rowIndices,
|
||||
int* colIndices,
|
||||
int* indexConversion,
|
||||
int startIdx,
|
||||
int rowsInLevelSet,
|
||||
const T* dInv,
|
||||
const T* d,
|
||||
T* v)
|
||||
{
|
||||
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
||||
|
||||
const size_t nnzIdx = rowIndices[reorderedRowIdx];
|
||||
const int naturalRowIdx = indexConversion[reorderedRowIdx];
|
||||
|
||||
T rhs[blocksize];
|
||||
for (int i = 0; i < blocksize; i++) {
|
||||
rhs[i] = d[naturalRowIdx * blocksize + i];
|
||||
}
|
||||
|
||||
for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
|
||||
const int col = colIndices[block];
|
||||
mmv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||
}
|
||||
|
||||
mv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
__global__ void cuComputeUpperSolveLevelSet(T* mat,
|
||||
int* rowIndices,
|
||||
int* colIndices,
|
||||
int* indexConversion,
|
||||
int startIdx,
|
||||
int rowsInLevelSet,
|
||||
const T* dInv,
|
||||
T* v)
|
||||
{
|
||||
const auto reorderedRowIdx = startIdx + (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
||||
const size_t nnzIdxLim = rowIndices[reorderedRowIdx + 1];
|
||||
const int naturalRowIdx = indexConversion[reorderedRowIdx];
|
||||
|
||||
T rhs[blocksize] = {0};
|
||||
|
||||
for (int block = nnzIdxLim - 1; colIndices[block] > naturalRowIdx; --block) {
|
||||
const int col = colIndices[block];
|
||||
umv<T, blocksize>(&mat[block * blocksize * blocksize], &v[col * blocksize], rhs);
|
||||
}
|
||||
|
||||
mmv<T, blocksize>(&dInv[reorderedRowIdx * blocksize * blocksize], rhs, &v[naturalRowIdx * blocksize]);
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
__global__ void cuComputeDiluDiagonal(T* mat,
|
||||
int* rowIndices,
|
||||
int* colIndices,
|
||||
int* reorderedToNatural,
|
||||
int* naturalToReordered,
|
||||
const int startIdx,
|
||||
int rowsInLevelSet,
|
||||
T* dInv)
|
||||
{
|
||||
const auto reorderedRowIdx = startIdx + blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (reorderedRowIdx < rowsInLevelSet + startIdx) {
|
||||
const int naturalRowIdx = reorderedToNatural[reorderedRowIdx];
|
||||
const size_t nnzIdx = rowIndices[reorderedRowIdx];
|
||||
|
||||
int diagIdx = nnzIdx;
|
||||
while (colIndices[diagIdx] != naturalRowIdx) {
|
||||
++diagIdx;
|
||||
}
|
||||
|
||||
T dInvTmp[blocksize * blocksize];
|
||||
for (int i = 0; i < blocksize; ++i) {
|
||||
for (int j = 0; j < blocksize; ++j) {
|
||||
dInvTmp[i * blocksize + j] = mat[diagIdx * blocksize * blocksize + i * blocksize + j];
|
||||
}
|
||||
}
|
||||
|
||||
for (int block = nnzIdx; colIndices[block] < naturalRowIdx; ++block) {
|
||||
const int col = naturalToReordered[colIndices[block]];
|
||||
// find element with indices swapped
|
||||
// Binary search over block in the right row, [rowIndices[col], rowindices[col+1]-1] defines the range
|
||||
// we binary search over
|
||||
int left = rowIndices[col];
|
||||
int right = rowIndices[col + 1] - 1;
|
||||
int mid;
|
||||
|
||||
while (left <= right) {
|
||||
mid = left + (right - left) / 2; // overflow-safe average
|
||||
const int col = colIndices[mid];
|
||||
|
||||
if (col == naturalRowIdx) {
|
||||
break;
|
||||
} else if (col < naturalRowIdx) {
|
||||
left = mid + 1;
|
||||
} else {
|
||||
right = mid - 1;
|
||||
}
|
||||
}
|
||||
|
||||
const int symOpposite = mid;
|
||||
|
||||
mmx2Subtraction<T, blocksize>(&mat[block * blocksize * blocksize],
|
||||
&dInv[col * blocksize * blocksize],
|
||||
&mat[symOpposite * blocksize * blocksize],
|
||||
dInvTmp);
|
||||
}
|
||||
|
||||
invBlockInPlace<T, blocksize>(dInvTmp);
|
||||
|
||||
for (int i = 0; i < blocksize; ++i) {
|
||||
for (int j = 0; j < blocksize; ++j) {
|
||||
dInv[reorderedRowIdx * blocksize * blocksize + i * blocksize + j] = dInvTmp[i * blocksize + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
__global__ void cuMoveDataToReordered(
|
||||
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* indexConversion, size_t numberOfRows)
|
||||
{
|
||||
const auto srcRow = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (srcRow < numberOfRows) {
|
||||
|
||||
const auto dstRow = indexConversion[srcRow];
|
||||
|
||||
for (int srcBlock = srcRowIndices[srcRow], dstBlock = dstRowIndices[dstRow];
|
||||
srcBlock < srcRowIndices[srcRow + 1];
|
||||
++srcBlock, ++dstBlock) {
|
||||
for (int i = 0; i < blocksize; ++i) {
|
||||
for (int j = 0; j < blocksize; ++j) {
|
||||
dstMatrix[dstBlock * blocksize * blocksize + i * blocksize + j]
|
||||
= srcMatrix[srcBlock * blocksize * blocksize + i * blocksize + j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -143,32 +353,103 @@ namespace
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <class T>
|
||||
template <class T, int blocksize>
|
||||
void
|
||||
invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, size_t blocksize, T* vec)
|
||||
invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec)
|
||||
{
|
||||
switch (blocksize) {
|
||||
case 1:
|
||||
cuInvertDiagonalAndFlattenBlocksize1<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(
|
||||
mat, rowIndices, colIndices, numberOfRows, vec);
|
||||
break;
|
||||
case 2:
|
||||
cuInvertDiagonalAndFlattenBlocksize2<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(
|
||||
mat, rowIndices, colIndices, numberOfRows, vec);
|
||||
break;
|
||||
case 3:
|
||||
cuInvertDiagonalAndFlattenBlocksize3<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(
|
||||
mat, rowIndices, colIndices, numberOfRows, vec);
|
||||
break;
|
||||
default:
|
||||
// TODO: Figure out what is why it did not produce an error or any output in the output stream or the DBG file
|
||||
// when I forced this case to execute
|
||||
if (blocksize <= 3) {
|
||||
cuInvertDiagonalAndFlatten<T, blocksize>
|
||||
<<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(mat, rowIndices, colIndices, numberOfRows, vec);
|
||||
} else {
|
||||
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
template void invertDiagonalAndFlatten(double*, int*, int*, size_t, size_t, double*);
|
||||
template void invertDiagonalAndFlatten(float*, int*, int*, size_t, size_t, float*);
|
||||
// perform the lower solve for all rows in the same level set
|
||||
template <class T, int blocksize>
|
||||
void
|
||||
computeLowerSolveLevelSet(T* reorderedMat,
|
||||
int* rowIndices,
|
||||
int* colIndices,
|
||||
int* indexConversion,
|
||||
int startIdx,
|
||||
int rowsInLevelSet,
|
||||
const T* dInv,
|
||||
const T* d,
|
||||
T* v)
|
||||
{
|
||||
cuComputeLowerSolveLevelSet<T, blocksize><<<getBlocks(rowsInLevelSet), getThreads(rowsInLevelSet)>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, d, v);
|
||||
}
|
||||
|
||||
// perform the upper solve for all rows in the same level set
|
||||
template <class T, int blocksize>
|
||||
void
|
||||
computeUpperSolveLevelSet(T* reorderedMat,
|
||||
int* rowIndices,
|
||||
int* colIndices,
|
||||
int* indexConversion,
|
||||
int startIdx,
|
||||
int rowsInLevelSet,
|
||||
const T* dInv,
|
||||
T* v)
|
||||
{
|
||||
cuComputeUpperSolveLevelSet<T, blocksize><<<getBlocks(rowsInLevelSet), getThreads(rowsInLevelSet)>>>(
|
||||
reorderedMat, rowIndices, colIndices, indexConversion, startIdx, rowsInLevelSet, dInv, v);
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
void
|
||||
computeDiluDiagonal(T* reorderedMat,
|
||||
int* rowIndices,
|
||||
int* colIndices,
|
||||
int* reorderedToNatural,
|
||||
int* naturalToReordered,
|
||||
const int startIdx,
|
||||
int rowsInLevelSet,
|
||||
T* dInv)
|
||||
{
|
||||
if (blocksize <= 3) {
|
||||
cuComputeDiluDiagonal<T, blocksize>
|
||||
<<<getBlocks(rowsInLevelSet), getThreads(rowsInLevelSet)>>>(reorderedMat,
|
||||
rowIndices,
|
||||
colIndices,
|
||||
reorderedToNatural,
|
||||
naturalToReordered,
|
||||
startIdx,
|
||||
rowsInLevelSet,
|
||||
dInv);
|
||||
} else {
|
||||
OPM_THROW(std::invalid_argument, "Inverting diagonal is not implemented for blocksizes > 3");
|
||||
}
|
||||
}
|
||||
|
||||
template <class T, int blocksize>
|
||||
void
|
||||
copyMatDataToReordered(
|
||||
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows)
|
||||
{
|
||||
cuMoveDataToReordered<T, blocksize><<<getBlocks(numberOfRows), getThreads(numberOfRows)>>>(
|
||||
srcMatrix, srcRowIndices, dstMatrix, dstRowIndices, naturalToReordered, numberOfRows);
|
||||
}
|
||||
|
||||
#define INSTANTIATE_KERNEL_WRAPPERS(T, blocksize) \
|
||||
template void invertDiagonalAndFlatten<T, blocksize>(T*, int*, int*, size_t, T*); \
|
||||
template void copyMatDataToReordered<T, blocksize>(T*, int*, T*, int*, int*, size_t); \
|
||||
template void computeDiluDiagonal<T, blocksize>(T*, int*, int*, int*, int*, const int, int, T*); \
|
||||
template void computeUpperSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, T*); \
|
||||
template void computeLowerSolveLevelSet<T, blocksize>(T*, int*, int*, int*, int, int, const T*, const T*, T*);
|
||||
|
||||
INSTANTIATE_KERNEL_WRAPPERS(float, 1);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(float, 2);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(float, 3);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(float, 4);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(float, 5);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(float, 6);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(double, 1);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(double, 2);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(double, 3);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(double, 4);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(double, 5);
|
||||
INSTANTIATE_KERNEL_WRAPPERS(double, 6);
|
||||
} // namespace Opm::cuistl::detail
|
||||
|
@ -29,11 +29,101 @@ namespace Opm::cuistl::detail
|
||||
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||
* @param numberOfRows Integer describing the number of rows in the matrix
|
||||
* @param blocksize Integer describing the sidelength of a block in the sparse matrix
|
||||
* @param[out] vec Pointer to the vector where the inverse of the diagonal matrix should be stored
|
||||
*/
|
||||
template <class T>
|
||||
void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, size_t blocksize, T* vec);
|
||||
template <class T, int blocksize>
|
||||
void invertDiagonalAndFlatten(T* mat, int* rowIndices, int* colIndices, size_t numberOfRows, T* vec);
|
||||
|
||||
/**
|
||||
* @brief Perform a lower solve on certain rows in a matrix that can safely be computed in parallel
|
||||
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
||||
* that rows in the same level sets are contiguous
|
||||
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
||||
* in the natural ordered matrix
|
||||
* @param startIdx Index of the first row of the matrix to be solve
|
||||
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||
* function
|
||||
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner. Must be reordered in the same way as
|
||||
* reorderedMat
|
||||
* @param d Stores the defect
|
||||
* @param [out] v Will store the results of the lower solve
|
||||
*/
|
||||
template <class T, int blocksize>
|
||||
void computeLowerSolveLevelSet(T* reorderedMat,
|
||||
int* rowIndices,
|
||||
int* colIndices,
|
||||
int* indexConversion,
|
||||
int startIdx,
|
||||
int rowsInLevelSet,
|
||||
const T* dInv,
|
||||
const T* d,
|
||||
T* v);
|
||||
|
||||
/**
|
||||
* @brief Perform an upper solve on certain rows in a matrix that can safely be computed in parallel
|
||||
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
||||
* that rows in the same level sets are contiguous
|
||||
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||
* @param indexConversion Integer array containing mapping an index in the reordered matrix to its corresponding index
|
||||
* in the natural ordered matrix
|
||||
* @param startIdx Index of the first row of the matrix to be solve
|
||||
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||
* function
|
||||
* @param dInv The diagonal matrix used by the Diagonal ILU preconditioner
|
||||
* @param [out] v Will store the results of the lower solve. To begin with it should store the output from the lower
|
||||
* solve
|
||||
*/
|
||||
template <class T, int blocksize>
|
||||
void computeUpperSolveLevelSet(T* reorderedMat,
|
||||
int* rowIndices,
|
||||
int* colIndices,
|
||||
int* indexConversion,
|
||||
int startIdx,
|
||||
int rowsInLevelSet,
|
||||
const T* dInv,
|
||||
T* v);
|
||||
|
||||
/**
|
||||
* @brief Computes the ILU0 of the diagonal elements of the reordered matrix and stores it in a reordered vector
|
||||
* containing the diagonal blocks
|
||||
* @param reorderedMat pointer to GPU memory containing nonzerovalues of the sparse matrix. The matrix reordered such
|
||||
* that rows in the same level sets are contiguous
|
||||
* @param rowIndices Pointer to vector on GPU containing row indices compliant wiht bsr format
|
||||
* @param colIndices Pointer to vector on GPU containing col indices compliant wiht bsr format
|
||||
* @param reorderedToNatural Integer array containing mapping an index in the reordered matrix to its corresponding
|
||||
* index in the natural ordered matrix
|
||||
* @param naturalToreordered Integer array containing mapping an index in the reordered matrix to its corresponding
|
||||
* index in the natural ordered matrix
|
||||
* @param startIdx Index of the first row of the matrix to be solve
|
||||
* @param rowsInLevelSet Number of rows in this level set, which number the amount of rows solved in parallel by this
|
||||
* function
|
||||
* @param [out] dInv The diagonal matrix used by the Diagonal ILU preconditioner
|
||||
*/
|
||||
template <class T, int blocksize>
|
||||
void computeDiluDiagonal(T* reorderedMat,
|
||||
int* rowIndices,
|
||||
int* colIndices,
|
||||
int* reorderedToNatural,
|
||||
int* naturalToReordered,
|
||||
int startIdx,
|
||||
int rowsInLevelSet,
|
||||
T* dInv);
|
||||
|
||||
/**
|
||||
* @brief Reorders the elements of a matrix by copying them from one matrix to another using a permutation list
|
||||
* @param srcMatrix The source matrix we will copy data from
|
||||
* @param srcRowIndices Pointer to vector on GPU containing row indices for the source matrix compliant wiht bsr format
|
||||
* @param [out] dstMatrix The destination matrix that we copy data to
|
||||
* @param dstRowIndices Pointer to vector on GPU containing riw indices for the destination matrix compliant wiht bsr
|
||||
* format
|
||||
* @param naturalToReordered Permuation list that converts indices in the src matrix to the indices in the dst matrix
|
||||
* @param numberOfRows The number of rows in the matrices
|
||||
*/
|
||||
template <class T, int blocksize>
|
||||
void copyMatDataToReordered(
|
||||
T* srcMatrix, int* srcRowIndices, T* dstMatrix, int* dstRowIndices, int* naturalToReordered, size_t numberOfRows);
|
||||
} // namespace Opm::cuistl::detail
|
||||
#endif
|
||||
|
@ -87,12 +87,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith3By3Blocks, T, Numeric
|
||||
Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
|
||||
Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
|
||||
|
||||
Opm::cuistl::detail::invertDiagonalAndFlatten(m.getNonZeroValues().data(),
|
||||
m.getRowIndices().data(),
|
||||
m.getColumnIndices().data(),
|
||||
N,
|
||||
blocksize,
|
||||
dInvDiag.data());
|
||||
Opm::cuistl::detail::invertDiagonalAndFlatten<T, 3>(
|
||||
m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());
|
||||
|
||||
std::vector<T> expectedInvDiag {-1.0 / 4.0,
|
||||
1.0 / 4.0,
|
||||
@ -165,12 +161,8 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric
|
||||
Opm::cuistl::CuSparseMatrix<T> m = Opm::cuistl::CuSparseMatrix<T>::fromMatrix(B);
|
||||
Opm::cuistl::CuVector<T> dInvDiag(blocksize * blocksize * N);
|
||||
|
||||
Opm::cuistl::detail::invertDiagonalAndFlatten(m.getNonZeroValues().data(),
|
||||
m.getRowIndices().data(),
|
||||
m.getColumnIndices().data(),
|
||||
N,
|
||||
blocksize,
|
||||
dInvDiag.data());
|
||||
Opm::cuistl::detail::invertDiagonalAndFlatten<T, 2>(
|
||||
m.getNonZeroValues().data(), m.getRowIndices().data(), m.getColumnIndices().data(), N, dInvDiag.data());
|
||||
|
||||
std::vector<T> expectedInvDiag {2.0, -2.0, -1.0 / 2.0, 1.0, -1.0, 0.0, 0.0, -1.0};
|
||||
std::vector<T> computedInvDiag = dInvDiag.asStdVector();
|
||||
@ -179,4 +171,4 @@ BOOST_AUTO_TEST_CASE_TEMPLATE(FlattenAndInvertDiagonalWith2By2Blocks, T, Numeric
|
||||
for (size_t i = 0; i < expectedInvDiag.size(); ++i) {
|
||||
BOOST_CHECK_CLOSE(expectedInvDiag[i], computedInvDiag[i], 1e-7);
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue
Block a user